{ 'module' => { 'program' => 'uplug-coocfreq', 'location' => '$UplugBin', 'name' => 'co-occurrence frequency counter', # 'stdin' => 'bitext', }, 'description' => 'This modules counts co-occurrence frequencies of words and phrases.', 'input' => { 'bitext' => { 'stream name' => 'runtime xml', }, }, 'output' => { 'cooc freq' => { 'file' => 'data/runtime/cooc.tab', 'format' => 'tab', 'write_mode' => 'overwrite', }, 'source freq' => { 'file' => 'data/runtime/src.tab', 'format' => 'tab', 'write_mode' => 'overwrite', }, 'target freq' => { 'file' => 'data/runtime/trg.tab', 'format' => 'tab', 'write_mode' => 'overwrite', }, 'source vocabulary' => { 'file' => 'data/runtime/src.voc', 'format' => 'tab', 'write_mode' => 'overwrite', }, 'target vocabulary' => { 'file' => 'data/runtime/trg.voc', 'format' => 'tab', 'write_mode' => 'overwrite', } }, 'parameter' => { 'token' => { #------------------------------------------------------------------ # token pair features # define contextual features for counting # for example: # # 'features (source)' => { # source language features: # 'left:pos' => '^(..).*$$1', # 1st 2 char's of POS of the left neighbor # '#text' => '(.{4})$$1', # last 4 char's of the word itself # 'c.*:right:type' => undef, # type attribute of a right neighbor # }, # of a parent-tag that starts with 'c' # 'features (target)' => { # target language features: # 'pos' => undef, # POS-attribute of the current token # }, # #------------------------------------------------------------------ #------------------------------------------------------------------ # other token parameters: # chunks: use marked chunks, argument: xml-tag-pattern # minimal frequency: threshold for token pair frequencies # # 'minimal length diff' => 0.1, # string length difference ratio # 'matching word class' => 'same', # don't mix content and stop words # 'minimal length (source)' => 2, # 'minimal length (target)' => 2, # 'use attribute (source)' => 'stem', # use the 'stem'-attribute # 'use attribute (target)' => 'stem', # for all tokens # 'grep token (source)' => 'alphabetic', # restrict tokens to # 'grep token (target)' => 'alphabetic', # alphabetic only # 'exclude stop words (source)' => 0, # don't count stop words # 'exclude stop words (target)' => 0, # 'language (source)' => 'english', # use language-specific # 'language (target)' => 'swedish', # information (inilang.ini) #------------------------------------------------------------------ 'chunks (source)' => 'c.*', # use marked chunks 'chunks (target)' => 'c.*', # use marked chunks 'minimal frequency' => 2, 'minimal frequency (source)' => 2, 'minimal frequency (target)' => 2, 'maximal ngram length (source)' => 1, # >1 --> use N-grams 'maximal ngram length (target)' => 1, # >1 --> use N-grams 'lower case (source)' => 0, # =1 --> lower case 'lower case (target)' => 0, # =1 --> lower case 'token label' => 'w', # xml-tag for (single) tokens 'remove linked' => 1, # =1 --> don't count aligned data! }, 'runtime' => { #------------------------------------------------------------------ # runtime parameters # 'print progress' => 1, # verbose output 'buffer' => 2000000, # number of token pairs buffered in a hash 'source buffer' => 2000000, # source token buffer 'target buffer' => 2000000, # target token buffer #------------------------------------------------------------ # clean buffer: # if set to 1: remove low-frequency-pairs from the buffer in # cases of buffer overflows 'clean buffer' => 1, #------------------------------------------------------------ }, }, #------------------------------------------------------------------ 'arguments' => { 'shortcuts' => { 'in' => 'input:bitext:file', 'informat' => 'input:bitext:format', 'src' => 'output:source freq:file', 'trg' => 'output:target freq:file', 'cooc' => 'output:cooc freq:file', 'freq' => 'parameter:token:minimal frequency', 'srclang' => 'parameter:token:language (source)', 'trglang' => 'parameter:token:language (target)', 'max' => 'parameter:runtime:max nr of segments', 'buf' => 'parameter:runtime:buffer', 'clean' => 'parameter:runtime:clean buffer', 'sa' => 'parameter:token:use attribute (source)', 'ta' => 'parameter:token:use attribute (target)', 'w' => 'parameter:token:token label', } }, 'widgets' => { } }