#!/usr/bin/env perl -w #-*-perl-*- # # this is a sentence aligner based on Phillip Koehn's implementation used # for aligning the EuroParl corpus # http://people.csail.mit.edu/koehn/publications/europarl/ # (and his aligner is based on Gale&Church's length-based alignment approach) # # USAGE: sentalign [OPTIONS] srcfile trgfile > alignfile # # OPTIONS # # -h 'hard-tag-re' .......... regular expression to match hard boundary tags # -i min_length ............. check for identical words (set minimal length) # -c threshold .............. use LCSR cognate filter (with threshold) # -w window ................. define size of sliding window (for all filters) # -d dic-file ............... use dictionary filter (using dic-file) # (the dictionary has to be in UTF-8!) # -t ........................ use time-slot length instead of char length # -u ........................ cognate/identical filter for upper case words # -v ........................ verbose mode # # # TODO # # - re-align + dictionary filter? # - distance weighted cognate filter # - character pair weights for LCSR # - add at least a warning if number of hardboundaries is not identical # use strict; use vars qw($opt_h $opt_l $opt_c $opt_w $opt_d $opt_i $opt_v $opt_t $opt_u); use Getopt::Std; use XML::Parser; use Encode; getopts('h:c:w:l:i:d:vtu'); ## set some global variables ## mostly command line arguments my $VERBOSE = $opt_v; my $SENT_TAG = 's'; my $WORD_TAG = 'w'; my $HARD_TAG = $opt_h; # hard tag is a regular expression! my $USE_IDENTICAL = $opt_i; # use cognate filter (identical words) my $USE_COGNATES = $opt_c; # use cognate filter (lcsr) my $USE_DICTIONARY = $opt_d; # use dictionary filter my $UPPER_CASE = $opt_u; # cognate filter with upper case words only my $TIME_LENGTH = $opt_t; # align based on time slot length my $MINLENGTH = $opt_l || 7; my $WINDOW = $opt_w || 5; ## read dictionary if necessary my %DIC=(); if ($USE_DICTIONARY){ ReadDictionary(\%DIC,$USE_DICTIONARY); } ## open source and target files my $src_file = shift(@ARGV); my $trg_file = shift(@ARGV); if ($src_file=~/\.gz$/){ open SRC,"gzip -cd < $src_file |" || die "cannot open $src_file!\n"; } else{open SRC,"<$src_file" || die "cannot open $src_file!\n";} if ($trg_file=~/\.gz$/){ open TRG,"gzip -cd < $trg_file |" || die "cannot open $trg_file!\n"; } else{ open TRG,"<$trg_file" || die "cannot open $trg_file!\n"; } ## create & initialize XML parser instances my $src_parser = new XML::Parser(Handlers => {Start => \&XmlStart, End => \&XmlEnd, Default => \&XmlChar, },); my $trg_parser = new XML::Parser(Handlers => {Start => \&XmlStart, End => \&XmlEnd, Default => \&XmlChar, },); my $src_ph = $src_parser->parse_start; my $trg_ph = $trg_parser->parse_start; if ($USE_COGNATES || $USE_IDENTICAL || $USE_DICTIONARY){ $src_ph->{STORE_WORDS} = 1; # we want to store the actual words $trg_ph->{STORE_WORDS} = 1; # if we want to apply a filter for } ##--------------------------------------------------------------------- ## now the main part! ## - print the XML header for sentence alignment file ## - run through the XML documents (one section section pair at a time) ## - close XML files and print XML footer &print_xmlheader($src_file,$trg_file); my ($src_len,$trg_len,$src_id,$trg_id); my $LINK_ID=0; do{ ($src_len,$trg_len, $src_id,$trg_id) = &GetNextBitextSegment(*SRC,*TRG,$src_ph,$trg_ph); if (@{$src_len} or @{$trg_len}){ my @links = &sentence_align($src_len,$trg_len,$src_id,$trg_id); print_alignments(\@links); } } until ((not @{$src_len}) && (not @{$trg_len})); close SRC; close TRG; &print_xmlfooter(); ## end of main part ##--------------------------------------------------------------------- sub print_xmlheader{ my ($src_file,$trg_file) = @_; print ''."\n"; print ''."\n"; print ''."\n"; print ' '."\n"; } sub print_xmlfooter{ print "\n"; print "\n"; } ##--------------------------------------------------------------------- ## read sections from the XML documents ## read one sentence sub ReadNextSentence{ my ($FH,$parser) = @_; $parser->{SENTENCE_END} = 0; while (not $parser->{SENTENCE_END}){ my $line = <$FH>; # read next line if (not $line){ # end of file? $parser->{SENTENCE_END}=1; # --> stop $parser->{SECTION_END}=1; } else{ $parser->parse_more($line); # else: parse line } } } ## read one section sub GetNextBitextSegment{ my ($SFH,$TFH,$src_parser,$trg_parser) = @_; my @src_length=(); my @trg_length=(); my @src_ids=(); my @trg_ids=(); my $last_src = undef; # idx of last source sentence in section my $last_trg = undef; # idx of last target sentence in section $src_parser->{SENT_IDS} = [] unless defined $src_parser->{SENT_IDS}; $trg_parser->{SENT_IDS} = [] unless defined $trg_parser->{SENT_IDS}; if (not @{$src_parser->{SENT_IDS}}){ # no more sentences stored? $src_parser->{SENT_LENGTHS} = [0]; # start again at position 0 $src_parser->{SECTION_END} = 0; # reset end-of-section flag if ($TIME_LENGTH){ $src_parser->{TIME_LENGTHS} = [0]; } } if (not @{$trg_parser->{SENT_IDS}}){ $trg_parser->{SENT_LENGTHS} = [0]; $trg_parser->{SECTION_END} = 0; if ($TIME_LENGTH){ $trg_parser->{TIME_LENGTHS} = [0]; } } ## main loop to read sentences from XML files while ((not $src_parser->{SECTION_END}) || (not $trg_parser->{SECTION_END})){ ## we like to have about the same number of sentences in both languages ## --> read source sentences while there are less than target sents ## --> read target sentences while there are less than source sents ## (stop if end-of-section reached) if (($#{$src_parser->{SENT_IDS}} < $#{$trg_parser->{SENT_IDS}}) && (not $src_parser->{SECTION_END})){ ReadNextSentence($SFH,$src_parser); } elsif (($#{$trg_parser->{SENT_IDS}} < $#{$src_parser->{SENT_IDS}}) && (not $trg_parser->{SECTION_END})){ ReadNextSentence($TFH,$trg_parser); } ## in case we reached end-of-section in one of them ## or we have the same amount of sentences ## --> read from source and target else{ if (not $src_parser->{SECTION_END}){ ReadNextSentence($SFH,$src_parser); } if (not $trg_parser->{SECTION_END}){ ReadNextSentence($TFH,$trg_parser); } } ## apply word filter to find possible boundaries ($last_src,$last_trg) = WordFilter($src_parser,$trg_parser); if ((defined $last_src) && (defined $last_trg)){last;} } # if no filter sugestion any new section borders # --> take all sentences! if (not defined $last_src){ $last_src = $#{$src_parser->{SENT_IDS}}; } if (not defined $last_trg){ $last_trg = $#{$trg_parser->{SENT_IDS}}; } # get sentence parameters from section if ($last_src>=0){ foreach (0..$last_src){ my $time_length = shift(@{$src_parser->{TIME_LENGTHS}}); my $char_length = shift(@{$src_parser->{SENT_LENGTHS}}); shift(@{$src_parser->{TIME}}); if ($TIME_LENGTH){push(@src_length,$time_length);} else{push(@src_length,$char_length);} push(@src_ids,shift(@{$src_parser->{SENT_IDS}})); if (defined $src_parser->{WORDS}){ shift(@{$src_parser->{WORDS}}); } } if ($TIME_LENGTH){ push(@src_length,$src_parser->{TIME_LENGTHS}->[-1]);; } else{ push(@src_length,$src_parser->{SENT_LENGTHS}->[-1]);; } } if ($last_trg>=0){ foreach (0..$last_trg){ my $time_length = shift(@{$trg_parser->{TIME_LENGTHS}}); my $char_length = shift(@{$trg_parser->{SENT_LENGTHS}}); shift(@{$trg_parser->{TIME}}); if ($TIME_LENGTH){push(@trg_length,$time_length);} else{push(@trg_length,$char_length);} push(@trg_ids,shift(@{$trg_parser->{SENT_IDS}})); if (defined $trg_parser->{WORDS}){ shift(@{$trg_parser->{WORDS}}); } } if ($TIME_LENGTH){ push(@trg_length,$trg_parser->{TIME_LENGTHS}->[-1]);; } else{ push(@trg_length,$trg_parser->{SENT_LENGTHS}->[-1]);; } } return (\@src_length,\@trg_length,\@src_ids,\@trg_ids); } ##--------------------------------------------------------------------- ## WordFilter ## run with a sliding window through the bitext segment ## and check if there are matching word pairs ## in the sentence pairs considered sub WordFilter{ my ($src_parser,$trg_parser)=@_; my $i = $#{$src_parser->{SENT_IDS}}; my $j = $#{$trg_parser->{SENT_IDS}}; if (not exists $src_parser->{WORDS}){return (undef,undef);} if (not exists $trg_parser->{WORDS}){return (undef,undef);} my $src_sents = $src_parser->{WORDS}; my $trg_sents = $trg_parser->{WORDS}; if ($i<0){return ($i,$j);} if ($j<0){return ($i,$j);} my $w = 0; while ($w < $WINDOW){ if ($i>$j){ if ($i-$w >= 0 && $i-$w <= $j){ if (find_match($$src_sents[$i],$$trg_sents[$i-$w])){ print STDERR " in ",$i,' ',$i-$w,"!\n" if $VERBOSE; return($i,$i-$w); } } if ($w > 0 && $i+$w <= $j){ if (find_match($$src_sents[$i],$$trg_sents[$i+$w])){ print STDERR " in ",$i,' ',$i+$w,"!\n" if $VERBOSE; return($i-$w,$j); } } } else{ if ($j-$w >= 0 && $j-$w <= $i){ if (find_match($$src_sents[$j-$w],$$trg_sents[$j])){ print STDERR " in ",$j-$w,' ',$j,"!\n" if $VERBOSE; return($j-$w,$j); } } if ($w > 0 && $j+$w <= $i){ if (find_match($$src_sents[$j+$w],$$trg_sents[$j])){ print STDERR " in ",$j+$w,' ',$j,"!\n" if $VERBOSE; return($j+$w,$j); } } } $w++; } return (undef,undef); } sub find_match{ if ($USE_IDENTICAL){ if (identical(@_,$USE_IDENTICAL)){return 1;} } if ($USE_COGNATES){ if (cognates(@_,$MINLENGTH,$USE_COGNATES)){return 1;} } if ($USE_DICTIONARY){ if (dictionary(@_)){return 1;} } return 0; } sub identical{ my ($src,$trg,$minlength)=@_; ## make lower case version of first word ## to avoid problems with the 'only upper case words' ## (quite ad-hoc) if ($UPPER_CASE){ $$src[0] = lc($$src[0]); $$trg[0] = lc($$trg[0]); } my %src_words=(); my %trg_words=(); foreach (@{$src}){ $src_words{$_}++; } foreach (@{$trg}){ $trg_words{$_}++; } foreach (keys %src_words){ if (length($_)<$minlength){next;} if ($UPPER_CASE){if ($_!~/^\p{Lu}/){next;}} if (defined $trg_words{$_}){ print STDERR "found identical string '$_' " if $VERBOSE; return 1; } } return 0; } sub cognates{ my ($src,$trg,$minlength,$minscore)=@_; ## make lower case version of first word ## to avoid problems with the 'only upper case words' ## (quite ad-hoc) if ($UPPER_CASE){ $$src[0] = lc($$src[0]); $$trg[0] = lc($$trg[0]); } my %src_words=(); my %trg_words=(); foreach (@{$src}){ $src_words{$_}++; } foreach (@{$trg}){ $trg_words{$_}++; } foreach my $s (keys %src_words){ my $s_len = length($s); if ($s_len < $minlength){next;} if ($UPPER_CASE){if ($s!~/^\p{Lu}/){next;}} foreach my $t (keys %trg_words){ my $t_len = length($t); if ($t_len < $minlength){next;} if ($UPPER_CASE){if ($t!~/^\p{Lu}/){next;}} if ($s eq $t){ print STDERR "found cognate '$s' - '$t' " if $VERBOSE; return 1; } if ($s_len > $t_len){ if ($t_len/$s_len < $minscore){next;} if (LCS($s,$t)/$s_len > $minscore){ print STDERR "found cognate '$s' - '$t' " if $VERBOSE; return 1; } } else{ if ($s_len/$t_len < $minscore){next;} if (LCS($s,$t)/$t_len > $minscore){ print STDERR "found cognate '$s' - '$t' " if $VERBOSE; return 1; } } } } return 0; } sub LCS { my ($src,$trg)=@_; my (@l,$i,$j); my @src_let=split(//,$src); # split string into char my @trg_let=split(//,$trg); unshift (@src_let,''); unshift (@trg_let,''); for ($i=0;$i<=$#src_let;$i++){ # initialize the matrix $l[$i][0]=0; } for ($i=0;$i<=$#trg_let;$i++){ $l[0][$i]=0; } # weight function is for $i (1..$#src_let){ for $j (1..$#trg_let){ if ($src_let[$i] eq $trg_let[$j]){ $l[$i][$j]=$l[$i-1][$j-1]+1; } else{ if ($l[$i][$j-1]>$l[$i-1][$j]){ $l[$i][$j]=$l[$i][$j-1]; } else{ $l[$i][$j]=$l[$i-1][$j]; } } } } return $l[$#src_let][$#trg_let]; } sub ReadDictionary{ my ($dic,$file)=@_; if (-e $file){ if ($file=~/\.gz$/){ open DIC,"gzip -cd < $file |" || die "cannot open dictionary file $file!\n"; } else{ open DIC,"< $file " || die "cannot open dictionary file $file!\n"; } binmode(DIC,":encoding(utf8)"); while (){ chomp; my ($src,$trg) = split(/\s/); $$dic{$src}{$trg}++; } } } sub dictionary{ my ($src,$trg)=@_; my %src_words=(); my %trg_words=(); foreach (@{$src}){ $src_words{$_}++; } foreach (@{$trg}){ $trg_words{$_}++; } foreach my $s (keys %src_words){ if (exists $DIC{$s}){ foreach my $t (keys %trg_words){ if (exists $DIC{$s}{$t}){ print STDERR "found in dic '$s' - '$t' " if $VERBOSE; return 1; } } } } return 0; } ##--------------------------------------------------------------------- ## handlers for XML parser sub XmlStart{ my ($p,$e,%attr) = @_; if ($e eq $SENT_TAG){ $p->{START2SID}->{$p->{SENT_LENGTHS}->[-1]}=$attr{id}; push(@{$p->{SENT_IDS}},$attr{id}); push(@{$p->{SENT_LENGTHS}},$p->{SENT_LENGTHS}->[-1]); $p->{SENT_OPEN} = 1; } if ($e eq $WORD_TAG){ $p->{WORD_OPEN} = 1; if ($p->{STORE_WORDS}){ my $sidx = $#{$p->{SENT_IDS}}; push (@{$p->{WORDS}->[$sidx]},''); } } if ($TIME_LENGTH){ if ($e eq 'time'){ if (ref($p->{TIME}) ne 'ARRAY'){ $p->{TIME}=[]; $p->{SEC_PER_CHAR} = 1; # wild guess .... } my $sidx = $#{$p->{SENT_IDS}}; if ($sidx>=0){ ## time at sent start --> easy case: just store as start point if ($p->{SENT_LENGTHS}->[-1] == $p->{SENT_LENGTHS}->[-2]){ $p->{TIME}->[$sidx]->{start}=time2sec($attr{value}); $p->{TIME}->[$sidx]->{first}=$p->{TIME}->[$sidx]->{start}; $p->{TIME}->[$sidx]->{first_pos}=$p->{SENT_LENGTHS}->[-1]; } ## first time tag in this sentence but not at start position elsif ($#{$p->{TIME}} < $sidx){ $p->{TIME}->[$sidx]->{first}=time2sec($attr{value}); $p->{TIME}->[$sidx]->{first_pos}=$p->{SENT_LENGTHS}->[-1]; } ## other time events -> just store as last seen time else{ $p->{TIME}->[$sidx]->{last}=time2sec($attr{value}); $p->{TIME}->[$sidx]->{last_pos}=$p->{SENT_LENGTHS}->[-1]; ## update SEC_PER_CHAR estimation my $sec = $p->{TIME}->[$sidx]->{last} - $p->{TIME}->[$sidx]->{first}; my $chars = $p->{TIME}->[$sidx]->{last_pos} - $p->{TIME}->[$sidx]->{first_pos}; if ($sec>0 && $chars>0){ $p->{SEC_PER_CHAR} = $sec/$chars; } } } } } } sub XmlEnd{ my $p=shift; my $e=shift; if ($e eq $SENT_TAG){ delete $p->{SENT_OPEN}; $p->{SENTENCE_END} = 1; # flag to indicate end-of-sentence } elsif (defined $HARD_TAG){ if ($e=~/$HARD_TAG/){ $p->{SECTION_END} = 1; print STDERR "hard boundary ($e) found!\n" if $VERBOSE; } } if ($e eq $WORD_TAG){ delete $p->{WORD_OPEN}; } if ($TIME_LENGTH && ($e eq $SENT_TAG)){ my $sidx = $#{$p->{SENT_IDS}}; ## hardest case: no time at all found in this sentence if ($#{$p->{TIME}}<$sidx){ if ($sidx == 0){ $p->{TIME}->[$sidx]->{start} = 0; } else{ ## take end of last sentence as start for new one $p->{TIME}->[$sidx]->{start} = $p->{TIME}->[$sidx-1]->{end}; } } elsif (not $p->{TIME}->[$sidx]->{start}){ my $before = $p->{TIME}->[$sidx]->{first_pos} - $p->{SENT_LENGTHS}->[-2]; if (defined $p->{TIME}->[$sidx]->{last}){ my $secs = $p->{TIME}->[$sidx]->{last} - $p->{TIME}->[$sidx]->{first}; my $chars = $p->{TIME}->[$sidx]->{last_pos} - $p->{TIME}->[$sidx]->{first_pos}; if ($secs>0 && $chars>0){ $p->{TIME}->[$sidx]->{start} = $p->{TIME}->[$sidx]->{first} - $before*$secs/$chars; } else{ $p->{TIME}->[$sidx]->{start} = $p->{TIME}->[$sidx]->{first}- $p->{SEC_PER_CHAR}*$before; } } else{ $p->{TIME}->[$sidx]->{start} = $p->{TIME}->[$sidx]->{first} - $p->{SEC_PER_CHAR}*$before; } } if (defined $p->{TIME}->[$sidx]->{last}){ my $after = $p->{SENT_LENGTHS}->[-1] - $p->{TIME}->[$sidx]->{last_pos}; if ($after){ my $secs = $p->{TIME}->[$sidx]->{last} - $p->{TIME}->[$sidx]->{first}; my $chars = $p->{TIME}->[$sidx]->{last_pos} - $p->{TIME}->[$sidx]->{first_pos}; if ($secs>0 && $chars>0){ $p->{TIME}->[$sidx]->{end} = $p->{TIME}->[$sidx]->{last} + $after*$secs/$chars; } else{ $p->{TIME}->[$sidx]->{end} = $p->{TIME}->[$sidx]->{first}+ $after*$p->{SEC_PER_CHAR}; } } else{ $p->{TIME}->[$sidx]->{end} = $p->{TIME}->[$sidx]->{last}; } } else{ my $chars=$p->{SENT_LENGTHS}->[-1]-$p->{SENT_LENGTHS}->[-2]; $p->{TIME}->[$sidx]->{end} = $p->{TIME}->[$sidx]->{start} + $chars*$p->{SEC_PER_CHAR}; } my $time = $p->{TIME}->[$sidx]->{end}-$p->{TIME}->[$sidx]->{start}; if ($time>0){ push (@{$p->{TIME_LENGTHS}},$p->{TIME_LENGTHS}->[-1]+$time); } else{ ## don't allow time<=0 --> add small time length ... print STDERR "warning! sentence $sidx is $time seconds long\n"; push (@{$p->{TIME_LENGTHS}},$p->{TIME_LENGTHS}->[-1]+0.001); } } } sub XmlChar{ my $p=shift; my $e=shift; if ($p->{SENT_OPEN}){ if ($p->{WORD_OPEN}){ $p->{SENT_LENGTHS}->[-1] += length($e); } } if ($p->{STORE_WORDS}){ if ($p->{WORD_OPEN}){ my $sidx = $#{$p->{SENT_IDS}}; $p->{WORDS}->[$sidx]->[-1] .= $e; } } } sub time2sec{ my $time=shift; my ($h,$m,$s,$ms)=split(/[^0-9\-]/,$time); if (not $h){$h=0;} if (not $m){$m=0;} if (not $s){$s=0;} if (not $ms){$ms=0;} my $sec = 3600*$h+60*$m+$s+$ms/1000; return $sec; } ##--------------------------------------------------------------------- ## alignment related functions # print sentence links sub print_alignments{ my $LINKS = shift; foreach my $i (0..$#{$LINKS}){ print '\n"; $LINK_ID++; } } # this is a vanilla implementation of church and gale sub sentence_align { my ($LEN1,$LEN2,$IDS1,$IDS2) = @_; if (not @{$LEN1}){ my @LINKS=(); $LINKS[0]{src} = []; foreach (0..$#{$LEN2}-1){ # push(@{$LINKS[0]{trg}},$$IDS2{$$LEN2[$_]}); push(@{$LINKS[0]{trg}},$$IDS2[$_]); } return @LINKS; } if (not @{$LEN2}){ my @LINKS=(); $LINKS[0]{trg} = []; foreach (0..$#{$LEN1}-1){ # push(@{$LINKS[0]{src}},$$IDS1{$$LEN1[$_]}); push(@{$LINKS[0]{src}},$$IDS1[$_]); } return @LINKS; } # parameters my %PRIOR; # $PRIOR{1}{1} = 0.89; # $PRIOR{1}{0} = 0.01/2; # $PRIOR{0}{1} = 0.01/2; # $PRIOR{2}{1} = 0.089/2; # $PRIOR{1}{2} = 0.089/2; ## $PRIOR{2}{2} = 0.011; # $PRIOR{3}{1} = 0.011/2; # $PRIOR{1}{3} = 0.011/2; # for subtitles: $PRIOR{1}{1} = 0.6829; # 896 $PRIOR{2}{1} = 0.0762; # 100 $PRIOR{0}{1} = 0.0694; # 91 $PRIOR{1}{0} = 0.0564; # 74 $PRIOR{1}{2} = 0.0549; # 72 $PRIOR{1}{3} = 0.0183; # 24 $PRIOR{3}{1} = 0.0122; # 16 # dynamic programming my (@COST,@BACK); $COST[0][0] = 0; for(my $i1=0;$i1<=$#{$LEN1};$i1++) { for(my $i2=0;$i2<=$#{$LEN2};$i2++) { next if $i1 + $i2 == 0; $COST[$i1][$i2] = 1e10; foreach my $d1 (keys %PRIOR) { next if $d1>$i1; foreach my $d2 (keys %{$PRIOR{$d1}}) { next if $d2>$i2; my $cost = $COST[$i1-$d1][$i2-$d2] - log($PRIOR{$d1}{$d2}) + &match($$LEN1[$i1]-$$LEN1[$i1-$d1], $$LEN2[$i2]-$$LEN2[$i2-$d2]); # print "($i1->".($i1-$d1).",$i2->".($i2-$d2).") [".($LEN1[$i1]-$LEN1[$i1-$d1]).",".($LEN2[$i2]-$LEN2[$i2-$d2])."] = $COST[$i1-$d1][$i2-$d2] - ".log($PRIOR{$d1}{$d2})." + ".&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2])." = $cost\n"; if ($cost < $COST[$i1][$i2]) { $COST[$i1][$i2] = $cost; @{$BACK[$i1][$i2]} = ($i1-$d1,$i2-$d2); } } } # print $COST[$i1][$i2]."($i1-$BACK[$i1][$i2][0],$i2-$BACK[$i1][$i2][1]) "; } # print "\n"; } # back tracking my %NEXT=(); my @LINKS=(); my $i1 = $#{$LEN1}; my $i2 = $#{$LEN2}; while($i1>0 || $i2>0) { # print "back $i1 $i2\n"; @{$NEXT{$BACK[$i1][$i2][0]}{$BACK[$i1][$i2][1]}} = ($i1,$i2); ($i1,$i2) = ($BACK[$i1][$i2][0],$BACK[$i1][$i2][1]); } while($i1<$#{$LEN1} || $i2<$#{$LEN2}) { push(@LINKS,{}); # print "fwd $i1 $i2\n"; # print '\n"; ($i1,$i2) = @{$NEXT{$i1}{$i2}}; } return @LINKS; } sub match { my ($len1,$len2) = @_; my $c = 1; my $s2 = 6.8; if ($len1==0 && $len2==0) { return 0; } my $mean = ($len1 + $len2/$c) / 2; my $z = ($c * $len1 - $len2)/sqrt($s2 * $mean); if ($z < 0) { $z = -$z; } my $pd = 2 * (1 - &pnorm($z)); if ($pd>0) { return -log($pd); } return 25; } sub pnorm { my ($z) = @_; my $t = 1/(1 + 0.2316419 * $z); return 1 - 0.3989423 * exp(-$z * $z / 2) * ((((1.330274429 * $t - 1.821255978) * $t + 1.781477937) * $t - 0.356563782) * $t + 0.319381530) * $t; }