#!/usr/bin/perl #----------------------------------------------------------------------- # -*-perl-*- # # Copyright (C) 2004 Jörg Tiedemann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # declclue.pl: load declarative clues into DBM-databases # this script OVERWRITES ./clue.dbm and # ./clue.dbm.head #----------------------------------------------------------------------- # usage: declclue.pl [OPTIONS] target1 frequency1 # source2 target2 frequency2 # # frequencies are optional! # #----------------------------------------------------------------------- # example clue.db file: # # #------------------------------------------------------------- # #<--- this is column 0 in the file # # data fields have to be speparated with ONE tab-character # # # # features (source): { 'c.*:type' => undef } # # features (target): { 'c.*:type' => undef } # # # NP NP # NPMAX NP PP 10 # NPMAX NP APMIN PP 5 # NPMAX NP PP NP 200 # NPMAX NP 8 # APMIN ADVP # PP PP # VC VP # ADVP ADVP # INFP VP # #------------------------------- end of the file ------------ # # use strict; use FindBin qw($Bin); use lib "$Bin/.."; use Uplug::Data; use Uplug::IO::Any; my $prob=0; my $delimiter="\t"; my %dic=('file' => 'clue.dbm', 'format' => 'dbm', 'write_mode' => 'overwrite', 'key' => ['source','target']); my %inStream=('format' => 'tab', 'columns' => ['source','target','value',], 'field delimiter' => $delimiter); while ($ARGV[0]=~/^\-/){ my $opt=shift(@ARGV); if ($opt eq '-d'){ $delimiter=shift(@ARGV); $inStream{'field delimiter'}=$delimiter; } if ($opt eq '-c'){ my $col=shift(@ARGV); @{$inStream{columns}}=split(/[\,\:]/,$col); } if ($opt eq '-ci'){ $inStream{encoding}=shift(@ARGV); } if ($opt eq '-co'){ $dic{encoding}=shift(@ARGV); } if ($opt eq '-p'){$prob=1;} if ($opt eq '-f'){$prob=1;} if ($opt eq '-o'){$dic{file}=shift(@ARGV);} } my %lex=(); my $data=Uplug::Data->new; my $in=Uplug::IO::Any->new(\%inStream); $in->open('read',\%inStream); while ($in->read($data)){ my $src=$data->attribute('source'); my $trg=$data->attribute('target'); if ((not $src) or (not $trg)){next;} my $value=$data->attribute('value'); if (not $value){$value=1;} if ($value<1){$prob=1;} # value<1 --> probabilities if ($prob){$lex{$src}{$trg}=$value;} # probabilities: set value else{$lex{$src}{$trg}+=$value;} # frequencies: add up if (($src=~s/\_/ /gs) or ($trg=~s/\_/ /gs)){ # (for giza-clue:) if ($prob){$lex{$src}{$trg}=$value;} # '_' means ' ' else{$lex{$src}{$trg}+=$value;} } } my $header=$in->header; my $out=Uplug::IO::Any->new(\%dic); $out->open('write',\%dic); $out->addheader($header); $out->writeheader(); foreach my $s (keys %lex){ my $total; if (not $prob){map ($total+=$_,values %{$lex{$s}});} foreach my $t (keys %{$lex{$s}}){ my $score=$lex{$s}{$t}; if (not $prob){ # for frequencies: if (not $total){next;} # calculate relative frequencies $score/=$total; # score = freq/total-freq } my $data=Uplug::Data->new; $data->setAttribute('source',$s); $data->setAttribute('target',$t); $data->setAttribute('score',$score); $out->write($data); } } $out->close; $in->close;