The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
#
# ngramstat.pl: n-gram statistics
#
#---------------------------------------------------------------------------
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#---------------------------------------------------------------------------
#
# $Id$
#
# usage: 
#
#

use strict;
use FindBin qw($Bin);
use lib "$Bin/../lib";


use Uplug::IO::Any;
use Uplug::Data;
use Uplug::Config;
use Uplug::CoocStat;

my %IniData=&GetDefaultIni;
my $IniFile='NgramStat.ini';
&CheckParameter(\%IniData,\@ARGV,$IniFile);

#---------------------------------------------------------------------------

my ($InputStreamName,$InputStream)=             # input stream
    each %{$IniData{'input'}};
my ($OutputStreamName,$OutputStream)=             # output stream
    each %{$IniData{'output'}};

my $input=Uplug::IO::Any->new($InputStream);
my $output=Uplug::IO::Any->new($OutputStream);

#---------------------------------------------------------------------------

$input->open('read',$InputStream);
$output->open('write',$OutputStream) || exit;

#---------------------------------------------------------------------------

my $Param={};
$Param=$IniData{parameter};
my $FreqHeader=$input->header();

my $MinFreq=$IniData{parameter}{token}{'minimal frequency'};
my $measure=$IniData{parameter}{statistics}{measure};
my $MinScore=$IniData{parameter}{statistics}{'minimal score'};
my $precision=$IniData{parameter}{statistics}{precision};
my $PrintProgr=$IniData{parameter}{runtime}{'print progress'};

if (ref($FreqHeader) eq 'HASH'){
    if ((not defined $MinFreq) and 
	(defined $FreqHeader->{'minimal frequency'})){
	$MinFreq=$FreqHeader->{'minimal frequency'};
    }
    $FreqHeader->{measure}=$measure;
    $FreqHeader->{threshold}=$MinScore;
    $output->addheader($FreqHeader);
}

my $stat=Uplug::CoocStat->new($measure);
if (not ref($stat)){die "# ngramstat.pl: cannot find '$measure'!\n";}

#---------------------------------------------------------------------------

if ($PrintProgr){
    print STDERR "read frequencies\n";
}

my $data=Uplug::Data->new();
my $time=time();
my $count=0;
my $accepted=0;

while ($input->read($data)){

    if ($PrintProgr){
	$count++;
	if (not ($count % 500)){
	    $|=1;print STDERR '.';$|=0;
	}
	if (not ($count % 10000)){
	    $|=1;
	    print STDERR "$accepted ($count) ngrams (";
	    print STDERR time()-$time;
	    print STDERR " sec)\n";
	    $|=0;
	}
    }

    my $tok=$data->data();
    my @words=split(/\s+/,$tok->{token});
    my $len=@words;
    if ($len<2){next;}

    my ($left,$right);
    my ($first,$second);
    if ($measure=~/left/i){
	$second=shift(@words);                # take the initial word
    }
    else{
	$second=pop(@words);                  # take the final word
    }
    $first=join (' ',@words);
    $left=&GetTokFreq($input,$first);
    $right=&GetTokFreq($input,$second);

    my $NgramLength=$#words+2;
    my $NrNgrams=$FreqHeader->{"$NgramLength-gram freq"};
    my $score=$stat->compute($tok->{freq},$left,$right,$NrNgrams);


########### old: use built-in sub's
#
#       my $score;
#	my $LenLeft=scalar @words;
#	my $LenRight=1;
#	if ($measure=~/dice/i){
#	    $score=&Dice($tok->{freq},$left,$right);
#	}
#	if ($measure=~/mutual/i){
#	    $score=&Mutual($tok->{freq},
#			   $left,$right,
#			   $LenLeft,$LenRight,
#			   $FreqHeader);
#	}
#	if ($measure=~/t-?score/i){
#	    $score=&Tscore($tok->{freq},
#			   $left,$right,
#			   $LenLeft,$LenRight,
#			   $FreqHeader);
#	}
#######################

    if (defined $precision){
	$score=int($score*10**$precision+0.5)/(10**$precision);
    }
    if ($MinScore){
	if ($score<$MinScore){next;}
    }

    $accepted++;
    my $OutData=Uplug::Data->new;
    $OutData->setAttribute('ngram',$tok->{token});
    $OutData->setAttribute('score',$score);

    $output->write($OutData);
}


$input->close;
$output->close;



#---------------------------------------------------------------------------
#---------------------------------------------------------------------------

sub Dice{
    my ($freq,$left,$right)=@_;
    my $score=0;
    if ($freq and $left and $right){
	$score=$freq*2/($left+$right);
    }
    return $score;
}

sub Mutual{
    my ($freq,$left,$right,$LenLeft,$LenRight,$counts)=@_;
    my $LenAll=$LenLeft+$LenRight;

    my $CountLeft=$counts->{"$LenLeft-gram freq"};
    my $CountRight=$counts->{"$LenRight-gram freq"};
    my $CountAll=$counts->{"$LenAll-gram freq"};

    my $score=0;
    if ($CountLeft and $CountRight and $CountAll){
	my $probleft=$left/$CountLeft;
	my $probright=$right/$CountRight;
	my $probpair=$freq/$CountAll;
	if ($probleft and $probright and $probpair){
	    $score=log(($probpair)/
		       (($probleft)*($probright)))/
			   log(2);
	}
    }
    return $score;
}

sub Tscore{
    my ($freq,$left,$right,$LenLeft,$LenRight,$counts)=@_;
    my $LenAll=$LenLeft+$LenRight;

    my $CountLeft=$counts->{"$LenLeft-gram freq"};
    my $CountRight=$counts->{"$LenRight-gram freq"};
    my $CountAll=$counts->{"$LenAll-gram freq"};

    my $score=0;
    if ($CountLeft and $CountRight and $CountAll){
	my $probleft=$left/$CountLeft;
	my $probright=$right/$CountRight;
	my $probpair=$freq/$CountAll;
	if ($probleft and $probright and $probpair){
	    $score=($probpair-$probleft*$probright)/
		sqrt($probpair/$CountAll);
	}
    }
    return $score;
}


#---------------------------------------------------------------------------

sub GetTokFreq{
    my ($freq,$token)=@_;
    my %search=('token' => $token);
    my $found=Uplug::Data->new;
    if ($freq->select($found,\%search)){
	return $found->attribute('freq');
    }
    return 0;
}


#---------------------------------------------------------------------------

sub GetDefaultIni{

    my $DefaultIni = {
  'module' => {
    'name' => 'N-gram statistics',
    'program' => 'ngramstat.pl',
    'location' => '$UplugBin',
    'stin' => 'text',
    'stdout' => 'text',
  },
  'input' => {
    'ngram freq' => {
      'format' => 'DBM',
      'key' => ['token']
    },
  },
  'output' => {
    'ngram stat' => {
       'format' => 'uwa tab',
       'columns' => ['ngram'],
       'write_mode' => 'overwrite',
    },
  },
  'parameter' => {
    'token' => {
      'minimal frequency' => 2,
#      'delimiter' => '\s+',
    },
    'statistics' => {
       'minimal score' => 0.3,
       'measure' => 'dice',
    },
    'runtime' => {
      'print progress' => 1,
    },
  },
  'arguments' => {
    'shortcuts' => {
       'in' => 'input:ngram freq:file',
       'infile' => 'input:ngram freq:file',
       'informat' => 'input:ngram freq:format',
       'out' => 'output:ngram stat:file',
       'stat' => 'parameter:statistics:measure',
       'min' => 'parameter:statistics:minimal score',
       'prec' => 'parameter:statistics:precision',
       'lang' => 'parameter:token:language (target)',
    }
  },
};
    return %{$DefaultIni};
}