#!/usr/local/bin/perl -w =head1 NAME frequency.pl - Compute the distribution of senses in a Senseval-2 data file =head1 SYNOPSIS frequency.pl [OPTIONS] SOURCE You can find begin.v-test.xml in samples/Data frequency.pl begin.v-test.xml Output => Total Instances = 255 Total Distinct Senses=3 Distribution={64.31,21.18,14.51} % of Majority Sense = 64.31 Type C for a quick summary of options =head1 DESCRIPTION Displays distribution of senses in a given Senseval-2 file to STDOUT. This information can be used to better understand the data, and also to decide to filter low frequency senses (using L) or balance the distribution of senses (using L). =head1 INPUT =head2 Required Arguments: =head4 SOURCE SOURCE should be a Senseval-2 formatted file. The sense ids are searched by matching a regex /sense\s*id="S"/. An instance having multiple sense ids should appear only once with multiple tags. e.g. If an instance IID has 2 sense ids SID1 and SID2, then in the SOURCE file, instance IID should be formatted as - Context Data comes here .... =head2 Optional Arguments: =head4 --help Displays this message. =head4 --version Displays the version information. =head1 OUTPUT Output displays 1. Total number of instances in SOURCE These are counted by matching regex /instance id=\"ID\"/ for unique instance ids. 2. Total number of distinct sense tags found in SOURCE These are searched by matching a regex /sense\s*id="S"/. 3. Sense Distribution Output shows for each sense id found in SOURCE. P is the percentage frequency of the sense S. 4. % of Majority sense This will be the highest sense percentage found in SOURCE. =head2 Sample Output Total Instances = 548 Total Distinct Senses=5 Distribution={59.49,18.99,13.38,4.70,3.44} % of Majority Sense = 59.49 Shows that there are total 548 instances and 5 senses. The senses are distributed with frequencies {59.49,18.99,13.38,4.70,3.44} where majority sense has frequency = 59.49 The tags show the frequency of each individual tag. =head1 AUTHORS Ted Pedersen, University of Minnesota, Duluth tpederse at d.umn.edu Amruta Purandare, University of Pittsburgh =head1 COPYRIGHT Copyright (c) 2002-2008, Amruta Purandare and Ted Pedersen This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to : The Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut ############################################################################### # THE CODE STARTS HERE ############################################################################### # ================================ # COMMAND LINE OPTIONS AND USAGE # ================================ # command line options use Getopt::Long; GetOptions ("help","version"); # show help option if(defined $opt_help) { $opt_help=1; &showhelp(); exit; } # show version information if(defined $opt_version) { $opt_version=1; &showversion(); exit; } # show minimal usage message if no arguments if($#ARGV<0) { &showminimal(); exit; } ############################################################################# # ================================ # INITIALIZATION AND INPUT # ================================ #$0 contains the program name along with #the complete path. Extract just the program #name and use in error messages $0=~s/.*\/(.+)/$1/; if(!defined $ARGV[0]) { print STDERR "ERROR($0): Please specify a Senseval-2 formatted Source file...\n"; exit; } #accept input $infile=$ARGV[0]; #check if exists if(!-e $infile) { print STDERR "ERROR($0): Source file <$infile> doesn't exist ... \n"; exit; } open(IN,$infile) || die "ERROR($0): Error(code=$!) in opening Source file <$infile>.\n"; ############################################################################## # ============================== # Get Tag Frequency # ============================== $total=0; $instances=0; $histo={}; while() { if(/instance id=\"([^\"]+)\"/) { if(defined $instance_hash{$1}) { print STDERR "ERROR($0): Instance Id <$1> is repeated in file <$infile>.\n"; } $instances++; $instance_hash{$1}=1; } # get the sense tag if(/sense\s*id=\"([^\"]+)\"/) { $histo{$1}++; $total++; } } # now find percentages foreach (sort keys %histo) { $histo{$_}=sprintf("%2.2f",$histo{$_}/$total*100); print "\n"; push @sense_distri,$histo{$_}; delete $histo{$_}; } # sort frequency array in descending order @sorted=sort {$b <=> $a} @sense_distri; $distinct=$#sense_distri+1; # display print "Total Instances = $instances\n"; print "Total Distinct Senses=$distinct\nDistribution={".join(",",@sorted)."}\n"; if(defined $sorted[0]) { print "% of Majority Sense = $sorted[0]\n"; } close IN; ############################################################################## # ========================== # SUBROUTINE SECTION # ========================== #----------------------------------------------------------------------------- #show minimal usage message sub showminimal() { print "Usage: frequency.pl [OPTIONS] SOURCE"; print "\nTYPE frequency.pl --help for help\n"; } #----------------------------------------------------------------------------- #show help sub showhelp() { print "Usage: frequency.pl [OPTIONS] SOURCE Displays total number of instances, total distinct senses, sense distribution and % frequency of majority sense in a given Senseval-2 formatted SOURCE file. SOURCE Specify a Senseval-2 formatted file for which sense distribution is to be shown. Sense ids for instances in SOURCE are searched by matching regex /sense\\s*id=\"S\"/. OPTIONS: --help Displays this message. --version Displays the version information.\n"; } #------------------------------------------------------------------------------ #version information sub showversion() { # print "frequency.pl - Version 0.11"; print '$Id: frequency.pl,v 1.11 2008/03/29 20:52:27 tpederse Exp $'; print "\nDisplay sense distribution of a given Senseval-2 file\n"; # print "\nCopyright (c) 2002-2005, Amruta Purandare, Ted Pedersen.\n"; # print "Date of Last Update: 05/07/2003\n"; } #############################################################################