#!/usr/local/bin/perl -w =head1 NAME balance.pl Created balanced data with same number of senses for each possible outcome =head1 SYNOPSIS Chooses exactly given number of instances of each sense from a given Senseval-2 file. =head1 USGAE balance.pl [OPTIONS] DATA N =head1 INPUT =head2 Required Arguments: =head4 DATA balance.pl accepts a Senseval-2 data file. =head4 N Specifies the number of instances to be selected from each sense. =head2 Optional Arguments: --count COUNT Balances the COUNT file created by SenseTool's preprocess.pl along with the DATA file. COUNT file is balanced such that it stays consistent with the new balanced DATA file and contains only those instances left after balancing, in the same order as they appear in the output. Balanced COUNT is written to file COUNT.balanced and every ith line in COUNT.balanced is instance data within and tags for the ith instance in the output of balance. =head3 Other Options : =head4 --help Displays this message. =head4 --version Displays the version information. =head1 OUTPUT Output is a sense balanced Senseval-2 file and is displayed to stdout. Output will show exactly N instances of each sense that has atleast N instances. All senses in the output Senseval-2 will have equal number of instances meaning the senses will be equally distributed. =head1 BUGS output of balance.pl will have un-balanced distribution of senses when some of the instances have multiple sense tags in the given DATA file. =head1 AUTHOR Amruta Purandare, Ted Pedersen. University of Minnesota, Duluth. =head1 COPYRIGHT Copyright (c) 2002-2005, Amruta Purandare, University of Pittsburgh. amruta@cs.pitt.edu Ted Pedersen, University of Minnesota, Duluth. tpederse@umn.edu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to The Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut ############################################################################### # ==================== # THE CODE STARTS HERE # ==================== ############################################################################### # ============================== # COMMAND LINE OPTIONS AND USAGE # ============================== #command line options use Getopt::Long; GetOptions ("help","version","count=s"); #show help message if(defined $opt_help) { $opt_help=1; &showhelp(); exit; } #show version information if(defined $opt_version) { $opt_version=1; &showversion(); exit; } #show minimal usage note if($#ARGV<1) { &minimal(); exit; } #if --count file is provided if(defined $opt_count) { $countfile=$opt_count; } ############################################################################## # ================================ # INITIALIZATION AND INPUT # ================================ #$0 contains the program name along with #the complete path. Extract just the program #name and use in error messages $0=~s/.*\/(.+)/$1/; #getting the source file name if(!defined $ARGV[0]) { print STDERR "ERROR($0): Please specify Senseval-2 formatted Data file to be balanced.\n"; exit; } $infile=$ARGV[0]; if(!(-e $infile)) { print STDERR "ERROR($0): Source file $infile doesn't exist.\n"; exit; } open(IN,$infile) || die "ERROR($0): Error(code=$!) in opening file $infile.\n"; #getting no of instances to be selected from each sense if(!defined $ARGV[1]) { print STDERR "ERROR($0): Please specify the Number of instances to be selected from each sense.\n"; exit; } $number=$ARGV[1]; # -------------------------- # if count file is provided # -------------------------- if(defined $countfile) { if(!-e $countfile) { print STDERR "ERROR($0): Count file <$countfile> doesn't exist.\n"; exit; } open(COUNT,$countfile) || die "Error($0): Error(code=$!) in opening <$countfile> file.\n"; #----------------------------- # Creating out file for count #----------------------------- $count_outfile=$countfile.".balanced"; $ans="N"; if(-e $count_outfile) { print STDERR "Warning($0): Balanced file <$count_outfile> for count file <$countfile> already exists, overwrite (y/n)? "; $ans=; } if(!-e $count_outfile || $ans=~/Y|y/) { open(COUNT_OUT,">$count_outfile") || die "Error($0): Error(code=$!) in opening balanced count file <$count_outfile>.\n"; } else { undef $countfile; } } ############################################################################## # ==================== # Actual Balancing # ==================== # if sense tagged, get senses from data file while() { push @text,$_; if(/instance id=\"([^\"]+)\"/) { $instance=$1; } if(/sense\s*id=\"([^\"]+)\"/) { # storing instances per sense push @{$instances{$1}},$instance; } } # selecting N instances of each sense foreach $sense (keys %instances) { # shuffle and select only the instances of that sense which have # number of instances > specified number N if($#{$instances{$sense}} >= ($number-1)) { #selecting randomly by shuffling and selecting top N shuffle(\@{$instances{$sense}}); foreach (0..$number-1) { # selected will contain all the instances that are # to be displayed $select_it=shift @{$instances{$sense}}; $selected{$select_it}=1; } } } # now display only the selected instances from the text $write=1; $line_num=0; $count_flag=0; foreach (@text) { if(/<\/context>/) { $count_flag=0; } if($count_flag==1) { $line_num++; } if(/instance id=\"([^\"]+)\"/) { if(!defined $selected{$1}) { $write=0; } } if($write==1) { print; if(defined $opt_count && $count_flag==1) { push @count_lines,$line_num; } } if(/<\/instance>/) { $write=1; } if(//) { $count_flag=1; } } ############################################################################### # --------------------- # balancing count file # --------------------- # @count_lines contains line numbers of lines to be written from count file if(defined $countfile) { $line_num=0; $next_line=shift @count_lines; while() { $line_num++; # write this line if(defined $next_line && $line_num==$next_line) { print COUNT_OUT $_; # get the next line number if($#count_lines>=0) { $next_line=shift @count_lines; } else { last; } } } # catching inconsistency between given count and Data file # all line nos in count_lines array must occur in count file if($#count_lines>=0) { print STDERR "ERROR($0): Data File <$infile> and Count file <$countfile> are inconsistent.\n"; exit; } } ############################################################################## #------------------- #shuffle subroutine #------------------- #this code is taken from the book PerlCookbook Chapter 4 that describes #randomizing array(Page 121-122) #Reference : Perl Cookbook, Tom Christiansen & Nathan Torkington, O'Reilly # publication, 1998, Chapter 4, section 4.17, Randomizing an Array sub shuffle { my $array = shift; my $i; for ($i = @$array; --$i; ) { my $j = int rand ($i+1); next if $i == $j; @$array[$i,$j] = @$array[$j,$i]; } } #show minimal usage message sub minimal() { print "Usage: balance.pl [OPTIONS] DATA N"; print "\nTYPE balance.pl --help for help\n"; } #show help sub showhelp() { print "Usage: balance.pl [OPTIONS] DATA N Balances sense distribution in a given Senseval-2 formatted DATA file by randomly selecting exactly N instances of each sense tag. DATA Specify a Senseval-2 file to be balanced for sense tags. N Specify the number of instances to be selected from sense found in the SOURCE file."; print "\nOPTIONS:"; print " --count COUNT_FILE Specify the COUNT_FILE created by preprocess.pl program that shows instance data within tags corresponding to each instance in a given DATA file. COUNT_FILE will also be balanced along with the DATA file and updated COUNT_FILE will be written into COUNT_FILE.balanced. --help Displays this message."; print " --version Displays the version information.\n"; } #version information sub showversion() { print "balance.pl - Version 0.11\n"; print "A component of SenseClusters Package that balances sense distribution in a\ngiven Senseval-2 file.\n"; print "Copyright (c) 2002-2005, Amruta Purandare, Ted Pedersen.\n"; print "Date Of Last Update: 05/23/2003\n"; }