#!/usr/local/bin/perl -w =head1 NAME balance.pl - Create a balanced Senseval-2 data file that has the same number of instances for each possible sense. =head1 SYNOPSIS balance.pl [OPTIONS] DATA N > Balanced-DATA This is the original distribution of senses in a Senseval-2 data file: frequency.pl begin-v.test.xml Output => Total Instances = 255 Total Distinct Senses=3 Distribution={64.31,21.18,14.51} % of Majority Sense = 64.31 Here they are balanced with 20 instances per sense. balance.pl begin.v-test.xml 20 > bal-output frequency.pl bal-output Output => Total Instances = 60 Total Distinct Senses=3 Distribution={33.33,33.33,33.33} % of Majority Sense = 33.33 Here they are balanced with 50 instances per sense. Please note that any sense with less than 50 instances is removed from the data. balance.pl begin.v-test.xml 50 > bal-output frequency.pl bal-output Output => Total Instances = 100 Total Distinct Senses=2 Distribution={50.00,50.00} % of Majority Sense = 50.00 You can find begin-v.test.xml in samples/Data Type C for a quick summary of options =head1 DESCRIPTION This program will choose exactly the same number of instances for each sense found in a given Senseval-2 file. Unless a value is specified, it will choose the number of instances present in the least frequent sense. Output is to STDOUT, so the original input data is unchanged. =head1 INPUT =head2 Required Arguments: =head4 DATA balance.pl accepts a Senseval-2 data file. =head4 N Specifies the number of instances to be selected from each sense. =head2 Optional Arguments: --count COUNT Balances the COUNT file created by SenseTool's L along with the DATA file. COUNT file is balanced such that it stays consistent with the new balanced DATA file and contains only those instances left after balancing, in the same order as they appear in the output. Balanced COUNT is written to file COUNT.balanced and every ith line in COUNT.balanced is instance data within and tags for the ith instance in the output of balance. =head3 Other Options : =head4 --help Displays this message. =head4 --version Displays the version information. =head1 OUTPUT Output is a sense balanced Senseval-2 file and is displayed to stdout. Output will show exactly N instances of each sense that has at least N instances. All senses in the output Senseval-2 will have equal number of instances meaning the senses will be equally distributed. =head1 BUGS The output of balance.pl will have un-balanced distribution of senses when some of the instances have multiple sense tags in the given DATA file. =head1 AUTHORS Ted Pedersen, University of Minnesota, Duluth tpederse at d.umn.edu Amruta Purandare, University of Pittsburgh =head1 COPYRIGHT Copyright (c) 2002-2008, Amruta Purandare and Ted Pedersen This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to The Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut ############################################################################### # ==================== # THE CODE STARTS HERE # ==================== ############################################################################### # ============================== # COMMAND LINE OPTIONS AND USAGE # ============================== #command line options use Getopt::Long; GetOptions ("help","version","count=s"); #show help message if(defined $opt_help) { $opt_help=1; &showhelp(); exit; } #show version information if(defined $opt_version) { $opt_version=1; &showversion(); exit; } #show minimal usage note if($#ARGV<1) { &minimal(); exit; } #if --count file is provided if(defined $opt_count) { $countfile=$opt_count; } ############################################################################## # ================================ # INITIALIZATION AND INPUT # ================================ #$0 contains the program name along with #the complete path. Extract just the program #name and use in error messages $0=~s/.*\/(.+)/$1/; #getting the source file name if(!defined $ARGV[0]) { print STDERR "ERROR($0): Please specify Senseval-2 formatted Data file to be balanced.\n"; exit; } $infile=$ARGV[0]; if(!(-e $infile)) { print STDERR "ERROR($0): Source file $infile doesn't exist.\n"; exit; } open(IN,$infile) || die "ERROR($0): Error(code=$!) in opening file $infile.\n"; #getting no of instances to be selected from each sense if(!defined $ARGV[1]) { print STDERR "ERROR($0): Please specify the Number of instances to be selected from each sense.\n"; exit; } $number=$ARGV[1]; # -------------------------- # if count file is provided # -------------------------- if(defined $countfile) { if(!-e $countfile) { print STDERR "ERROR($0): Count file <$countfile> doesn't exist.\n"; exit; } open(COUNT,$countfile) || die "Error($0): Error(code=$!) in opening <$countfile> file.\n"; #----------------------------- # Creating out file for count #----------------------------- $count_outfile=$countfile.".balanced"; $ans="N"; if(-e $count_outfile) { print STDERR "Warning($0): Balanced file <$count_outfile> for count file <$countfile> already exists, overwrite (y/n)? "; $ans=; } if(!-e $count_outfile || $ans=~/Y|y/) { open(COUNT_OUT,">$count_outfile") || die "Error($0): Error(code=$!) in opening balanced count file <$count_outfile>.\n"; } else { undef $countfile; } } ############################################################################## # ==================== # Actual Balancing # ==================== # if sense tagged, get senses from data file while() { push @text,$_; if(/instance id=\"([^\"]+)\"/) { $instance=$1; } if(/sense\s*id=\"([^\"]+)\"/) { # storing instances per sense push @{$instances{$1}},$instance; } } # selecting N instances of each sense foreach $sense (keys %instances) { # shuffle and select only the instances of that sense which have # number of instances > specified number N if($#{$instances{$sense}} >= ($number-1)) { #selecting randomly by shuffling and selecting top N shuffle(\@{$instances{$sense}}); foreach (0..$number-1) { # selected will contain all the instances that are # to be displayed $select_it=shift @{$instances{$sense}}; $selected{$select_it}=1; } } } # now display only the selected instances from the text $write=1; $line_num=0; $count_flag=0; foreach (@text) { if(/<\/context>/) { $count_flag=0; } if($count_flag==1) { $line_num++; } if(/instance id=\"([^\"]+)\"/) { if(!defined $selected{$1}) { $write=0; } } if($write==1) { print; if(defined $opt_count && $count_flag==1) { push @count_lines,$line_num; } } if(/<\/instance>/) { $write=1; } if(//) { $count_flag=1; } } ############################################################################### # --------------------- # balancing count file # --------------------- # @count_lines contains line numbers of lines to be written from count file if(defined $countfile) { $line_num=0; $next_line=shift @count_lines; while() { $line_num++; # write this line if(defined $next_line && $line_num==$next_line) { print COUNT_OUT $_; # get the next line number if($#count_lines>=0) { $next_line=shift @count_lines; } else { last; } } } # catching inconsistency between given count and Data file # all line nos in count_lines array must occur in count file if($#count_lines>=0) { print STDERR "ERROR($0): Data File <$infile> and Count file <$countfile> are inconsistent.\n"; exit; } } ############################################################################## #------------------- #shuffle subroutine #------------------- #this code is taken from the book Perl Cookbook Chapter 4 that describes #randomizing array(Page 121-122) #Reference : Perl Cookbook, Tom Christiansen & Nathan Torkington, O'Reilly # publication, 1998, Chapter 4, section 4.17, Randomizing an Array sub shuffle { my $array = shift; my $i; for ($i = @$array; --$i; ) { my $j = int rand ($i+1); next if $i == $j; @$array[$i,$j] = @$array[$j,$i]; } } #show minimal usage message sub minimal() { print "Usage: balance.pl [OPTIONS] DATA N"; print "\nTYPE balance.pl --help for help\n"; } #show help sub showhelp() { print "Usage: balance.pl [OPTIONS] DATA N Balances sense distribution in a given Senseval-2 formatted DATA file by randomly selecting exactly N instances of each sense tag. DATA Specify a Senseval-2 file to be balanced for sense tags. N Specify the number of instances to be selected from sense found in the SOURCE file."; print "\nOPTIONS:"; print " --count COUNT_FILE Specify the COUNT_FILE created by preprocess.pl program that shows instance data within tags corresponding to each instance in a given DATA file. COUNT_FILE will also be balanced along with the DATA file and updated COUNT_FILE will be written into COUNT_FILE.balanced. --help Displays this message."; print " --version Displays the version information.\n"; } #version information sub showversion() { print '$Id: balance.pl,v 1.12 2008/03/31 16:19:57 tpederse Exp $'; # print "balance.pl - Version 0.11\n"; print "\nBalance sense distribution in a Senseval-2 file\n"; # print "Copyright (c) 2002-2005, Amruta Purandare, Ted Pedersen.\n"; # print "Date Of Last Update: 05/23/2003\n"; }