#!/usr/local/bin/perl -w =head1 NAME windower.pl - Limit window of context around a target word specified in a Senseval-2 input file =head1 SYNOPSIS Suppose we have a very small Senseval-2 file (small-test.xml) with just 2 instances. We would like to limit the surrounding context to 5 words to the left and 5 words to the right of the target word: windower.pl small.xml 5 Output => greats hardly knowns and unknowns begin a game three month season late november it expects to begin construction by year end and This is from the first two lines of the file begin.v-test.xml. You can see the full contexts at /samples/Data. Type C for a quick summary of options =head1 DESCRIPTION Limits the contexts of given instances to W tokens around the target word. =head1 USAGE windower.pl [OPTIONS] SVAL2 W =head1 INPUT =head2 Required Arguments: =head3 SVAL2 SVAL2 must be a tokenized and preprocessed instance file in the Senseval-2 format. =head3 W Should be a positive integer number specifying the window size. windower will display only the tokens that appear in the window of [-W, +W] centered around the target word. =head2 Optional Arguments: =head3 --plain Output will be displayed in plain text format showing context of each instance on a single separate line. i.e. each i'th line on stdout will show the context of the i'th instance in the given SVAL2 file. By default, output is created in Senseval-2 format. =head3 --token TOKENREGEX TOKENREGEX should be a file containing Perl regular expressions that define the tokenization scheme in SVAL2. windower recognizes only those character sequences from SVAL2 that match the specified token regex/s, everything else will be ignored. If --token is not specified, windower searches the default token.regex file in the current directory. =head3 --target TARGETREGEX Specify a file containing Perl regular expressions that define the target word/s. Target words must be valid tokens recognizable by the specified tokenization scheme (via --token or token.regex) Following are some of the examples of TARGET word regex files - =over 4 =item 1. /[Ll]ines?<\/head>/ which specifies that the target word could be line, Line, lines or Lines delimited in and tags. =item 2. Above regex can also be specified as multiple regexes in TARGET as - /line<\/head>/ /lines<\/head>/ /Line<\/head>/ /Lines<\/head>/ with a single regex per line =item 3. Regex /\w+<\/head>/ shows a more general regex for target words marked in tags =item 4. Regex /\w+<\/head>/ Shows the regex for matching target words in the original Senseval-2 data. =item 5. /[Ll]ines?/ shows that any occurrence of words - Line, line, Lines, lines are target words (that are not delimited in any special tags). =back =head3 Other Options : =head4 --help Displays this message. =head4 --version Displays the version information. =head1 OUTPUT When --plain is not selected, OUTPUT is in Senseval-2 format that looks same as the input SVAL2 file except the context of each instance shows atmost W words around the target word. When --plain is ON, OUTPUT shows each context on a single line i.e. context of i'th instance in the given SVAL2 file is shown on the i'th line on stdout. =head1 AUTHORS Amruta Purandare, University of Pittsburgh Ted Pedersen, University of Minnesota, Duluth tpederse at d.umn.edu =head1 COPYRIGHT Copyright (c) 2002-2008, Amruta Purandare and Ted Pedersen This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to The Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut ############################################################################### # THE CODE STARTS HERE ############################################################################### # ================================ # COMMAND LINE OPTIONS AND USAGE # ================================ # command line options use Getopt::Long; GetOptions ("help","version","target=s","token=s","plain"); # show help option if(defined $opt_help) { $opt_help=1; &showhelp(); exit; } # show version information if(defined $opt_version) { $opt_version=1; &showversion(); exit; } # show minimal usage message if no arguments if($#ARGV<1) { &showminimal(); exit; } ############################################################################# # ================================ # INITIALIZATION AND INPUT # ================================ #$0 contains the program name along with #the complete path. Extract just the program #name and use in error messages $0=~s/.*\/(.+)/$1/; #check if the source file is specified if(!defined $ARGV[0]) { print STDERR "ERROR($0): Please specify the input SVAL2 file ...\n"; exit 1; } #accept the input file name $infile=$ARGV[0]; #check if exists if(!-e $infile) { print STDERR "ERROR($0): SVAL2 file <$infile> doesn't exist...\n"; exit 1; } #open and get handle open(IN,$infile) || die "Error($0): Error(code=$!) in opening input SVAL2 file <$infile>.\n"; # ------- # Window # ------- #check if the window value is specified if(!defined $ARGV[1]) { print STDERR "ERROR($0): Please specify the window size...\n"; exit 1; } #accept the window size $window=$ARGV[1]; # ------------------- # Target Word regex # ------------------- #file containing regex/s for target word if(defined $opt_target) { $target_file=$opt_target; if(!(-e $target_file)) { print STDERR "ERROR($0): Target regex file <$target_file> doesn't exist.\n"; exit 1; } } else { $target_file="target.regex"; if(!-e $target_file) { print STDERR "ERROR($0): Please copy the target.regex file into the current directory or specify the target regex file via --target option.\n"; exit 1; } } # ------------------------ # creating target regex # ------------------------ open(REG,$target_file) || die "ERROR($0): Error(error code=$!) in opening the target regex file <$target_file>.\n"; while() { chomp; s/^\s+//g; s/\s+$//g; if(/^\s*$/) { next; } if(/^\//) { s/^\///; } else { print STDERR "ERROR($0): Regular Expression <$_> should start with '/'\n"; exit 1; } if(/\/$/) { s/\/$//; } else { print STDERR "ERROR($0): Regular Expression <$_> should end with '/'\n"; exit 1; } $target.="(".$_.")|"; } if(!defined $target) { print STDERR "ERROR($0): No valid Perl regular expression found in the target regex file <$target_file>.\n"; exit 1; } else { chop $target; } # ----------------------- # creating token regex # ----------------------- if(defined $opt_token) { $token_file=$opt_token; if(!(-e $token_file)) { print STDERR "ERROR($0): Token regex file <$token_file> doesn't exist.\n"; exit 1; } } else { $token_file="token.regex"; if(!(-e $token_file)) { print STDERR "ERROR($0): Please copy the file token.regex into the current directory or specify the token regex file via --token option.\n"; exit 1; } } open(TOK,$token_file) || die "ERROR($0): Error(error code=$!) in opening token regex file <$token_file>.\n"; while() { chomp; s/^\s+//g; s/\s+$//g; if(/^\s*$/) { next; } if(/^\//) { s/^\///; } else { print STDERR "ERROR($0): Regular Expression <$_> should start with '/'\n"; exit 1; } if(/\/$/) { s/\/$//; } else { print STDERR "ERROR($0): Regular Expression <$_> should end with '/'\n"; exit 1; } $token_regex.="(".$_.")|"; } if(!defined $token_regex) { print STDERR "ERROR($0): No valid Perl regular expression found in token regex file <$token_file>.\n"; exit 1; } else { chop $token_regex; } ############################################################################## # ========================= # CODE SECTION # ========================= $tempfile="tempfile" . time() . ".windower"; if(-e $tempfile) { print STDERR "ERROR($0): Temporary file <$tempfile> already exists.\n"; exit 1; } open(TEMP,">$tempfile") || die "ERROR($0): Error(code=$!) in opening temporary internal file <$tempfile>.\n"; $line_num=0; while() { $line_num++; # instance start if(/instance id\s*=\s*\"([^"]+)\"/) { $instance=$1; } # instance ends if(/<\/instance>/) { undef $instance; } # end of context if(/<\/context>/) { undef $data_start; if(!defined $got_target) { print STDERR "ERROR($0): No matching target word found in the context of instance <$instance> in SVAL2 file <$infile>.\n"; exit 1; } # actual windowing now ! foreach $index (0..$#text_line) { #check if the target word if($text_line[$index] =~ /$target/) { #find the lower and upper bounds for window $lower=($index-$window)<0 ? 0 : $index-$window; $upper=($index+$window)>$#text_line ? $#text_line : $index+$window; # display the window words foreach $windex ($lower..$upper) { print TEMP "$text_line[$windex] "; } } } print TEMP "\n"; } # context data if(defined $data_start) { # tokenize while(/$token_regex/) { $token=$&; $_=$'; # check if target if($token =~ /$target/) { # error on multiple targets if(defined $got_target) { print STDERR "ERROR($0): Multiple target words matched in the context of instance <$instance> in SVAL2 file <$infile>.\n"; exit 1; } $got_target=1; } push @text_line,$token; } } if(!defined $data_start && !defined $opt_plain) { print TEMP $_; } # context start if(//) { $data_start=1; if(!defined $instance) { print STDERR "ERROR($0): No instance id found for the context at line <$line_num> in SVAL2 file <$infile>.\n"; exit 1; } undef $got_target; undef @text_line; } } # ----------------------- # printing to stdout # ----------------------- close TEMP; open(TEMP,$tempfile) || die "ERROR($0): Error(code=$!) in opening temporary internal file <$tempfile>.\n"; while() { print; } close TEMP; unlink "$tempfile"; undef $opt_plain; ############################################################################## # ========================== # SUBROUTINE SECTION # ========================== #----------------------------------------------------------------------------- #show minimal usage message sub showminimal() { print "Usage: windower.pl SVAL2 W"; print "\nTYPE windower.pl --help for help\n"; } #----------------------------------------------------------------------------- #show help sub showhelp() { print "Usage: windower.pl SVAL2 W Context of each instance in the given SVAL2 file is limited to W tokens around the target word. Input and output are both in Senseval-2 format. SVAL2 A tokenized and preprocessed input instance file in Senseval-2 format. W Window size. Limits the contexts to W tokens on left and right of the target word. OPTIONS: --plain Output will be in plain text format showing context of each instance on a single line. By default, output is in Senseval-2 format. --target TARGETREGEX A file containing Perl regex/s that define the target word/s. By default, file target.regex is searched in the current directory. --token TOKENREGEX A file containing Perl regex/s that define valid tokens in the SVAL2 file. By default, file token.regex is searched in the current directory. --help Displays this message. --version Displays the version information. Type 'perldoc windower.pl' to view detailed documentation of windower.\n"; } #------------------------------------------------------------------------------ #version information sub showversion() { # print "windower.pl - Version 0.07\n"; print '$Id: windower.pl,v 1.13 2008/03/29 20:52:30 tpederse Exp $'; print "\nLimit contexts in a Senseval-2 file to N tokens around the target word\n"; # print "Copyright (c) 2002-2005, Amruta Purandare & Ted Pedersen.\n"; # print "Date of Last Update: 27/07/2006\n"; } #############################################################################