#!/usr/local/bin/perl -w =head1 NAME maketarget.pl - Create target.regex file for a given Senseval-2 data file that shows all the forms of the target word =head1 SYNOPSIS maketarget.pl -head begin.v-test.xml This creates a file called target.regex with the following contents: /\s*(began)|(begin)|(beginning)|(begins)|(begun)\s*/ maketarget.pl begin.v-test.xml This creates a file called target.regex with the following contents: /(\bbegan\b)|(\bbegin\b)|(\bbeginning\b)|(\bbegins\b)|(\bbegun\b)/ These are regular expressions that show all the forms of "begin" that appear in the given Senseval-2 data file with and without a surrounding head tag. You can find begin.v-test.xml at samples/Data Type C for a quick list of options =head1 DESCRIPTION This program creates a Perl regex for the TARGET word by detecting its various forms from the given SVAL2 file. This program will create a regular expression file called target.regex that can be used to match target words via the --target option in many SenseClusters programs. The target.regex file can be of two forms: /\s*(target1|target2)\s*/ or /(\btarget1\b)|(\btarget2\b)/ The first form is appropriate when the corpus already has the target word marked with head tags, while the second should be used when the corpus is plain unannotated text. The second form is the default, while the first is available with the --head option. Note that in the first form the tag acts as a delimiter on word boundaries, while in the second form the \b character class is used for that purpose. =head1 INPUT =head2 Required Arguments: =head3 SVAL2 Should be a file in Senseval-2 format from which various possible forms of the TARGET word are to be detected. =head2 Optional Arguments: =head3 --head Create target word regex in the form: \s*(target1|target2)\s* =head3 --help Displays the summary of command line options. =head3 --version Displays the version information. =head1 OUTPUT maketarget.pl automatically creates the file with name 'target.regex' that shows the Perl regex for the TARGET word. The regex is a OR of various forms of the word detected placed within a single regex, optionally surrounded by and tags. For example: Contents of a sample file: /(\bLine\b)|(\bLines\b)|(\bline\b)|(\blined\b)|(\blines\b)/ (default) /\s*(Line)|(Lines)|(line)|(lined)|(lines)\s*/ (with --head) =head1 BUGS This program does not recognize target words of the form: Bill Clinton It is restricted to target words that are a single string, such as Bill_Clinton =head1 AUTHORS Ted Pedersen, University of Minnesota, Duluth tpederse at d.umn.edu Amruta Purandare, University of Pittsburgh Anagha Kulkarni, Carnegie-Mellon University =head1 COPYRIGHT Copyright (c) 2002-2008, Ted Pedersen, Amurta Purandare, Anagha Kulkarni This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to The Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut ############################################################################### # THE CODE STARTS HERE #$0 contains the program name along with #the complete path. Extract just the program #name and use in error messages $0=~s/.*\/(.+)/$1/; ############################################################################### # ================================ # COMMAND LINE OPTIONS AND USAGE # ================================ # command line options use Getopt::Long; GetOptions ("head","help","version"); # show help option if(defined $opt_help) { $opt_help=1; &showhelp(); exit; } # show version information if(defined $opt_version) { $opt_version=1; &showversion(); exit; } # use head tags if --head is given if(defined $opt_head) { $opt_head=1; } # show minimal usage message if no arguments if($#ARGV<0) { &showminimal(); exit; } ############################################################################# # ================================ # INITIALIZATION AND INPUT # ================================ #accept the SVAL2 file name $infile=$ARGV[0]; if(!-e $infile) { print STDERR "ERROR($0): Source file <$infile> doesn't exist...\n"; exit; } open(IN,$infile) || die "Error($0): Error(code=$!) in opening source file <$infile>.\n"; ############################################################################## while() { # modified by tdp on august 5, 2006 to handle \s* that # may appear inside the head tags # first match the head tags, and then find what is inside # them. Match those to a regex that defines a single # string that is (optionally) preceded and/or followed # by spaces. If that matches, then we have a legal head tag. if(/(.*)<\/head>/) { $match = $1; if( $match =~ /\s*(\w+)\s*/) { $target_form{$1}=1; } else { print STDERR "ERROR($0): Target word <$match> doesn't look like a valid word.\n"; exit 1; } } } if(defined %target_form) { $targetfile="target.regex"; if(-e $targetfile) { print STDERR "Warning($0): Target File <$targetfile> already exists, overwrite (y/n)? "; $ans=; if($ans !~ /^y|Y/) { exit; } } open(TARGET,">$targetfile") || die "Error($0): Error(code=$!) in opening file <$targetfile>.\n"; # modified by TDP August 1, 2006 to use head option # if using head tag, word boundaries delimited by # if not, then should use \b to avoid substring matches if ($opt_head) { # Modified by AKK July 30, 2006. # Added the surrounding head tags to the regex of # target forms. # tdp modified $target assignments to single quotes # to avoid interpolation of regex, which seemed to # be causing problem with passed through \s errors $target= '/\s*'; foreach $form (sort keys %target_form) { $target.="($form)|"; } chop $target; $target.= '\s*/'; print TARGET $target; print TARGET "\n"; } else { $target="/"; foreach $form (sort keys %target_form) { $target.="(\\b$form\\b)|"; } chop $target; $target.="/"; print TARGET $target; print TARGET "\n"; } } ############################################################################## # ========================== # SUBROUTINE SECTION # ========================== #----------------------------------------------------------------------------- #show minimal usage message sub showminimal() { print "Usage: maketarget.pl [OPTIONS] SVAL2"; print "\nTYPE maketarget.pl --help for help\n"; } #----------------------------------------------------------------------------- #show help sub showhelp() { print "Usage: maketarget.pl [OPTIONS] SVAL2 Creates a Perl regex for the TARGET word by detecting its various forms from the given SVAL2 file. SVAL2 File in Senseval-2 format from which different forms of the target word are to be searched. OPTIONS: --head Create target word regex as follows: \\s*target\\s* --help Displays this message. --version Displays the version information. Type 'perldoc maketarget.pl' to view detailed documentation of maketarget.\n"; } #------------------------------------------------------------------------------ #version information sub showversion() { print '$Id: maketarget.pl,v 1.17 2008/03/29 20:52:30 tpederse Exp $'; ## print "\nCopyright (c) 2002-2006, Ted Pedersen, Amruta Purandare, & Anagha Kulkarni\n"; ## print "maketarget.pl - Version 0.01\n"; print "\nCreate a Perl regex for a target word in a Senseval-2 file\n"; # print "Date of Last Update: 07/30/2006\n"; } #############################################################################