#! /usr/bin/perl -w # # rawtextFreq.pl version 2.05 # (Last updated $Id: rawtextFreq.pl,v 1.27 2008/06/03 00:04:43 sidz1979 Exp $) # # ----------------------------------------------------------------------------- use strict; use File::Find; use WordNet::Similarity::FrequencyCounter; # Variable declarations my %stopWords; my %offsetFreq; # Some modules used use Getopt::Long; use WordNet::QueryData; use WordNet::Tools; # First check if no commandline options have been provided... in which case # print out the usage notes! if($#ARGV == -1) { &minimalUsageNotes(); exit 1; } our @opt_infiles; our ($opt_version, $opt_help, $opt_stopfile, $opt_outfile); our ($opt_wnpath, $opt_resnik, $opt_smooth, $opt_stdin); # Now get the options! my $ok = GetOptions("version", "help", "stopfile=s", "outfile=s","wnpath=s", "resnik", "smooth=s", "stdin","infile=s" => \@opt_infiles); # GetOptions should have already printed out a detail error message if # $ok is false $ok or die "Error getting command-line arguments\n"; # If the version information has been requested if(defined $opt_version) { &printVersion(); exit 0; } # If detailed help has been requested if(defined $opt_help) { &printHelp(); exit 0; } # make sure either --stdin or --infile was given unless($opt_stdin or scalar @opt_infiles) { minimalUsageNotes(); exit 1; } # make sure that both --stdin and --infile were NOT given if($opt_stdin and scalar @opt_infiles) { minimalUsageNotes(); exit 1; } # make sure --outfile is given if(!defined $opt_outfile) { &minimalUsageNotes(); exit 1; } # Load the stop words if specified if(defined $opt_stopfile) { print STDERR "Loading stoplist... "; open(WORDS, '<', "$opt_stopfile") or die "Couldn't open $opt_stopfile.\n"; while() { s/[\r\f\n]//g; $stopWords{$_} = 1; } close WORDS; print STDERR "done.\n"; } # Get the path to WordNet... my ($wnPCPath, $wnUnixPath); if(defined $opt_wnpath) { $wnPCPath = $opt_wnpath; $wnUnixPath = $opt_wnpath; } elsif(defined $ENV{WNSEARCHDIR}) { $wnPCPath = $ENV{WNSEARCHDIR}; $wnUnixPath = $ENV{WNSEARCHDIR}; } elsif(defined $ENV{WNHOME}) { $wnPCPath = $ENV{WNHOME} . "\\dict"; $wnUnixPath = $ENV{WNHOME} . "/dict"; } else { $wnPCPath = "C:\\Program Files\\WordNet\\3.0\\dict"; $wnUnixPath = "/usr/local/WordNet-3.0/dict"; } # Load up WordNet print STDERR "Loading WordNet... "; my $wn = (defined $opt_wnpath) ? (WordNet::QueryData->new($opt_wnpath)) : (WordNet::QueryData->new()); die "Unable to create WordNet::QueryData object.\n" if(!$wn); $wnPCPath = $wnUnixPath = $wn->dataPath() if(defined $wn->can('dataPath')); my $wntools = WordNet::Tools->new($wn); die "Unable to create WordNet::Tools object.\n" if(!$wntools); print STDERR "done.\n"; # Load the topmost nodes of the hierarchies print STDERR "Loading topmost nodes of the hierarchies... "; my $topHash = WordNet::Similarity::FrequencyCounter::createTopHash($wn); die "Unable to create topHash.\n" if(!defined($topHash)); print STDERR "done.\n"; # Read the input, form sentences and process each # We need to process each sentence (or clause really) at the same time. # This is because process() tries to find compounds in the sentence. It is # not sufficient to process a line at a time because a sentence often # spans more than one line; therefore, a compound could also span a line # break (for example, the first word of a compound could be the last word # on a line, and the second word of the compound could be the first word # on the next line). my $sentence = ""; # check if we're reading from a file or from STDIN if(scalar @opt_infiles) { # first we have to figure out what files to process. The files # are specified with in --infile option. The value of the option # can be a filename, a directory, or a pattern (as understood by # Perl's glob() function). my @infiles = getFiles(@opt_infiles); print STDERR "Computing frequencies...\n"; foreach my $i (0..$#infiles) { my $infile = $infiles[$i]; print STDERR (" Processing '$infile' (", $i + 1, "/",$#infiles + 1, " files)... "); open(IFH, '<', $infile) or die "Cannot open file '$infile': $!"; while(my $line = ) { $line =~ s/[\r\f\n]//g; my @parts = split(/[.?!,;:]/, $line); foreach (1..$#parts) { $sentence .= shift(@parts)." "; &process($sentence); $sentence = ""; } $sentence .= shift(@parts)." " if(@parts); } &process($sentence); close IFH or die "Cannot not close file '$infile': $!"; print STDERR " done.\n"; } } else { print STDERR "Computing frequencies... "; while (my $line = ) { $line =~ s/[\r\f\n]//g; my @parts = split(/[.?!,;:]/, $line); foreach (1..$#parts) { $sentence .= shift(@parts) . " "; &process($sentence); $sentence = ""; } $sentence .= shift(@parts) . " " if @parts; } &process($sentence); print STDERR "done.\n"; } # Smoothing! if(defined $opt_smooth) { print STDERR "Smoothing... "; if($opt_smooth eq 'ADD1') { foreach my $pos ("noun", "verb") { my $localpos = $pos; if(!open(IDX, $wnUnixPath."/data.$pos")) { if(!open(IDX, $wnPCPath."/$pos.dat")) { print STDERR "Unable to open WordNet data files.\n"; exit; } } $localpos =~ s/(^[nv]).*/$1/; while() { last if(/^\S/); } my ($offset) = split(/\s+/, $_, 2); $offset =~ s/^0*//; $offsetFreq{$localpos}{$offset}++; while() { ($offset) = split(/\s+/, $_, 2); $offset =~ s/^0*//; $offsetFreq{$localpos}{$offset}++; } close(IDX); } print STDERR "done.\n"; } else { print STDERR "\nWarning: Unknown smoothing '$opt_smooth'.\n"; print STDERR "Use --help for details.\n"; print STDERR "Continuing without smoothing.\n"; } } # Propagating frequencies up the WordNet hierarchies... print STDERR "Propagating frequencies up through WordNet... "; my $newFreq = WordNet::Similarity::FrequencyCounter::propagateFrequency(\%offsetFreq, $wn, $topHash); print STDERR "done.\n"; # Print the output to file print STDERR "Writing output file... "; open(OUT, ">$opt_outfile") || die "Unable to open $opt_outfile for writing: $!\n"; print OUT "wnver::".$wntools->hashCode()."\n"; foreach my $pos ("n", "v") { foreach my $offset (sort {$a <=> $b} keys %{$newFreq->{$pos}}) { print OUT "$offset$pos ".($newFreq->{$pos}->{$offset}); print OUT " ROOT" if($topHash->{$pos}->{$offset}); print OUT "\n"; } } close(OUT); print STDERR "done.\n"; # ----------------- Subroutines start Here ---------------------- # Processing of each sentence # (1) Convert to lowercase # (2) Remove all unwanted characters # (3) Combine all consequetive occurrence of numbers into one # (4) Remove leading and trailing spaces # (5) Form all possible compounds in the words # (6) Get the frequency counts sub process { my $block; $block = lc(shift); $block =~ s/\'//g; $block =~ s/[^a-z0-9]+/ /g; while($block =~ s/([0-9]+)\s+([0-9]+)/$1$2/g){} $block =~ s/^\s+//; $block =~ s/\s+$//; $block = $wntools->compoundify($block); while($block =~ /([\w_]+)/g) { WordNet::Similarity::FrequencyCounter::updateWordFrequency($1, \%offsetFreq, $wn, $opt_resnik) if(!defined $stopWords{$1}); } } sub getFiles { my @inpatterns = @_; my @infiles; # the options to pass to File::Find::find() my %options = ( wanted => sub { unless(-d $File::Find::name) { push @infiles, $File::Find::name; } }, follow_fast => 1 ); foreach my $pattern (@inpatterns) { if(-d $pattern) { find(\%options, $pattern); } elsif(-e $pattern and not -d $pattern) { push @infiles, $pattern; } else { my @files = glob $pattern; foreach my $file (@files) { if(-d $file) { find(\%options, $pattern); } else { push @infiles, $file; } } } } return @infiles; } # Subroutine to print detailed help sub printHelp { &printUsage(); print "\nThis program computes the information content of concepts, by\n"; print "counting the frequency of their occurrence in raw text.\n"; print "Options: \n"; print "--outfile Specifies the output file OUTFILE.\n"; print "--stdin Read the input from the standard input\n"; print "--infile INFILE is the name of an input file\n"; print "--stopfile STOPFILE is a list of stop listed words that will\n"; print " not be considered in the frequency count.\n"; print "--wnpath Option to specify WNPATH as the location of WordNet data\n"; print " files. If this option is not specified, the program tries\n"; print " to determine the path to the WordNet data files using the\n"; print " WNHOME environment variable.\n"; print "--resnik Option to specify that the frequency counting should\n"; print " be performed according to the method described by\n"; print " Resnik (1995).\n"; print "--smooth Specifies the smoothing to be used on the probabilities\n"; print " computed. SCHEME specifies the type of smoothing to\n"; print " perform. It is a string, which can be only be 'ADD1'\n"; print " as of now. Other smoothing schemes will be added in\n"; print " future releases.\n"; print "--help Displays this help screen.\n"; print "--version Displays version information.\n\n"; } # Subroutine to print minimal usage notes sub minimalUsageNotes { &printUsage(); print "Type rawtextFreq.pl --help for detailed help.\n"; } # Subroutine that prints the usage sub printUsage { print <<'EOT'; Usage: rawtextFreq.pl --outfile OUTFILE {--stdin | --infile FILE [--infile FILE ...]} [--stopfile FILE] [--resnik] [--wnpath PATH] [--smooth SCHEME] | --help | --version EOT } # Subroutine to print the version information sub printVersion { print "rawtextFreq.pl version 2.05\n"; print "Copyright (c) 2005-2008, Ted Pedersen, Satanjeev Banerjee, Siddharth Patwardhan and Jason Michelizzi.\n"; } __END__ =head1 NAME rawtextFreq.pl - Compute Information Content from Raw / Plain Text =head1 SYNOPSIS rawtextFreq.pl --outfile OUTFILE [--stopfile=STOPFILE] {--stdin | --infile FILE [--infile FILE ...]} [--wnpath WNPATH] [--resnik] [--smooth=SCHEME] | --help | --version =head1 OPTIONS B<--outfile>=I The name of a file to which output should be written B<--stopfile>=I A file containing a list of stop listed words that will not be considered in the frequency counts. A sample file can be down- loaded from http://www.d.umn.edu/~tpederse/Group01/WordNet/words.txt B<--wnpath>=I Location of the WordNet data files (e.g., /usr/local/WordNet-3.0/dict) B<--resnik> Use Resnik (1995) frequency counting B<--smooth>=I Smoothing should used on the probabilities computed. SCHEME can only be ADD1 at this time B<--help> Show a help message B<--version> Display version information B<--stdin> Read from the standard input the text that is to be used for counting the frequency of words. B<--infile>=I The name of a raw text file to be used to count word frequencies. This can actually be a filename, a directory name, or a pattern (as understood by Perl's glob() function). If the value is a directory name, then all the files in that directory and its subdirectories will be used. If you are looking for some interesting files to use, check out Project Gutenberg: . This option may be given more than once (if more than one file should be used). =head1 DESCRIPTION This program reads a corpus of plain text and computes frequency counts from that corpus and then uses those to determine the information content of each synset in WordNet. In brief it does this by first assigning counts to each synset for which it obtains a frequency count in the corpus, and then those counts are propagated up the WordNet hierarchy. More details on this process can be found in the documentation of the lin, res, and jcn measures in L and in the publication by Patwardhan, et. al. (2003) referred to below. The utility programs L, L, L, L all function in exactly the same way as this plain text program (rawtextFreq.pl), except that they include the ability to deal with the format of the corpus with which they are used. None of these programs requires sense-tagged text; instead they simply distribute the counts of the observed form of word to all the synsets in the corpus to which it could be associated. The different forms of a word are found via the validForms and querySense methods of L. For example, if the observed word is 'bank', then a count is given to the synsets associated with the financial institution, a river shore, the act of turning a plane, etc. =head2 Distributing Counts to Synsets If the corpora is sense-tagged, then distributing the counts of sense-tagged words to synsets is trivial; you increment the count of each synset for which you have a sense tagged instance. It is very hard to obtain large quantities of sense tagged text, so in general it is not feasible to obtain information content values from large sense-tagged corpora. As such this program and the related *Freq.pl utilities are all trying to increment the counts of synsets based on the occurence of raw untagged word forms. In this case it is less obvious how to proceed. This program supports two methods for distributing the counts of an observed word forms in untagged text to synsets. One is our default method, and we refer to the other as Resnik counting. In our default counting scheme, each synset receives the total count of each word form associated with it. Suppose the word 'bank' can be associated with six different synets. In our default scheme each of those synsets would receive a count for each occurrence of 'bank'. In Resnik counting, the count would be divided between the possible synsets, so in this case each synset would get one sixth (1/6) of the total count. =head2 How are These Counts Used? This program maps word forms to synsets. These synset counts are then propagated up the WordNet hierarchy to arrive at Information Content values for each synset, which are then used by the Lin (lin), Resnik (res), and Jiang & Conrath (jcn) measures of semantic similarity. By default these measures use counts derived from the cntlist file provided by WordNet, which is based on frequency counts from the sense-tagged SemCor corpus. This consists of approximately 200,000 sense tagged tokens taken from the Brown Corpus and the Red Badge of Courage. A file called ic-semcor.dat is created during installation of L from cntlist. In fact, the util program semCorFreq.pl is used to do this. This is the only one of the *Freq.pl utility programs that uses sense tagged text, and in fact it only uses the counts from cntlist, not the actual sense tagged text. This program simply creates an alternative version of the ic-semcor.dat file based on counts obtained from raw untagged text. =head2 Why Use This Program? The default information content file (ic-semcor.dat) is based on SemCor, which includes sense tagged portions of the Brown Corpus and the Red Badge of Courage. It has the advantage of being sense tagged, but is from a rather limited domain and is somewhat small in size (200,000 sense tagged tokens). If you are working in a different domain or have access to a larger quantity of corpora, you might find that this program provides information content values that better reflect your underlying domain or problem. =head2 How can these counts be reliable if they aren't based on sense tagged text? Remember once the counts are given to a synset, those counts are propogated upwards, so that each synset receives the counts of its children. These are then used in the calculation of the information content of each synset, which is simply : information content (synset) = - log [probability (synset)] More details on this calculation and how they are used in the res, lin, and jcn measures can be found in the WordNet::Similarity module doumentation, and in the following publication: Using Measures of Semantic Relatedness for Word Sense Disambiguation (Patwardhan, Banerjee and Pedersen) - Appears in the Proceedings of the Fourth International Conference on Intelligent Text Processing and Computational Linguistics, pp. 241-257, February 17-21, 2003, Mexico City. L We believe that a propagation effect will result in concentrations or clusters of information content values in the WordNet hierarchy. For example, if you have a text about banking, while the different counts of "bank" will be dispersed around WordNet, there will also be other financial terms that occur with bank that will occur near the financial synset in WordNet, and lead to a concentration of counts in that region of WordNet. It is best to view this as a conjecture or hypothesis at this time. Evidence for or against would be most interesting. You can use raw text of any kind in this program. We sometimes use text from Project Gutenburg, for example the Complete Works of Shakespeare, available from L =head1 BUGS Report to WordNet::Similarity mailing list : L =head1 SEE ALSO L WordNet home page : L WordNet::Similarity home page : L =head1 AUTHORS Ted Pedersen, University of Minnesota, Duluth tpederse at d.umn.edu Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh banerjee+ at cs.cmu.edu Siddharth Patwardhan, University of Utah, Salt Lake City sidd at cs.utah.edu Jason Michelizzi =head1 COPYRIGHT Copyright (c) 2005-2008, Ted Pedersen, Satanjeev Banerjee, Siddharth Patwardhan and Jason Michelizzi This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to Free Software Foundation, Inc. 59 Temple Place - Suite 330 Boston, MA 02111-1307, USA =cut