# stem.pm version 1.03 # (Last updated $Id: stem.pm,v 1.7 2006/02/19 19:11:09 sidz1979 Exp $) # # Package used by WordNet::Similarity::lesk module that # computes semantic relatedness of word senses in WordNet # using gloss overlaps. # # Copyright (c) 2005, # # Ted Pedersen, University of Minnesota Duluth # tpederse at d.umn.edu # # Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh # banerjee+ at cs.cmu.edu # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to # # The Free Software Foundation, Inc., # 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # ------------------------------------------------------------------ package stem; =head1 NAME stem - Module that find the stem of a word or the stems of a string of words, using WordNet. =head1 SYNOPSIS use stem; my $wn = WordNet::QueryData->new(); my $stemmer = stem->new($wn) my @stems = $stemmer->stemWord($word); my $string = $stemmer->stemString($inString, $cache); =head1 DESCRIPTION This module uses the internal stemming algorithm of WordNet to stem words and strings of words. This module is used by the lesk measure of the WordNet::Similarity package. =head2 Methods =over =cut use strict; use Exporter; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); @ISA = qw(Exporter); %EXPORT_TAGS = (); @EXPORT_OK = (); @EXPORT = (); $VERSION = '1.03'; =item new Creates a new stemmer object and initilizes it with a WordNet::QueryData object. Parameters: $wn Returns: $stemmer =cut # function to create the stemmer object sub new { my $className = shift; my $wn = shift; my $self = {}; $self->{wn} = $wn; $self->{wordStemHash} = (); $self->{stringStemHash} = (); bless($self, $className); return $self; } =item stemString Takes a string of words as input and returns a string of stemmed words. Parameters: $inString Returns: $retString =cut # Function to take a string, and process it in such a way that all the # words in it get stemmed. Note that if a single word has two or more # possible stems, we return the original surface form since there is # no way to select from the competing stems. The stem of the string # can be cached if requested. Useful if the calling function knows # which strings it will have to stem over and over again. Strings that # will be only stemmed ones need not be cached - thereby saving space. sub stemString { my $self = shift; my $inputString = shift; my $cache = shift; # whether or not this string has been requested for cacheing, # check in the cache return $self->{'stringStemHash'}->{$inputString} if (defined $self->{'stringStemHash'}->{$inputString}); # Not in cache. Stem. # for each word in the input get the stem and put in the output string my $outputString = ""; while ($inputString =~ /(\w+)/g) { my $word = $1; my @stems = $self->stemWord($word); # if multiple or no stems, use surface form. $outputString .= ($#stems != 0) ? "$word " : "$stems[0] "; } # if cache required, do so $self->{'stringStemHash'}->{$inputString} = $outputString if (defined($cache)); # return the string return($outputString); } =item stemWord Takes a word as input and returns its stems. A word may have more than one stem. All are returned. Parameters: $word Returns: @stems =back =cut # stem the word passed to this function and return an array of words # that contain all the possible stems of this word. All possible stems # of the word may include the surface form too if its a valid WordNet # lemma. sub stemWord { my $self = shift; my $word = shift; my $wn = $self->{wn}; my @stems = (); # if not in the cache, create and put in cache if (!defined $self->{wordStemHash}->{$word}) { # So not in the hash. gotta check for all possible parts of speech. my %stems = (); my $possiblePartsOfSpeech = "nvar"; my $pos; while ("nvar" =~ /(.)/g) { foreach ($wn->validForms("$word\#$1")) { # put underscore for space $_ =~ s/ /_/g; # remove part of speech if any $_ =~ s/\#\w$//; # put in stems hash (the hash allows us to not worry about # multiple copies of the same stem!) $stems{$_} = 1; } } # put in the cache $self->{wordStemHash}->{$word} = join(" ", (keys %stems)); } # return the stems return (split / /, $self->{wordStemHash}->{$word}); } 1; __END__ =head1 SEE ALSO perl(1), WordNet::Similarity(3), WordNet::QueryData(3) http://www.cs.utah.edu/~sidd http://wordnet.princeton.edu http://www.ai.mit.edu/~jrennie/WordNet http://groups.yahoo.com/group/wn-similarity =head1 AUTHORS Ted Pedersen, University of Minnesota Duluth tpederse at d.umn.edu Satanjeev Banerjee, Carnegie Mellon University, Pittsburgh banerjee+ at cs.cmu.edu =head1 BUGS None. To report bugs, go to http://groups.yahoo.com/group/wn-similarity/ or e-mail "S". =head1 COPYRIGHT AND LICENSE Copyright (c) 2005, Ted Pedersen and Satanjeev Banerjee This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to The Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. Note: a copy of the GNU General Public License is available on the web at L and is included in this distribution as GPL.txt. =cut