package Text::Similarity; # Text::Similarity # Copyright (C) 2004, Jason Michelizzi and Ted Pedersen # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA use 5.006; use strict; use warnings; use constant { COMPFILE => "compfile", STEM => "stem", VERBOSE => "verbose", STOPLIST => "stoplist", NORMALIZE => "normalize" }; require Exporter; our @ISA = qw(Exporter); our $VERSION = '0.02'; # Attributes -- these all have lvalue accessor methods, use those methods # instead of accessing directly. If you add another attribute, be sure # to take the appropriate action in the DESTROY method; otherwise, a memory # leak could occur. my %errorString; my %compounds; my %verbose; my %stem; my %normalize; my %stoplist; sub new { my $class = shift; my $hash_ref = shift; $class = ref $class || $class; my $self = bless [], $class; if (defined $hash_ref) { while (my ($key, $val) = each %$hash_ref) { if (($key eq COMPFILE) and (defined $val)) { $self->loadCompounds ($val); } elsif ($key eq STEM) { $self->stem = $val; } elsif ($key eq VERBOSE) { $self->verbose = $val; } elsif ($key eq NORMALIZE) { $self->normalize = $val; } elsif ($key eq STOPLIST) { $self->stoplist = $val; } else { $self->error ("Unknown option: $key"); } } } return $self; } sub DESTROY { my $self = shift; delete $errorString{$self}; delete $compounds{$self}; delete $stem{$self}; delete $verbose{$self}; delete $normalize{$self}; delete $stoplist{$self}; } #sub _loadStoplist #{ # my $self = shift; # my $file = shift; # # unless (open FH, '<', $file) { # $self->error ("Cannot open '$file': $!"); # return undef; # } # # while () { # chomp; # my $word = lc; # $stoplist{$self}->{$word} = 1; # } # # close FH; #} sub error { my $self = shift; my $msg = shift; if ($msg) { my ($package, $file, $line) = caller; $errorString{$self} .= "\n" if $errorString{$self}; $errorString{$self} .= "($file:$line) $msg"; } return $errorString{$self}; } sub verbose : lvalue { my $self = shift; $verbose{$self} } sub stem : lvalue { my $self = shift; $stem{$self} } sub normalize : lvalue { my $self = shift; $normalize{$self} } sub sanitizeString { my $self = shift; my $str = shift; # get rid of most punctuation $str =~ tr/.;:,?!(){}\x22\x60\x24\x25\x40<>/ /s; # convert to lower case $str =~ tr/A-Z_/a-z /; # convert ampersands into 'and' -- maybe not appropriate? # s/\&/ and /; # get rid of apostrophes not surrounded by word characters $str =~ s/(?error ("Cannot open '$compfile': $!"); return undef; } while () { chomp; $compounds{$self}{$_} = 1; } close FH; } sub removeStopWords { my $self = shift; my $str = shift; foreach my $stopword (keys %{$self->stoplist}) { $str =~ s/\Q $stopword \E/ /g; } return $str; } # compoundifies a block of text # e.g., if you give it "we have a new bird dog", you'll get back # "we have a new bird_dog". # (code borrowed from rawtextFreq.pl) sub compoundify { my $self = shift; my $block = shift; # get the block of text my $done; my $temp; unless ($compounds{$self}) { return $block; } # get all the words into an array my @wordsArray = $block =~ /(\w+)/g; # now compoundify, GREEDILY!! my $firstPtr = 0; my $string = ""; while($firstPtr <= $#wordsArray) { my $secondPtr = $#wordsArray; $done = 0; while($secondPtr > $firstPtr && !$done) { $temp = join ("_", @wordsArray[$firstPtr..$secondPtr]); if(exists $compounds{$self}{$temp}) { $string .= "$temp "; $done = 1; } else { $secondPtr--; } } if(!$done) { $string .= "$wordsArray[$firstPtr] "; } $firstPtr = $secondPtr + 1; } $string =~ s/ $//; return $string; } 1; __END__ =head1 NAME Text::Similarity - module for measuring the similarity of text documents. This module is a superclass for other modules. =head1 SYNOPSIS use Text::Similarity::Overlaps; my $mod = Text::Similarity::Overlaps->new; my $score = $mod->getSimilarity ($text_file1, $text_file2); =head1 DESCRIPTION This module serves as a superclass for other modules that implement measures of text document similarity. =head1 SEE ALSO Text::Similarity::Overlaps http://text-similarity.sourceforge.net =head1 AUTHOR Jason Michelizzi, Ejmichelizzi at sourceforge.netE Ted Pedersen, Etpederse at d.umn.eduE =head1 COPYRIGHT AND LICENSE Copyright (C) 2004 by Jason Michelizzi, Ted Pedersen, and Siddharth Patwardhan This library is free software; you may redistribute it and/or modify it under the terms of the GNU General Public License, version 2 or, at your option, any later version. =cut