package Text::Document; $Text::Document::VERSION = '1.05'; use strict; use v5.6.0; our @FIELDS = qw( lowercase ); our $COMPRESS_AVAILABLE; our @KEYS_FOR_NEW = qw( compress lowercase ); BEGIN { eval "use Compress::Zlib;"; if( $@ ){ $COMPRESS_AVAILABLE = undef; } else { $COMPRESS_AVAILABLE = 1; } } sub new { my $class = shift; my %self = @_; my $self = { lowercase => 1, compress => 1, terms => {}, }; foreach my $k ( @KEYS_FOR_NEW ){ defined( $self{$k} ) and ($self->{$k} = $self{$k}); } bless $self, $class; return $self; } sub AddContent { my $self = shift; my ($text) = @_; # clear frequency cache $self->{freqs} and delete $self->{freqs}; # parse text fragment my @terms = $self->ScanV( $text ); # update word count foreach my $w (@terms){ $self->{terms}->{$w} ++; } undef $self->{WeightedEuclideanNorm}; undef $self->{EuclideanNorm}; return scalar @terms; } # number of occurrences of a given term sub Occurrences { my $self = shift; my ($term) = @_; return $self->{terms}->{$term}; } sub ScanV { my $self = shift; my ($text) = @_; my @words = split( /[^a-zA-Z0-9]+/, $text ); @words = grep( /.+/, @words ); if( $self->{lowercase} ){ return map( lc($_), @words ); } else { return @words; } } sub KeywordFrequency { my $self = shift; return $self->{freqs} if $self->{freqs}; # all the distinct terms in the doc my @terms = $self->Terms(); # total number of terms my $sum = 0; foreach my $t (@terms) { $sum += $self->{terms}->{$t}; } # if zero, frequency is not defined ($sum > 0) or return undef; # list of [term,frequency] pairs my @freqs = map( [$_, $self->{terms}->{$_}/$sum ] , @terms ); # sort by ascending frequency @freqs = sort { $a->[1] <=> $b->[1] } @freqs; # return reference to result return $self->{freqs} = \@freqs; } # all distinct term names sub Terms { my $self = shift; return keys %{$self->{terms}}; } # number of common terms divided by total number of terms sub CommonTermsRatio { my $self = shift; my ($other) = @_; my @terms = $self->Terms(); my %terms; @terms{@terms} = 1 .. @terms; my @oTerms = $other->Terms(); my (%union); @union{@terms} = 1 .. @terms; @union{@oTerms} = 1 .. @oTerms; my @intersection = map( ( $terms{$_} ? 1 : () ), @oTerms ); my $unionCardinality = scalar( keys %union ); ($unionCardinality > 0) or return undef; return scalar(@intersection) / $unionCardinality; } sub PureASCII { my $self = shift; $self->{compress} = 1; } sub WriteToString { my $self = shift; my $block = join( ',', %{$self->{terms}} ); my $compressed = undef; if( $COMPRESS_AVAILABLE && $self->{compress} ){ $block = Compress::Zlib::compress( $block ); # $block = compress( $block ); $compressed = 1; } my $header = 'p=' . __PACKAGE__ . ' v=' . $Text::Document::VERSION . ' l=' . length( $block ) . ' compress=' . ($compressed?'1':'0') . ' ' . join( ' ', map( "$_=$self->{$_}", @FIELDS)) . "\n"; my $str = $header . $block; # add 8-char hex-encoded 4-byte checksum at the end of data return $str . sprintf( '%08x', unpack( '%32C*', $str ) ); } sub NewFromString { my ($str) = @_; my $self = {}; # verify checksum # try to be compatible with version 1.03 my $stored_checksum = unpack( 'N', substr( $str, -4 )); my $data_payload = substr( $str, 0, -4 ); my $computed_checksum = unpack( '%32C*', $data_payload ); if( $stored_checksum != $computed_checksum ){ $stored_checksum = hex( substr( $str, -8 )); $data_payload = substr( $str, 0, -8 ); $computed_checksum = unpack( '%32C*', $data_payload ); } if( $stored_checksum != $computed_checksum ){ die( __PACKAGE__ . '::NewFromString : ' . 'checksum test failed ' . $stored_checksum . ' != ' . $computed_checksum ); } # split data in header and block my ($header,$block) = split( /\n/, $data_payload, 2 ); # parse header line my %header = split( /[ =]+/, $header ); # check that the reading package is the same as the one that wrote if( $header{p} ne __PACKAGE__ ){ die( __PACKAGE__ . '::NewFromString : ' . "file was not written by " . __PACKAGE__ ); } # version must be identical if( $header{v} > $Text::Document::VERSION ){ die( __PACKAGE__ . '::NewFromString : ' . "Current version is $Text::Document::VERSION" . " and the file version is $header{v}" ); } # size of block must match if( $header{l} != length( $block ) ){ die( __PACKAGE__ . '::NewFromString : ' . "data size is " . length( $block ) . "instead of $header{l} " ); } # compressed? if( $header{compress} and not($COMPRESS_AVAILABLE) ){ die( __PACKAGE__ . '::NewFromString : ' . 'header indicates that data is compressed, ' . 'but Compress::Zlib is not available' ); } if( $header{compress} ){ $block = Compress::Zlib::uncompress( $block ); # $block = uncompress( $block ); } @{$self}{@FIELDS} = @header{ @FIELDS }; # retrieve terms and recurrence count %{$self->{terms}} = split( /,/, $block ); bless $self, $header{p}; return $self; } sub JaccardSimilarity { my $self = shift; my ($e) = @_; my @inter = map( ( $self->{terms}->{$_} ? $_ : () ), keys %{$e->{terms}} ); my %union = %{$self->{terms}}; my @keyse = keys %{$e->{terms}}; @union{@keyse} = @keyse; if( (my $unionSize = scalar keys %union) > 0 ){ return scalar(@inter) / $unionSize; } else { return undef; } } sub CosineSimilarity { my $self = shift; my ($e) = @_; my ($Dv,$Ev) = ($self->{terms}, $e->{terms}); my %union = %{$self->{terms}}; my @keyse = keys %{$e->{terms}}; @union{@keyse} = @keyse; my $dotProduct = 0.0; map( $dotProduct += (defined($Dv->{$_}) ? $Dv->{$_} : 0.0) * (defined($Ev->{$_}) ? $Ev->{$_} : 0.0 ), keys %union ); my $nD = $self->EuclideanNorm(); my $nE = $e->EuclideanNorm(); if( ($nD==0) || ($nE==0) ){ return undef; } else { return $dotProduct / $nD / $nE; } } sub EuclideanNorm { my $self = shift; defined( $self->{EuclideanNorm} ) and return $self->{EuclideanNorm}; my $sum = 0.0; map( $sum += $_*$_, values %{$self->{terms}} ); return ($self->{EuclideanNorm} = sqrt( $sum )); } # this is rather rough sub WeightedCosineSimilarity { my $self = shift; my ($e,$weightFunction,$rock) = @_; my ($Dv,$Ev) = ($self->{terms}, $e->{terms}); # compute union my %union = %{$self->{terms}}; my @keyse = keys %{$e->{terms}}; @union{@keyse} = @keyse; my @allkeys = keys %union; # weighted D my @Dw = map(( defined( $Dv->{$_} )? &{$weightFunction}( $rock, $_ )*$Dv->{$_} : 0.0 ), @allkeys ); # weighted E my @Ew = map(( defined( $Ev->{$_} )? &{$weightFunction}( $rock, $_ )*$Ev->{$_} : 0.0 ), @allkeys ); # dot product of D and E my $dotProduct = 0.0; map( $dotProduct += $Dw[$_] * $Ew[$_] , 0..$#Dw ); # norm of D my $nD = 0.0; map( $nD += $Dw[$_] * $Dw[$_] , 0..$#Dw ); $nD = sqrt( $nD ); # norm of E my $nE = 0.0; map( $nE += $Ew[$_] * $Ew[$_] , 0..$#Ew ); $nE = sqrt( $nE ); # dot product scaled by norm if( ($nD==0) || ($nE==0) ){ return undef; } else { return $dotProduct / $nD / $nE; } } 1; __END__ =head1 NAME Text::Document - a text document subject to statistical analysis =head1 SYNOPSIS my $t = Text::Document->new(); $t->AddContent( 'foo bar baz' ); $t->AddContent( 'foo barbaz; ' ); my @freqList = $t->KeywordFrequency(); my $u = Text::Document->new(); ... my $sj = $t->JaccardSimilarity( $u ); my $sc = $t->CosineSimilarity( $u ); my $wsc = $t->WeightedCosineSimilarity( $u, \&MyWeight, $rock ); =head1 DESCRIPTION C allows to perform simple Information-Retrieval-oriented statistics on pure-text documents. Text can be added in chunks, so that the document may be incrementally built, for instance by a class like C. A simple algorithm splits the text into terms; the algorithm may be redefined by subclassing and redefining C. The C function computes term frequency over the whole document. =head1 FORESEEN REUSE The package may be {re}used either by simple instantiation, or by subclassing (defining a descendant package). In the latter case the methods which are foreseen to be redefined are those ending with a C suffix. Redefining other methods will require greater attention. =head1 CLASS METHODS =head2 new The creator method. The optional arguments are in the I<(key,value)> form and allow to specify whether all keywords are trasformed to lowercase (default) and whether the string representation (C) will be compressed (default). my $d = Text::Document->new(); my $dNotCompressed = Text::Document( compressed => 0 ); my $dPreserveCase = Text::Document( lowercase => 0 ); =head2 NewFromString Take a string written by C (see below) and create a new C with the same contents; call C whenever the restore is impossible or ill-advised, for instance when the current version of the package is different from the original one, or the compression library in unavailable. my $b = Text::Document::NewFromString( $str ); The return value is a blessed reference; put in another way, this is an alternative contructor. The string should have been written by C; you may of course tweak the string contents, but at this point you're entirely on you own. =head1 INSTANCE METHODS =head2 AddContent Used as $d->AddContent( 'foo bar baz foo9' ); $d->AddContent( 'mary had a little lamb' ); Successive calls accumulate content; there is currently no way of resetting the content to zero. =head2 Terms Returns a list of all distinct terms in the document, in no particular order. =head2 Occurrences Returns the number of occurrences of a given term. $d->AddContent( 'foo baz bar foo foo'); my $n = $d->Occurrences( 'foo' ); # now $n is 3 =head2 ScanV Scan a string and return a list of terms. Called internally as: my @terms = $self->ScanV( $text ); =head2 KeywordFrequency Returns a reference list of pairs I<[term,frequency]>, sorted by ascending frequency. my $listRef = $d->KeywordFrequency(); foreach my $pair (@{$listRef}){ my ($term,$frequency) = @{$pair}; ... } Terms in the document are sampled and their frequencies of occurrency are sorted in ascending order; finally, the list is returned to the user. =head2 WriteToString Convert the document (actually, some parameters and the term counters) into a string which can be saved and later restored with C. my $str = $d->WriteToString(); The string begins with a header which encodes the originating package, its version, the parameters of the current instance. Whenever possible, C is used in order to compress the bit vector in the most efficient way. On systems without C, the bit string is saved uncompressed. This method is influenced by C. =head2 PureASCII Ensure that the representation in WriteToString does not contain characters with ASCII code >= 128. Needed to easily include document representations into textual databases (e.g. XML files). =head2 JaccardSimilarity Compute the Jaccard measure of document similarity, which is defined as follows: given two documents I and I, let I and I be the set of terms occurring in I and I, respectively. Define I as the intersection of I and I, and I as their union. Then the Jaccerd similarity is the the number of elements of I divided by the number of elements of I. It is called as follows: my $sim = $d->JaccardSimilarity( $e ); If neither document has any terms the result is undef (a rare evenience). Otherwise the similarity is a real number between 0.0 (no terms in common) and 1.0 (all terms in common). =head2 CosineSimilarity Compute the cosine similarity between two documents I and I. Let I and I be the set of terms occurring in I and I, respectively. Define I as the union of I and I, and let I be the I-th element of I. Then the term vectors of I and I are Dv = (nD(t1), nD(t2), ..., nD(tN)) Ev = (nE(t1), nE(t2), ..., nE(tN)) where nD(ti) is the number of occurrences of term ti in I, and nE(ti) the same for I. Now we are at last ready to define the cosine similarity I: CS = (Dv,Ev) / (Norm(Dv)*Norm(Ev)) Here (... , ...) is the scalar product and Norm is the Euclidean norm (square root of the sum of squares). C is called as $sim = $d->CosineSimilarity( $e ); It is C if either I or I have no occurrence of any term. Otherwise, it is a number between 0.0 and 1.0. Since term occurrences are always non-negative, the cosine is obviously always non-negative. =head2 WeightedCosineSimilarity Compute the weighted cosine similarity between two documents I and I. In the setting of C, the term vectors of I and I are Dv = (nD(t1)*w1, nD(t2)*w2, ..., nD(tN)*wN) Ev = (nE(t1)*w1, nE(t2)*w2, ..., nE(tN)*wN) The weights are nonnegative real values; each term has associated a weight. To achieve generality, weights may be defined using a function, like: my $wcs = $d->WeightedCosineSimilarity( $e, \&function, $rock ); The C will be called as follows: my $weight = function( $rock, 'foo' ); C<$rock> is a 'constant' object used for passing a I to the function. For instance, a common way of defining weights is the IDF (inverse document frequency), which is defined in L. In this context, you can weigh terms with their IDF as follows: $sim = $c->WeightedCosineSimilarity( $d, \&Text::DocumentCollection::IDF, $collection ); C will call $collection->IDF( 'foo' ); which is what we expect. Actually, we should return the square root of IDF, but this detail is not necessary here. =head1 AUTHORS spinellia@acm.org (Andrea Spinelli) walter@humans.net (Walter Vannini) =head1 HISTORY 2001-11-02 - initial revision 2001-11-20 - added WeightedCosineSimilarity, suggested by JP Mc Gowan 2002-02-03 - changed representation of checksum. New method C. =head DISCARDED CHOICES We did not use C, because we wanted to fine-tune compression and version compatibility. However, this choice may be easily reversed redefining WriteToString and NewFromString.