package Text::Bloom; use strict; use Bit::Vector; use FileHandle; @Text::Bloom::hashParam = ( [ 1, 0, ], #identity [ 48661, 109441 ], [ 13679, 103651 ], [ 41851, 33413 ], [ 69499, 2399 ], [ 55799, 85037 ], [ 127051, 73571 ], [ 7393, 60821 ], [ 123449, 100297 ], [ 124309, 87547 ], [ 67129, 5531 ], [ 72689, 44389 ], ); BEGIN { $Text::Bloom::VERSION = '1.3'; %Text::Bloom::Radix = (); @Text::Bloom::RadixDomain = (); # previous value was too large for some linux boxes # $Text::Bloom::p = 4294967291; $Text::Bloom::p = 499979; %Text::Bloom::config = ( d => 4, size => 65536*2, compress=> 1, ); # print __PACKAGE__ . ' v' . $Text::Bloom::VERSION . " ready\n"; my @domain = 'a' .. 'z'; push @domain, '0' .. '9'; @Text::Bloom::RadixDomain = @domain; @Text::Bloom::Radix{ @domain } = 0 .. $#domain; eval { require Compress::Zlib; }; if( $@ ){ $Text::Bloom::config{compress} = undef; } } sub new { my $class= shift; my %self = @_; my $self = {}; foreach my $key (qw( d size compress ) ){ if( defined($self{$key}) ){ $self->{$key} = $self{$key}; } else { $self->{$key} = $Text::Bloom::config{$key}; } } bless $self, $class; return $self; } sub Size { my $self = shift; my ($newsize) = @_; $newsize and ($self->{size} = $newsize); return $self->{size}; } sub Compute { my $self = shift; my @terms = @_; my $bv = Bit::Vector->new( $self->{size} ); foreach my $w (@terms) { my $q = $self->QuantizeV( $w ); foreach my $i (1..$self->{d}){ my $index = $self->HashV( $i-1, $q ); if( 0 && $bv->bit_test($index) ){ print STDOUT ("$w -> $q -> $index" . " COLLISION\n"); } $bv->Bit_On( $index ); } } my $nTerms = scalar @terms; my $nBits = $bv->Norm(); if( $nBits > 0 ){ $self->{collisionRatio} = 1.0 - $nBits/$nTerms/$self->{d}; } print STDERR ( $nTerms . ' terms added, bit vector norm is ' . $nBits . ', collision ratio is ' . $self->{collisionRatio} . "\n" ) unless 1; return $self->{bv} = $bv; } sub QuantizeV { my $self = shift; my ($term) = @_; my @chars = split( //, $term ); my $q = 0; for( my $i=$#chars; $i>=0; $i-- ){ $q = $q * (scalar( @Text::Bloom::RadixDomain )+1) + $Text::Bloom::Radix{ $chars[$i] } + 1; $q %= $Text::Bloom::p; } return $q; } sub HashV { my $self =shift; my ($order,$x ) = @_; my ($m,$q) = @{$Text::Bloom::hashParam[$order]}; my $scrambled = $x * $m + $q; $scrambled %= $self->{size}; return $scrambled; } sub WriteToString { my $self = shift; return undef unless $self->{bv}; my $block = $self->{bv}->Block_Read() ; if( $self->{compress} ){ $block = Compress::Zlib::compress( $self->{bv}->Block_Read()); $block or die( __PACKAGE__ . '::WriteToString : ' . 'cannot compress block' ); } my $str = 'p=' . __PACKAGE__ . ' v=' . $Text::Bloom::VERSION . ' size=' . $self->{size} . ' d=' . $self->{d} . ' compress=' . ($self->{compress}?1:0) . ' l=' . length( $block ) . "\n" ; $str .= $block; $str .= pack( 'L', unpack( '%32C*', $str )); return $str; } sub WriteToFile { my $self = shift; my ($file) = @_; my $f = FileHandle->new( '>' . $file ); binmode $f; $f->print( $self->WriteToString() ); } sub NewFromString { my ($string) = @_; my $self = {}; # verify checksum my $stored_checksum = substr( $string, -4 ); $stored_checksum = unpack( 'L', $stored_checksum ); $string = substr( $string, 0, -4 ); my $computed_checksum = unpack( '%32C*', $string ); if( $stored_checksum ne $computed_checksum ){ die( __PACKAGE__ . '::NewFromString : ' . 'checksum test failed ' . $stored_checksum . ' != ' . $computed_checksum ); } # split in two: first line and rest my( $header, $block ) = split( /\n/, $string, 2 ); # parse header line my %header = split( /[ =]+/, $header ); # check that the reading package is the same as the one that wrote if( $header{p} ne __PACKAGE__ ){ die( __PACKAGE__ . '::NewFromString : ' . "file was not written by " . __PACKAGE__ . ". Header is '$header'" ); } # version must be identical if( $header{v} != $Text::Bloom::VERSION ){ die( __PACKAGE__ . '::NewFromString : ' . "Current version is $Text::Bloom::VERSION" . " and the file version is $header{v}" ); } # size of block must match if( $header{l} != length( $block ) ){ die( __PACKAGE__ . '::NewFromString : ' . "data size is " . length( $block ) . "instead of $header{l} " ); } # retrieve header info @{$self}{ qw( size d compress ) } = @header{ qw( size d compress ) }; # if we have to decompress, check that we have the required lib if( not( $Text::Bloom::config{compress} ) and ($header{compress}==1) ){ die( __PACKAGE__ . '::NewFromString : ' . 'file $file is compressed, but ' . 'Compress::Zlib is not available.' ); } if( $header{compress} ){ eval { $block = Compress::Zlib::uncompress( $block ); }; ($block and not($@)) or die( __PACKAGE__ . '::WriteToString : ' . 'cannot uncompress block' ); } $self->{bv} = Bit::Vector->new( $self->{size} ); $self->{bv}->Block_Store( $block ); bless $self, __PACKAGE__; } sub NewFromFile { my ($file) = @_; my $f = FileHandle->new( $file ); binmode $f; local $/ = undef; my $freezed = <$f>; return Text::Bloom::NewFromString( $freezed ); } # similarity is the number of common bits # divided by the number of all nonzero bits sub Similarity { my $self = shift; my ($other) = @_; if( $self->{size} != $other->{size} ){ die( __PACKAGE__ . '::Similarity : ' . 'Bloom signatures cannot be compared ' . "because sizes are $self->{size} and $other->{size}" ); } my $union = Bit::Vector->new( $self->{size} ); my $inter = Bit::Vector->new( $self->{size} ); $union->Union( $self->{bv}, $other->{bv} ); $inter->Intersection( $self->{bv}, $other->{bv} ); my $normUnion = $union->Norm(); ($normUnion == 0) and return 0; my $normInter = $inter->Norm(); return $normInter / $normUnion; } sub GenerateHashes { foreach (0..10){ my $v = int( $Text::Bloom::config{size} * rand() ); $v = Text::Bloom::GreatestPrimeLessThan( $v ); my $vB = int( $Text::Bloom::config{size} * rand() ); $vB = Text::Bloom::GreatestPrimeLessThan( $vB ); print "[ $v,\t$vB ],\n"; } } # very naive primality test sub GreatestPrimeLessThan { my ($number) = @_; while( not Text::Bloom::IsPrime( $number ) ){ $number--; } return $number; } sub IsPrime { my ($number) = @_; my $max = int( sqrt( $number ) ); for( my $i = 2; $i <= $max ; $i++ ){ if( $number % $i == 0 ){ # print "$number is divisible by $i\n"; return undef; } } # print "$number is prime\n"; return 1; } 1; # this line was used to generate hashParam # Text::Bloom::GenerateHashes(); __END__ =head1 NAME Text::Bloom - Evaluate Bloom signature of a set of terms =head1 SYNOPSIS my $b = Text::Bloom->new(); $b->Compute( qw( foo bar baz ) ); my $sig = $b->WriteToString(); $b->WriteToFile( 'afile.sig' ); my $b2 = Text::Bloom::NewFromFile( 'afile.sig' ); my $b3 = Text::Bloom->new(); $b3->Compute( qw( foo bar barbaz ) ); my $sim = $b->Similarity( $b2 ); my $b4 = Text::Bloom::NewFromString( $sig ); =head1 DESCRIPTION C applies the Bloom filtering technique to the statistical analysis of documents. The terms in the document are quantized using a base-36 radix representation; each term thus corresponds to an integer in the range 0..I, where I

is a prime, currently set to the greatest prime less than 2^32. Each quantized value is mapped to I integers in the range 0..I, where I is an integer less than I

, currently 2^17, using a family of hash functions, computed by the C function. Each hashed value is used as the index in a large bit vector. Bits corresponding to terms present in the document are set to 1; all other bits are set to 0. Of course, collisions may cause the same bit to be set twice, by different terms. It follows that, if the document contains I distinct terms, in the resulting bit vector at most I bits are set to 1. The resulting bit string is a very compact representation of the presence/absence of terms in the document, and is therefore characterised as a I. Moreover, it does not depend on a pre-set dictionary of terms. The signature may be used for: =over 4 =item * testing whether a given set of terms is present in the document, =item * computing which fraction of terms are common to two documents. =back The bit representation may be written to and read from a file. C prepends a header to the bit stream proper; moreover, whenever the package C is available, the bit vector is compressed, so that disk space requirements are drastically reduced, especially for small documents. The hash function is obviously a crucial component of the filter; the reference implementation uses a radix representation of strings. Each term must therefore match the regular expression C. There are quite a few viable alternatives, which can be pursued by subclassing and redefining the method C. =head1 FORESEEN REUSE The package may be {re}used either by simple instantiation, or by subclassing (defining a descendant package). In the latter case the methods which are foreseen to be redefined are those ending with a C suffix. Redefining other methods will require greater attention. =head1 CLASS METHODS =head2 new The constructor. No arguments are required. $b = Text::Bloom->new(); =head2 NewFromString Take a string written by C (see below) and create a new C with the same contents; call C whenever the restore is impossible or ill-advised, for instance when the current version of the package is different from the original one, or the compression library in unavailable. my $b = Text::Bloom::NewFromString( $str ); The return value is a blessed reference; put in another way, this is an alternative contructor. The string should have been written by C; you may of course tweak the string contents, but at this point you're entirely on you own. =head2 NewFromFile Utility function that reads a binary file and performs a C on its content; see its counterpart, C. my $b2 = Text::Document::NewFromFile( 'foo.sig' ); =head1 INSTANCE METHODS =head2 Size Set and get the size of the filter, in bits. The default size is currently 128K. print 'size is ' . $b->Size() . "\n"; $b->Size( 65536 ); The C method must be called before the C method in order to have effect. =head2 Compute Compute the Bloom signature from the given set of words and store it internally. $b->Compute( qw( foo bar baz foobar bazbaz ) ); Makes use of the C method. =head2 QuantizeV Convert a term into an integer; must return an integer in the range 0 .. C<$Text::Bloom::p-1>. It is called as my $hash = $b->QuantizeV( $term ); The current version is designed for strings matching C. Other characters do not cause errors, but degrade the hash function performance. This function is a likely candidate for redefinition. =head2 HashV Convert an integer to a (smaller) integer, according to one of a class of similar functions. It is internally called as: my $index = $b->HashV( $order, $value ); The C<$value> must belong to the interval 0..C<$Text::Bloom::p-1>, while the index must lie in 0..I. C<$order> is a small integer from 0 to I. The default implementation is index = m[order] * value + q[order] (mod size) the values of I and I are taken from the array C<@Text::Bloom::hashParam>; the form of the function is taken from [2]. =head2 WriteToString Convert the Bloom signature into a string which can be saved and later restored with C. C must have been called previously. my $str = $b->WriteToString(); The string begins with a header which encodes the originating package, its version, the parameters of the current instance. Whenever possible, C is used in order to compress the bit vector in the most efficient way. On systems without C, the bit string is saved uncompressed. =head2 WriteToFile These convenience functions just call their String counterparts and read/write the file specified in the argument. $b->WriteToFile( 'foo.sig' ); =head1 AUTHORS spinellia@acm.org (Andrea Spinelli) walter@humans.net (Walter Vannini) =head1 BIBLIOGRAPHY =over 4 =item [1] Burton H. Bloom, "Space/time trade-offs in hash coding with allowable errors", I, B<13>, 7, July 1970, pages 422-426. (available electronically from ACM Digital Library). =item [2] M. V. Ramakrishna, "Practical Performance of Bloom FIlters and Parallel Free-Text Searching", I, B<32>, 10, October 1989, pages 1237-1239. (available electronically from ACM Digital Library). =back =head1 BUGS On Win32 we have experienced some instabilities when dealing with a large number of signatures; in this case Perl crashes without apparent explanation. The main suspect is Bit::Vector, but without any evidence. =head1 HISTORY 2001-11-02 - initial revision 2002-02-04 - reduced prime p to pacify some linux boxes