The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

package Text::DocumentCollection;

use strict;
use warnings;
use Text::Document;

use DB_File;

use v5.6.0;

$Text::DocumentCollection::VERSION = 1.02;

sub new
{
	my $class = shift;
	my %self = @_;
	my $self = \%self;
	bless $self, $class;

	defined( $self->{file} ) or die( __PACKAGE__ . '::new : '
		. "keyword 'file' is mandatory"
	);

	my %persistent;

	(tie %persistent, 'DB_File', $self->{file}) or die( __PACKAGE__
		. '::new : '
		. "Cannot tie persistent hash: $!"
	);

	$self->{pdocs} = \%persistent;

	return $self;
}

sub NewFromDB
{
	my $self = Text::DocumentCollection::new(@_);
	while( my @kv = each %{$self->{pdocs}} ){
		$self->{docs}->{$kv[0]} =
			Text::Document::NewFromString( $kv[1] );
	}
	return $self;
}

sub Add
{
	my $self = shift;
	my ($key,$doc) = @_;

	if( defined( $self->{docs}->{$key} ) ){
		die( __PACKAGE__ . '::Add : '
			. "document `$key' is already in this collection"
		);
	}

	$self->{docs}->{$key} = $doc;

	delete $self->{IDF};

	$self->{pdocs}->{$key} = $doc->WriteToString();

	return $doc;
}

sub Delete
{
	my $self = shift;
	my ($key) = @_;

	if( not defined( $self->{docs}->{$key} ) ){
		return undef;
	}
	delete $self->{docs}->{$key};
	delete $self->{pdocs}->{$key};
	return 1;
}

sub EnumerateV
{
	my $self = shift;
	my ($callback,$rock) = @_;

	my @result = ();
	while( my @kv = each %{$self->{docs}} ){
		my @l = &{$callback}( $self, $kv[0], $kv[1], $rock );
		push @result, @l;
	}
	return @result;
}

sub IDF_Help
{
	my $self = shift;
	my ($key,$doc,$term) = @_;

	my $o = $doc->Occurrences( $term );
	$self->{_idf_n}++;
	if( $o and ($o>0) ){
		$self->{_idf_dt}++;
	}
}

sub IDF
{
	my $self = shift;
	my ($term) = @_;

	defined( $self->{IDF}->{$term} ) and return $self->{IDF}->{$term};
	$self->{_idf_n} = 0;
	$self->{_idf_dt} = 0;
	$self->EnumerateV( \&Text::DocumentCollection::IDF_Help, $term );
	if( $self->{_idf_dt} <= 0 ){
		warn( "term $term does not occur in any document" );
		return $self->{IDF}->{$term} = 0.0;
	}
	$self->{IDF}->{$term} =
		log( $self->{_idf_n} / $self->{_idf_dt} ) / log(2.0);

#	print "IDF($term) = $self->{IDF}->{$term}\n";
	return $self->{IDF}->{$term} ;
}

1;

__END__

=head1 NAME

  Text::DocumentCollection - a collection of documents

=head1 SYNOPSIS

=head1 DESCRIPTION

=head1 CLASS METHODS

=head2 new

The constructor; arguments must be passed as maps
from keys to values. The key C<file> is mandatory.

  my $c = Text::DocumentCollection->new( file => 'coll.db' );

Documents from the collection are saved as in the  specified file,
which is  currently handled by a C<DB_File> hash.

=head1 INSTANCE METHODS

=head2 Add

Add a document to the collection, tagging it with
a unique key.

  $c->Add( $key, $doc );

C<Add> C<die>s if the key is already present.

To change an existing key, use C<Delete> and then C<Add>.

=head2 Delete

Discard a document from the collection.

=head2 NewFromDB

Loads the collection from the given DB file:

  my $c = Text::DocumentCollection->NewFromDB( file => 'coll.db' );

The file must be either empty or created by a former invocation
of C<new> or C<NewFromDB>, followed by any number of C<Add>
and/or C<Delete>.

Currently, all documents in  the  collection are  revived
(by calling C<NewFromString>). This poses performance problems
for huge collections; a caching mechanism would be an option
in this case.

=head2 IDF

Inverse Term frequency of a given term.

The definition we used is, given a term I<t>, a set of documents
I<DOC> and the binary relationship I<has-term>:

  IDF(t) = log2( #DOC / #{ d in DOC | d has-term t } )

The logarithm is in base 2, since this is related to an
information measurement, and # is the cardinality operator.

=head2 EnumerateV

Enumerates all the document in the collection. Called as:

  my @result = $c->EnumerateV( \&Callback, 'the rock' );

The function C<Callback> will be called on each element
of the collection as:

  my @l = CallBack( $c, $key, $doc, $rock );

where C<$rock> is the second argument to C<Callback>.

Since C<$c> is the first argument, the callback may be
an instance method of C<Text::DocumentCollection>.

The final result is obtained by concatenating all the
partial results (C<@l> in the example above).  If you do
not want a result, simply return the empty list ().

There is no particular order of enumeration, so there
is no particular order in which results are concatenated.

=head1 AUTHORS

  spinellia@acm.org
  walter@humans.net