package AI::Categorizer::Collection::Files; use strict; use AI::Categorizer::Collection; use base qw(AI::Categorizer::Collection); use Params::Validate qw(:types); use File::Spec; __PACKAGE__->valid_params ( path => { type => SCALAR|ARRAYREF }, recurse => { type => BOOLEAN, default => 0 }, ); sub new { my $class = shift; my $self = $class->SUPER::new(@_); $self->{dir_fh} = do {local *FH; *FH}; # double *FH avoids a warning # Documents are contained in a directory, or list of directories $self->{path} = [$self->{path}] unless ref $self->{path}; $self->{used} = []; $self->_next_path; return $self; } sub _next_path { my $self = shift; closedir $self->{dir_fh} if $self->{cur_dir}; $self->{cur_dir} = shift @{$self->{path}}; push @{$self->{used}}, $self->{cur_dir}; opendir $self->{dir_fh}, $self->{cur_dir} or die "$self->{cur_dir}: $!"; } sub next { my $self = shift; my $file = $self->_read_file; return unless defined $file; warn "No category information about '$file'" unless defined $self->{category_hash}{$file}; my @cats = map AI::Categorizer::Category->by_name(name => $_), @{ $self->{category_hash}{$file} || [] }; return $self->call_method('document', 'read', path => File::Spec->catfile($self->{cur_dir}, $file), name => $file, categories => \@cats, ); } sub _read_file { my ($self) = @_; my $file = readdir $self->{dir_fh}; if (!defined $file) { # Directory has been exhausted return undef unless @{$self->{path}}; $self->_next_path; return $self->_read_file; } elsif ($file eq '.' or $file eq '..') { return $self->_read_file; } elsif (-d (my $path = File::Spec->catdir($self->{cur_dir}, $file))) { push @{$self->{path}}, $path # Add for later processing if $self->{recurse} and !grep {$_ eq $path} @{$self->{path}}, @{$self->{used}}; return $self->_read_file; } return $file; } sub rewind { my $self = shift; push @{$self->{path}}, @{$self->{used}}; @{$self->{used}} = (); $self->_next_path; } # This should share an iterator with next() sub count_documents { my $self = shift; return $self->{document_count} if defined $self->{document_count}; $self->rewind; my $count = 0; $count++ while defined $self->_read_file; $self->rewind; return $self->{document_count} = $count; } 1; __END__ =head1 NAME AI::Categorizer::Collection::Files - One document per file =head1 SYNOPSIS my $c = new AI::Categorizer::Collection::Files (path => '/tmp/docs/training', category_file => '/tmp/docs/cats.txt'); print "Total number of docs: ", $c->count_documents, "\n"; while (my $document = $c->next) { ... } $c->rewind; # For further operations =head1 DESCRIPTION This implements a Collection class in which each document exists as a single file on a filesystem. The documents can exist in a single directory, or in several directories. =head1 METHODS This is a subclass of the abstract AI::Categorizer::Collection class, so any methods mentioned in its documentation are available here. =over 4 =item new() Creates a new Collection object and returns it. In addition to the parameters accepted by the superclass, the following parameters are accepted: =over 4 =item path Indicates a location on disk where the documents can be found. The path may be specified as a string giving the name of a directory, or as a reference to an array of such strings if the documents are located in more than one directory. =item recurse Indicates whether subdirectories of the directory (or directories) in the C parameter should be descended into. If set to a true value, they will be descended into. If false, they will be ignored. The default is false. =back =back =head1 AUTHOR Ken Williams, ken@mathforum.org =head1 COPYRIGHT Copyright 2002-2003 Ken Williams. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO AI::Categorizer::Collection(3) =cut