package Plucene::Index::Writer; =head1 NAME Plucene::Index::Writer - write an index. =head1 SYNOPSIS my $writer = Plucene::Index::Writer->new($path, $analyser, $create); $writer->add_document($doc); $writer->add_indexes(@dirs); $writer->optimize; # called before close my $doc_count = $writer->doc_count; =head1 DESCRIPTION This is the writer class. If an index will not have more documents added for a while and optimal search performance is desired, then the C method should be called before the index is closed. =head1 METHODS =cut use strict; use warnings; use Carp qw/cluck croak/; use Fcntl qw(O_EXCL O_CREAT O_WRONLY); use File::Path qw(mkpath); use File::Spec::Functions qw(catfile); use List::Util qw(sum); use File::Temp qw(tempdir); use Plucene::Index::DocumentWriter; use Plucene::Index::SegmentInfos; use Plucene::Index::SegmentInfo; use Plucene::Index::SegmentReader; use Plucene::Index::SegmentMerger; use Plucene::Utils; use constant MAX_FIELD_LENGTH => 10_000; our $mergefactor = 10; our $max_merge_docs = ~0; our $DEBUG = 0; =head2 new my $writer = Plucene::Index::Writer->new($path, $analyser, $create); This will create a new Plucene::Index::Writer object. The third argument to the constructor determines whether a new index is created, or whether an existing index is opened for the addition of new documents. =cut sub new { my ($class, $path, $analyzer, $create) = @_; $create = 0 unless defined $create; if (!-d $path) { croak "Couldn't write into $path - it doesn't exist" unless $create; mkpath($path) or croak "Couldn't create $path - $!"; } my $lock = catfile($path, "write.lock"); my $self = bless { directory => $path, analyzer => $analyzer, lock => $lock, # There are many like it, but this one is mine segmentinfos => new Plucene::Index::SegmentInfos(), tmp_directory => tempdir(CLEANUP => 1), }, $class; local *FH; sysopen FH, $lock, O_EXCL | O_CREAT | O_WRONLY or croak "Couldn't get lock"; do_locked { $create ? $self->{segmentinfos}->write($path) : $self->{segmentinfos}->read($path); } catfile($path, "commit.lock"); return $self; } sub DESTROY { my $self = shift; if ($self->{lock}) { unlink $self->{lock} } $self->_flush; } =head2 doc_count my $doc_count = $writer->doc_count; =cut sub doc_count { my $self = shift; return sum map $_->doc_count(), $self->{segmentinfos}->segments; } =head2 add_document $writer->add_document($doc); =cut sub add_document { my ($self, $doc) = @_; my $dw = Plucene::Index::DocumentWriter->new($self->{tmp_directory}, $self->{analyzer}, MAX_FIELD_LENGTH); my $segname = $self->_new_segname; $dw->add_document($segname, $doc); #lock $self; $self->{segmentinfos}->add_element( Plucene::Index::SegmentInfo->new({ name => $segname, doc_count => 1, dir => $self->{tmp_directory} })); $self->_maybe_merge_segments; } sub _new_segname { "_" . shift->{segmentinfos}->{counter}++ # Urgh } sub _flush { my $self = shift; my @segs = $self->{segmentinfos}->segments; my $min_segment = $#segs; my $doc_count = 0; while ($min_segment >= 0 and $segs[$min_segment]->dir eq $self->{tmp_directory}) { $doc_count += $segs[$min_segment]->doc_count; $min_segment--; } if ( $min_segment < 0 or ($doc_count + $segs[$min_segment]->doc_count > $mergefactor) or !($segs[-1]->dir eq $self->{tmp_directory})) { $min_segment++; } return if $min_segment > @segs; $self->_merge_segments($min_segment); } =head2 optimize $writer->optimize; Merges all segments together into a single segment, optimizing an index for search. =cut sub optimize { my $self = shift; my $segments; while ( ($segments = scalar $self->{segmentinfos}->segments) > 1 or # If it's fragmented ( $segments == 1 and # or it's not fragmented ( Plucene::Index::SegmentReader->has_deletions( # but has deletions $self->{segmentinfos}->info(0)))) ) { my $minseg = $segments - $mergefactor; $self->_merge_segments($minseg < 0 ? 0 : $minseg); } } =head2 add_indexes $writer->add_indexes(@dirs); Merges all segments from an array of indexes into this index. This may be used to parallelize batch indexing. A large document collection can be broken into sub-collections. Each sub-collection can be indexed in parallel, on a different thread, process or machine. The complete index can then be created by merging sub-collection indexes with this method. After this completes, the index is optimized. =cut sub add_indexes { my ($self, @dirs) = @_; $self->optimize; for my $dir (@dirs) { my $sis = new Plucene::Index::SegmentInfos; $sis->read($dir); $self->{segmentinfos}->add_element($_) for $sis->segments; } $self->optimize; } # Incremental segment merger. # Or even this code - SC sub _maybe_merge_segments { my $self = shift; my $target_merge_docs = $mergefactor; while ($target_merge_docs <= $max_merge_docs) { cluck("No segments defined!") unless $self->{segmentinfos}; my $min_seg = scalar $self->{segmentinfos}->segments; my $merge_docs = 0; while (--$min_seg >= 0) { my $si = $self->{segmentinfos}->info($min_seg); last if $si->doc_count >= $target_merge_docs; $merge_docs += $si->doc_count; } last unless $merge_docs >= $target_merge_docs; warn "It's time to do a merge\n" if $DEBUG; $self->_merge_segments($min_seg + 1); $target_merge_docs *= $mergefactor; } } # Pops segments off of segmentInfos stack down to minSegment, merges # them, and pushes the merged index onto the top of the segmentInfos stack. sub _merge_segments { my $self = shift; my $min_segment = shift; my $mergedname = $self->_new_segname; my $mergedcount = 0; print "Merging segments\n" if $DEBUG; my $merger = Plucene::Index::SegmentMerger->new({ dir => $self->{directory}, segment => $mergedname }); my @to_delete; my @segments = $self->{segmentinfos}->segments; return if $#segments < $min_segment; for my $si (@segments[ $min_segment .. $#segments ]) { print " " . $si->name . " (" . $si->doc_count . " docs)\n" if $DEBUG; my $reader = Plucene::Index::SegmentReader->new($si); $merger->add($reader); push @to_delete, $reader if $reader->directory eq $self->{directory} or $reader->directory eq $self->{tmp_directory}; $mergedcount += $si->doc_count; } print " into $mergedname ($mergedcount docs)\n" if $DEBUG; $merger->merge; $self->{segmentinfos}->{segments} = # This is a bit naughty [ ($self->{segmentinfos}->segments)[ 0 .. $min_segment - 1 ], Plucene::Index::SegmentInfo->new({ name => $mergedname, dir => $self->{directory}, doc_count => $mergedcount }) ]; do_locked { $self->{segmentinfos}->write($self->{directory}); $self->_delete_segments(@to_delete); } catfile($self->{directory}, "commit.lock"); } sub _delete_segments { my ($self, @to_delete) = @_; my @try_later = $self->_delete($self->_read_deletable_files); for my $reader (@to_delete) { for my $file ($reader->files) { push @try_later, $self->_delete(catfile($reader->{directory}, $file)); } } $self->_write_deletable_files(@try_later); } sub _delete { my ($self, @files) = @_; my @failed; for (@files) { unlink $_ or push @failed, $_ } return @failed; } sub _read_deletable_files { my $self = shift; return unless -e (my $dfile = catfile($self->{directory}, "deletable")); open my $fh, $dfile or die $!; chomp(my @files = <$fh>); return @files; } sub _write_deletable_files { my ($self, @files) = @_; my $dfile = catfile($self->{directory}, "deletable"); open my $fh, ">" . $dfile . ".new" or die $!; print $fh "$_\n" for @files; close($fh); rename $dfile . ".new", $dfile; } 1;