package KinoSearch::Index::TermInfosWriter; use strict; use warnings; use KinoSearch::Util::ToolSet; use base qw( KinoSearch::Util::Class ); BEGIN { __PACKAGE__->init_instance_vars( # constructor params invindex => undef, seg_name => undef, is_index => 0, index_interval => 1024, skip_interval => 16, ); } our %instance_vars; sub new { my $class = shift; confess kerror() unless verify_args( \%instance_vars, @_ ); my %args = ( %instance_vars, @_ ); my $invindex = $args{invindex}; # open an outstream my $suffix = $args{is_index} ? 'tii' : 'tis'; my $filename = "$args{seg_name}.$suffix"; $invindex->delete_file($filename) if $invindex->file_exists($filename); my $outstream = $args{invindex}->open_outstream($filename); my $self = _new( $outstream, @args{qw( is_index index_interval skip_interval )} ); # create the tii doppelganger if ( !$args{is_index} ) { my $other = __PACKAGE__->new( invindex => $invindex, seg_name => $args{seg_name}, is_index => 1, ); $self->_set_other($other); $other->_set_other($self); } return $self; } sub finish { my $self = shift; my $outstream = $self->_get_outstream; # seek to near the head and write the number of terms processed $outstream->seek(4); $outstream->lu_write( 'Q', $self->_get_size ); # cue the doppelganger's exit if ( !$self->_get_is_index ) { $self->_get_other()->finish; } $outstream->close; } 1; __END__ __XS__ MODULE = KinoSearch PACKAGE = KinoSearch::Index::TermInfosWriter TermInfosWriter* _new(outstream_sv, is_index, index_interval, skip_interval) SV *outstream_sv; I32 is_index; I32 index_interval; I32 skip_interval; CODE: RETVAL = Kino_TInfosWriter_new(outstream_sv, is_index, index_interval, skip_interval); OUTPUT: RETVAL =for comment Add a Term (encoded as a termstring) and its associated TermInfo. =cut void add(obj, termstring_sv, tinfo) TermInfosWriter *obj; SV *termstring_sv; TermInfo *tinfo; PREINIT: ByteBuf bb; STRLEN len; PPCODE: bb.ptr = SvPV(termstring_sv, len); bb.size = len; Kino_TInfosWriter_add(obj, &bb, tinfo); =for comment Export the FORMAT constant to Perl. =cut IV FORMAT() CODE: RETVAL = KINO_TINFOS_FORMAT; OUTPUT: RETVAL SV* _set_or_get(obj, ...) TermInfosWriter *obj; ALIAS: _set_other = 1 _get_other = 2 _get_outstream = 4 _get_is_index = 6 _get_size = 8 CODE: { KINO_START_SET_OR_GET_SWITCH case 1: SvREFCNT_dec(obj->other_sv); obj->other_sv = newSVsv( ST(1) ); Kino_extract_struct(obj->other_sv, obj->other, TermInfosWriter*, "KinoSearch::Index::TermInfosWriter"); /* fall through */ case 2: RETVAL = newSVsv(obj->other_sv); break; case 4: RETVAL = newSVsv(obj->fh_sv); break; case 6: RETVAL = newSViv(obj->is_index); break; case 8: RETVAL = newSViv(obj->size); break; KINO_END_SET_OR_GET_SWITCH } OUTPUT: RETVAL void DESTROY(obj) TermInfosWriter *obj; PPCODE: Kino_TInfosWriter_destroy(obj); __H__ #ifndef H_KINO_TERM_INFOS_WRITER #define H_KINO_TERM_INFOS_WRITER 1 #include "EXTERN.h" #include "perl.h" #include "XSUB.h" #include "KinoSearchIndexTerm.h" #include "KinoSearchIndexTermInfo.h" #include "KinoSearchStoreOutStream.h" #include "KinoSearchUtilByteBuf.h" #include "KinoSearchUtilCClass.h" #include "KinoSearchUtilMathUtils.h" #include "KinoSearchUtilMemManager.h" #include "KinoSearchUtilStringHelper.h" #define KINO_TINFOS_FORMAT -2 typedef struct terminfoswriter { OutStream *fh; SV *fh_sv; I32 is_index; I32 index_interval; I32 skip_interval; struct terminfoswriter* other; SV *other_sv; ByteBuf *last_termstring; TermInfo *last_tinfo; I32 last_fieldnum; double last_tis_ptr; I32 size; } TermInfosWriter; TermInfosWriter* Kino_TInfosWriter_new(SV*, I32, I32, I32); void Kino_TInfosWriter_add(TermInfosWriter*, ByteBuf*, TermInfo*); void Kino_TInfosWriter_destroy(TermInfosWriter*); #endif /* include guard */ __C__ #include "KinoSearchIndexTermInfosWriter.h" TermInfosWriter* Kino_TInfosWriter_new(SV *outstream_sv, I32 is_index, I32 index_interval, I32 skip_interval) { TermInfosWriter *obj; /* allocate */ Kino_New(0, obj, 1, TermInfosWriter); /* assign */ obj->is_index = is_index; obj->index_interval = index_interval; obj->skip_interval = skip_interval; obj->fh_sv = newSVsv(outstream_sv); Kino_extract_struct(obj->fh_sv, obj->fh, OutStream*, "KinoSearch::Store::OutStream"); /* NOTE: this value forces the first field_num in the .tii file to -1. * Do not change it. */ obj->last_termstring = Kino_BB_new_string("\xff\xff", 2); obj->last_tinfo = Kino_TInfo_new(); obj->last_fieldnum = -1; obj->last_tis_ptr = 0, obj->size = 0; obj->other = NULL; obj->other_sv = &PL_sv_undef; /* write file header */ obj->fh->write_int(obj->fh, KINO_TINFOS_FORMAT); obj->fh->write_long(obj->fh, 0.0); /* return to fill in later */ obj->fh->write_int(obj->fh, index_interval); obj->fh->write_int(obj->fh, skip_interval); return obj; } /* Write out a term/terminfo combo. */ void Kino_TInfosWriter_add(TermInfosWriter* obj, ByteBuf* termstring_bb, TermInfo* tinfo) { char *termstring, *last_tstring; STRLEN termstring_len, last_tstring_len; I32 field_num; I32 overlap; char *diff_start_str; STRLEN diff_len; OutStream* fh; /* make local copy */ fh = obj->fh; /* write a subset of the entries to the .tii index */ if ( (obj->size % obj->index_interval == 0) && (!obj->is_index) ) { Kino_TInfosWriter_add(obj->other, obj->last_termstring, obj->last_tinfo); } /* extract string pointers and string lengths */ termstring = termstring_bb->ptr; last_tstring = obj->last_termstring->ptr; termstring_len = termstring_bb->size; last_tstring_len = obj->last_termstring->size; /* to obtain field number, decode packed 'n' at top of termstring */ field_num = (I16)Kino_decode_bigend_U16(termstring); /* move past field_num */ termstring += KINO_FIELD_NUM_LEN; last_tstring += KINO_FIELD_NUM_LEN; termstring_len -= KINO_FIELD_NUM_LEN; last_tstring_len -= KINO_FIELD_NUM_LEN; /* count how many bytes the strings share at the top */ overlap = Kino_StrHelp_string_diff(last_tstring, termstring, last_tstring_len, termstring_len); diff_start_str = termstring + overlap; diff_len = termstring_len - overlap; /* write number of common bytes */ fh->write_vint(fh, overlap); /* write common bytes */ fh->write_string(fh, diff_start_str, diff_len); /* write field number and doc_freq */ fh->write_vint(fh, field_num); fh->write_vint(fh, tinfo->doc_freq); /* delta encode filepointers */ fh->write_vlong(fh, (tinfo->frq_fileptr - obj->last_tinfo->frq_fileptr) ); fh->write_vlong(fh, (tinfo->prx_fileptr - obj->last_tinfo->prx_fileptr) ); /* write skipdata */ if (tinfo->doc_freq >= obj->skip_interval) fh->write_vint(fh, tinfo->skip_offset); /* the .tii index file gets a pointer to the location of the primary */ if (obj->is_index) { double tis_ptr; tis_ptr = obj->other->fh->tell(obj->other->fh); obj->fh->write_vlong(obj->fh, (tis_ptr - obj->last_tis_ptr)); obj->last_tis_ptr = tis_ptr; } /* track number of terms */ obj->size++; /* remember for delta encoding */ Kino_BB_assign_string(obj->last_termstring, termstring_bb->ptr, termstring_bb->size); StructCopy(tinfo, obj->last_tinfo, TermInfo); } void Kino_TInfosWriter_destroy(TermInfosWriter *obj) { SvREFCNT_dec(obj->fh_sv); SvREFCNT_dec(obj->other_sv); Kino_BB_destroy(obj->last_termstring); Kino_TInfo_destroy(obj->last_tinfo); Kino_Safefree(obj); } __POD__ =begin devdocs =head1 NAME KinoSearch::Index::TermInfosWriter - write a term dictionary =head1 DESCRIPTION The TermInfosWriter write both parts of the term dictionary. The primary instance creates a shadow TermInfosWriter that writes the index. =head TODO Find the optimum TermIndexInterval. =head1 COPYRIGHT Copyright 2005-2009 Marvin Humphrey =head1 LICENSE, DISCLAIMER, BUGS, etc. See L version 0.165. =end devdocs =cut