package Search::Estraier; use 5.008; use strict; use warnings; our $VERSION = '0.09'; =head1 NAME Search::Estraier - pure perl module to use Hyper Estraier search engine =head1 SYNOPSIS =head2 Simple indexer use Search::Estraier; # create and configure node my $node = new Search::Estraier::Node( url => 'http://localhost:1978/node/test', user => 'admin', passwd => 'admin', create => 1, label => 'Label for node', croak_on_error => 1, ); # create document my $doc = new Search::Estraier::Document; # add attributes $doc->add_attr('@uri', "http://estraier.gov/example.txt"); $doc->add_attr('@title', "Over the Rainbow"); # add body text to document $doc->add_text("Somewhere over the rainbow. Way up high."); $doc->add_text("There's a land that I heard of once in a lullaby."); die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) }); =head2 Simple searcher use Search::Estraier; # create and configure node my $node = new Search::Estraier::Node( url => 'http://localhost:1978/node/test', user => 'admin', passwd => 'admin', croak_on_error => 1, ); # create condition my $cond = new Search::Estraier::Condition; # set search phrase $cond->set_phrase("rainbow AND lullaby"); my $nres = $node->search($cond, 0); if (defined($nres)) { print "Got ", $nres->hits, " results\n"; # for each document in results for my $i ( 0 ... $nres->doc_num - 1 ) { # get result document my $rdoc = $nres->get_doc($i); # display attribte print "URI: ", $rdoc->attr('@uri'),"\n"; print "Title: ", $rdoc->attr('@title'),"\n"; print $rdoc->snippet,"\n"; } } else { die "error: ", $node->status,"\n"; } =head1 DESCRIPTION This module is implementation of node API of Hyper Estraier. Since it's perl-only module with dependencies only on standard perl modules, it will run on all platforms on which perl runs. It doesn't require compilation or Hyper Estraier development files on target machine. It is implemented as multiple packages which closly resamble Ruby implementation. It also includes methods to manage nodes. There are few examples in C directory of this distribution. =cut =head1 Inheritable common methods This methods should really move somewhere else. =head2 _s Remove multiple whitespaces from string, as well as whitespaces at beginning or end my $text = $self->_s(" this is a text "); $text = 'this is a text'; =cut sub _s { my $text = $_[1]; return unless defined($text); $text =~ s/\s\s+/ /gs; $text =~ s/^\s+//; $text =~ s/\s+$//; return $text; } package Search::Estraier::Document; use Carp qw/croak confess/; use Search::Estraier; our @ISA = qw/Search::Estraier/; =head1 Search::Estraier::Document This class implements Document which is single item in Hyper Estraier. It's is collection of: =over 4 =item attributes C<< 'key' => 'value' >> pairs which can later be used for filtering of results You can add common filters to C in estmaster's C<_conf> file for better performance. See C in L. =item vectors also C<< 'key' => 'value' >> pairs =item display text Text which will be used to create searchable corpus of your index and included in snippet output. =item hidden text Text which will be searchable, but will not be included in snippet. =back =head2 new Create new document, empty or from draft. my $doc = new Search::HyperEstraier::Document; my $doc2 = new Search::HyperEstraier::Document( $draft ); =cut sub new { my $class = shift; my $self = {}; bless($self, $class); $self->{id} = -1; my $draft = shift; if ($draft) { my $in_text = 0; foreach my $line (split(/\n/, $draft)) { if ($in_text) { if ($line =~ /^\t/) { push @{ $self->{htexts} }, substr($line, 1); } else { push @{ $self->{dtexts} }, $line; } next; } if ($line =~ m/^%VECTOR\t(.+)$/) { my @fields = split(/\t/, $1); if ($#fields % 2 == 1) { $self->{kwords} = { @fields }; } else { warn "can't decode $line\n"; } next; } elsif ($line =~ m/^%SCORE\t(.+)$/) { $self->{score} = $1; next; } elsif ($line =~ m/^%/) { # What is this? comment? #warn "$line\n"; next; } elsif ($line =~ m/^$/) { $in_text = 1; next; } elsif ($line =~ m/^(.+)=(.*)$/) { $self->{attrs}->{ $1 } = $2; next; } warn "draft ignored: '$line'\n"; } } $self ? return $self : return undef; } =head2 add_attr Add an attribute. $doc->add_attr( name => 'value' ); Delete attribute using $doc->add_attr( name => undef ); =cut sub add_attr { my $self = shift; my $attrs = {@_}; while (my ($name, $value) = each %{ $attrs }) { if (! defined($value)) { delete( $self->{attrs}->{ $self->_s($name) } ); } else { $self->{attrs}->{ $self->_s($name) } = $self->_s($value); } } return 1; } =head2 add_text Add a sentence of text. $doc->add_text('this is example text to display'); =cut sub add_text { my $self = shift; my $text = shift; return unless defined($text); push @{ $self->{dtexts} }, $self->_s($text); } =head2 add_hidden_text Add a hidden sentence. $doc->add_hidden_text('this is example text just for search'); =cut sub add_hidden_text { my $self = shift; my $text = shift; return unless defined($text); push @{ $self->{htexts} }, $self->_s($text); } =head2 add_vectors Add a vectors $doc->add_vector( 'vector_name' => 42, 'another' => 12345, ); =cut sub add_vectors { my $self = shift; return unless (@_); # this is ugly, but works die "add_vector needs HASH as argument" unless ($#_ % 2 == 1); $self->{kwords} = {@_}; } =head2 set_score Set the substitute score $doc->set_score(12345); =cut sub set_score { my $self = shift; my $score = shift; return unless (defined($score)); $self->{score} = $score; } =head2 score Get the substitute score =cut sub score { my $self = shift; return -1 unless (defined($self->{score})); return $self->{score}; } =head2 id Get the ID number of document. If the object has never been registred, C<-1> is returned. print $doc->id; =cut sub id { my $self = shift; return $self->{id}; } =head2 attr_names Returns array with attribute names from document object. my @attrs = $doc->attr_names; =cut sub attr_names { my $self = shift; return unless ($self->{attrs}); #croak "attr_names return array, not scalar" if (! wantarray); return sort keys %{ $self->{attrs} }; } =head2 attr Returns value of an attribute. my $value = $doc->attr( 'attribute' ); =cut sub attr { my $self = shift; my $name = shift; return unless (defined($name) && $self->{attrs}); return $self->{attrs}->{ $name }; } =head2 texts Returns array with text sentences. my @texts = $doc->texts; =cut sub texts { my $self = shift; #confess "texts return array, not scalar" if (! wantarray); return @{ $self->{dtexts} } if ($self->{dtexts}); } =head2 cat_texts Return whole text as single scalar. my $text = $doc->cat_texts; =cut sub cat_texts { my $self = shift; return join(' ',@{ $self->{dtexts} }) if ($self->{dtexts}); } =head2 dump_draft Dump draft data from document object. print $doc->dump_draft; =cut sub dump_draft { my $self = shift; my $draft; foreach my $attr_name (sort keys %{ $self->{attrs} }) { next unless defined(my $v = $self->{attrs}->{$attr_name}); $draft .= $attr_name . '=' . $v . "\n"; } if ($self->{kwords}) { $draft .= '%VECTOR'; while (my ($key, $value) = each %{ $self->{kwords} }) { $draft .= "\t$key\t$value"; } $draft .= "\n"; } if (defined($self->{score}) && $self->{score} >= 0) { $draft .= "%SCORE\t" . $self->{score} . "\n"; } $draft .= "\n"; $draft .= join("\n", @{ $self->{dtexts} }) . "\n" if ($self->{dtexts}); $draft .= "\t" . join("\n\t", @{ $self->{htexts} }) . "\n" if ($self->{htexts}); return $draft; } =head2 delete Empty document object $doc->delete; This function is addition to original Ruby API, and since it was included in C wrappers it's here as a convinience. Document objects which go out of scope will be destroyed automatically. =cut sub delete { my $self = shift; foreach my $data (qw/attrs dtexts stexts kwords/) { delete($self->{$data}); } $self->{id} = -1; return 1; } package Search::Estraier::Condition; use Carp qw/carp confess croak/; use Search::Estraier; our @ISA = qw/Search::Estraier/; =head1 Search::Estraier::Condition =head2 new my $cond = new Search::HyperEstraier::Condition; =cut sub new { my $class = shift; my $self = {}; bless($self, $class); $self->{max} = -1; $self->{options} = 0; $self ? return $self : return undef; } =head2 set_phrase $cond->set_phrase('search phrase'); =cut sub set_phrase { my $self = shift; $self->{phrase} = $self->_s( shift ); } =head2 add_attr $cond->add_attr('@URI STRINC /~dpavlin/'); =cut sub add_attr { my $self = shift; my $attr = shift || return; push @{ $self->{attrs} }, $self->_s( $attr ); } =head2 set_order $cond->set_order('@mdate NUMD'); =cut sub set_order { my $self = shift; $self->{order} = shift; } =head2 set_max $cond->set_max(42); =cut sub set_max { my $self = shift; my $max = shift; croak "set_max needs number, not '$max'" unless ($max =~ m/^\d+$/); $self->{max} = $max; } =head2 set_options $cond->set_options( 'SURE' ); $cond->set_options( qw/AGITO NOIDF SIMPLE/ ); Possible options are: =over 8 =item SURE check every N-gram =item USUAL check every second N-gram =item FAST check every third N-gram =item AGITO check every fourth N-gram =item NOIDF don't perform TF-IDF tuning =item SIMPLE use simplified query phrase =back Skipping N-grams will speed up search, but reduce accuracy. Every call to C will reset previous options; This option changed in version C<0.04> of this module. It's backwards compatibile. =cut my $options = { SURE => 1 << 0, USUAL => 1 << 1, FAST => 1 << 2, AGITO => 1 << 3, NOIDF => 1 << 4, SIMPLE => 1 << 10, }; sub set_options { my $self = shift; my $opt = 0; foreach my $option (@_) { my $mask; unless ($mask = $options->{$option}) { if ($option eq '1') { next; } else { croak "unknown option $option"; } } $opt += $mask; } $self->{options} = $opt; } =head2 phrase Return search phrase. print $cond->phrase; =cut sub phrase { my $self = shift; return $self->{phrase}; } =head2 order Return search result order. print $cond->order; =cut sub order { my $self = shift; return $self->{order}; } =head2 attrs Return search result attrs. my @cond_attrs = $cond->attrs; =cut sub attrs { my $self = shift; #croak "attrs return array, not scalar" if (! wantarray); return @{ $self->{attrs} } if ($self->{attrs}); } =head2 max Return maximum number of results. print $cond->max; C<-1> is returned for unitialized value, C<0> is unlimited. =cut sub max { my $self = shift; return $self->{max}; } =head2 options Return options for this condition. print $cond->options; Options are returned in numerical form. =cut sub options { my $self = shift; return $self->{options}; } =head2 set_skip Set number of skipped documents from beginning of results $cond->set_skip(42); Similar to C in RDBMS. =cut sub set_skip { my $self = shift; $self->{skip} = shift; } =head2 skip Return skip for this condition. print $cond->skip; =cut sub skip { my $self = shift; return $self->{skip}; } =head2 set_distinct $cond->set_distinct('@author'); =cut sub set_distinct { my $self = shift; $self->{distinct} = shift; } =head2 distinct Return distinct attribute print $cond->distinct; =cut sub distinct { my $self = shift; return $self->{distinct}; } =head2 set_mask Filter out some links when searching. Argument array of link numbers, starting with 0 (current node). $cond->set_mask(qw/0 1 4/); =cut sub set_mask { my $self = shift; return unless (@_); $self->{mask} = \@_; } package Search::Estraier::ResultDocument; use Carp qw/croak/; #use Search::Estraier; #our @ISA = qw/Search::Estraier/; =head1 Search::Estraier::ResultDocument =head2 new my $rdoc = new Search::HyperEstraier::ResultDocument( uri => 'http://localhost/document/uri/42', attrs => { foo => 1, bar => 2, }, snippet => 'this is a text of snippet' keywords => 'this\tare\tkeywords' ); =cut sub new { my $class = shift; my $self = {@_}; bless($self, $class); croak "missing uri for ResultDocument" unless defined($self->{uri}); $self ? return $self : return undef; } =head2 uri Return URI of result document print $rdoc->uri; =cut sub uri { my $self = shift; return $self->{uri}; } =head2 attr_names Returns array with attribute names from result document object. my @attrs = $rdoc->attr_names; =cut sub attr_names { my $self = shift; croak "attr_names return array, not scalar" if (! wantarray); return sort keys %{ $self->{attrs} }; } =head2 attr Returns value of an attribute. my $value = $rdoc->attr( 'attribute' ); =cut sub attr { my $self = shift; my $name = shift || return; return $self->{attrs}->{ $name }; } =head2 snippet Return snippet from result document print $rdoc->snippet; =cut sub snippet { my $self = shift; return $self->{snippet}; } =head2 keywords Return keywords from result document print $rdoc->keywords; =cut sub keywords { my $self = shift; return $self->{keywords}; } package Search::Estraier::NodeResult; use Carp qw/croak/; #use Search::Estraier; #our @ISA = qw/Search::Estraier/; =head1 Search::Estraier::NodeResult =head2 new my $res = new Search::HyperEstraier::NodeResult( docs => @array_of_rdocs, hits => %hash_with_hints, ); =cut sub new { my $class = shift; my $self = {@_}; bless($self, $class); foreach my $f (qw/docs hints/) { croak "missing $f for ResultDocument" unless defined($self->{$f}); } $self ? return $self : return undef; } =head2 doc_num Return number of documents print $res->doc_num; This will return real number of documents (limited by C). If you want to get total number of hits, see C. =cut sub doc_num { my $self = shift; return $#{$self->{docs}} + 1; } =head2 get_doc Return single document my $doc = $res->get_doc( 42 ); Returns undef if document doesn't exist. =cut sub get_doc { my $self = shift; my $num = shift; croak "expect number as argument, not '$num'" unless ($num =~ m/^\d+$/); return undef if ($num < 0 || $num > $self->{docs}); return $self->{docs}->[$num]; } =head2 hint Return specific hint from results. print $res->hint( 'VERSION' ); Possible hints are: C, C, C, C, C, C, C