package KSx::Highlight::Summarizer; $VERSION = '0.06'; @ISA = KinoSearch::Highlight::Highlighter; use KinoSearch::Highlight::Highlighter; use strict; use List::Util qw 'min'; use Number::Range; use Hash::Util::FieldHash::Compat 'fieldhashes'; fieldhashes \my( %ellipsis, %summ_len, %page_h, %encoder ); sub _range_endpoints { my $range = shift; my @range = $range->range; my $previous = shift @range; my $subrange = [($previous) x 2]; my @arrays; foreach my $current (@range) { if ($current == ($previous + 1)) { $subrange->[1] = $current; } else { push @arrays, $subrange; $subrange = [($current) x 2]; } $previous = $current; } return @arrays, $subrange; # Make sure the last subrange isn’t left out! } sub new { my($pack, %args) = @_; my $ellipsis = exists $args{ellipsis} ? delete $args{ellipsis} : ' ... '; my $summ_len = exists $args{summary_length} ? delete $args{summary_length} : 0; my $page_h = delete $args{page_handler}; my $encoder = delete $args{encoder}; # accept args that the superclass only allows one to set through # accessor methods: my $pre_tag = delete $args{pre_tag}; my $post_tag = delete $args{post_tag}; my $self = SUPER::new $pack %args; $ellipsis{$self} = $ellipsis; $summ_len{$self} = $summ_len; $page_h{$self} = $page_h; $encoder{$self} = $encoder; defined $pre_tag and $self->set_pre_tag($pre_tag); defined $post_tag and $self->set_post_tag($post_tag); return $self; } sub create_excerpt { my ($self, $hitdoc) = @_; my $field = $self->get_field; my $x_len = $self->get_excerpt_length; my $limit = int($x_len /3 ); # retrieve the text from the chosen field my $text = $hitdoc->{$field}; return unless defined $text; my $text_length = length $text; return '' unless $text_length; # get offsets and weights of words that match my $searcher = $self->get_searchable; my $posits = $self->get_compiler->highlight_spans( searchable => $searcher, field => $field, doc_vec => $searcher->fetch_doc_vec( $hitdoc->get_doc_id ), ); my @locs = map [$_->get_offset,$_->get_weight], @{ KinoSearch::Highlight::HeatMap->new( spans => $posits, window => $limit*2 )->get_spans }; @locs = map $$_[0], sort { $$b[1] <=> $$a[1] } @locs; @locs or @locs = 0; #warn "@locs" if $summ_len{$self}; # determine the rough boundaries of the excerpts my $range = new Number::Range; my $summ_len = $summ_len{$self}; for(@locs) { no warnings; # suppress Number::Range’s nasty warnings my $start = $_-$limit; $start = 0 if $start < 0; $range->addrange($start . '..' . min($start+$x_len, $text_length)); last if !$summ_len || $range->size >= $summ_len; } my @excerpt_bounds = _range_endpoints($range); #use DDS; warn Dump \@excerpt_bounds if $summ_len; # close small gaps between ranges for(my $c = 1; $c < @excerpt_bounds;++$c) { $excerpt_bounds[$c][0] - $excerpt_bounds[$c-1][1] <= 10 and $excerpt_bounds[$c-1][1] = $excerpt_bounds[$c][1], splice(@excerpt_bounds, $c, 1), --$c; } # extract the offsets from the highlight spans my(@starts, @ends); for(@$posits) { push(@starts, my $start = $_->get_offset); push(@ends, $start + $_->get_length); } # make the summary my $summary = ''; my $ellipsis = $ellipsis{$self}; my $token_re = qr/\b\w+(?:'\w+)?\b/; my $prev_ellipsis; # whether the previous excerpt ended with an ellip. my $prev_page = 0; # last page number of previous excerpt my $page_h = $page_h{$self}; for(@excerpt_bounds) { # make the excerpt my ($start,$end) = @$_; # determine the page number that $start falls within my $page_no; $page_h and $page_no = substr($text, 0,$start) =~ y/\014// + 1; my $x; # short for x-cerpt my $need_ellipsis; #warn "<<".substr($text,$start,$limit).">>"; # look for a page break within $limit chars from $start (except we # shouldn’t do it if $start is 0 because there’s a good chance # we’ll go past the very word for whose sake this excerpt exists) # ~~~ What about a case in which a page break plus maybe a few # spaces occur just *before* $start. That shouldn’t get an # ellipsis (as in the elsif block below), should it? if($page_h && $start && substr($text,$start,$limit) =~ /^(.*)\014/s) { $start += length($1) + 1; $page_no += 1 + $1 =~ y/\014//; $x = substr $text, $start; } elsif( $start ) { # if this is not the beginning of the doc my $sb = $self->find_sentences( text => $text, offset => $start, length => $limit ); if(@$sb) { $start = $$sb[0]; } else { ++ $need_ellipsis } $x = substr $text, $start; if($need_ellipsis) { # skip past possible partial tokens, but don’t insert an # ellipsis yet, because it might need to come after a # page marker if ($x =~ s/ \A ( .{1,$limit}? # don't go outside the window ) (?=$token_re) # just b4 the start of a full token //xsm ) { $start += length($1); } } } else { $x = substr $text, $start } # trim unwanted text from the end of the excerpt $x = substr $x, 0, $end-$start+1; # +1 ’cos we need that extra # char later my $end_with_ellipsis = 0; # if we’ve trimmed the end of the text if ( $end < $text_length) {{ # doubled so ‘last’ will work # check to see whether there are page breaks after the high- # lighted word, and stop at the first one if so if ($page_h and substr($x, $limit*-2) =~ s/(\014[^\014]*)//) { $end -= length $1; last; } # remove possible partial tokens from the end of the excerpt my $extra_char = chop $x; # the char we left dangling earlier # if the extra char wasn't part of a token, then we’re not # splitting one if ( $extra_char =~ $token_re ) { $x =~ s/$token_re$//; # if this fails, that's fine } # if the excerpt doesn't end with a full stop, end with # an ellipsis if ( $x !~ /\.\s*\Z/xsm ) { $x =~ s/\W+\Z//xsm; $x .= $ellipsis; ++$end_with_ellipsis; } }} #warn $x if $page_h; # get the offsets that are within range for the excerpt, and make # them relative to $start my @relative_starts = map $_-$start, @starts; my @relative_ends = map $_-$start, @ends; my $this_x_len = $end - $start; while ( @relative_starts and $relative_starts[0] < 0 ) { shift @relative_starts; shift @relative_ends; } while ( @relative_ends and $relative_ends[-1] > $this_x_len ) { pop @relative_starts; pop @relative_ends; } # insert highlight tags and page break markers # sstart and send stand for span start and end my ( $sstart, $send, $last_sstart, $last_send ) = ( undef, undef, 0, 0 ); if($page_h) { # Some of this code *is* repeated redundantly, but it # should theoretically run faster since the # if($page_h) check doesn’t have to be made every # time through the loop. $prev_page != $page_no ? ( $summary .= &$page_h($hitdoc, $page_no), $need_ellipsis && ($summary .= $ellipsis) ) : $need_ellipsis && !$prev_ellipsis && ($summary .= $ellipsis) ; while (@relative_starts) { $send = shift @relative_ends; $sstart = shift @relative_starts; $summary .= _encode_with_pb( $self, substr( $x, $last_send, $sstart - $last_send ), $page_h, \$page_no, $hitdoc ) unless !$last_send && !$sstart; $summary .= $self->highlight( _encode_with_pb( $self, substr( $x, $sstart, $send - $sstart ), $page_h, \$page_no, $hitdoc ) ); $last_send = $send; } $summary .= _encode_with_pb( $self, substr( $x, $last_send ), $page_h, \$page_no, $hitdoc ) unless $last_send == length $x; $prev_page = $page_no; } else { $need_ellipsis and !$prev_ellipsis and $summary .= $ellipsis; while (@relative_starts) { $send = shift @relative_ends; $sstart = shift @relative_starts; $summary .= $self->encode( substr( $x, $last_send, $sstart - $last_send ) ) unless !$last_send && !$sstart; $summary .= $self->highlight( $self->encode( substr( $x, $sstart, $send - $sstart ) ) ); $last_send = $send; } $summary .= $self->encode( substr( $x, $last_send ) ) unless $last_send == length $x; } $prev_ellipsis = $end_with_ellipsis; } return $summary; } # This is not called as a method above, because it’s a private routine that # should not be overridden (it is not guaranteed to exist in future ver- # sions), and it’s faster to call it as a function. sub _encode_with_pb { # w/page breaks my ($self, $text, $page_h, $page_no_ref, $hitdoc) = @_; my @to_encode = split /\014/, $text, -1; # -1 to allow trailing my $ret = ''; # null fields $ret .= $self->encode(shift @to_encode) if length $to_encode[0]; for(@to_encode) { $ret .= &$page_h($hitdoc, ++$$page_no_ref); $ret .= $self->encode($_) if length; } $ret; } sub encode { my @__ = @_; # workaround for perl5.8.8 bug &{ $encoder{$__[0]} or return shift(@__)->SUPER::encode(@__) }($__[1]) } 1; __END__ =head1 NAME KSx::Highlight::Summarizer - KinoSearch Highlighter subclass that provides more comprehensive summaries =head1 VERSION 0.06 (beta) =head1 SYNOPSIS use KSx::Highlight::Summarizer; my $summarizer = new KSx::Highlight::Summarizer searchable => $searcher, query => 'foo bar', field => 'content', # optional: pre_tag => '', post_tag => '', encoder => sub { my $str = shift; $str =~ s/([&'"<])/''.ord($1).';'/eg; $str }, page_handler => sub { "