## $Id: Matcher.pm 152 2006-09-27 13:10:57Z anders $ # Copyright (c) 1996-1998 LUB NetLab, 2002-2006 Anders Ardö # # See the file LICENCE included in the distribution. package Combine::Matcher; use Exporter(); @ISA=qw(Exporter); @EXPORT=(Match); use strict; ##use Combine::LoadTermList; use HTML::Entities; use locale; #needed for \b in regexps to work OK ????KOLLA???? #CONFIG my $DoTermStat = 0; # Dont save matched terms in a file for statitics #my $DoTermStat = 1; # Do save matched terms in a file for statitics # #my $KeepTerms = 0; # Dont keep matched terms in an internal hash my $KeepTerms = 1; # Do keep matched terms in an internal hash - MAY CAUSE MEMORY PROBLEM my %mterms; # hash to keep macthed terms # Keep frequency of matched terms in an internal hash - MAY CAUSE MEMORY PROBLEM my $KeepFreqTerms = 0; my %fterms; # hash to keep frequency of macthed terms #Matcher.pm ------------------ sub GetTermsM { # return matched terms up to now # empty hash of matched terms my %t; my $k; %t = %mterms; foreach $k (keys %mterms) { delete $mterms{$k}; } return (%t); } sub GetTermsF { # return frequency of matched terms up to now my %t; my $k; %t = %fterms; foreach $k (keys %fterms) { delete $fterms{$k}; } return (%t); } sub Match { my ($t, $termlist) = @_; #Accepts a text (either SCALAR or reference) and returns a list of classifications and scores my %score; my $cl = ""; my $k; my $i; my $ant; my $text; #Holds a reference to the text to be matched. if (ref($t)) {$text=$t;} else {$text=\$t;} if (length($$text) < 3) { return %score; } #unitialized? study($text); #OK att skicka in en reference? if ( $DoTermStat ) { open(TMS,">>TermMatchStat.txt"); } foreach $i (0 .. $#{$termlist->{Term}}) { $k=@{$termlist->{Term}}[$i]; if ( $k =~ /\@and/ ) { $ant= &boolmatch($k,$text); if ( $ant > 0 ) { if ( $DoTermStat ) { print TMS "$k\n"; } foreach $cl ( split('\s*,\s+', @{$termlist->{TermClass}}[$i] ) ) { if (length($cl)>0) { #print "BHIT: $ant, $cl\n"; $score{$cl} += $ant * @{$termlist->{TermWeight}}[$i]; #fel if ( $KeepTerms && defined($mterms{$cl}) && ! ($mterms{$cl} =~ /$k, /) ) { #OK warn if ( $KeepTerms && ! ($mterms{$cl} =~ /$k, /) ) { if ( $KeepTerms && ( !defined($mterms{$cl}) || (! ($mterms{$cl} =~ /$k, /)) ) ) { $mterms{$cl} .= "$k, "; } if ( $KeepFreqTerms ) { $fterms{$k}++; } } } } } else { while ($$text =~ /\b$k\b/g) { if ( $DoTermStat ) { print TMS "$k\n"; } foreach $cl ( split('\s*,\s+', @{$termlist->{TermClass}}[$i] ) ) { if (length($cl)>0) { #print "HIT: $cl ($k)\n"; $score{$cl} += @{$termlist->{TermWeight}}[$i]; #fel if ( $KeepTerms && defined($mterms{$cl}) && ! ($mterms{$cl} =~ /$k, /) ) { #OK warn if ( $KeepTerms && ! ($mterms{$cl} =~ /$k, /) ) { if ( $KeepTerms && ( !defined($mterms{$cl}) || (! ($mterms{$cl} =~ /$k, /)) ) ) { $mterms{$cl} .= "$k, "; } if ( $KeepFreqTerms ) { $fterms{$k}++; } } } } } } if ( $DoTermStat ) { close(TMS); } return (%score); } sub boolmatch { my ($terms, $text) = @_; my $m; my $min=100000; my $t; my @term; @term = split('\s+\@and\s+', $terms); foreach $t (@term) { $m=0; while ( $$text =~ /\b$t\b/g ) { $m++; } if ( $m == 0 ) { return 0; } if ( $min > $m ) { $min = $m; } } return $min; } #GetText.pm ------------------ sub getTextXWI { my ( $xwi, $DoStem, $stoplist, $simple) = @_; my $url =""; my $title="No Title"; my $size=0; # my $DoStem = 0 unless $DoStem; #False my $meta=""; my $head=""; my $text=""; $xwi->meta_rewind; my ($name,$content); while (1) { ($name,$content) = $xwi->meta_get; last unless $name; next if ($name eq 'Rsummary'); next if ($name =~ /^autoclass/); $meta .= $content . " "; } $title = $xwi->title; # $head = $xwi->title; # AA0 Treated separately $xwi->heading_rewind; my $this; while (1) { $this = $xwi->heading_get or last; $head .= $this . " "; } $this = $xwi->text; if ($this) { $this = $$this; $text .= $this ; } $size = $xwi->length; #unitialized? if ($simple) { if ( defined($meta) && ($meta ne '') ) { SimpletextConv(\$meta, $DoStem); } if ( defined($head) && ($head ne '') ) { SimpletextConv(\$head, $DoStem); } if ( defined($text) && ($text ne '') ) { SimpletextConv(\$text, $DoStem); } if ( defined($title) && ($title ne '') ) { SimpletextConv(\$title, $DoStem); } } else { if ( defined($meta) && ($meta ne '') ) { textConv(\$meta, $DoStem, $stoplist); } if ( defined($head) && ($head ne '') ) { textConv(\$head, $DoStem, $stoplist); } if ( defined($text) && ($text ne '') ) { textConv(\$text, $DoStem, $stoplist); } if ( defined($title) && ($title ne '') ) { textConv(\$title, $DoStem, $stoplist); } } return ($meta, $head, $text, $url, $title, $size); } #Keep?? sub getTextURL { my ( $url, $DoStem, $stoplist, $simple) = @_; my $title = ""; my $size = 0; # my $DoStem = 0; #False my $meta=""; my $head=""; my $text=""; my $html = `GET $url`; if ($html eq "") { return ($meta, $head, $text, $url, $title, -1); } $size = length($html); $html =~ s/\n/ /g; # converting HTML chars to Latin1 $html = HTML::Entities::decode_entities($html); #title if ($html =~ s/