package classifyMPCA; ####NOT TESTED############## use Combine::XWI; #Mandatory use Combine::Config; #Optional if you want to use the Combine configuration system use Saa; #This comes from the MPCA suite. Make sure it's available to Perl my $saa = new Saa(); my $wait=1; my %MSG=('command'=>'call', 'object'=>'classify', 'function'=>'query' ); use Combine::Config; my ($host,$port)=split(':',Combine::Config::Get('MPCAHostPort')); #API: # a subroutine named 'classify' taking a XWI-object as in parameter # return values: 0/1 # 0: record fails to meet the classification criteria, ie ignore this record # 1: record is OK and should be stored in the database, and links followed by the crawler sub classify { my ($self,$xwi) = @_; #utility routines to extract information from the XWI-object #Title: my $text = $xwi->title . ' '; #Metadata: $xwi->meta_rewind; my ($name,$content); while (1) { ($name,$content) = $xwi->meta_get; last unless $name; next if ($name eq 'Rsummary'); next if ($name =~ /^autoclass/); $text .= $content . " "; } #Headings:? # $xwi->heading_rewind; # my $this; # while (1) { # $this = $xwi->heading_get or last; # $head .= $this . " "; # } #Text: $this = $xwi->text; if ($this) { $text .= $$this; } ############################### #Apply your classification algorithm here # assign $result a value (0/1) ############################### $MSG{'content'}="$text"; my $pca = &_send_query(\%MSG); #print "PCA=$pca\n"; #utility routines for saving detailed results (optional) in the database. These data may appear # in exported XML-records #Topic takes 5 parameters # $xwi->topic_add(topic_class_notation, topic_absolute_score, topic_normalized_score, topic_terms, algorithm_id); # topic_class_notation, topic_terms, and algorithm_id are strings # max length topic_class_notation: 50, algorithm_id: 25 # topic_absolute_score, and topic_normalized_score are integers # topic_normalized_score and topic_terms are optional and may be replaced with 0, '' respectively $xwi->topic_add('ALL', 1000*$pca, 1000*$pca,'','mpca'); # return true (1) if you want to keep the record # otherwise return false (0) if ($pca>0.5) {return 1;} else {return 0;} } sub _send_query { my $msg=shift; # warn "query_client(): before queue()"; $saa->queue($host, $port, $msg, arb_name => undef, arb => undef) || die($saa->{'err'} . "\n"); # warn "query_client(): after queue()"; my($ok, $sent, $received, $pending); $received = []; $sent = []; while(scalar(@$sent) < 1) { # ($ok, $sent, $received, $pending) = $saa->process(0.01); ($ok, $sent, $received, $pending) = $saa->process(10); $ok || die($saa->{'err'} . "\n"); } if($wait) { while(scalar(@$received) < 1) { # ($ok, $sent, $received, $pending) = $saa->process(0.01); ($ok, $sent, $received, $pending) = $saa->process(10); $ok || die($saa->{'err'} . "\n"); } } if ($received->[0]->{msg}->{result} ne 'ok') { print "$received->[0]->{msg}->{'result-text'}\n"; return 0.5; ########################?????????????? } return $received->[0]->{msg}->{score}; } 1;