# $Id: XWI.pm,v 1.3 2006/12/13 15:06:46 anders Exp $ # Copyright (c) 1996-1998 LUB NetLab, 2002-2006 Anders Ardö # # See the file LICENCE included in the distribution. package Combine::XWI; use strict; use HTML::Entities; sub new { my ($class) = @_; my $self = {}; bless $self, $class; $self->url_reset; $self->heading_reset; $self->link_reset; $self->meta_reset; $self->robot_reset; $self->topic_reset; return $self; } sub DESTROY { # print "an XWI object is destroyed" if Combine::Config::GetLoglev() > 2; } sub AUTOLOAD { my ($self, $value) = @_; my $name = $Combine::XWI::AUTOLOAD; $name =~ s/.*://; if ($value) { $self->{$name} = $value; return undef; } else { return $self->{$name}; } } sub url_remove { my ($self,$url) = @_; my ($i,$next); my $index = 1; my $count = $self->{'url_count'}; while ( $index <= $count and $self->{"url_$index"} ne $url ) { $index++; } return undef if $index > $count; for ($i=$index; $i<$count; $i++) { $next = $i + 1; $self->{"url_$i"} = $self->{"url_$next"}; } $self->{'url_count'}--; } sub url_reset { my ($self) = @_; $self->{'url_point'} = 1; $self->{'url_count'} = 0; } sub url_rewind { my ($self) = @_; $self->{'url_point'} = 1; } sub url_add { my ($self, $url) = @_; $self->{'url_count'}++; my $point = $self->{'url_count'}; $self->{"url_$point"} = $url; return $self->{'url_count'}; } sub url_get { my ($self) = @_; my $point = $self->{'url_point'}; return undef unless $point <= $self->{'url_count'}; $self->{'url_point'}++; return $self->{"url_$point"}; } sub meta_reset { my ($self) = @_; $self->{'meta_point'} = 1; $self->{'meta_count'} = 0; } sub meta_rewind { my ($self) = @_; $self->{'meta_point'} = 1; } sub meta_add { my ($self, $meta_name, $meta_content) = @_; $self->{'meta_count'}++; my $point = $self->{'meta_count'}; $self->{"meta_" . $point . "_name"} = $meta_name; $self->{"meta_" . $point . "_content" } = HTML::Entities::decode_entities($meta_content); # special for robots meta-tag if ( $meta_name eq "robots" ) { $self->{metarobots} = $meta_content; } return $self->{'meta_count'}; } sub meta_get { my ($self) = @_; my $point = $self->{'meta_point'}; return undef unless $point <= $self->{'meta_count'}; $self->{'meta_point'}++; my $meta_name = $self->{"meta_" . $point . "_name"}; my $meta_content = $self->{"meta_" . $point . "_content"}; return($meta_name, $meta_content); } sub robot_reset { my ($self) = @_; $self->{'robot_point'} = 1; $self->{'robot_count'} = 0; } sub robot_rewind { my ($self) = @_; $self->{'robot_point'} = 1; } sub robot_add { my ($self, $robot_name, $robot_content) = @_; $self->{'robot_count'}++; my $point = $self->{'robot_count'}; $self->{"robot_" . $point . "_name"} = $robot_name; $self->{"robot_" . $point . "_content" } = $robot_content; return $self->{'robot_count'}; } sub robot_get { my ($self) = @_; my $point = $self->{'robot_point'}; return undef unless $point <= $self->{'robot_count'}; $self->{'robot_point'}++; my $robot_name = $self->{"robot_" . $point . "_name"}; my $robot_content = $self->{"robot_" . $point . "_content"}; return($robot_name, $robot_content); } sub topic_reset { my ($self) = @_; $self->{'topic_point'} = 1; $self->{'topic_count'} = 0; } sub topic_rewind { my ($self) = @_; $self->{'topic_point'} = 1; } sub topic_add { my ($self, $topic_cls, $topic_absscore, $topic_relscore, $terms, $algorithm) = @_; $self->{'topic_count'}++; my $point = $self->{'topic_count'}; $self->{"topic_" . $point . "_cls"} = $topic_cls; $self->{"topic_" . $point . "_absscore" } = $topic_absscore; $self->{"topic_" . $point . "_relscore" } = $topic_relscore; $self->{"topic_" . $point . "_terms" } = $terms; $self->{"topic_" . $point . "_algorithm" } = $algorithm; return $self->{'topic_count'}; } sub topic_get { my ($self) = @_; my $point = $self->{'topic_point'}; return undef unless $point <= $self->{'topic_count'}; $self->{'topic_point'}++; my $topic_cls = $self->{"topic_" . $point . "_cls"}; my $topic_absscore = $self->{"topic_" . $point . "_absscore"}; my $topic_relscore = $self->{"topic_" . $point . "_relscore"}; my $terms = $self->{"topic_" . $point . "_terms"}; my $algorithm = $self->{"topic_" . $point . "_algorithm"}; return($topic_cls, $topic_absscore, $topic_relscore, $terms, $algorithm); } sub xmeta_reset { my ($self) = @_; $self->{'xmeta_point'} = 1; $self->{'xmeta_count'} = 0; } sub xmeta_rewind { my ($self) = @_; $self->{'xmeta_point'} = 1; } sub xmeta_add { my ($self, $meta_name, $meta_content, $meta_scheme, $meta_lang, $meta_group) = @_; $self->{'xmeta_count'}++; my $point = $self->{'xmeta_count'}; $self->{"xmeta_" . $point . "_name"} = $meta_name; $self->{"xmeta_" . $point . "_content" } = $meta_content; $self->{"xmeta_" . $point . "_scheme" } = $meta_scheme; $self->{"xmeta_" . $point . "_lang" } = $meta_lang; $self->{"xmeta_" . $point . "_group" } = $meta_group; return $self->{'xmeta_count'}; } sub xmeta_get { my ($self) = @_; my $point = $self->{'xmeta_point'}; return undef unless $point <= $self->{'xmeta_count'}; $self->{'xmeta_point'}++; my $meta_name = $self->{"meta_" . $point . "_name"}; my $meta_content = $self->{"meta_" . $point . "_content"}; my $meta_scheme = $self->{"xmeta_" . $point . "_scheme" }; my $meta_lang = $self->{"xmeta_" . $point . "_lang" }; my $meta_group = $self->{"xmeta_" . $point . "_group" }; return ($meta_name, $meta_content, $meta_scheme,$meta_lang, $meta_group); } sub heading_reset { my ($self) = @_; $self->{'heading_point'} = 1; $self->{'heading_count'} = 0; } sub heading_rewind { my ($self) = @_; $self->{'heading_point'} = 1; } sub heading_add { my ($self, $heading) = @_; $self->{'heading_count'}++; my $point = $self->{'heading_count'}; $self->{"heading_$point"} = HTML::Entities::decode_entities($heading); return $self->{'heading_count'}; } sub heading_get { my ($self) = @_; my $point = $self->{'heading_point'}; return undef unless $point <= $self->{'heading_count'}; $self->{'heading_point'}++; return $self->{"heading_$point"}; } sub link_reset { my ($self) = @_; $self->{'link_point'} = 1; $self->{'link_count'} = 0; } sub link_rewind { my ($self) = @_; $self->{'link_point'} = 1; } sub link_add { my ($self, $link_urlstr, $link_netlocid, $link_urlid, $link_text, $link_type) = @_; $self->{'link_count'}++; my $point = $self->{'link_count'}; $self->{"link_" . $point . "_text"} = HTML::Entities::decode_entities($link_text); $self->{"link_" . $point . "_urlstr" } = $link_urlstr; $self->{"link_" . $point . "_netlocid" } = $link_netlocid; $self->{"link_" . $point . "_urlid" } = $link_urlid; $self->{"link_" . $point . "_type" } = $link_type; return $self->{'link_count'}; } sub link_get { my ($self) = @_; my $point = $self->{'link_point'}; return undef unless $point <= $self->{'link_count'}; $self->{'link_point'}++; my $link_text = $self->{"link_" . $point . "_text"}; my $link_urlstr = $self->{"link_" . $point . "_urlstr"}; my $link_netlocid = $self->{"link_" . $point . "_netlocid"}; my $link_urlid = $self->{"link_" . $point . "_urlid"}; my $link_type = $self->{"link_" . $point . "_type"}; return($link_urlstr, $link_netlocid, $link_urlid, $link_text, $link_type); } 1; __END__ =head1 NAME XWI.pm - class for internal representation of a document record =head1 SYNOPSIS use Combine::XWI; $xwi = new Combine::XWI; #single value record variables $xwi->server($server); my $server = $xwi->server(); #original content $xwi->content(\$html); my $text = ${$xwi->content()}; #multiple value record variables $xwi->meta_add($name1,$value1); $xwi->meta_add($name2,$value2); $xwi->meta_rewind; my ($name,$content); while (1) { ($name,$content) = $xwi->meta_get; last unless $name; } =head1 DESCRIPTION Provides methods for storing and retrieving structured records representing crawled documents. =head1 METHODS =head2 new() =head2 XXX($val) Saves $val using AUTOLOAD. Can later be retrieved, eg $xwi->MyVar('My value'); $t = $xwi->MyVar; will set $t to 'My value' =head2 *_reset() Forget all values. =head2 *_rewind() *_get will start with the first value. =head2 *_add stores values into the datastructure =head2 *_get retrieves values from the datastructure =head2 meta_reset() / meta_rewind() / meta_add() / meta_get() Stores the content of Meta-tags Takes/Returns 2 parameters: Name, Content $xwi->meta_add($name1,$value1); $xwi->meta_add($name2,$value2); $xwi->meta_rewind; my ($name,$content); while (1) { ($name,$content) = $xwi->meta_get; last unless $name; } =head2 xmeta_reset() / xmeta_rewind() / xmeta_add() / xmeta_get() Extended information from Meta-tags. Not used. =head2 url_remove() / url_reset() / url_rewind() / url_add() / url_get() Stores all URLs (ie if multiple URLs for the same page) for this record Takes/Returns 1 parameter: URL =head2 heading_reset() / heading_rewind() / heading_add() / heading_get() Stores headings from HTML documents Takes/Returns 1 parameter: Heading text =head2 link_reset() / link_rewind() / link_add() / link_get() Stores links from documents Takes/Returns 5 parameters: URL, netlocid, urlid, Anchor text, Link type =head2 robot_reset() / robot_rewind() / robot_add() / robot_get() Stores calculated information, like genre, language, etc Takes/Returns 2 parameters Name, Value. Both are strings with max length Name: 15, Value: 20 =head2 topic_reset() / topic_rewind() / topic_add() / topic_get() Stores result of topic classification. Takes/Returns 5 parameters: Class, Absolute score, Normalized score, Terms, Algorithm id Class, Terms, and Algorithm id are strings with max lengths Class: 50, and Algorithm id: 25 Absolute score, and Normalized score are integers Normalized score and Terms are optional and may be replaced with 0, and '' respectively =head1 SEE ALSO Combine focused crawler main site L =head1 AUTHOR Yong Cao v0.05 1997-03-13 Anders Ardö, Eanders.ardo@it.lth.seE =head1 COPYRIGHT AND LICENSE Copyright (C) 2005,2006 Anders Ardö This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available. See the file LICENCE included in the distribution at L =cut