A APPENDIX

A.1 Simple installation test

The following simple script is available in the doc/InstallationTest.pl file. It must be run as ’root’ and tests that basic functions of the Combine installation works.

Basicly it creates and initializes a new jobname, crawls one specific test page and exports it as XML. This XML is then compared to a correct XML-record for that page.

A.1.1 InstallationTest.pl
use strict;  
if ( $> != 0 ) {  
    die("You have to run this test as root");  
}  
 
my $orec=’’;  
while (<DATA>) { chop; $orec .= $_; }  
 
$orec =~ s|<checkedDate>.*</checkedDate>||;  
$orec =~ tr/\n\t //d;  
 
my $olen=length($orec);  
my $onodes=0;  
while ( $orec =~ m/</g ) { $onodes++; }  
print "ORIG Nodes=$onodes; Len=$olen\n";  
 
our $jobname;  
require ’./t/defs.pm’;  
 
system("combineINIT --jobname $jobname --topic /etc/combine/Topic_carnivor.txt > /dev/null");  
 
system("combine --jobname $jobname --harvest http://combine.it.lth.se/CombineTests/InstallationTest.html");  
open(REC,"combineExport --jobname $jobname |");  
my $rec=’’;  
while (<REC>) { chop; $rec .= $_; }  
close(REC);  
$rec =~ s|<checkedDate>.*</checkedDate>||;  
$rec =~ tr/\n\t //d;  
 
my $len=length($rec);  
my $nodes=0;  
while ( $rec =~ m/</g ) { $nodes++; }  
print "NEW Nodes=$nodes; Len=$len\n";  
 
my $OK=0;  
 
if ($onodes == $nodes) { print "Number of XML nodes match\n"; }  
else { print "Number of XML nodes does NOT match\n"; $OK=1; }  
if ($olen == $len) {  
  print "Size of XML match\n";  
} else {  
  $orec =~  s|<originalDocument.*</originalDocument>||s;  
  $rec =~  s|<originalDocument.*</originalDocument>||s;  
  if (length($orec) == length($rec)) { print "Size of XML match (after removal of ’originalDocument’)\n";}  
  else { print "Size of XML does NOT match\n"; $OK=1; }  
}  
 
if (($OK == 0) && ($orec eq $rec)) { print "All tests OK\n"; }  
else { print "There might be some problem with your Combine Installation\n"; }  
 
__END__  
<?xml version="1.0" encoding="UTF-8"?>  
<documentCollection version="1.1" xmlns="http://alvis.info/enriched/">  
<documentRecord id="80AC707F96BC57DFEF78C815F6FABD57">  
<acquisition>  
<acquisitionData>  
<modifiedDate>2006-12-05 13:20:25</modifiedDate>  
<checkedDate>2006-10-03 9:06:42</checkedDate>  
<httpServer>Apache/1.3.29 (Debian GNU/Linux) PHP/4.3.3</httpServer>  
<urls>  
    <url>http://combine.it.lth.se/CombineTests/InstallationTest.html</url>  
  </urls>  
</acquisitionData>  
<originalDocument mimeType="text/html" compression="gzip" encoding="base64" charSet="UTF-8">  
H4sIAAAAAAAAA4WQsU7DMBCG9zzF4bmpBV2QcDKQVKJSKR2CEKObXBSrjm3sSyFvT0yCQGJgusG/  
//u+E1flU1G9HrfwUD3u4fh8v98VwFLOXzYF52VVzg+b9Q3n2wPLE9FRr+NA2UyDFGnMdyaQ1FqS  
sgYIA0FrPRS2PymDgs+hRPRIEozsMWNnHN+tbwKD2hpCQxkrpDfqYr0dAjgtDYUVlN4G9HIFB3RT  
qMPAvns6Ipfi26Au09e5I61Gh78aCT+IR947qDvpA1I2UJvexg6+CJxsM0ad6/8kpkQiXB5XSWUC  
BNsj/GGG4LBWrarhSw+0OiOIidZjmzGPeh15WL6ICS7zFUjT/AiuBXeRbwHj870/AeRYaTupAQAA  
</originalDocument>  
<canonicalDocument>  
  <section>  
    <section title="Installation test for Combine">  
      <section>Installation test for Combine</section>  
      <section>Contains some Carnivorous plant specific words like <ulink url="rel.html">Drosera </ulink>, and Nepenthes.</section></section></section></canonicalDocument>  
<metaData>  
    <meta name="title">Installation test for Combine</meta>  
    <meta name="dc:format">text/html</meta>  
    <meta name="dc:format">text/html; charset=iso-8859-1</meta>  
    <meta name="dc:subject">Carnivorous plants</meta>  
    <meta name="dc:subject">Drosera</meta>  
    <meta name="dc:subject">Nepenthes</meta>  
  </metaData>  
<links>  
    <outlinks>  
      <link type="a">  
        <anchorText>Drosera</anchorText>  
        <location>http://combine.it.lth.se/CombineTests/rel.html</location>  
      </link>  
    </outlinks>  
  </links>  
<analysis>  
<property name="topLevelDomain">se</property>  
<property name="univ">1</property>  
<property name="language">en</property>  
<topic absoluteScore="1000" relativeScore="110526">  
    <class>ALL</class>  
  </topic>  
<topic absoluteScore="375" relativeScore="41447">  
    <class>CP.Drosera</class>  
    <terms>drosera</terms>  
  </topic>  
<topic absoluteScore="375" relativeScore="41447">  
    <class>CP.Nepenthes</class>  
    <terms>nepenthe</terms>  
  </topic>  
<topic absoluteScore="250" relativeScore="27632">  
    <class>CP</class>  
    <terms>carnivorous plant</terms>  
    <terms>carnivor</terms>  
  </topic>  
</analysis>  
</acquisition>  
</documentRecord>  
 
</documentCollection>

A.2 Example topic filter plug in

This example gives more details on how to write a topic filter Plug-In.

A.2.1 classifyPlugInTemplate.pm
#Template for writing a classify PlugIn for Combine  
#See documentation at http://combine.it.lth.se/documentation/  
 
package classifyPlugInTemplate; #Change to your own module name  
 
use Combine::XWI; #Mandatory  
use Combine::Config; #Optional if you want to use the Combine configuration system  
 
#API:  
#  a subroutine named ’classify’ taking a XWI-object as in parameter  
#    return values: 0/1  
#        0: record fails to meet the classification criteria, ie ignore this record  
#        1: record is OK and should be stored in the database, and links followed by the crawler  
sub classify {  
  my ($self,$xwi) = @_;  
 
  #utility routines to extract information from the XWI-object  
  #URL (can be several):  
   # $xwi->url_rewind;  
   # my $url_str="";  
   # my $t;  
   # while ($t = $xwi->url_get) { $url_str .= $t . ", "; }  
 
  #Metadata:  
   #  $xwi->meta_rewind;  
   #  my ($name,$content);  
   #  while (1) {  
   #    ($name,$content) = $xwi->meta_get;  
   #    last unless $name;  
   #    next if ($name eq ’Rsummary’);  
   #    next if ($name =~ /^autoclass/);  
   #    $meta .= $content . " ";  
   #  }  
 
  #Title:  
   #  $title = $xwi->title;  
 
  #Headings:  
   #  $xwi->heading_rewind;  
   #  my $this;  
   #  while (1) {  
   #    $this = $xwi->heading_get or last;  
   #    $head .= $this . " ";  
   #  }  
 
  #Text:  
   #  $this = $xwi->text;  
   #  if ($this) {  
   #    $text = $$this;  
   #  }  
 
###############################  
#Apply your classification algorithm here  
#  assign $result a value (0/1)  
###############################  
 
  #utility routines for saving detailed results (optional) in the database. These data may appear  
  # in exported XML-records  
 
  #Topic takes 5 parameters  
  # $xwi->topic_add(topic_class_notation, topic_absolute_score, topic_normalized_score, topic_terms, algorithm_id);  
  #  topic_class_notation, topic_terms, and algorithm_id are strings  
  #    max length topic_class_notation: 50, algorithm_id: 25  
  #  topic_absolute_score, and topic_normalized_score are integers  
  #  topic_normalized_score and topic_terms are optional and may be replaced with 0, ’’ respectively  
 
  #Analysis takes 2 parameters  
  # $xwi->robot_add(name,value);  
  # both are strings with max length name: 15, value: 20  
 
    # return true (1) if you want to keep the record  
    # otherwise return false (0)  
 
  return $result;  
}  
 
1;

A.3 Default configuration files

A.3.1 Global
#@#Default configuration values Combine system  
 
#Direct connection to Zebra indexing - for SearchEngine-in-a-box (default no connection)  
#@#ZebraHost = NoDefaultValue  
ZebraHost =  
 
#Direct connection to Solr indexing  
#@#SolrHost = NoDefaultValue  
SolrHost =  
 
#Enable(1)/disable(0) fulltext-index in MySQL table search  
MySQLfulltext = 0  
 
#Use a proxy server if this is defined (default no proxy)  
#@#httpProxy = NoDefaultValue  
httpProxy =  
 
#Enable(1)/disable(0) automatic recycling of new links  
AutoRecycleLinks = 1  
 
#User agent handles redirects (1) or treat redirects as new links (0)  
UserAgentFollowRedirects = 0  
 
#Number of pages to process before restarting the harvester  
HarvesterMaxMissions = 500  
 
#Logging level (0 (least) - 10 (most))  
Loglev = 0  
 
#Enable(1)/disable(0) analysis of genre, language  
doAnalyse = 1  
analysePlugin =  
relTextPlugin =  
 
#How long the summary should be. Use 0 to disable the summarization code  
SummaryLength   = 0  
 
#Store(1)/do not store(0) the raw HTML in the database  
saveHTML = 1  
 
#Use(1)/do not use(0) Tidy to clean the HTML before parsing it  
useTidy = 0  
 
#Use(1)/do not use(0) OAI record status keeping in SQL database  
doOAI = 1  
 
#Extract(1)/do not extract(0) links from plain text  
extractLinksFromText = 1  
 
#Enable(1)/disable(0) topic classification (focused crawling)  
#Generated by combineINIT based on --topic parameter  
doCheckRecord = 0  
 
#Which topic classification PlugIn module algorithm to use  
#Combine::Check_record and Combine::PosCheck_record included by default  
#NEW SVM classifier: Combine::classifySVM  
#see classifyPlugInTemplate.pm and documentation to write your own  
classifyPlugIn = Combine::Check_record  
 
#Filename for the SVM model  
#@#SVMmodel = NoDefaultValue  
SVMmodel =  
 
###Parameters for Std topic classification algorithm  
###StdTitleWeight = 10 #  
###StdMetaWeight = 4 #  
###StdHeadingsWeight = 2 #  
###StdCutoffRel = 10 #Class score must be above this % to be counted  
###StdCutoffNorm = 0.2 #normalised cutoff for summed normalised score  
###StdCutoffTot = 90 #non normalised cutoff for summed total score  
 
###Parameters for Pos topic classification algorithm  
###PosCutoffRel = 1 #Class score must be above this % to be counted  
###PosCutoffNorm = 0.002 #normalised cutoff for summed normalised score  
###PosCutoffTot = 1 #non normalised cutoff for summed total score  
 
HarvestRetries                  = 5  
SdqRetries                      = 5  
 
#Maximum length of a URL; longer will be silently discarded  
maxUrlLength = 250  
 
#Time in seconds to wait for a server to respond  
UAtimeout = 30  
 
#If we have seen this page before use Get-If-Modified (1) or not (0)  
UserAgentGetIfModifiedSince = 1  
 
WaitIntervalExpirationGuaranteed = 315360000  
WaitIntervalHarvesterLockNotFound = 2592000  
WaitIntervalHarvesterLockNotModified = 2592000  
WaitIntervalHarvesterLockRobotRules = 2592000  
WaitIntervalHarvesterLockUnavailable = 86400  
WaitIntervalRrdLockDefault = 86400  
WaitIntervalRrdLockNotFound = 345600  
WaitIntervalRrdLockSuccess = 345600  
 
#Time in seconds after succesfull download before allowing a page to be downloaded again (around 11 days)  
WaitIntervalHarvesterLockSuccess = 1000000  
 
#Time in seconds to wait before making a new reschedule if a reschedule results in an empty ready que  
WaitIntervalSchedulerGetJcf = 20  
 
#Minimum time between accesses to the same host. Must be positive  
WaitIntervalHost = 60  
 
#URL scheduling algorithm  
SchedulingAlgorithm = default  
 
#Identifies MySQL database name, user and host  
MySQLdatabase   = NoDefaultValue  
 
#Base directory for configuration files; initialized by Config.pm  
#@#baseConfigDir = /etc/combine  
 
#Directory for job specific configuration files; taken from ’jobname’  
#@#configDir = NoDefaultValue  
 
<binext>  
#Extensions of binary files  
arff  
au  
avi  
class  
exe  
fig  
gif  
gz  
hqx  
ica  
jpeg  
jpg  
mat  
mdb  
mov  
mp3  
mpeg  
mpg  
msi  
pcx  
pdb  
psd  
ram  
rar  
raw  
rmd  
rmx  
sav  
sdd  
shar  
tar  
tga  
tgz  
tif  
tiff  
vo  
wav  
wmv  
wmz  
xbm  
xpm  
z  
zip  
</binext>  
 
<converters>  
#Configure which converters can be used to produce a XWI object  
#Format:  
#  1 line per entry  
#  each entry consists of 3 ’;’ separated fields  
#  
#Entries are processed in order and the first match is executed  
#  external converters have to be found via PATH and executable to be considered a match  
#  the external converter command should take a filename as parameter and convert that file  
#   the result should be comming on STDOUT  
#  
# mime-type   ;   External converter command ; Internal converter  
 
text/html ; ; GuessHTML  
#Check this  
www/unknown ; ; GuessHTML  
text/plain ; ; GuessText  
text/x-tex ;  tth -g -w1 -r <  ; TeXHTML  
application/x-tex ;  tth -g -w1 -r < ; TeXHTML  
text/x-tex ; untex -a -e -giso ; TeXText  
application/x-tex ; untex -a -e -giso ; TeXText  
text/x-tex ;  ; TeX  
application/x-tex ; ; TeX  
application/pdf ; pdftohtml -i -noframes -nomerge -nodrm -stdout ; HTML  
application/pdf ; pstotext ; Text  
application/postscript ; pstotext ; Text  
application/msword ; antiword -t ; Text  
application/vnd.ms-excel ; xlhtml -fw ; HTML  
application/vnd.ms-powerpoint ; ppthtml ; HTML  
application/rtf ; unrtf --nopict --html ; HTML  
image/gif ; ; Image  
image/jpeg ; ; Image  
image/tiff ; ; Image  
</converters>  
 
<url>  
  <exclude>  
    #Exclude URLs or hostnames that matches these regular expressions  
    #Malformed hostnames  
    HOST: http:\/\/\.  
    HOST: \@  
  </exclude>  
</url>

A.3.2 Job specific
#Please change  
Operator-Email      = "YourEmailAdress@YourDomain"  
 
#Password not used yet. (Please change)  
Password    = "XxXxyYzZ"  
 
<converters>  
#Configure which converters can be used to produce a XWI object  
#Format:  
#  1 line per entry  
#  each entry consists of 3 ’;’ separated fields  
#  
#Entries are processed in order and the first match is executed  
#  external converters have to be found via PATH and executable to be considered a match  
#  the external converter command should take a filename as parameter and convert that file  
#   the result should be comming on STDOUT  
#  
# mime-type   ;   External converter command ; Internal converter  
 
application/pdf ; MYpdftohtml -i -noframes -nomerge -stdout ; HTML  
</converters>  
 
<url>  
#List of servernames that are aliases are in the file ./config_serveralias  
#    (automatically updated by other programs)  
#use one server per line  
#example  
#www.100topwetland.com  www.100wetland.com  
#  means that www.100wetland.com is replaced by www.100topwetland.com during URL normalization  
<serveralias>  
<<include config_serveralias>>  
</serveralias>  
 
#use either URL or HOST: (obs ’:’) to match regular expressions to  
# either the full URL or the HOST part of a URL.  
<allow>  
#Allow crawl of URLs or hostnames that matches these regular expressions  
HOST: .*$  
</allow>  
 
<exclude>  
#Exclude URLs or hostnames that matches these regular expressions  
# default: CGI and maps  
URL cgi-bin|htbin|cgi|\?|\.map$|_vti_  
 
# default: binary files  
URL \.exe$|\.zip$|\.tar$|\.tgz$|\.gz$|\.hqx$|\.sdd$|\.mat$|\.raw$  
URL \.EXE$|\.ZIP$|\.TAR$|\.TGZ$|\.GZ$|\.HQX$|\.SDD$|\.MAT$|\.RAW$  
 
# default: Unparsable documents  
URL \.shar$|\.rmx$|\.rmd$|\.mdb$|\.sav$  
URL \.SHAR$|\.RMX$|\.RMD$|\.MDB$|\.SAV$  
 
# default: images  
URL \.gif$|\.jpg$|\.jpeg$|\.xpm$|\.tif$|\.tiff$|\.mpg$|\.mpeg$|\.mov$|\.wav$|\.au$|\.pcx$|\.xbm$|\.tga$|\.psd$  
URL \.GIF$|\.JPG$|\.JPEG$|\.XPM$|\.TIF$|\.TIFF$|\.MPG$|\.MPEG$|\.MOV$|\.WAV$|\.AU$|\.PCX$|\.XBM$|\.TGA$|\.PSD$  
 
# default: other binary formats  
URL \.pdb$|\.class$|\.ica$|\.ram$|\.wmz$|\.arff$|\.rar$|\.vo$|\.fig$|\.mp3$|\.wmv$|\.avi$|\.msi$  
URL \.PDB$|\.CLASS$|\.ICA$|\.RAM$|\.WMZ$|\.ARFF$|\.RAR$|\.VO$|\.FIG$|\.MP3$|\.WMV$|\.AVI$|\.MSI$  
 
#more excludes in the file config_exclude (automatically updated by other programs)  
<<include config_exclude>>  
</exclude>  
<sessionids>  
#patterns to recognize and remove sessionids in URLs  
sessionid  
lsessionid  
jsessionid  
SID  
PHPSESSID  
SessionID  
BV_SessionID  
</sessionids>  
#url is just a conatiner for all URL related configuration patterns  
</url>

A.4 SQL database

A.4.1 Create database

DROP DATABASE IF EXISTS $database;
CREATE DATABASE $database DEFAULT CHARACTER SET utf8;
USE $database;

A.4.2 Creating MySQL tables

All tables use UTF-8

Summary tables ’^’=primary key, ’*’=key:
TABLE hdb: recordid^, type, dates, server, title, ip, ...
TABLE links: recordid*, mynetlocid*, urlid*, netlocid*, linktype, anchor  (netlocid for urlid!!)
TABLE meta: recordid*,  name, value
TABLE html: recordid^, html
TABLE analys: recordid*, name, value
TABLE topic: recordid*, notation*, absscore, relscore, terms, algorithm
TABLE localtags: netlocid, urlid, name, value
TABLE search: recordid^, stext*

(TABLE netlocalias: netlocid*, netlocstr^)
(TABLE urlalias: urlid*, urlstr^)
TABLE topichierarchy: node^, father*, notation*, caption, level
TABLE netlocs: netlocid^, netlocstr^, retries
TABLE urls: netlocid*, urlid^, urlstr^, path
TABLE urldb: netlocid*, urlid^, urllock, harvest*, retries, netloclock
TABLE newlinks urlid^, netlocid
TABLE recordurl: recordid*, urlid^, lastchecked, md5*, fingerprint*^
TABLE admin: status, queid, schedulealgorithm
TABLE log: pid, id, date, message
TABLE que: queid^, urlid, netlocid
TABLE robotrules: netlocid*, rule, expire
TABLE oai: recordid, md5^, date*, status
TABLE exports: host, port, last

A.4.3 Data tables
CREATE TABLE hdb (  
  recordid int(11) NOT NULL default ’0’,  
  type varchar(50) default NULL,  
  title text,  
  mdate timestamp NOT NULL,  
  expiredate datetime default NULL,  
  length int(11) default NULL,  
  server varchar(50) default NULL,  
  etag varchar(25) default NULL,  
  nheadings int(11) default NULL,  
  nlinks int(11) default NULL,  
  headings mediumtext,  
  ip mediumblob,  
  PRIMARY KEY  (recordid)  
) ENGINE=MyISAM AVG_ROW_LENGTH = 20000 MAX_ROWS = 10000000 DEFAULT CHARACTER SET=utf8;

CREATE TABLE html (  
  recordid int(11) NOT NULL default ’0’,  
  html mediumblob,  
  PRIMARY KEY  (recordid)  
) ENGINE=MyISAM AVG_ROW_LENGTH = 20000 MAX_ROWS = 10000000 DEFAULT CHARACTER SET=utf8;

CREATE TABLE links (  
  recordid int(11) NOT NULL default ’0’,  
  mynetlocid int(11) default NULL,  
  urlid int(11) default NULL,  
  netlocid int(11) default NULL,  
  anchor text,  
  linktype varchar(50) default NULL,  
  KEY recordid (recordid),  
  KEY urlid (urlid),  
  KEY mynetlocid (mynetlocid),  
  KEY netlocid (netlocid)  
) ENGINE=MyISAM MAX_ROWS = 1000000000 DEFAULT CHARACTER SET=utf8;

CREATE TABLE meta (  
  recordid int(11) NOT NULL default ’0’,  
  name varchar(50) default NULL,  
  value text,  
  KEY recordid (recordid)  
) ENGINE=MyISAM MAX_ROWS = 1000000000 DEFAULT CHARACTER SET=utf8;

CREATE TABLE analys (  
  recordid int(11) NOT NULL default ’0’,  
  name varchar(100) NOT NULL,  
  value varchar(100),  
  KEY recordid (recordid)  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE topic (  
  recordid int(11) NOT NULL default ’0’,  
  notation varchar(50) default NULL,  
  abscore int(11) default NULL,  
  relscore int(11) default NULL,  
  terms text default NULL,  
  algorithm varchar(25),  
  KEY notation (notation),  
  KEY recordid (recordid)  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE localtags (  
  netlocid int(11) NOT NULL DEFAULT ’0’,  
  urlid int(11) NOT NULL DEFAULT ’0’,  
  name varchar(100) NOT NULL,  
  value varchar(100) NOT NULL,  
  PRIMARY KEY tag (netlocid,urlid,name(100),value(100))  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE search (  
  recordid int(11) NOT NULL default ’0’,  
  stext mediumtext,  
  PRIMARY KEY (recordid),  
  FULLTEXT (stext)  
) ENGINE=MyISAM AVG_ROW_LENGTH = 20000 MAX_ROWS = 10000000 DEFAULT CHARACTER SET=utf8;

A.4.4 Administrative tables
CREATE TABLE netlocalias (  
  netlocid int(11),  
  netlocstr varchar(150) NOT NULL,  
  KEY netlocid (netlocid),  
  PRIMARY KEY netlocstr (netlocstr)  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE urlalias (  
  urlid int(11),  
  urlstr tinytext,  
  KEY urlid (urlid),  
  PRIMARY KEY urlstr (urlstr(255))  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

topichierarchy have to initialized manually

CREATE TABLE topichierarchy (  
  node int(11) NOT NULL DEFAULT ’0’,  
  father int(11) DEFAULT NULL,  
  notation varchar(50) NOT NULL DEFAULT ’’,  
  caption varchar(255) DEFAULT NULL,  
  level int(11) DEFAULT NULL,  
  PRIMARY KEY node (node),  
  KEY father (father),  
  KEY notation (notation)  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE netlocs (  
  netlocid int(11) NOT NULL auto_increment,  
  netlocstr varchar(150) NOT NULL,  
  retries int(11) NOT NULL DEFAULT 0,  
  PRIMARY KEY (netlocstr),  
  UNIQUE INDEX netlockid (netlocid)  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE urls (  
  netlocid int(11) NOT NULL DEFAULT ’0’,  
  urlid int(11) NOT NULL auto_increment,  
  urlstr tinytext,  
  path tinytext,  
  PRIMARY KEY urlstr (urlstr(255)),  
  INDEX netlocid (netlocid),  
  UNIQUE INDEX urlid (urlid)  
) ENGINE=MyISAM MAX_ROWS = 1000000000 DEFAULT CHARACTER SET=utf8;

CREATE TABLE urldb (  
  netlocid int(11) NOT NULL default ’0’,  
  netloclock int(11) NOT NULL default ’0’,  
  urlid int(11) NOT NULL default ’0’,  
  urllock int(11) NOT NULL default ’0’,  
  harvest tinyint(1) NOT NULL default ’0’,  
  retries int(11) NOT NULL default ’0’,  
  score int(11) NOT NULL default ’0’,  
  PRIMARY KEY  (urlid),  
  KEY netlocid (netlocid),  
  KEY harvest (harvest)  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE newlinks (  
  urlid int(11) NOT NULL,  
  netlocid int(11) NOT NULL,  
  PRIMARY KEY  (urlid)  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE recordurl (  
  recordid int(11) NOT NULL auto_increment,  
  urlid int(11) NOT NULL default ’0’,  
  lastchecked timestamp NOT NULL,  
  md5 char(32),  
  fingerprint char(50),  
  KEY md5 (md5),  
  KEY fingerprint (fingerprint),  
  PRIMARY KEY (urlid),  
  KEY recordid (recordid)  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE admin (  
  status enum(’closed’,’open’,’paused’,’stopped’) default NULL,  
  schedulealgorithm enum(’default’,’bigdefault’,’advanced’) default ’default’,  
  queid int(11) NOT NULL default ’0’  
) ENGINE=MEMORY DEFAULT CHARACTER SET=utf8;

advanced means use config variable SchedulingAlgorithm
Initialise admin to ’open’ status
INSERT INTO admin VALUES (’open’,’default’,0)

CREATE TABLE log (  
  pid int(11) NOT NULL default ’0’,  
  id varchar(50) default NULL,  
  date timestamp NOT NULL,  
  message varchar(255) default NULL  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE que (  
  netlocid int(11) NOT NULL default ’0’,  
  urlid int(11) NOT NULL default ’0’,  
  queid int(11) NOT NULL auto_increment,  
  PRIMARY KEY  (queid)  
) ENGINE=MEMORY DEFAULT CHARACTER SET=utf8;

CREATE TABLE robotrules (  
  netlocid int(11) NOT NULL default ’0’,  
  expire int(11) NOT NULL default ’0’,  
  rule varchar(255) default ’’,  
  KEY netlocid (netlocid)  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE oai (  
  recordid int(11) NOT NULL default ’0’,  
  md5 char(32),  
  date timestamp,  
  status enum(’created’, ’updated’, ’deleted’),  
  PRIMARY KEY (md5),  
  KEY date (date)  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

CREATE TABLE exports (  
  host varchar(30),  
  port int,  
  last timestamp DEFAULT ’1999-12-31’  
) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;

A.4.5 Create user dbuser with required priviligies
GRANT SELECT,INSERT,UPDATE,DELETE,CREATE,CREATE TEMPORARY TABLES,  
   ALTER,LOCK TABLES ON $database.* TO $dbuser;

GRANT SELECT,INSERT,UPDATE,DELETE,CREATE,CREATE TEMPORARY TABLES,  
   ALTER,LOCK TABLES ON $database.* TO $dbuser\@localhost;

A.5 Manual pages

A.5.1 combineExport

NAME combineExport - export records in XML from Combine database

SYNOPSIS combineExport –jobname <name> [–profile alvis|dc|combine –charset utf8|isolatin –number <n> –recordid <n> –md5 <MD5> –incremental –xsltscript ...]

OPTIONS AND ARGUMENTS jobname is used to find the appropriate configuration (mandatory)

–profile

Three profiles: alvis, dc, and combine . alvis and combine are similar XML formats.

’alvis’ profile format is defined by the Alvis enriched document format DTD. It uses charset UTF-8 per default.

’combine’ is more compact with less redundancy.

’dc’ is XML encoded Dublin Core data.

–charset

Selects a specific characterset from UTF-8, iso-latin-1 Overrides –profile settings.

–collapseinlinks

Skip inlinks with duplicate anchor-texts (ie just one inlink per unique anchor-text).

–nooutlinks

Do not include any outlinks in the exported records.

–ZebraIndex

ZebraIndex sends XML records directly to the Zebra server defined in Combine configuration variable ’ZebraHost’. It uses the default Zebra configuration: profile=combine, nooutlinks, collapseinlinks and is compatible with the direct Zebra indexing done during harvesting when ’ZebraHost’ is defined in the Combine configuration. Requires that the Zebra server is running.

–SolrIndex

SolrIndex sends XML records directly to the Solr server defined in Combine configuration variable ’SolrHost’. It uses the default Solr configuration: profile=combine, nooutlinks, collapseinlinks and is compatible with the direct Solr indexing done during harvesting when ’SolrHost’ is defined in the Combine configuration. Requires that the Solr server is running.

–xsltscript

Generates records in Combine native format and converts them using this XSLT script before output. See example scripts in /etc/combine/*.xsl

–number

the max number of records to be exported

–recordid

Export just the one record with this recordid

–md5

Export just the one record with this MD5 checksum

–pipehost, –pipeport

Specifies the server-name and port to connect to and export data using the Alvis Pipeline. Exports incrementally, ie all changes since last call to combineExport with the same pipehost and pipeport.

–incremental

Exports incrementally, ie all changes since last call to combineExport using –incremental

DESCRIPTION

EXAMPLES

 Export all records in Alvis XML-format to the file recs.xml  
   combineExport --jobname atest > recs.xml

 Export 10 records to STDOUT  
   combineExport --jobname atest --number 10

 Export all records in UTF-8 using Combine native format  
   combineExport --jobname atest --profile combine --charset utf8 > Zebrarecs.xml

 Incremental export of all changes from last call using localhost at port 6234 using the  
 default profile (Alvis)  
   combineExport --jobname atest --pipehost localhost --pipeport 6234

SEE ALSO Combine configuration documentation in /usr/share/doc/combine/.

Alvis XML schema (–profile alvis) at http://project.alvis.info/alvis_docs/enriched-document.xsd

AUTHOR Anders Ardö, <anders.ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2005 - 2006 Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

 See the file LICENCE included in the distribution at  
 L<http://combine.it.lth.se/>

__________________________________________________________________________

A.5.2 combineCtrl

NAME combineCtrl - controls a Combine crawling job

SYNOPSIS combineCtrl <action> –jobname <name>

where action can be one of start, kill, load, recyclelinks, reharvest, stat, howmany, records, hosts, initMemoryTables, open, stop, pause, continue

OPTIONS AND ARGUMENTS jobname is used to find the appropriate configuration (mandatory)

Actions starting/killing crawlers

start

takes an optional switch –harvesters n where n is the number of crawler processes to start

kill

kills all active crawlers (and their associated combineRun monitors) for jobname

Actions loading or recycling URLs for crawling

load

Read a list of URLs from STDIN (one per line) and schedules them for crawling

recyclelinks

Schedule all newly found (since last invocation of recyclelinks) links in crawled pages for crawling

reharvest

Schedules all pages in the database for crawling again (in order to check if they have changed)

Actions for controlling scheduling of URLs

open

opens database for URL scheduling (maybe after a stop)

stop

stops URL scheduling

pause

pauses URL scheduling

continue

continues URL scheduling after a pause

Misc actions

stat

prints out rudimentary status of the ready queue (ie eligible now) of URLs to be crawled

howmany

prints out rudimentary status of all URLs to be crawled

records

prints out the number of ercords in the SQL database

hosts

prints out rudimentary status of all hosts that have URLs to be crawled

initMemoryTables

initializes the administrative MySQL tables that are kept in memory

DESCRIPTION Implements various control functionality to administer a crawling job, like starting and stoping crawlers, injecting URLs into the crawl queue, scheduling newly found links for crawling, controlling scheduling, etc.

This is the preferred way of controling a crawl job.

EXAMPLES

echo ’http://www.yourdomain.com/’ | combineCtrl load –jobname aatest

Seed the crawling job aatest with a URL

combineCtrl start –jobname aatest –harvesters 3

Start 3 crawling processes for job aatest

combineCtrl recyclelinks –jobname aatest

Schedule all new links crawling

combineCtrl stat –jobname aatest

See how many URLs that are eligible for crawling right now.

SEE ALSO combine

Combine configuration documentation in /usr/share/doc/combine/.

AUTHOR Anders Ardö, <anders.ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2005 Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.3 combineRun

NAME combineRun - starts, monitors and restarts a combine harvesting process

SYNOPSIS combineRun <pidfile> <combine command to run>

DESCRIPTION Starts a program and monitors it in order to make sure there is alsways a copy running. If the program dies it will be restarted with the same parameters. Used by combineCtrl when starting combine crawling.

SEE ALSO combineCtrl

AUTHOR Anders Ardö, <anders.ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2005 Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.4 combineReClassify

NAME combineReClassify - main program that reanalyse records in a combine database

Algorithm: select relevant records based on cls parameter for each record get record from database delete analyse infor from the record analyse the record if still_relevant save in database

_________________________________________________________________________________________________________________

A.5.5 combineSVM

NAME combineSVM - generate a SVM model from good and bad examples

SYNOPSIS combineSVM –jobname <name> [–good <good-file>] [–bad <bad-file>] [–train <model-file>] [–help]

OPTIONS AND ARGUMENTS jobname is used to find the appropriate configuration (mandatory)

good is the name of a file with good URLs, one per line. Default ’goodURL.txt’

bad is the name of a file with bad URLs, one per line. Default ’badURL.txt’

train is the name of the file where the trained SVM model will be stored. Default ’SVMmodel.txt’

DESCRIPTION Takes two files, one with positive examples (good) and one with negative examples (bad) and trains a SVM classifier using these. The resulting model is stored in the file <train>.

The example files should contain one URL per line and nothing else.

SEE ALSO combine

Combine configuration documentation in /usr/share/doc/combine/.

AUTHOR Ignacio Garcia Dorado Anders Ardö, <anders.ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2008 Ignacio Garcia Dorado, Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.6 combineRank

NAME combineRank - calculates various Ranks for a Combine crawled database

SYNOPSIS combineRank <action> –jobname <name> –verbose

where action can be one of PageRank, PageRankBL, NetLocRank, and exportLinkGraph. Results on STDOUT.

OPTIONS AND ARGUMENTS jobname is used to find the appropriate configuration (mandatory)

verbose enables printing of ranks to STDOUT as SQL INSERT statements

Actions calculating variants of PageRank

PageRank

calculate standard PageRank

PageRankBL

calculate PageRanks with backlinks added for each link

NetLocRank

calculate SiteRank for each site and a local DocRank for documents within each site. Global ranks are then calulated as SiteRank * DocRank

Actions exporting link data

exportLinkGraph

export linkgraph from Combine database

DESCRIPTION Implements calculation of different variants of PageRank.

Results are written to STDOUT and can be huge for large databases.

Linkgraph is exported in ASCII as a sparse matrix, one row per line. First integer is the ID (urlid) of a page with links. The rest of integers on the line are IDs for pages linked to. Ie 121 5624 23416 51423 267178 means that page 121 links to pages 5624 23416 51423 267178

EXAMPLES

combineRank –jobname aatest –verbose PageRankBL

calculate PageRank with backlinks, result on STDOUT

combineRank –jobname aatest –verbose exportLinkGraph

export the linkgraph to STDOUT

SEE ALSO combine

Combine configuration documentation in /usr/share/doc/combine/.

AUTHOR Anders Ardö, <anders.ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2006 Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.7 combineUtil

NAME combineUtil - various operations on the Combine database

SYNOPSIS combineUtil <action> –jobname <name>

where action can be one of stats, termstat, classtat, sanity, all, serveralias, resetOAI, restoreSanity, deleteNetLoc, deletePath, deleteMD5, deleteRecordid, addAlias

OPTIONS AND ARGUMENTS jobname is used to find the appropriate configuration (mandatory)

Actions listing statistics

stats

Global statistics about the database

termstat

generates statistics about the terms from topic ontology matched in documents (can be long output)

classtat

generates statistics about the topic classes assigned to documents

Actions for sanity controlls

sanity

Performs various sanity checks on the database

restoreSanity

Deletes records which sanity checks finds insane

resetOAI

Removes all history (ie ’deleted’ records) from the OAI table. This is done by removing the OAI table and recreating it from the existing database.

Action all Does the actions: stats, sanity, classtat, termstat

Actions for deleting records

deleteNetLoc

Deletes all records matching the ’,’-separated list of server net-locations (server-names optionally with port) in the switch –netlocstr. Net-locations can include SQL wild cards (’%’).

deletePath

Deletes all records matching the ’,’-separated list of URl paths (excluding net-locations) in the switch –pathsubstr. Paths can include SQL wild cards (’%’).

deleteMD5

Delete the record which has the MD5 in switch –md5

deleteRecordid

Delete the record which has the recordid in switch –recordid

Actions for handling server aliases

serverAlias

Detect server aliases in the current database and do a ’addAlias’ on each detected alias.

addAlias

Manually add a serveralias to the system. Requires switches –aliases and –preferred

DESCRIPTION Does various statistics generation as well as performing sanity checks on the database

EXAMPLES

combineUtil termstat –jobname aatest

Generate matched term statistics

SEE ALSO combine

Combine configuration documentation in /usr/share/doc/combine/.

AUTHOR Anders Ardö, <anders.ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2005 Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.8 combine

NAME Combine - Focused Web crawler framework

SYNOPSIS combine –jobname <name> –logname <id>

OPTIONS AND ARGUMENTS jobname is used to find the appropriate configuration (mandatory)

logname is used as identifier in the log (in MySQL table log)

DESCRIPTION Does crawling, parsing, optional topic-check and stores in MySQL database Normally started with the combineCtrl command. Briefly it get’s an URL from the MySQL database, which acts as a common coordinator for a Combine job. The Web-page is fetched, provided it passes the robot exclusion protocoll. The HTML ic cleaned using Tidy and parsed into metadata, headings, text, links and link achors. Then it is stored (optionaly provided a topic-check is passed to keep the crawler focused) in the MySQL database in a structured form.

A simple workflow for a trivial crawl job might look like:

    Initialize database and configuration  
  combineINIT --jobname aatest  
    Enter some seed URLs from a file with a list of URLs  
  combineCtrl  load --jobname aatest < seedURLs.txt  
    Start 2 crawl processes  
  combineCtrl  start --jobname aatest --harvesters 2

    For some time occasionally schedule new links for crawling  
  combineCtrl recyclelinks --jobname aatest  
    or look at the size of the ready queue  
  combineCtrl stat --jobname aatest

    When satisfied kill the crawlers  
  combineCtrl kill --jobname aatest  
    Export data records in a highly structured XML format  
  combineExport --jobname aatest

For more complex jobs you have to edit the job configuration file.

SEE ALSO combineINIT, combineCtrl

Combine configuration documentation in /usr/share/doc/combine/.

AUTHOR Anders Ardö, <anders.ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2005 Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.9 Combine::PosMatcher

NAME PosMatcher

DESCRIPTION This a module in the DESIRE automatic classification system. Copyright 1999.

Exported routines: 1. Fetching text: These routines all extract texts from a document (either a Combine record, a Combine XWI datastructure or a WWW-page identified by a URL. They all return: $meta, $head, $text, $url, $title, $size $meta: Metadata from document $head: Important text from document $text: Plain text from document $url: URL of the document $title: HTML title of the document $size: The size of the document

   Common input parameters:  
        $DoStem: 1=do stemming; 0=no stemming  
        $stoplist: object pointer to a LoadTermList object with a stoplist loaded  
        $simple: 1=do simple loading; 0=advanced loading (might induce errors)

 getTextXWI  
     parameters: $xwi, $DoStem, $stoplist, $simple  
       $xwi is a Combine XWI datastructure

 getTextURL  
    parameters: $url, $DoStem, $stoplist, $simple  
       $url is the URL for the page to extract text from

2. Term matcher accepts a text as a (reference) parameter, matches each term in Term against text Matches are recorded in an associative array with class as key and summed weight as value. Match parameters: $text, $termlist $text: text to match against the termlist $termlist: object pointer to a LoadTermList object with a termlist loaded output: %score: an associative array with classifications as keys and scores as values

3. Heuristics: sum scores down the classification tree to the leafs cleanEiTree parameters: %res - an associative array from Match output: %res - same array

AUTHOR Anders Ardö, <anders.ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2005,2006 Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.10 Combine::selurl

NAME selurl - Normalise and validate URIs for harvesting

INTRODUCTION Selurl selects and normalises URIs on basis of both general practice (hostname lowercasing, portnumber substsitution etc.) and Combine-specific handling (aplpying config_allow, config_exclude, config_serveralias and other relevant config settings).

The Config settings catered for currently are:

maxUrlLength - the maximum length of an unnormalised URL allow - Perl regular to identify allowed URLs exclude - Perl regular expressions to exclude URLs from harvesting serveralias - Aliases of server names sessionids - List sessionid markers to be removed

A selurl object can hold a single URL and has methods to obtain its subparts as defined in URI.pm, plus some methods to normalise and validate it in Combine context.

BUGS Currently, the only schemes supported are http, https and ftp. Others may or may not work correctly. For one thing, we assume the scheme has an internet hostname/port.

clone() will only return a copy of the real URI object, not a new selurl.

URI URI-escapes the strings fed into it by new() once. Existing percent signs in the input are left untouched, which implicates that:

(a) there is no risk of double-encoding; and

(b) if the original contained an inadvertent sequence that could be interpreted as an escape sequence, uri_unescape will not render the original input (e.g. url_with_%66_in_it goes whoop) If you know that the original has not yet been escaped and wish to safeguard potential percent signs, you’ll have to escape them (and only them) once before you offer it to new().

A problem with URI is, that its object is not a hash we can piggyback our data on, so I had to resort to AUTOLOAD to emulate inheritance. I find this ugly, but well, this *is* Perl, so what’d you expect?

_________________________________________________________________________________________________________________

A.5.11 Combine::XWI

NAME XWI.pm - class for internal representation of a document record

SYNOPSIS

 use Combine::XWI;  
 $xwi = new Combine::XWI;

 #single value record variables  
 $xwi->server($server);

 my $server = $xwi->server();

 #original content  
 $xwi->content(\$html);

 my $text = ${$xwi->content()};

 #multiple value record variables  
 $xwi->meta_add($name1,$value1);  
 $xwi->meta_add($name2,$value2);

 $xwi->meta_rewind;  
 my ($name,$content);  
 while (1) {  
  ($name,$content) = $xwi->meta_get;  
  last unless $name;  
 }

DESCRIPTION Provides methods for storing and retrieving structured records representing crawled documents.

METHODS

new()

XXX($val) Saves $val using AUTOLOAD. Can later be retrieved, eg

    $xwi->MyVar(’My value’);  
    $t = $xwi->MyVar;

will set $t to ’My value’

*_reset() Forget all values.

*_rewind() *_get will start with the first value.

*_add stores values into the datastructure

*_get retrieves values from the datastructure

meta_reset() / meta_rewind() / meta_add() / meta_get() Stores the content of Meta-tags

Takes/Returns 2 parameters: Name, Content

 $xwi->meta_add($name1,$value1);  
 $xwi->meta_add($name2,$value2);

 $xwi->meta_rewind;  
 my ($name,$content);  
 while (1) {  
  ($name,$content) = $xwi->meta_get;  
  last unless $name;  
 }

xmeta_reset() / xmeta_rewind() / xmeta_add() / xmeta_get() Extended information from Meta-tags. Not used.

url_remove() / url_reset() / url_rewind() / url_add() / url_get() Stores all URLs (ie if multiple URLs for the same page) for this record

Takes/Returns 1 parameter: URL

heading_reset() / heading_rewind() / heading_add() / heading_get() Stores headings from HTML documents

Takes/Returns 1 parameter: Heading text

link_reset() / link_rewind() / link_add() / link_get() Stores links from documents

Takes/Returns 5 parameters: URL, netlocid, urlid, Anchor text, Link type

robot_reset() / robot_rewind() / robot_add() / robot_get() Stores calculated information, like genre, language, etc

Takes/Returns 2 parameters Name, Value. Both are strings with max length Name: 15, Value: 20

topic_reset() / topic_rewind() / topic_add() / topic_get() Stores result of topic classification.

Takes/Returns 5 parameters: Class, Absolute score, Normalized score, Terms, Algorithm id

Class, Terms, and Algorithm id are strings with max lengths Class: 50, and Algorithm id: 25

Absolute score, and Normalized score are integers

Normalized score and Terms are optional and may be replaced with 0, and ” respectively

SEE ALSO Combine focused crawler main site http://combine.it.lth.se/

AUTHOR Yong Cao <tsao@munin.ub2.lu.se> v0.05 1997-03-13

Anders Ardö, <anders.ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2005,2006 Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.12 Combine::Matcher

NAME Matcher

DESCRIPTION This a module in the DESIRE automatic classification system. Copyright 1999. Modified in the ALVIS project. Copyright 2004

Exported routines: 1. Fetching text: These routines all extract texts from a document (either a Combine XWI datastructure or a WWW-page identified by a URL. They all return: $meta, $head, $text, $url, $title, $size $meta: Metadata from document $head: Important text from document $text: Plain text from document $url: URL of the document $title: HTML title of the document $size: The size of the document

   Common input parameters:  
        $DoStem: 1=do stemming; 0=no stemming  
        $stoplist: object pointer to a LoadTermList object with a stoplist loaded  
        $simple: 1=do simple loading; 0=advanced loading (might induce errors)

 getTextXWI  
     parameters: $xwi, $DoStem, $stoplist, $simple  
       $xwi is a Combine XWI datastructure

 getTextURL  
    parameters: $url, $DoStem, $stoplist, $simple  
       $url is the URL for the page to extract text from

2. Term matcher accepts a text as a (reference) parameter, matches each term in Term against text Matches are recorded in an associative array with class as key and summed weight as value. Match parameters: $text, $termlist $text: text to match against the termlist $termlist: object pointer to a LoadTermList object with a termlist loaded output: %score: an associative array with classifications as keys and scores as values

AUTHOR Anders Ardö <anders.ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2005,2006 Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.13 Combine::FromTeX

NAME Combine::FromTeX.pm - TeX parser in combine package

AUTHOR

 Anders Ardř 2000-06-11

__________________________________________________________________________

A.5.14 Combine::utilPlugIn

NAME utilPlugIn

DESCRIPTION Utilities for: * extracting text from XWI’s * SVM classification * language and country identification

AUTHOR Ignacio Garcia Dorado Anders Ardö <anders.ardo@eit.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2008 Ignacio Garcia Dorado, Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.15 Combine::SD_SQL

NAME SD_SQL

DESCRIPTION Reimplementation of sd.pl SD.pm and SDQ.pm using MySQL contains both recyc and guard

Basic idea is to have a table (urldb) that contains most URLs ever inserted into the system together with a lock (the guard function) and a boolean harvest-flag. Also in this table is the host part together with its lock. URLs are selected from this table based on urllock, netloclock and harvest and inserted into a queue (table que). URLs from this queue are then given out to harvesters. The queue is implemented as: # The admin table can be used to generate sequence numbers like this: #mysql> update admin set queid=LAST_INSERT_ID(queid+1); # and used to extract the next URL from the queue #mysql> select host,url from que where queid=LAST_INSERT_ID(); # When the queue is empty it is filled from table urldb. Several different algorithms can be used to fill it (round-robin, most urls, longest time since harvest, ...). Since the harvest-flag and guard-lock are not updated until the actual harvest is done it is OK to delete the queue and regenerate it anytime.

########################## #Questions, ideas, TODOs, etc #Split table urldb into 2 tables - one for urls and one for hosts??? #Less efficient when filling que; more efficient when updating netloclock #Datastruktur TABLE hosts: create table hosts( host varchar(50) not null default ”, netloclock int not null, retries int not null default 0, ant int not null default 0, primary key (host), key (ant), key (netloclock) );

############# Handle to many retries?

    algorithm takes an url from the host that was accessed longest ago  
    ($hostid,$url)=SELECT host,url,id FROM hosts,urls WHERE  
         hosts.hostlock < UNIX_TIMESTAMP()  
         hosts.host=urls.host AND  
         urls.urllock < UNIX_TIMESTAMP() AND  
         urls.harvest=1 ORDER BY hostlock LIMIT 1;

    algorithm takes an url from the host with most URLs  
    ($hostid,$url)=SELECT host,url,id FROM hosts,urls WHERE  
         hosts.hostlock < UNIX_TIMESTAMP()  
         hosts.host=urls.host AND  
         urls.urllock < UNIX_TIMESTAMP() AND  
         urls.harvest=1 ORDER BY host.ant DESC LIMIT 1;

    algorithm takes an url from any available host  
    ($hostid,$url)=SELECT host,url,id FROM hosts,urls WHERE  
         hosts.hostlock < UNIX_TIMESTAMP()  
         hosts.host=urls.host AND  
         urls.urllock < UNIX_TIMESTAMP() AND  
         urls.harvest=1 LIMIT 1;

AUTHOR Anders Ardö <anders.ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2005,2006 Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.16 Combine::FromHTML

NAME Combine::FromHTML.pm - HTML parser in combine package

AUTHOR Yong Cao <tsao@munin.ub2.lu.se> v0.06 1997-03-19 Anders Ardř 1998-07-18 added <AREA ... HREF=link ...> fixed <A ... HREF=link ...> regexp to be more general Anders Ardö 2002-09-20 added ’a’ as a tag not to be replaced with space added removal of Cntrl-chars and some punctuation marks from IP added <style>...</style> as something to be removed before processing beefed up compression of sequences of blanks to include \240 (non-breakable space) changed ’remove head’ before text extraction to handle multiline matching (which can be introduced by decoding html entities) added compress blanks and remove CRs to metadata-content Anders Ardö 2004-04 Changed extraction process dramatically

_________________________________________________________________________________________________________________

A.5.17 Combine::RobotRules

NAME RobotRules.pm

AUTHOR Anders Ardo version 1.0 2004-02-19

_________________________________________________________________________________________________________________

A.5.18 Combine::HTMLExtractor

NAME HTMLExtractor

DESCRIPTION Adopted from HTML::LinkExtractor - Extract links from an HTML document by D.H (PodMaster)

AUTHOR Anders Ardo D.H (PodMaster)

LICENSE Copyright (c) 2003 by D.H. (PodMaster). All rights reserved.

This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The LICENSE file contains the full text of the license.

_________________________________________________________________________________________________________________

A.5.19 Combine::LoadTermList

NAME LoadTermList

DESCRIPTION This a module in the DESIRE automatic classification system. Copyright 1999.

LoadTermList - A class for loading and storing a stoplist with single words a termlist with classifications and weights

 Subroutines:  
   LoadStopWordList(StopWordListFileName)  
      loads a list of stopwords, one per line, from  
      the file StopWordListFileName.

   EraseStopWordList  
      clears the stopword list

 Subroutines:  
  LoadTermList(TermListFileName) - loads TermClass from file  
  LoadTermListStemmed(TermListFileName) - same plus stems terms

 Input: A formatted term-list including weights and classifications  
  Format:  <weight>: <term_reg_exp>=[<classification>, ]+  
  weight can be a positive or negative number  
  term_reg_exp can be words, phrases, boolean expressions (with @and  
     as operator) on term_reg_exp or Perl regular expressions

AUTHOR Anders Ardö <Anders.Ardo@it.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2005,2006 Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________

A.5.20 Combine::classifySVM

NAME classifySVM

DESCRIPTION Classification plugin module using SVM (implementation SVMLight)

Uses SVM model loaded from file pointed to by configuration variable ’SVMmodel’

AUTHOR Ignacio Garcia Dorado Anders Ardö <anders.ardo@eit.lth.se>

COPYRIGHT AND LICENSE Copyright (C) 2008 Ignacio Garcia Dorado, Anders Ardö

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

See the file LICENCE included in the distribution at http://combine.it.lth.se/

_________________________________________________________________________________________________________________