# MySql replacement for hdb # AA0 2002-09-30 # Modified Open to return DBI connection and HDB table name package Combine::MySQLhdb; use strict; use Combine::XWI; use HTTP::Date; use Encode; my $sv; # holds the mysql connection my $table = ''; # holds the hdb table name my $savehtml; my $doOAI; sub Open { #needed?? use Combine::Config; $sv = Combine::Config::Get('MySQLhandle'); $savehtml = Combine::Config::Get('saveHTML'); $doOAI = Combine::Config::Get('doOAI'); my $hdbd = 'hdb'; return ($sv,$hdbd); } sub Close { # print "MySQLhdb::Close\n"; $sv->disconnect ; } sub DESTROY { print STDERR "MySQLhdb::DESTROY\n"; $sv->disconnect ; } sub Write { my ($xwi) = @_; return undef unless $xwi; if (!defined($sv)) { Open(); } #Init $sv CHANGE? my $md5 = $xwi->md5; my $recordid = $xwi->recordid; #Set by DataBase.pm #OAI if ($doOAI) { $sv->prepare("REPLACE INTO oai SET status='created', recordid=?, md5=?")->execute($recordid, $md5); } #OAI # $xwi->url_rewind; MORE THAN one URL?? # my $url = $xwi->url_get; my $urlid = $xwi->urlid; my $my_netlocid = $xwi->netlocid; my $type = $xwi->type; my $title = $xwi->title; #checkedDate is inserted/updated in DataBase.pm and harvpars.pl my $modifiedDate = $xwi->modifiedDate; if ( ! $modifiedDate) { $modifiedDate = $xwi->checkedDate; } my $expiryDate = $xwi->expiryDate; # if ($expiryDate) { $expiryDate = str2time($expiryDate) ; } # else { $expiryDate = 'NULL'; } my $length = $xwi->length; my $server = $xwi->server; my $etag = $xwi->etag; my $nheadings = $xwi->heading_count; my $headings=''; # headings $xwi->heading_rewind; while (1) { my $this = $xwi->heading_get or last; $headings .= $this . '; '; } my $nlinks = $xwi->link_count; my $this = $xwi->text; my $ip; if ($this) { $this = $$this; if ($xwi->truncated()) { # IMPORTANT! This document was truncated. Therefore: # # 1) Discard it if no space characters in it, because then it # could be binary. # # 2) If a space is found, then truncate after the last space, # so as to avoid erroneous indexing (since the truncation # most likely cut a word). my $last_blank = rindex($this,' '); if ($last_blank > 0) { $ip = substr($this, 0, $last_blank) ; } } else { $ip = $this ; } } #?? if (length($ip)>250000) {$ip = substr($ip, 0, 250000);} $sv->prepare("REPLACE INTO hdb VALUES (?, ?, ?, FROM_UNIXTIME( ? ), FROM_UNIXTIME( ? ), ?, ?, ?, ?, ?, ?, COMPRESS(?))")->execute( $recordid, $type, Encode::encode('utf8',$title), $modifiedDate, $expiryDate, $length, $server, $etag, $nheadings, $nlinks, Encode::encode('utf8',$headings), Encode::encode('utf8',$ip)); if ( $savehtml == 1 ) { my $html = $xwi->content; $sv->prepare("REPLACE INTO html SET html=COMPRESS(?), recordid=?")->execute(Encode::encode('utf8',$$html),$recordid); } my $res; #save links my ( $urlstr, $anchor, $ltype); $xwi->link_rewind; my $link_count = 1; my $netlocid; $res = $sv->do(qq{DELETE FROM links WHERE recordid='$recordid';}); #needed? while(1) { #links ($urlstr, $netlocid, $urlid, $anchor, $ltype) = $xwi->link_get; if (defined($urlstr)) { #Convert urlstr to urlid,netlocid if needed if ( ($netlocid <= 0) || ($urlid <= 0) ) { if ( $urlstr eq '') { print STDERR "ERR MySQLhdb, save links, no info\n"; } ## sanity check -> log error use Combine::selurl; my $u; if ( $u = new Combine::selurl($urlstr) ) { $urlstr = $u->normalise(); my $netlocstr = $u->authority; my $path_query = $u->path_query; my $lsth = $sv->prepare(qq{SELECT netlocid,urlid FROM urls WHERE urlstr=?;}); $lsth->execute($urlstr); ($netlocid,$urlid) = $lsth->fetchrow_array; if ( !defined($urlid) ) { $sv->prepare(qq{INSERT IGNORE INTO netlocs SET netlocstr=?;})->execute($netlocstr); # ($netlocid) = $sv->selectrow_array(qq{SELECT netlocid FROM netlocs WHERE netlocstr='$netlocstr';}); my $nlsth = $sv->prepare(qq{SELECT netlocid FROM netlocs WHERE netlocstr=?;}); $nlsth->execute($netlocstr); ($netlocid) = $nlsth->fetchrow_array(); $sv->prepare(qq{INSERT IGNORE INTO urls SET urlstr=?, netlocid=?, path=?;})->execute($urlstr,$netlocid,$path_query); $lsth->execute($urlstr); ($netlocid,$urlid) = $lsth->fetchrow_array; } $sv->prepare("INSERT INTO links (recordid,mynetlocid,urlid,netlocid,anchor,linktype) VALUES (?, ?, ?, ?, ?, ?)")->execute($recordid,$my_netlocid,$urlid,$netlocid,Encode::encode('utf8',$anchor),$ltype); } } else { $sv->prepare("INSERT INTO links (recordid,mynetlocid,urlid,netlocid,anchor,linktype) VALUES (?, ?, ?, ?, ?, ?)")->execute($recordid,$my_netlocid,$urlid,$netlocid,Encode::encode('utf8',$anchor),$ltype); } } else { last; } last if ($link_count++ >= 500); # limit on number of links } #save metadata $xwi->meta_rewind; $res = $sv->do(qq{DELETE FROM meta WHERE recordid='$recordid';}); #needed? my ($name,$content); while (1) { ($name,$content) = $xwi->meta_get; last unless $name; $sv->prepare("INSERT INTO meta VALUES (?, ?, ?)")->execute($recordid, Encode::encode('utf8',$name), Encode::encode('utf8',$content)); } #OLD #save URLs # $xwi->url_rewind; # $res = $sv->do(qq{DELETE FROM urls WHERE recordid='$recordid';}); # while (1) { # $this = $xwi->url_get or last; ## $res = $sv->do(qq{INSERT INTO urls VALUES ('$recordid','$this');}); # my $machine = $this; # $machine =~ s|http://([^:/]+)[:/]?.*|$1|; # $sv->prepare("INSERT INTO urls VALUES (?, ?, ?)")->execute($recordid, $this, $machine); # } #save robot data in analys table (uses that URL is stored) $xwi->robot_rewind; $res = $sv->do(qq{DELETE FROM analys WHERE recordid='$recordid';}); #needed? while (1) { ($name,$content) = $xwi->robot_get; last unless $name; $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, $name, Encode::encode('utf8',$content)); } ## my $alinks = calclinks($recordid,$machine); #? #What if link-stats are inserted double after a Get and following write? my $sth = $sv->prepare(qq{SELECT COUNT(DISTINCT(links.recordid)), COUNT(DISTINCT(mynetlocid)) FROM links,recordurl WHERE recordurl.recordid= ? AND links.urlid = recordurl.urlid AND mynetlocid<>links.netlocid;}); $sth->execute($recordid); my ($inlinks,$hostinlinks)=$sth->fetchrow_array; $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, 'inlinks', $inlinks); $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, 'hostinlinks', $hostinlinks); $sth = $sv->prepare(qq{SELECT count(distinct(netlocid)) FROM links WHERE recordid=?;}); $sth->execute($recordid); my ($outlinks)=$sth->fetchrow_array; $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, 'outlinks', $outlinks); #save topic, ie result of autoclassification $xwi->topic_rewind; $res = $sv->do(qq{DELETE FROM topic WHERE recordid='$recordid';}); #needed? my ($cls,$absscore, $relscore, $terms, $alg); while (1) { ($cls,$absscore, $relscore,$terms, $alg) = $xwi->topic_get; last unless $cls; $sv->prepare("INSERT INTO topic VALUES (?, ?, ?, ?, ?, ?)")->execute($recordid, Encode::encode('utf8',$cls), $absscore, $relscore, Encode::encode('utf8',$terms), $alg); } if (my $zh = Combine::Config::Get('ZebraHost')) { require Combine::Zebra; Combine::Zebra::update($zh,$xwi); } } sub Delete { #Used?? my ($xwi) = @_; return undef unless $xwi; my $recordid = $xwi->recordid; #print "MySQLhdb::DeleteMD5 $recordid\n"; DeleteKey($recordid); } sub DeleteKey { my ($key) = @_; if (!defined($sv)) { Open(); } #Init $sv CHANGE? #OAI if ($doOAI) { # $sv->prepare("REPLACE INTO oai SET status='deleted', recordid=?, md5=?")->execute($key,$md5); $sv->prepare("REPLACE INTO oai SELECT recordid,md5,NOW(),'deleted' FROM recordurl WHERE recordid=?")->execute($key); } #OAI #print "MySQLhdb::DeleteKey $key\n"; my $res = $sv->do(qq{DELETE FROM hdb WHERE recordid=$key;}); $res = $sv->do(qq{DELETE FROM html WHERE recordid=$key;}); $res = $sv->do(qq{DELETE FROM meta WHERE recordid=$key;}); $res = $sv->do(qq{DELETE FROM analys WHERE recordid=$key}); $res = $sv->do(qq{DELETE FROM links WHERE recordid=$key;}); $res = $sv->do(qq{DELETE FROM topic WHERE recordid=$key;}); $res = $sv->do(qq{DELETE FROM recordurl WHERE recordid=$key;}); } sub Get { my ($key) = @_; #should return an initalized xwi-object if (!defined($sv)) { Open(); } #Init $sv CHANGE? my ($type, $title, $modifiedDate, $expiryDate, $length, $server, $etag, $nheadings, $nlinks, $headings, $ip) = $sv->selectrow_array(qq{SELECT type,title, UNIX_TIMESTAMP(mdate),IF(expiredate,UNIX_TIMESTAMP(expiredate),0), length,server,etag,nheadings,nlinks,headings,UNCOMPRESS(ip) FROM hdb WHERE recordid='$key';}); my $xwi = new Combine::XWI ; $xwi->recordid($key); #url Relies on that all urls are in table urls $xwi->type($type); $xwi->title(Encode::decode('utf8',$title)); $xwi->modifiedDate($modifiedDate); if ($expiryDate>0) {$xwi->expiryDate($expiryDate)}; $xwi->length($length); $xwi->server($server); $xwi->etag($etag); $xwi->nheadings($nheadings); $xwi->nlinks($nlinks); $headings =~ s/; $//; $xwi->heading_add(Encode::decode('utf8',$headings)) ; my $ip1=Encode::decode('utf8',$ip); $xwi->text(\$ip1); my ($html1) = $sv->selectrow_array(qq{SELECT UNCOMPRESS(html) FROM html WHERE recordid='$key';}); my $html = Encode::decode('utf8',$html1); $xwi->content(\$html); my ($urlpath) = $sv->selectrow_array(qq{SELECT path FROM urls,recordurl WHERE recordid='$key' AND recordurl.urlid=urls.urlid;}); $xwi->urlpath($urlpath); my ($url,$anchor,$lty,$name,$value,$heading); #links my $sth = $sv->prepare(qq{SELECT urlid,netlocid,anchor,linktype from links WHERE recordid='$key';}); $sth->execute; my ($urlid,$netlocid,$checkedDate,$md5,$fingerprint,$cls,$absscore,$relscore,$terms,$alg); while (($urlid,$netlocid,$anchor,$lty)=$sth->fetchrow_array) { $xwi->link_add('', $netlocid, $urlid, Encode::decode('utf8',$anchor), $lty) ; #no URLstr add? } #meta $sth = $sv->prepare(qq{SELECT name,value from meta WHERE recordid='$key';}); $sth->execute; while (($name,$value)=$sth->fetchrow_array) { $xwi->meta_add(Encode::decode('utf8',$name),Encode::decode('utf8',$value)) ; } # analys -> robot $sth = $sv->prepare(qq{SELECT name,value FROM analys WHERE recordid='$key';}); $sth->execute; while (($name,$value)=$sth->fetchrow_array) { $xwi->robot_add($name,Encode::decode('utf8',$value)) ; } # topic $sth = $sv->prepare(qq{SELECT notation,abscore,relscore,terms,algorithm FROM topic WHERE recordid='$key';}); $sth->execute; while (($cls,$absscore,$relscore,$terms,$alg)=$sth->fetchrow_array) { $xwi->topic_add(Encode::decode('utf8',$cls),$absscore,$relscore,Encode::decode('utf8',$terms),$alg) ; } #recordurl $sth = $sv->prepare(qq{SELECT urlid,UNIX_TIMESTAMP(lastchecked),md5,fingerprint FROM recordurl WHERE recordid='$key';}); $sth->execute; while (($urlid,$checkedDate,$md5,$fingerprint)=$sth->fetchrow_array) { $xwi->urlid($urlid); $xwi->checkedDate($checkedDate); $xwi->md5($md5); $xwi->fingerprint($fingerprint); } return $xwi; } 1;