#!/usr/bin/perl -w =head1 NAME cpanest - generate an Hyper Estraier index for CPAN =head1 SYNOPSIS B [B<-clean>] [B<-noclean>] [B<-cpan> I] [B<-node> I] [B<-force>] [B<-noforce>] [B<-keep> I] [B<-match> I] [B<-test> I] [B<-trust_mtime>] [B<-notrust_mtime>] =head1 DESCRIPTION This is a port of C from L perl search engine to node API of Hyper Estraier. All the hard work was done by Ulrich Pfeifer who wrote all parsers and formatters. I just added support for Hyper Estraier back-end after. B =head1 OPTIONS =over 5 =item B<-clean> / B<-noclean> Clean the table befor indexing. Default is B. =item B<-cpan> I Default directory or URL for indexing. If an URL is given, there currently must be a file F relative to it which contains the output of C. Default is F. =item B<-node> I Specify node URI =item B<-force> Force reindexing, even if B thinks files are up to date. Default is B =item B<-keep> I If fetching from a remote server, keep files in I. Default is F. =item B<-match> I Limit to patches matching I. Default is F. =item B<-test> I Set test level, were B<0> means normal operation, B<1> means, don't really index and B<2> means, don't even get archives and examine them. =item B<-trust_mtime> / B<-notrust_mtime> If B, the files mtimes are used to decide, which version of an archive is the newest. If b, the version extracted is used (beware, there are far more version numbering schemes than B can parse). =back =head1 AUTHORS Ulrich Pfeifer EFE Dobrica Pavlinusic EFE =head1 COPYRIGHT Copyright (c) 1996-1997, Ulrich Pfeifer Copyright (c) 2005, Dobrica Pavlinusic =cut use strict; use File::Path; use Getopt::Long; use File::Find; use File::Basename; use IO::File; use IO::Zlib; use POSIX qw/strftime/; use lib '/data/wait/lib'; use WAIT::Parse::Base; use WAIT::Parse::Pod; use WAIT::Document::Tar; use WAIT::Document::Find; sub fname($); # maximum number of archives to index (set to -1 for unlimited) my $max = -1; my %OPT = ( node => 'http://localhost:1978/node/cpan', clean => 0, remove => [], force => 0, # cpan => '/usr/src/perl/CPAN/sources', cpan => '/rest/cpan/CPAN/', trust_mtime => 1, match => 'authors/id/', test => 0, # cpan => 'ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN', # cpan => 'ftp://ftp.uni-hamburg.de:/pub/soft/lang/perl/CPAN', keep => '/tmp/CPAN/', ); GetOptions(\%OPT, 'node=s', 'cpan=s', 'keep=s', 'match=s', 'clean!', 'test=i', # test level 0: normal # 1: don't change db # 2: don't look at archives even 'remove=s@', 'force!', # force indexing even if seen 'trust_mtime!', # use mtime instead of version number 'max=i', 'debug!', ) || die "Usage: ...\n"; if ($OPT{max}) { $max = $OPT{max}; print STDERR "processing just first $max modules\n"; } # FIXME #clean_node( # node => $OPT{node}, # ) if $OPT{clean}; my $tb = new HyperEstraier::WAIT::Table( uri => $OPT{node}, attr => ['docid', 'headline', 'source', 'size', 'parent', 'version'], key => 'docid', invindex => [ qw/name synopsis bugs description text environment example author/ ], debug => $OPT{debug}, ) or die "Could not open node '$OPT{node}'"; my $layout= new WAIT::Parse::Pod; # Map e.g. '.../latest' to 'perl'. Used in wanted(). Effects version # considerations. Value *must* match common prefix. Aliasing should be # used if CPAN contains serveral distributions with different name but # same root directory. # We still have a problem if there are different root directories! my %ALIAS = (# tar name real (root) name 'Games-Scrabble' => 'Games', 'HTML-ParseBrowser' => 'HTML', 'iodbc_ext' => 'iodbc-ext-0.1', 'sol-inst' => 'Solaris', 'WebService-Validator-CSS-223C' => 'WebService-Validator-CSS-W3C-0.02', 'MPEG-ID3212Tag' => 'MPEG-ID3v2Tag-0.36', 'WebService-GoogleHack' => 'WebService', 'Db-Mediasurface-ReadConfig' => 'ReadConfig', 'Tie-Array-RestrictUpdates' => 'Tie', 'HTML-Lister' => 'HTML', 'Net-253950-AsyncZ' => 'Net-Z3950-AsyncZ-0.08', 'ChildExit_0' => 'ChildExit-0.1', 'Tie-TieConstant' => 'TieConstant.pm', 'Crypt-OpenSSL-23509' => 'Crypt-OpenSSL-X509-0.2', 'subclustv' => 'blib', 'finance-yahooquote' => 'Finance-YahooQuote-0.20', 'HPUX-FS' => 'FS', 'Business-DE-Konto' => 'Business', 'Digest-MD5-124p' => 'Digest-MD5-M4p-0.01', 'AKDB_Okewo_de' => 'AKDB', 'ExtUtils-0577' => 'ExtUtils-F77-1.14', 'LispFmt' => 'Lisp::Fmt-0.00', 'Acme-Stegano' => 'Acme', 'Acme-RTB' => 'Acme', 'WWW-Search-PRWire' => 'work', 'Video-Capture-214l' => 'Video-Capture-V4l-0.224', 'Tie-DirHandle' => 'Tie', 'DB2' => 'DBD-DB2-0.71a', 'Tie-Scalar-RestrictUpdates' => 'Tie', 'Math-MVPoly' => 'MVPoly', 'PlugIn' => 'PlugIn.pm', 'Lingua-ID-Nums2Words' => 'Nums2Words-0.01', 'chronos-1.' => 'Chronos', 'jp_beta' => 'jperl_beta_r1', 'Bundle-223C-Validator' => 'Bundle-W3C-Validator-0.6.5', 'Text-199' => 'Text-T9-1.0', 'Games-Literati' => 'Games', 'VMS-IndexedFile' => 'VMS', 'authen-rbac' => 'Authen', 'Graphics-EPS' => 'EPS.pm', 'new.spirit-2.' => 'new.spirit', 'Tk-MListbox' => 'MListbox-1.11', 'DBD-SQLrelay' => 'SQLRelay.pm', 'Tie-RDBM-Cached' => 'RDBM', 'PDL_IO_HDF' => 'HDF', 'HPUX-LVM' => 'LVM', 'Parse-Nibbler' => 'Parse', 'Digest-Perl-MD4' => 'MD4', 'Crypt-Imail' => 'Imail', 'ubertext' => 'Text-UberText-0.95', 'MP3-123U' => 'M3U', 'Qmail-Control' => 'Qmail', 'T-LXS' => 'Text-LevenshteinXS-0.02', 'HTML-Paginator' => 'HTML', 'swig' => 'SWIG1.1p5', 'MIDI-Realtime' => 'MIDI', 'sparky-public' => 'Sparky-Public-1.06', 'Chemistry-MolecularMass' => 'Chemistry', 'Net-253950-SimpleServer' => 'Net-Z3950-SimpleServer-0.08', 'NewsClipper-OpenSource' => 'NewsClipper-1.32-OpenSource', 'Win32API-Resources' => 'Resources.pm', 'Unicode-Collate-Standard-2131_1' => 'Unicode-Collate-Standard-V3_1_1-0.1', 'Net-026Term' => 'Net-C6Term-0.11', 'BitArray1' => 'BitArray', 'Audio-Radio-214L' => 'Audio-Radio-V4L-0.01', 'Devel-AutoProfiler' => 'Devel', 'Brasil-Checar-CGC' => 'Brasil', 'AI-NeuralNet-SOM' => 'SOM.pm', 'Net-BitTorrent-File-fix' => 'Net-BitTorrent-File-1.01', 'VMS-FindFile' => 'VMS', 'LoadHtml.' => 'README', 'Time-Compare' => 'Time', 'ShiftJIS-230213-MapUTF' => 'ShiftJIS-X0213-MapUTF-0.21', 'Image-WMF' => 'Image', 'sdf-2.0.eta' => 'sdf-2.001beta1', 'Math-Expr-LATEST' => 'Math-Expr-0.4', 'MP3-Player-PktConcert' => 'MP3', 'Apache-OWA' => 'OWA', 'Audio-Gramofile' => 'Audio', 'DBIx-Copy' => 'Copy', 'P4-024' => 'P4-C4-2.021', 'Disassemble-2386' => 'Disassemble-X86-0.13', 'Proc-Swarm' => 'Swarm-0.5', 'Smil' => 'perlysmil', 'Net-SSH-2232Perl' => 'Net-SSH-W32Perl-0.05', 'Win32-SerialPort' => 'SerialPort-0.19', 'Lingua-ID-Words2Nums' => 'Words2Nums-0.01', 'Parse-Text' => 'Text', 'DBIx-HTMLView-LATEST' => 'DBIx-HTMLView-0.9', 'Apache-NNTPGateway' => 'NNTPGateway-0.9', 'XPathToXML' => 'XPathToXML.pm', 'XML-WMM-ASX' => 'XML', 'CGISession' => 'CGI', 'Net-SMS-142' => 'Net-SMS-O2-0.019', 'Search-253950' => 'Search-Z3950-0.05', 'Date-Christmas' => 'Christmas', 'Win32-InternetExplorer-Window' => 'Win32', 'Apache-WAP-MailPeek' => 'MailPeek', 'Statistics-Table-F' => 'Statistics', 'BerkeleyDB_Locks' => 'BerkeleyDB-Locks-0_2', 'HookPrePostCall' => 'PrePostCall-1.2', 'Oak-AAS-Service-DBI_13_PAM' => 'Oak-AAS-Service-DBI_N_PAM-1.8', 'Math-Vector' => 'Vector.pm', 'Audio-124pDecrypt' => 'Audio-M4pDecrypt-0.04', 'libao-perl_0.03' => 'libao-perl-0.03', 'CGI-EZForm' => 'EZForm', 'Data-Locations-fixed' => 'Data-Locations-5.2-fixed', 'HTML-Template-Filter-Dreamweaver' => 'Dreamweaver', 'LineByLine' => 'LineByLine.pm', 'Geo-0400' => 'Geo-E00-0.05', 'WebService-Validator-HTML-223C' => 'WebService-Validator-HTML-W3C-0.03', 'DateTime-Format-223CDTF' => 'DateTime-Format-W3CDTF-0.04', 'DBD_SQLFLEX' => 'DBD-Sqlflex', 'Text-Number' => 'Number', 'DBIx-DataLookup' => 'DBIx', 'MP3-ID3211Tag' => 'MP3-ID3v1Tag-1.11', 'Text-Striphigh' => 'Striphigh-0.02', 'Tie-SortHash' => 'SortHash', 'Apache-AccessAbuse' => 'AccessAbuse', 'MP3-123U-Parser' => 'MP3-M3U-Parser', 'Net-253950' => 'Net-Z3950-0.44', 'Net-RBLClient' => 'RBLCLient-0.2', 'CGI-EasyCGI' => 'CGI', 'http-handle' => 'HTTP::Handle', 'JPEG-Comment' => 'JPEG', 'router-lg' => 'Router', 'Db-Mediasurface' => 'Mediasurface', 'Text-BarGraph' => 'bargraph', 'TL' => 'Text-Levenshtein-0.04', 'Config-Vars' => 'Config-0.01', 'Tie-PerfectHash' => 'Tie', 'DNS-TinyDNS' => 'DNS', 'DesignPattern-Factory' => 'Factory', 'WWW-01_Rail' => 'WWW-B_Rail-0.01', 'Win32-Exchange' => 'blib', 'Math-RPN' => 'Math', 'Db-Mediasurface-Cache' => 'Cache', 'perl_archie.' => 'Archie.pm', 'Acme-PGPSign' => 'Acme', 'HTML-Widget-sideBar' => 'HTML-Widget-SideBar-1.00', 'log' => 'Games', 'File-List' => 'File', 'Schedule-Cronchik' => 'Schedule', 'Curses-Devkit' => 'Cdk', 'Pod-PalmDoc' => 'Pod', 'Easy-WML' => 'Easy WML 0.1', 'Interval.' => 'Date', 'Brasil-Checar-CPF' => 'Brasil', 'Apache-WAP-AutoIndex' => 'AutoIndex', 'SOM.pm' => 'SOM.pm', 'PlugIn.pm' => 'PlugIn.pm', 'XPathToXML.pm' => 'XPathToXML.pm', 'Vector.pm' => 'Vector.pm', 'LineByLine.pm' => 'LineByLine.pm', 'Archie.pm' => 'Archie.pm', 'TieConstant.pm' => 'TieConstant.pm', 'EPS.pm' => 'EPS.pm', 'SQLRelay.pm' => 'SQLRelay.pm', 'Resources.pm' => 'Resources.pm', 'README' => 'README', ); my %NEW_ALIAS; # found in this pass # Map module names to pathes. Generated by wanted() doing alisaing. my %ARCHIVE; # Map module names to latest version. Generated by wanted() my %VERSION; # Mapping for modules with common root not matching modules name that # are not aliased. This is just for prefix stripping and not strictly # necessary. Takes effect after version considerations. my %TR = (# tar name root to strip 'Net_SSLeay.pm' => 'SSLeay/', 'EventDrivenServer' => 'Server/', 'bio_lib.pl.' => '', 'AlarmCall' => 'Sys/', 'Cdk-ext' => 'Cdk/', 'Sx' => '\d.\d/', 'DumpStack' => 'Devel/', 'StatisticsDescriptive'=> 'Statistics/', 'Term-Gnuplot' => 'Gnuplot/', 'iodbc_ext' => 'iodbc-ext-\d.\d/', 'UNIVERSAL' => '', 'Term-Query' => 'Query/', 'SelfStubber' => 'Devel/', 'CallerItem' => 'Devel/', ); my $LWP; # FIXME my $DIR = '/rest/estseek/cpan/'; my $DATA = $DIR . '/data'; if (@{$OPT{remove}}) { my $pod; for $pod (@{$OPT{remove}}) { unless (-e $pod) { $pod = "$DIR/$pod"; } index_pod(file => $pod, remove => 1) if -f $pod; unlink $pod or warn "Could not unlink '$pod': $!\n"; } exit; } # Now get the beef if ($OPT{cpan} =~ /^(http|ftp):/) { $LWP = 1; require LWP::Simple; LWP::Simple->import(); mkpath($DATA,1,0755) or die "Could not generate '$DATA/': $!" unless -d $DATA; if (! -f "$DATA/find-ls.gz" or -M "$DATA/find-ls.gz" > 0.5) { my $status = mirror("$OPT{cpan}/indices/find-ls.gz", "$DATA/find-ls.gz"); if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) { # we could use Net:FTP here ... die "Was unable to mirror '$OPT{cpan}/indices/find-ls.gz'\n"; } } my $fh = new IO::File "gzip -cd $DATA/find-ls.gz |"; die "Could not open 'gzip -cd $DATA/find-ls.gz': !$\n" unless $fh; my $line; while (defined ($line = <$fh>)) { chomp($line); my ($mon, $mday, $time, $file, $is_link) = (split ' ', $line)[7..11]; next if defined $is_link; my $mtime = mtime($mon, $mday, $time); $file =~ s:^\./::; ($_) = fileparse($file); $File::Find::name = $file; wanted($mtime); } } else { find(sub {&wanted((stat($_))[9])}, $OPT{cpan}); } ARCHIVE: for my $tar (sort keys %ARCHIVE) { next if $OPT{match} and $ARCHIVE{$tar} !~ /$OPT{match}/o; my $base = (split /\//, $ARCHIVE{$tar})[-1]; my $parent; my %attr; # logging if ($OPT{trust_mtime}) { $attr{'@mdate'} = strftime('%Y-%m-%dT%H:%M:%S+00:00', gmtime($VERSION{$tar})); $parent->{'@mdate'} = $attr{'@mdate'}; printf "%-20s %10s %s\t", $tar, $attr{'@mdate'}, $base; } else { $attr{'version'} = $VERSION{$tar}; printf "%-20s %10.5f %s\t", $tar, $attr{'version'}, $base; } # Remember the archive # We should have an extra table for the tar file data ... if (!$OPT{force} and $tb->have(docid => $base)) { print "skipping\n"; next ARCHIVE; } else { $parent->{_id} = $tb->insert(docid => $base, headline => $ARCHIVE{$tar}, %attr ) unless $OPT{test}; print "indexing\n"; } next ARCHIVE if $OPT{test} > 1; my $TAR = myget($tar); next ARCHIVE unless $TAR; # not able to fetch it my %tar; tie (%tar, 'WAIT::Document::Tar', sub { $_[0] =~ /\.(pm|pod|PL)$/i or $_[0] =~ /readme/i}, #sub { $_[0] !~ m:/$: }, $TAR) or warn "Could not tie '$TAR'\n"; my $sloppy; my ($key, $val); FILE: while (($key, $val) = each %tar) { my $file = fname($key); # don't index directories next if $file =~ /\/$/; # is it a POD file? next FILE unless $file =~ /readme/i or $val =~ /\n=head/; # remove directory prefix unless ($sloppy # no common root or $file =~ s:^\Q$tar\E[^/]*/:: # common root, maybe alias or ($TR{$tar} # common root, not aliased and $file =~ s:^\Q$TR{$tar}\E::) ) { # try to determine an alias warn "Bad directory prefix: '$file'\n"; my ($prefix) = split /\//, $file; while ($key = (tied %tar)->NEXTKEY) { my $file = fname($key); next if $file =~ /\/$/; unless ($file =~ m:^$prefix/: or $file eq $prefix) { warn "Archive contains different prefixes: $prefix,$file\n"; $prefix = ''; last; } } if ($prefix) { print "Please alias '$tar' to '$prefix' next time!\n"; print "See alias table later.\n"; $NEW_ALIAS{$tar} = $prefix; $tb->delete_by_key($parent->{_id}); next ARCHIVE; } else { print "Assuming that tar file name $tar is a valid prefix\n"; $sloppy = 1; # We may reset too much here! But that this is not exact # science anyway. Maybe we should ignore using 'next ARCHIVE'. $key = (tied %tar)->FIRSTKEY; redo FILE; } } # remove /lib prefix $file =~ s:^lib/::; # generate new path my $path = "$DATA/$tar/$file"; my ($sbase, $sdir) = fileparse($path); my $fh; unless ($OPT{test}) { if (-f $path) { index_pod(file => $path, remove => 1); unlink $path or warn "Could not unlink '$path' $!\n"; } elsif (!-d $sdir) { mkpath($sdir,1,0755) or die "Could not mkpath($sdir): $!\n"; } # $fh = new IO::File "> $path"; $fh = new IO::Zlib "$path.gz","wb"; die "Could not write '$path': $!\n" unless $fh; } if ($file =~ /readme|install/i) { # make READMEs verbatim pods $val =~ s/\n/\n /g; $val = "=head1 NAME\n\n$tar $file\n\n=head1 DESCRIPTION\n\n $val" unless $val =~ /^=head/m; } else { # remove non-pod stuff my $nval = $val; $val = ''; my $cutting = 1; for (split /\n/, $nval) { if (/^=cut|!NO!SUBS!/) { $cutting = 1; } elsif ($cutting and /^=head/) { $cutting = 0; } unless ($cutting) { $val .= $_ . "\n"; } } } unless ($OPT{test}) { $fh->print($val); index_pod(file => $path, parent => $parent, text => $val, source => $ARCHIVE{$tar}, ); } } if ($LWP and !$OPT{keep}) { unlink $TAR or warn "Could not unlink '$TAR': $!\n"; } } if (%NEW_ALIAS) { print "\%ALIAS = (\n"; for (keys %NEW_ALIAS) { print "\t'$_'\t=> '$NEW_ALIAS{$_}',\n"; } print "\t);\n"; } exit; sub fname ($) { my $key = shift; my ($ntar, $file) = split $;, $key; # remove leading './' - shudder $file =~ s/^\.\///; return($file); } sub myget { my $tar = shift; my $TAR; if ($LWP) { # fetch the archive if ($OPT{keep}) { $TAR = "$OPT{keep}/$ARCHIVE{$tar}"; print "Keeping in '$TAR'\n" unless -e $TAR; my ($base, $path) = fileparse($TAR); unless (-d $path) { mkpath($path,1,0755) or die "Could not mkpath($path)\n"; } } else { $TAR = "/tmp/$tar.tar.gz"; } unless (-e $TAR) { # lwp mirror seems to fetch ftp: in any case? print "Fetching $OPT{cpan}/$ARCHIVE{$tar}\n"; my $status = mirror("$OPT{cpan}/$ARCHIVE{$tar}", $TAR); if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) { warn "Was unable to mirror '$ARCHIVE{$tar}, skipping'\n"; return; } } } else { $TAR = $ARCHIVE{$tar}; } $TAR; } sub index_pod { my %parm = @_; my $did = $parm{file}; my $rel_did = $did; my $abs_did = $did; if ($rel_did =~ s:$DIR/::) { $abs_did = "$DIR/$rel_did"; } undef $did; # check for both variants if ($tb->have('docid' => $rel_did)) { $did = $rel_did; } elsif ($tb->have('docid' => $abs_did)) { $did = $abs_did; } if ($did) { # have it version if (!$parm{remove} and !$OPT{force}) { warn "duplicate: $did\n"; return; } } else { # not seen yet $did = $rel_did; if ($parm{remove}) { print "missing: $did\n"; return; } } $parm{'text'} ||= WAIT::Document::Find->FETCH($abs_did); unless (defined $parm{'text'}) { print "unavailable: $did\n"; return; } my $record = $layout->split($parm{'text'}); if (! $record) { print "empty pod: $did\n"; return; } $record->{size} = length($parm{'text'}); my $headline = $record->{name} || $did; # additional fields for Hyper Estraier $record->{'@mdate'} = $parm{'mdate'} if ($parm{'mdate'}); $headline =~ s/^$DATA//o; # $did $headline =~ s/\s+/ /g; $headline =~ s/^\s+//; printf "%s %s\n", ($parm{remove})?'-':'+', substr($headline,0,70); if ($parm{remove}) { $tb->delete('docid' => $did, headline => $headline, %{$record}); } else { foreach (keys %{$parm{parent}}) { next if (/^_/); $record->{$_} = $parm{parent}->{$_} if ($parm{parent}->{$_}); } $tb->insert('docid' => $did, headline => $headline, source => $parm{source}, parent => $parm{parent}->{_id}, %{$record}); } } # This *must* remove the version in *any* case. It should compute a # resonable version number - but usually mtimes should be used. sub version { local ($_) = @_; # remove alpha/beta postfix s/([-_\d])(a|b|alpha|beta|src)$/$1/; # jperl1.3@4.019.tar.gz s/@\d.\d+//; # oraperl-v2.4-gk.tar.gz s/-v(\d)/$1/; # lettered versions - shudder s/([-_\d\.])([a-z])([\d\._])/sprintf "$1%02d$3", ord(lc $2) - ord('a') /ei; s/([-_\d\.])([a-z])$/sprintf "$1%02d", ord(lc $2) - ord('a') /ei; # thanks libwww-5b12 ;-) s/(\d+)b/($1-1).'.'/e; s/(\d+)a/($1-2).'.'/e; # replace '-pre' by '0.' s/-pre([\.\d])/-0.$1/; s/\.\././g; s/(\d)_(\d)/$1$2/g; # chop '[-.]' and thelike s/\W$//; # ram's versions Storable-0.4@p s/\@/./; if (s/[-_]?(\d+)\.(0\d+)\.(\d+)$//) { return($_, $1 + "0.$2" + $3 / 1000000); } elsif (s/[-_]?(\d+)\.(\d+)\.(\d+)$//) { return($_, $1 + $2/1000 + $3 / 1000000); } elsif (s/[-_]?(\d+\.[\d_]+)$//) { return($_, $1); } elsif (s/[-_]?([\d_]+)$//) { return($_, $1); } elsif (s/-(\d+.\d+)-/-/) { # perl-4.019-ref-guide return($_, $1); } else { if ($_ =~ /\d/) { # smells like an unknown scheme warn "Odd version Numbering: '$File::Find::name'\n"; return($_, undef); } else { # assume version 0 warn "No version Numbering: '$File::Find::name'\n"; return($_, 0); } } } sub wanted { my $mtime = shift; # called by parse_file_ls(); return if (! $max); $max--; return unless /^(.*)\.(tar\.(gz|Z)|tgz)$/; my ($archive, $version) = version($1); unless (defined $version) { warn "Skipping $1\n"; return; } # Check for file alias $archive = $ALIAS{$archive} if $ALIAS{$archive}; # Check for path alias. if ($File::Find::name =~ m(/CPAN/(?:source/)?(.*\Q$archive\E))) { if ($ALIAS{$1}) { $archive = $ALIAS{$1}; } } if ($OPT{trust_mtime}) { $version = $mtime; } else { $version =~ s/(\d)_/$1/; $version ||= $mtime; # mtime } if (!exists $ARCHIVE{$archive} or $VERSION{$archive} < $version) { $ARCHIVE{$archive} = $File::Find::name; $VERSION{$archive} = $version; } } my %MON; my $YEAR; BEGIN { my $i = 1; for (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)) { $MON{$_} = $i++; } $YEAR = (localtime(time))[5]; } # We could/should use Date::GetDate here use Time::Local; sub mtime { my ($mon, $mday, $time) = @_; my ($hour, $min, $year, $monn) = (0,0); if ($time =~ /(\d+):(\d+)/) { ($hour, $min) = ($1, $2); $year = $YEAR; } else { $year = $time; } $monn = $MON{$mon} || $MON{ucfirst lc $mon} || warn "Unknown month: '$mon'"; my $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year); if ($guess > time) { $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year-1); } $guess; } package HyperEstraier::WAIT::Table; use Search::Estraier; use Text::Iconv; =head1 NAME HyperEstraier::WAIT::Table =head1 DESCRIPTION This is a mode that emulates C functionality somewhat. There are some limitations and only one key attribute is supported (and used for C<@uri>). =head2 Porting from WAIT to this module. Since only one key is supported (and used as C<@uri> attribute), use first parametar of C as C. Full text index is specified as C, but you need just name of fields. You will probably need to add use WAIT::Parse::Base; to your code after you remove C and C. =head1 METHODS =head2 new my $tb = new HyperEstraier::WAIT::Table( uri => 'http://localhost:1978/node/cpan', attr => qw/docid headline source size parent/, key => 'docid', invindex => qw/name synopsis bugs description text environment example author/, ); =cut sub new { my $class = shift; my $self = {@_}; bless($self, $class); foreach my $p (qw/uri attr key invindex/) { die "need $p" unless ($self->{$p}); } $self->{'iso2utf'} = Text::Iconv->new('ISO-8859-1','UTF-8'); my $node = Search::Estraier::Node->new( url => $self->{'uri'}, user => 'admin', passwd => 'admin', create => 1, ); $self->{'node'} = $node; $self ? return $self : return undef; } =head2 have if ( $tb->have(docid => $something) ) ... =cut sub have { my $self = shift; my $args = {@_}; my $key = $self->{'key'} || die "no key in object"; my $key_v = $args->{$key} || die "no key $key in data"; my $id = $self->{'node'}->uri_to_id('file://' . $key_v); return unless($id); return ($id == -1 ? undef : $id); } =head2 insert my $key = $tb->insert( docid => $base, headline => 'Something', ... ); =cut sub insert { my $self = shift; my $args = {@_}; my $uri = 'file://'; $uri .= $args->{'docid'} or die "no docid"; my $doc = Search::Estraier::Document->new; $doc->add_attr('@uri', $uri); $doc->add_attr('@title', $args->{'headline'}) if ($args->{'headline'}); $doc->add_attr('@size', $args->{'size'}) if ($args->{'size'}); my @attr = $self->{'attr'} || die "no attr in object"; my @invindex = $self->{'invindex'} || die "no invindex in object"; foreach my $attr (keys %{$args}) { if (grep(/^$attr$/, @{ $self->{'attr'} }) or $attr =~ m/^@/o) { $doc->add_attr($attr, $args->{$attr}); } if (grep(/^$attr$/, @{ $self->{'invindex'} })) { $doc->add_text($args->{$attr}); } } print STDERR $doc->dump_draft if ($self->{'debug'}); my $id; unless ($self->{'node'}->put_doc($doc)) { printf STDERR "ERROR: %d\n", $self->{'node'}->status; } else { $id = $self->{'node'}->uri_to_id( $uri ); if ($id != -1) { print STDERR "id: $id\n" if ($self->{'debug'}) } else { print STDERR "ERROR: can't find id for newly insrted document $uri\n"; } } return $id; } =head2 delete_by_key $tb->delete_by_key($key); =cut sub delete_by_key { my $self = shift; my $key_v = shift || die "no key?"; unless ($self->{'node'}->out_doc_by_uri( 'file://' . $key_v )) { print STDERR "WARNING: can't delete document $key_v\n"; } } =head2 delete $tb->delete( docid => $did, ... ); =cut sub delete { my $self = shift; my $args = {@_}; my $key = $self->{'key'} || die "no key in object"; die "no $key in data" unless (my $key_v = $args->{$key}); $self->delete_by_key($key_v); }