#!/usr/local/perl5.005_56.Mar06/bin/perl -w ######################### -*- Mode: Perl -*- ######################### ## ## $Basename: cpanwait $ ## $Revision: 1.7 $ ## ## Author : Ulrich Pfeifer ## Created On : Sat Jan 4 18:09:28 1997 ## ## Last Modified By : Ulrich Pfeifer ## Last Modified On : Sun Nov 22 18:44:36 1998 ## ## Copyright (c) 1996-1997, Ulrich Pfeifer ## ## ###################################################################### eval 'exec perl -S $0 "$@"' if 0; use strict; use File::Path; use DB_File; use Getopt::Long; use File::Find; use File::Basename; use IO::File; require WAIT::Config; require WAIT::Database; require WAIT::Parse::Pod; require WAIT::Document::Tar; my %OPT = (database => 'DB', dir => $WAIT::Config->{WAIT_home} || '/tmp', table => 'cpan', clean => 0, remove => [], force => 0, cpan => '/usr/src/perl/CPAN/sources', trust_mtime => 1, match => 'authors/id/', test => 0, # cpan => 'ftp://ftp.rz.ruhr-uni-bochum.de/pub/CPAN', cpan => 'ftp://ftp.uni-hamburg.de:/pub/soft/lang/perl/CPAN', keep => '/app/unido-i06/src/share/lang/perl/96a/CPAN/sources', ); GetOptions(\%OPT, 'database=s', 'dir=s', 'cpan=s', 'table=s', 'keep=s', 'match=s', 'clean!', 'test=i', # test level 0: normal # 1: don't change db # 2: don't look at archives even 'remove=s@', 'force!', # force indexing even if seen 'trust_mtime!', # use mtime instead of version number ) || die "Usage: ...\n"; clean_database( database => $OPT{database}, dir => $OPT{dir}, table => $OPT{table}, ) if $OPT{clean}; my $db = WAIT::Database->open( name => $OPT{database}, 'directory' => $OPT{dir}, ) || WAIT::Database->create( name => $OPT{database}, 'directory' => $OPT{dir}, ) or die "Could not open/create database '$OPT{dir}/$OPT{database}': $@"; my $layout= new WAIT::Parse::Pod; my $tb = $db->table(name => $OPT{table}) || create_table(db => $db, table => $OPT{table}, layout => $layout); # Map e.g. '.../latest' to 'perl'. Used in wanted(). Effects version # considerations. Value *must* match common prefix. Aliasing should be # used if CPAN contains serveral distributions with different name but # same root directory. # We still have a problem if there are different root directories! my %ALIAS = (# tar name real (root) name 'latest' => 'perl', 'perl5db-kit' => 'DB', 'SGI-FM' => 'FM', 'net-ext' => 'Net', 'VelocisSQL' => 'Velocis', 'Net-ext' => 'Net', 'Curses-DevKit' => 'Cdk', 'PostgresPerl' => 'Postgres', 'perlpdf' => 'PERLPDF', 'Des-perl' => 'Des', 'SGI-GL' => 'GL', 'DBD-DB2' => 'DB2', ); my %NEW_ALIAS; # found in this pass # Map module names to pathes. Generated by wanted() doing alisaing. my %ARCHIVE; # Map module names to latest version. Generated by wanted() my %VERSION; # Mapping for modules with common root not matching modules name that # are not aliased. This is just for prefix stripping and not strictly # necessary. Takes effect after version considerations. my %TR = (# tar name root to strip 'Net_SSLeay.pm' => 'SSLeay/', 'EventDrivenServer' => 'Server/', 'bio_lib.pl.' => '', 'AlarmCall' => 'Sys/', 'Cdk-ext' => 'Cdk/', 'Sx' => '\d.\d/', 'DumpStack' => 'Devel/', 'StatisticsDescriptive'=> 'Statistics/', 'Term-Gnuplot' => 'Gnuplot/', 'iodbc_ext' => 'iodbc-ext-\d.\d/', 'UNIVERSAL' => '', 'Term-Query' => 'Query/', 'SelfStubber' => 'Devel/', 'CallerItem' => 'Devel/', ); my $DIR = $tb->dir; my $DATA = $tb->dir . "/data"; my $LWP; if (@{$OPT{remove}}) { my $pod; for $pod (@{$OPT{remove}}) { unless (-e $pod) { $pod = "$DIR/$pod"; } index_pod(file => $pod, remove => 1) if -f $pod; unlink $pod or warn "Could not unlink '$pod': $!\n"; #$tb->sync; } $tb->close; $db->close; exit; } # Now get the beef if ($OPT{cpan} =~ /^(http|ftp):/) { $LWP = 1; require LWP::Simple; LWP::Simple->import(); mkpath($DATA,1,0755) or die "Could not generate '$DATA/': $!" unless -d $DATA; if (! -f "$DATA/find-ls.gz" or -M "$DATA/find-ls.gz" > 0.5) { my $status = mirror("$OPT{cpan}/indices/find-ls.gz", "$DATA/find-ls.gz"); if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) { # we could use Net:FTP here ... die "Was unable to mirror '$OPT{cpan}/indices/find-ls.gz'\n"; } } my $fh = new IO::File "gzip -cd $DATA/find-ls.gz |"; die "Could not open 'gzip -cd $DATA/find-ls.gz': !$\n" unless $fh; my $line; while (defined ($line = <$fh>)) { chomp($line); my ($mon, $mday, $time, $file, $is_link) = (split ' ', $line)[7..11]; next if defined $is_link; my $mtime = mtime($mon, $mday, $time); $file =~ s:^\./::; ($_) = fileparse($file); $File::Find::name = $file; wanted($mtime); } } else { find(sub {&wanted((stat($_))[9])}, $OPT{cpan}); } ARCHIVE: for my $tar (sort keys %ARCHIVE) { next if $OPT{match} and $ARCHIVE{$tar} !~ /$OPT{match}/o; my $base = (split /\//, $ARCHIVE{$tar})[-1]; my $parent; # logging if ($OPT{trust_mtime}) { printf "%-20s %10s %s\t", $tar, substr(scalar(localtime($VERSION{$tar})),0,10), $base; } else { printf "%-20s %10.5f %s\t", $tar, $VERSION{$tar}, $base; } # Remember the archive # We should have an extra table for the tar file data ... if (!$OPT{force} and $tb->have(docid => $base)) { print "skipping\n"; next ARCHIVE; } else { $parent = $tb->insert(docid => $base, headline => $ARCHIVE{$tar}) unless $OPT{test}; print "indexing\n"; } next ARCHIVE if $OPT{test} > 1; my $TAR = myget($tar); next ARCHIVE unless $TAR; # not able to fetch it my %tar; tie (%tar, 'WAIT::Document::Tar', sub { $_[0] =~ /\.(pm|pod|PL)$/ or $_[0] =~ /readme/i}, #sub { $_[0] !~ m:/$: }, $TAR) or warn "Could not tie '$TAR'\n"; my $sloppy; my ($key, $val); FILE: while (($key, $val) = each %tar) { my $file = fname($key); # don't index directories next if $file =~ /\/$/; # is it a POD file? next FILE unless $file =~ /readme/i or $val =~ /\n=head/; # remove directory prefix unless ($sloppy # no common root or $file =~ s:^\Q$tar\E[^/]*/:: # common root, maybe alias or ($TR{$tar} # common root, not aliased and $file =~ s:^\Q$TR{$tar}\E::) ) { # try to determine an alias warn "Bad directory prefix: '$file'\n"; my ($prefix) = split /\//, $file; while ($key = (tied %tar)->NEXTKEY) { my $file = fname($key); next if $file =~ /\/$/; unless ($file =~ m:^$prefix/: or $file eq $prefix) { warn "Archive contains different prefixes: $prefix,$file\n"; $prefix = ''; last; } } if ($prefix) { print "Please alias '$tar' to '$prefix' next time!\n"; print "See alias table later.\n"; $NEW_ALIAS{$tar} = $prefix; $tb->delete_by_key($parent); next ARCHIVE; } else { print "Assuming that tar file name $tar is a valid prefix\n"; $sloppy = 1; # We may reset too much here! But that this is not exact # science anyway. Maybe we should ignore using 'next ARCHIVE'. $key = (tied %tar)->FIRSTKEY; redo FILE; } } # remove /lib prefix $file =~ s:^lib/::; # generate new path my $path = "$DATA/$tar/$file"; my ($sbase, $sdir) = fileparse($path); my $fh; unless ($OPT{test}) { if (-f $path) { index_pod(file => $path, remove => 1); unlink $path or warn "Could not unlink '$path' $!\n"; } elsif (!-d $sdir) { mkpath($sdir,1,0755) or die "Could not mkpath($sdir): $!\n"; } $fh = new IO::File "> $path"; die "Could not write '$path': $!\n" unless $fh; } if ($file =~ /readme|install/i) { # make READMEs verbatim pods $val =~ s/\n/\n /g; $val = "=head1 NAME\n\n$tar $file\n\n=head1 DESCRIPTION\n\n $val" unless $val =~ /^=head/m; } else { # remove non-pod stuff my $nval = $val; $val = ''; my $cutting = 1; for (split /\n/, $nval) { if (/^=cut|!NO!SUBS!/) { $cutting = 1; } elsif ($cutting and /^=head/) { $cutting = 0; } unless ($cutting) { $val .= $_ . "\n"; } } } unless ($OPT{test}) { $fh->print($val); index_pod(file => $path, parent => $parent, text => $val, source => $ARCHIVE{$tar}); } } if ($LWP and !$OPT{keep}) { unlink $TAR or warn "Could not unlink '$TAR': $!\n"; } } if (%NEW_ALIAS) { print "\%ALIAS = (\n"; for (keys %NEW_ALIAS) { print "\t'$_'\t=> '$NEW_ALIAS{$_}',\n"; } print "\t);\n"; } # we are done $db->close(); exit; sub fname ($) { my $key = shift; my ($ntar, $file) = split $;, $key; # remove leading './' - shudder $file =~ s/^\.\///; return($file); } sub myget { my $tar = shift; my $TAR; if ($LWP) { # fetch the archive if ($OPT{keep}) { $TAR = "$OPT{keep}/$ARCHIVE{$tar}"; print "Keeping in '$TAR'\n" unless -e $TAR; my ($base, $path) = fileparse($TAR); unless (-d $path) { mkpath($path,1,0755) or die "Could not mkpath($path)\n"; } } else { $TAR = "/tmp/$tar.tar.gz"; } unless (-e $TAR) { # lwp mirror seems to fetch ftp: in any case? print "Fetching $OPT{cpan}/$ARCHIVE{$tar}\n"; my $status = mirror("$OPT{cpan}/$ARCHIVE{$tar}", $TAR); if ($status != &RC_OK and $status != &RC_NOT_MODIFIED) { warn "Was unable to mirror '$ARCHIVE{$tar}, skipping'\n"; return; } } } $TAR; } sub index_pod { my %parm = @_; my $did = $parm{file}; my $rel_did = $did; my $abs_did = $did; if ($rel_did =~ s:$DIR/::) { $abs_did = "$DIR/$rel_did"; } undef $did; # check for both variants if ($tb->have('docid' => $rel_did)) { $did = $rel_did; } elsif ($tb->have('docid' => $abs_did)) { $did = $abs_did; } if ($did) { # have it version if (!$parm{remove}) { warn "duplicate: $did\n"; return; } } else { # not seen yet $did = $rel_did; if ($parm{remove}) { print "missing: $did\n"; return; } } $parm{'text'} ||= WAIT::Document::Find->FETCH($abs_did); unless (defined $parm{'text'}) { print "unavailable: $did\n"; return; } my $record = $layout->split($parm{'text'}); $record->{size} = length($parm{'text'}); my $headline = $record->{name} || $did; $headline =~ s/^$DATA//o; # $did $headline =~ s/\s+/ /g; $headline =~ s/^\s+//; printf "%s %s\n", ($parm{remove})?'-':'+', substr($headline,0,70); if ($parm{remove}) { $tb->delete('docid' => $did, headline => $headline, %{$record}); } else { $tb->insert('docid' => $did, headline => $headline, source => $parm{source}, parent => $parm{parent}, %{$record}); } } # This *must* remove the version in *any* case. It should compute a # resonable version number - but usually mtimes should be used. sub version { local ($_) = @_; # remove alpha/beta postfix s/([-_\d])(a|b|alpha|beta|src)$/$1/; # jperl1.3@4.019.tar.gz s/@\d.\d+//; # oraperl-v2.4-gk.tar.gz s/-v(\d)/$1/; # lettered versions - shudder s/([-_\d\.])([a-z])([\d\._])/sprintf "$1%02d$3", ord(lc $2) - ord('a') /ei; s/([-_\d\.])([a-z])$/sprintf "$1%02d", ord(lc $2) - ord('a') /ei; # thanks libwww-5b12 ;-) s/(\d+)b/($1-1).'.'/e; s/(\d+)a/($1-2).'.'/e; # replace '-pre' by '0.' s/-pre([\.\d])/-0.$1/; s/\.\././g; s/(\d)_(\d)/$1$2/g; # chop '[-.]' and thelike s/\W$//; # ram's versions Storable-0.4@p s/\@/./; if (s/[-_]?(\d+)\.(0\d+)\.(\d+)$//) { return($_, $1 + "0.$2" + $3 / 1000000); } elsif (s/[-_]?(\d+)\.(\d+)\.(\d+)$//) { return($_, $1 + $2/1000 + $3 / 1000000); } elsif (s/[-_]?(\d+\.[\d_]+)$//) { return($_, $1); } elsif (s/[-_]?([\d_]+)$//) { return($_, $1); } elsif (s/-(\d+.\d+)-/-/) { # perl-4.019-ref-guide return($_, $1); } else { if ($_ =~ /\d/) { # smells like an unknown scheme warn "Odd version Numbering: '$File::Find::name'\n"; return($_, undef); } else { # assume version 0 warn "No version Numbering: '$File::Find::name'\n"; return($_, 0); } } } sub wanted { my $mtime = shift; # called by parse_file_ls(); return unless /^(.*)\.tar(\.gz|\.Z)$/; my ($archive, $version) = version($1); unless (defined $version) { warn "Skipping $1\n"; return; } # Check for file alias $archive = $ALIAS{$archive} if $ALIAS{$archive}; # Check for path alias. if ($File::Find::name =~ m(/CPAN/(?:source/)?(.*\Q$archive\E))) { if ($ALIAS{$1}) { $archive = $ALIAS{$1}; } } if ($OPT{trust_mtime}) { $version = $mtime; } else { $version =~ s/(\d)_/$1/; $version ||= $mtime; # mtime } if (!exists $ARCHIVE{$archive} or $VERSION{$archive} < $version) { $ARCHIVE{$archive} = $File::Find::name; $VERSION{$archive} = $version; } } sub clean_database { my %parm = @_; my $db = WAIT::Database->open( name => $parm{database}, 'directory' => $parm{dir}, ) or die "Could not open database '$parm{dir}/$parm{database}': $@"; my $tbl = $db->table(name => $parm{table}); if ($tbl) { $tbl->drop or die "Could not open table '$parm{tabel}': $@"; } $db->close; } sub create_table { my %parm = @_; my $access = bless {}, 'WAIT::Document::Find'; my $stem = [{ 'prefix' => ['isotr', 'isolc'], 'intervall' => ['isotr', 'isolc'], }, 'isotr', 'isolc', 'split2', 'stop', 'Stem']; my $text = [{ 'prefix' => ['isotr', 'isolc'], 'intervall' => ['isotr', 'isolc'], }, 'isotr', 'isolc', 'split2', 'stop']; my $sound = ['isotr', 'isolc', 'split2', 'Soundex'],; my $tb = $parm{db}->create_table (name => $parm{table}, attr => ['docid', 'headline', 'source', 'size', 'parent'], keyset => [['docid']], layout => $parm{layout}, access => $access, invindex => [ 'name' => $stem, 'synopsis' => $stem, 'bugs' => $stem, 'description' => $stem, 'text' => $stem, 'environment' => $text, 'example' => $text, 'example' => $stem, 'author' => $sound, 'author' => $stem, ] ); die "Could not create table '$parm{table}'" unless $tb; $tb; } my %MON; my $YEAR; BEGIN { my $i = 1; for (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)) { $MON{$_} = $i++; } $YEAR = (localtime(time))[5]; } # We could/should use Date::GetDate here use Time::Local; sub mtime { my ($mon, $mday, $time) = @_; my ($hour, $min, $year, $monn) = (0,0); if ($time =~ /(\d+):(\d+)/) { ($hour, $min) = ($1, $2); $year = $YEAR; } else { $year = $time; } $monn = $MON{$mon} || $MON{ucfirst lc $mon} || warn "Unknown month: '$mon'"; my $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year); if ($guess > time) { $guess = timelocal(0,$min,$hour,$mday,$monn-1,$year-1); } $guess; } __END__ ## ################################################################### ## pod ## ################################################################### =head1 NAME cpan - generate an WAIT index for CPAN =head1 SYNOPSIS B [B<-clean>] [B<-noclean>] [B<-cpan> I] [B<-database> I] [B<-dir> I] [B<-force>] [B<-noforce>] [B<-keep> I] [B<-match> I] [B<-table> I] [B<-test> I] [B<-trust_mtime>] [B<-notrust_mtime>] =head1 DESCRIPTION TBS =head1 OPTIONS =over 5 =item B<-clean> / B<-noclean> Clean the table befor indexing. Default is B. =item B<-cpan> I Default directory or URL for indexing. If an URL is given, there currently must be a file F relative to it which contains the output of C. Default is F. =item B<-database> I Specify database name. Default is F. =item B<-dir> I Alternate directory were databases are located. Default is the directory specified during configuration of WAIT. =item B<-force> Force reindexing, even if B thinks files are up to date. Default is B =item B<-keep> I If fetching from a remote server, keep files in I. Default is F. =item B<-match> I Limit to patches matching I. Default is F. =item B<-table> I
Specify an alternate table name. Default is C. =item B<-test> I Set test level, were B<0> means normal operation, B<1> means, don't really index and B<2> means, don't even get archives and examine them. =item B<-trust_mtime> / B<-notrust_mtime> If B, the files mtimes are used to decide, which version of an archive is the newest. If b, the version extracted is used (beware, there are far more version numbering schemes than B can parse). =head1 AUTHOR Ulrich Pfeifer EFE