#!/usr/bin/perl use strict; $|++; my $VERSION = '0.20'; #---------------------------------------------------------------------------- =head1 NAME cpanstats-verify - script to verify the contents of the cpanstats database. =head1 SYNOPSIS perl cpanstats-verify [-c|-m] [-v|-s --file=$file] [--start=0] [--end=0] =head1 DESCRIPTION Reads the cpanstats database and verifies the contents. Three kinds of verification can be applied; Check (-c), Missing (-m) and Verify (-v). The start and end counts for the NNTP server can be specified, otherwise values within the database and the script will be assumed. =head1 OPTIONS =over 4 =item -c | --check Looks up the entries stored in the database between the START and END NNTP ids, checking the NNTP server for the subjects of any that are missing. Also highlights any subjects that have been marked as bad during processing. =item -m | --missing Looks up the entries stored in the database between the START and END NNTP ids, checking each to ensure all the fields are complete for each type. =item -v | --verify Looks up the entries stored in the database as stored in the named file, and either prints the entry or highlights that it is missing. =item -s | --search Looks up the NNTP entries directly as listed in the named file. =item --database Specify the exact path to the cpanstats database if not ./cpanstats.db. =item --file Named file used when verifying a list of NNTP ids. =item --start --end Start and end NNTP ids. =item -h | --help Print the help screen =back =cut # ------------------------------------- # Library Modules use lib qw(./lib ../lib); use DBI; use Email::Simple; use Net::NNTP; use Getopt::ArgvFile default=>1; use Getopt::Long; use IO::File; use CPAN::WWW::Testers::Generator::Database; # ------------------------------------- # Variables my (%log,%options); my $PROGRESS = 0; use constant DATABASE => 'cpanstats.db'; use constant NNTPSTART => 872391; # ------------------------------------- # Program ##### INITIALISE ##### progress("init"); init_options(); $options{database} ||= DATABASE; my $dbi = CPAN::WWW::Testers::Generator::Database->new(database => $options{database}, AutoCommit => 1); print STDERR "Cannot connect to database [$options{database}]\n" unless($dbi); $options{start} ||= NNTPSTART; $options{end} ||= get_lastid(); my $nntp; unless($options{localonly}) { $nntp = Net::NNTP->new("nntp.perl.org") || die "Cannot connect to nntp.perl.org"; $nntp->group("perl.cpan.testers"); } ##### MAIN ##### _clear_log(); _log("FROM: $options{start} - $options{end}") if($options{missing} || $options{check}); #load_log(); progress("start"); check_stats() if($options{check}); missing_stats() if($options{missing}); verify_stats() if($options{verify}); search_stats() if($options{search}); progress("finish"); # ------------------------------------- # Subroutines =item get_lastid Returns the last NNTP id recorded in the database. =cut sub get_lastid { my @rows = $dbi->get_query("SELECT MAX(id) FROM cpanstats"); return $rows[0]->[0]; } =item missing_stats Report on the database entries with missing field values. =cut # insert_report($id,$state,$date,$from,$dist,$version,$platform,$perl); sub missing_stats { progress("start - missing stats"); $" = ","; my $count = 0; my $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE id >= $options{start} AND id <= $options{end}"); while(my $row = $iterator->()) { next if($row->[1] =~ /^na|pass|fail|unknown|cpan$/); _log("BADPARSE:") unless($count++); _log("@$row"); } $count = 0; $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE state in ('na','pass','fail','unknown') AND id >= $options{start} AND id <= $options{end}"); while(my $row = $iterator->()) { next if( defined $row->[2] && defined $row->[3] && defined $row->[4] && defined $row->[5] && defined $row->[6]); _log("\nBADREPORTS:") unless($count++); _log("@$row"); } $count = 0; $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE state in ('cpan') AND id >= $options{start} AND id <= $options{end}"); while(my $row = $iterator->()) { next if( defined $row->[2] && defined $row->[4] && defined $row->[5]); _log("\nBADUPLOADS:") unless($count++); _log("@$row"); } } =item check_stats Report on the database entries which are either missing, or have reported bad processing. =cut sub check_stats { progress("start - check stats"); my $count = $options{start}; my $iterator = $dbi->get_query_iterator("SELECT * FROM cpanstats WHERE id >= $options{start} AND id <= $options{end} ORDER BY id"); while(my $row = $iterator->()) { #progress("$count - $row->[0]"); # missing entries (ignore human replies) while($count < $row->[0]) { my ($subj,$from) = get_subject($count); _log("$count,$from,missing - $subj") unless($subj =~ /(?:Re:|Fw:|Ab:|mirror update)/i || $subj =~ /\.(?:readme|pl|cgi|pdf|html?|doc|txt|ppd|asc|yml|jpg|png|gif|rtf|css|pod|sig|diff)$/i); # note that in the above two regexes we ignore conversation # threads, mirror updates and a whole host of uploads that # are nothing to do with a distribution upload. The caveats # to this are the entries that are potentially bad uploads # (bad archive naming or uploading a .pm file) and any # reports that are in error. $count++; } # badly parsed entries if($row->[1] =~ /bad/) { my ($subj,$from) = get_subject($row->[0]); _log("$row->[0],$from,$row->[1] - $subj"); # print join(",",@$row) . "\n"; } # missing fields elsif($row->[1] =~ /(na|pass|fail|unknown)/) { unless( defined $row->[2] && defined $row->[3] && defined $row->[4] && defined $row->[5] && defined $row->[6]) { my ($subj,$from) = get_subject($row->[0]); _log("$row->[0],$from,$row->[1] - $subj"); } } $count++; } } =item verify_stats Report on the given database entries highlighting those which are missing. =cut sub verify_stats { progress("start - verify stats"); $" = ","; my @list = get_list(); for my $id (@list) { my @rows = $dbi->get_query("SELECT * FROM cpanstats WHERE id=$id"); if(@rows) { my $row = $rows[0]; _log("@$row"); } else { _log("$id,missing"); } } } =item search_stats Report on the given database entries highlighting those which are missing. =cut sub search_stats { progress("start - search stats"); $" = ","; my @list = get_list(); for my $id (@list) { my ($subj,$from) = get_subject($id); _log("$id,$from,$subj"); } } =item get_subject Access the NNTP server to get the real subject recorded for the article, unless we can short cut the network by accessing the information from the project log file. =cut sub get_subject { my $id = shift; # can we short cut? find_id($id) unless($log{$id}); return($log{$id}{subject},$log{$id}{from}) if($log{$id}); return "" if($options{localonly}); # talk NNTP my $article = join "", @{$nntp->article($id) || []}; return "" unless($article); # parse the resulting headers my $mail = Email::Simple->new($article); return($mail->header("Subject"),$mail->header("From")); } =item get_list Returns the list of NNTP ids from the named file. =cut sub get_list { my @list; my $file = $options{file} || die "--file not specified"; die "file [$file] not found" unless(-f $file); my $fh = IO::File->new($file) or die "Cannot open file [$file]: $!"; while(<$fh>) { chomp; my ($num) = (m/^(\d+)/); push @list, $num; } $fh->close; return @list; } =item load_log Load log file, the output of cpanstats.pl. =cut sub load_log { my $fh = IO::File->new($options{log},'r') or die "Cannot read log file [$options{log}]: $!\n"; while(<$fh>) { next unless(/^ID \[(\d+)\] \[([^\]]+)\] (.*?)\s*$/); next unless($1 >= $options{start}); $log{$1} = {from => $2, subject => $3}; } } # note read the whole file in case it has been reparsed sub find_id { my $id = shift || return; my $fh = IO::File->new($options{log},'r') or die "Cannot read log file [$options{log}]: $!\n"; while(<$fh>) { next unless(/^ID \[$id\] \[([^\]]+)\] (.*?)\s*$/); $log{$id} = {from => $1, subject => $2}; } return; } sub _clear_log { my $fh = IO::File->new($options{out},'w') or die "Cannot write to file [$options{out}]: $!\n"; print $fh ''; $fh->close; } sub _log { my $msg = shift; my $fh = IO::File->new($options{out},'a+') or die "Cannot write to file [$options{out}]: $!\n"; print $fh "$msg\n"; $fh->close; } =item progress Simple audit logging function. =cut my $lasttime = time; sub progress { return unless($PROGRESS); my $msg = shift; my $time = time; my @localtime = localtime($time); my $secs = $time - $lasttime; printf STDERR "%02d:%02d:%02d\t%03d\t%s\n", $localtime[2], $localtime[1], $localtime[0], $secs, $msg; $lasttime = $time; } sub init_options { GetOptions( \%options, 'database=s', 'localonly|l', 'missing|m', 'check|c', 'verify|v', 'search|s', 'start=i', 'end=i', 'file=s', 'log=s', 'out=s', 'help|h', 'version|V' ); _help(1) if($options{help}); _help(0) if($options{version}); unless($options{database} && -f $options{database}) { print "No database specified\n\n"; _help(1); } unless($options{log} && -f $options{log}) { print "No cpanstats.log file specified\n\n"; _help(1); } unless($options{out}) { print "No results output file specified\n\n"; _help(1); } } sub _help { my $full = shift; if($full) { print <] [-c] [-m] [-v] [-s] \\ [--file=] [--start=n] [--end=n] \\ [--log=] [-out=] [-h] [-V] \\ [--localonly] --database= use a named database -m check for missing entries -c check existing entries for bad parsing -v provide a verification report -s search stats providing id + subject --file file of IDs to reference --start start id for -m or -c --end end id for -m or -c --log log file for shortcut reference for -m and -c --out results output file -h this help screen -V program version HERE } print "$0 v$VERSION\n"; exit(0); } __END__ =back =head1 BUGS, PATCHES & FIXES There are no known bugs at the time of this release. However, if you spot a bug or are experiencing difficulties, that is not explained within the POD documentation, please send bug reports and patches to the RT Queue (see below). Fixes are dependant upon their severity and my availablity. Should a fix not be forthcoming, please feel free to (politely) remind me. RT Queue - http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-WWW-Testers-Generator =head1 SEE ALSO L, L F, F, F =head1 AUTHOR Barbie, for Miss Barbell Productions . =head1 COPYRIGHT AND LICENSE Copyright (C) 2005-2008 Barbie for Miss Barbell Productions. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut