#!/usr/bin/perl use strict; use warnings; # $Id: google-gather,v 1.5 2004/04/05 04:41:26 lem Exp $ use Pod::Usage; use Getopt::Std; use WWW::Mechanize; our $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf " %d."."%03d" x $#r, @r }; =pod =head1 NAME google-gather - Leverage the power of Google(tm) to catch abuse complaints =head1 SYNOPSIS google-gather [-h] [-v] [-d delimiter] [-c count] [-m max] -s search =head1 DESCRIPTION One very important source of abuse complaints is the news group News.Admin.Net-Abuse.Sightings or NANAS. At NANAS, many activists place samples of abuse that are captured at remote sites and that, ideally, serve as an indication that corrections are needed. C fetches complaints from NANAS and produces an output suitable as input to C through the C module. =head2 Do not abuse this script In general, it is not polite to send large numbers of queries to a host, as this might be interpreted as an attack. This is true, even if said host is Google. Use this scipt judiciously and avoid long and repeated queries. The following options are recognized: =over =item B<-h> Outputs this documentation. =item B<-v> Be verbose about progress. =item B<-s search> What to search for. Typically, this should be your domain name. =item B<-d delimiter> Text string used to separate reports. See C for an application of this delimiter. Defaults to B<___END_OF_REPORT___>. =item B<-c count> How many reports to extract from NANAS. No more than this number of reports will be processed. Defaults to 20. =item B<-m max count> The maximum number of articles that will be spidered by this script. Defaults to 1000. =cut ; use vars qw/ $opt_c $opt_d $opt_h $opt_m $opt_v $opt_s /; getopts('c:d:hvs:m:'); pod2usage(verbose => 1) unless $opt_s; pod2usage(verbose => 2) if $opt_h; $opt_d = '___END_OF_REPORT___' unless $opt_d; $opt_c = 20 unless defined $opt_c and $opt_c > 0; $opt_m = 1000 unless defined $opt_m and $opt_m > 0; my $ua = WWW::Mechanize->new(agent => "google-gather/$VERSION"); # Submit the search as we want it warn "Issuing GET request\n" if $opt_v; my $counter = 0; my $reports = 0; while ($ua->get('http://groups.google.com/groups?q=' . $opt_s . '&group=news.admin.net-abuse.sightings' . '&sa=G&scoring=d&start=' . $counter) && $ua->success) { my @links = (); warn "GET succeeded\n" if $opt_v; unless (@links = grep { $_->url =~ m/\&selm=/ } ($ua->links)) { warn "No more interesting links\n" if $opt_v; last; } warn scalar @links, " links recovered\n" if $opt_v; for my $l (@links) { if ($reports >= $opt_c) { die "Reached maximum report count ($opt_c)\n"; } if ($reports ++ >= $opt_m) { die "Absolute maximum number of reports processed ($opt_m)\n"; } my $res = $ua->get($l->url . '&output=gplain'); if ($res->is_success) { my $date = scalar localtime; my $url = $l->url; my $title = $l->text; print <content, "\n$opt_d\n"; } else { warn "Failed to obtain ", $l->text, ": ", $res->status_line, "\n"; } } $counter += 10; } die "Failed GET request: ", $ua->response, "\n" unless $ua->success; __END__ =pod =back The complaint should be fed through C, as the output of C would. =head1 HISTORY =over =item B First version of this code. =back =head1 LICENSE AND WARRANTY This code and all accompanying software comes with NO WARRANTY. You use it at your own risk. This code and all accompanying software can be used freely under the same terms as Perl itself. =head1 AUTHOR Luis E. Muņoz =head1 SEE ALSO perl(1), C, C =cut