The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

#------------------------------------------------------------------------------
#
# Standard pragmas
#
#------------------------------------------------------------------------------

use strict;
use warnings;

require v5.6.0;

use LWP::Simple;
use Getopt::Long;

use HTTPD::Log::Filter;

use vars qw( $opt_exclusions_file $opt_log_file $opt_url );

sub usage() 
{ 
    die <<EOF;
Usage: $0 
    -url <robot exclusions URL> 
    [ -exclusions_file <exclusions file> ]
EOF
}

usage unless GetOptions qw( log_file=s exclusions_file=s url=s );
usage unless $opt_url;

my $agent_list = get( $opt_url ) or die "can't get $opt_url\n";
my $agent_re = 
    '(?i:' . 
    join( '|', map( { quotemeta } split( /[\n\r]+/, $agent_list ) ) ) . 
    ')'
;

my $filter = HTTPD::Log::Filter->new(
    exclusions_file     => $opt_exclusions_file,
    format              => 'XLF',
    agent_re            => $agent_re,
    invert              => 1,
);

while( <> )
{
    my $line = $filter->filter( $_ );
    die "Badly formatted line at line $.\n" unless defined $line;
    print $line if $line;
}

#------------------------------------------------------------------------------
#
# Start of POD
#
#------------------------------------------------------------------------------

=head1 NAME

exclude_robot.pl - a simple filter script to filter robots out of logfiles

=head1 SYNOPSIS

    exclude_robot.pl
        -url <robot exclusions URL>
        [ -exclusions_file <exclusions file> ]
        <httpd log file>
    
    OR

    cat <httpd log file> | exclude_robot.pl -url <robot exclusions URL>

=head1 DESCRIPTION

This script filters HTTP log files to exclude entries that correspond to know
webbots, spiders, and other undesirables.  The script requires a URL as a
command line option which should point to a text file containing a linebreak
separated list of lowercase strings to match on for bots. This is based on the
format used by ABC (L<http://www.abc.org.uk/exclusionss/exclude.html>).

The script filters httpd logfile entries either from a filename specified on
the command line, or from STDIN. It outputs filtered entries to STDOUT.

=head1 OPTIONS

=over 4

=item -url <robot exclusions URL>

Specify the URL of file to grab which contains the list of agents to exclude.
The option is REQUIRED.

=item -exclusions_file <exclusions file>

Specify a file to save excluded entries from the logfile. This option is
OPTIONAL.

=head1 AUTHOR

Ave Wrigley <Ave.Wrigley@itn.co.uk>

=head1 COPYRIGHT

Copyright (c) 2001 Ave Wrigley. All rights reserved. This program is free
software; you can redistribute it and/or modify it under the same terms as Perl
itself.

=cut

#------------------------------------------------------------------------------
#
# End of POD
#
#------------------------------------------------------------------------------