#!/usr/bin/perl #------------------------------------------------------------------------------ # # Standard pragmas # #------------------------------------------------------------------------------ use strict; use warnings; require v5.6.0; use LWP::Simple; use Getopt::Long; use HTTPD::Log::Filter; use vars qw( $opt_exclusions_file $opt_log_file $opt_url ); sub usage() { die < [ -exclusions_file ] EOF } usage unless GetOptions qw( log_file=s exclusions_file=s url=s ); usage unless $opt_url; my $agent_list = get( $opt_url ) or die "can't get $opt_url\n"; my $agent_re = '(?i:' . join( '|', map( { quotemeta } split( /[\n\r]+/, $agent_list ) ) ) . ')' ; my $filter = HTTPD::Log::Filter->new( exclusions_file => $opt_exclusions_file, format => 'XLF', agent_re => $agent_re, invert => 1, ); while( <> ) { my $line = $filter->filter( $_ ); die "Badly formatted line at line $.\n" unless defined $line; print $line if $line; } #------------------------------------------------------------------------------ # # Start of POD # #------------------------------------------------------------------------------ =head1 NAME exclude_robot.pl - a simple filter script to filter robots out of logfiles =head1 SYNOPSIS exclude_robot.pl -url [ -exclusions_file ] OR cat | exclude_robot.pl -url =head1 DESCRIPTION This script filters HTTP log files to exclude entries that correspond to know webbots, spiders, and other undesirables. The script requires a URL as a command line option which should point to a text file containing a linebreak separated list of lowercase strings to match on for bots. This is based on the format used by ABC (L). The script filters httpd logfile entries either from a filename specified on the command line, or from STDIN. It outputs filtered entries to STDOUT. =head1 OPTIONS =over 4 =item -url Specify the URL of file to grab which contains the list of agents to exclude. The option is REQUIRED. =item -exclusions_file Specify a file to save excluded entries from the logfile. This option is OPTIONAL. =head1 AUTHOR Ave Wrigley =head1 COPYRIGHT Copyright (c) 2001 Ave Wrigley. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut #------------------------------------------------------------------------------ # # End of POD # #------------------------------------------------------------------------------