# Sequin v1.1.2 # # by Peter Sergeant # # A module for extracting and parsing search engine URLs from # server referrer files. Proper usage information is in the # README file # Magic Package Stuff require 5.005; use strict; require Exporter; package URI::Sequin; use vars qw(@ISA $VERSION @EXPORT_OK %log_types); @ISA = qw(Exporter); @EXPORT_OK = qw(se_extract log_extract %log_types key_extract); $VERSION = '1.2'; # &log_extract v1.0 # =-=-=-=-=-=- =-=- # # # The purpose of this subroutine is to allow raw log files lines to be # handled. The subroutine accepts a log line, plus some information on # how it should be analysed, and returns a scalar value: the referring # URL. # # The subroutine knows a certain number of log types, and keeps regexs # with which to handle these logs in a globally accessable hash below, # called '%log_types'. If your log type is not already in the array it # can be added and used. # # Examples: # --------- # # Adding a new regex to %log_types: # => $log_types{'MyWebServer'} = '.+? Referer:(.+?) '; # # > It's worth pointing out that the subroutine uses $1 straight # > after the match has taken place to get the referrer. Because # > of this, you should make sure the part of string to be taken # > is enclosed in ()'s. If you're still unsure, this is clearly # > demonstrated below, where %log_types is set. # # Parsing a Log Entry # => $referrer = &log_extract($log_line, 'NCSA'); # # > As I hope is clear, $log_line is the log-file line that needs # > to be parsed, and 'Apache' refers to the relevant regex below # > in the %log_types hash. # %log_types = ( # Microsoft IIS 3.0 and 2.0 'IIS1' => '(http:.+?),', # Microsoft IIS4.0 (W3SVC format) 'IIS2' => '(http:.+?)$', # NCSA (Apache, Netscape) 'NCSA' => '"(http:.+?)"', # O'Reilly WebSite format 'ORW' => ' (http:.+?) ', # General (works for most logtypes) 'General' => '(?:\s|"|,|^)(http:.+?)(\s|"|,|$)', ); sub log_extract { my $log_file_line = $_[0]; my $log_file_type = $_[1] || 'General'; chomp($log_file_line); # Check that the $log_file_type contains a valid regex by using # (eval) on it to see if we crash the regex engine, and by also # checking if there is a regex in $log_types{$log_file_type} my $re = eval { qr/$log_types{$log_file_type}/ }; warn "Bad re: '$log_types{$log_file_type}' ($@)\n" if $@; unless (defined $log_types{$log_file_type}) { warn "Unknown Logtype - \"$log_file_type\"\n"; } # Return what we found if ($log_file_line =~ m/$log_types{$log_file_type}/i) { return $1 }; return; } # &se_extract v1.1 # =-=-=-=-=-= =-=- # # The purpose of this subroutine is to break down the referring URL in # to an array, containing the $search_engine_name and the # $search_engine_url. # # Example: # => ($name, $url) = @{&se_extract($url)}; # sub se_extract { my $input_url = $_[0]; chomp($input_url); # Break down the $input_url into two more useful variables, so # that we can check if there is information in the query # string, and if there is, we just get on with life. my ($location, $query_string) = split(/\?/, $input_url); return [] unless $query_string; my $search_engine_name; my $search_engine_url; # This is a scary regex. It picks out with suprising accuracy # the main part of a URL - the 'MSN' part of: # http://biteme15.search.cgi.msn.com.uk/?asdfasdf if ($location =~ m!(http://)?(\d+\.\d+\.\d+\.\d+(\:\d+)?)/!) { return ["Unknown (IP)", $2]; } if ($location =~ m!^(.+?\. ([^\.]+) \. (com|net|org|int|mil|\w\w| (gov|mil|com|net|org|\w\w)\.\w\w ) (?:/|:\d+/) )!x) { $search_engine_url = $1; $search_engine_name = "\u$2"; } elsif ($location =~ m!^(http://)?((\w+)\.\w+(\:\d+/?)?)!) { $search_engine_url = $2; $search_engine_name = "\u$3"; } elsif ($location =~ m!^(http://)?((\w+)([^\.\w:]|(\:\d+/?)?))!) { $search_engine_url = $2; $search_engine_name = "\u$3"; } # This has allowed us to quite accurately get the name and URL # of any given search-engine. However, in the interests of # total accuracy, we have a list of search-engines that we know # so we can provide even more information, and make sure it's # correct. # Define this list: my @search_engine_array = ( ['Altavista', 'http://www.av.com', '(altavista|av)'], ['HotBot', 'http://www.hotbot.com', 'hotbot\.lycos'], ['Infoseek', 'http://www.infoseek.com', 'infoseek\.go'], ['Magellan', 'http://magellan.excite.com', 'magellan\.excite'], ['Ask Jeeves', 'http://www.aj.com', '(aj|askjeeves)'], ['CNET Search', 'http://www.search.com', '(cnet|search\.com|savysearch)'], ); # Cycle through the list for (@search_engine_array) { my ($se_name, $se_url, $se_regex) = @{$_}; if ($location =~ m/$se_regex/) { $search_engine_url = $se_url; $search_engine_name = $se_name; } } # Return what we know. # jm: allow HTTPS search engines too ;) if (defined $search_engine_url && $search_engine_url !~ m!^https?://!) { $search_engine_url =~ s!^!http://!; } return [$search_engine_name, $search_engine_url]; } # &key_extract v1.1 # =-=-=-=-=-= =-=- # # The purpose of this subroutine is to break down the referring URL in # to a string containing the search terms. # # Example: # => $terms = &key_extract($url); # sub key_extract { my $input_url = $_[0]; chomp($input_url); # Break down the $input_url in to two more useful variables my ($location, $query_string) = split(/\?/, $input_url); return unless $query_string; # Google Caching ... What a bitch... This will deal with it, # how Google currently works... if ($query_string =~ m!q=cache\:.+/(.+?)&!i) { $_ = $1; tr/+/ /; s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; s/\s+/ /gs; s/^\s+//g; s/\s+$//g; return $_; } # There are a number of ways in which we now try and determine # what the search terms are. The first is quite clever, IMHO. # We search for spaces in any of the submitted fields that # isn't called 'next' or 'submit' or 'col' or 'btnG' (blame # google). if ($query_string =~ m/(? .+$'; $url = &log_extract($line_from_log_file, 'MyLogType'); $keyword_string = &key_extract($url); ($search_engine_name, $search_engine_url) = @{&se_extract($url)}; =head1 DESCRIPTION This module provides three tools to aid people trying to analyse Search-Engine URLs. It’s meant mainly for those who want to analyse referrer logs and pick out key information about site visitors, such as which Search-Engine and keywords they used to find the site. The functions and globals provided (and exported by default) from this module are: =over =item log_extract($log_line, 'Type') This will pick out the referring URL from a line of a logfile. The 'type' can be one of the built in types or can be a user-created one. For more information, see %log_types below. This subroutine accepts a scalar, and returns a scalar. =item key_extract($url) This will try and determine the keywords used in $url. It accepts a scalar and returns a scalar. Should nothing be found, it returns an undefined value. =item se_extract($url) This will try and determine the name of the Search-Engine used and its URL. It accepts a scalar, and returns an array containing firstly the Search- Engine’s name and secondly the Search-Engine’s URL. Should the URL appear not to be from a Search Query, it returns a reference to an empty array. =item %log_types There are five built-in logfile types already in this hash. They are: =over 4 =item * IIS1 - Microsoft IIS 3.0 and 2.0 =item * IIS2 - Microsoft IIS4.0 (W3SVC format) =item * NCSA - For APACHE, NETSCAPE and any other NCSA format logs =item * ORW - O'Reilly WebSite format =item * General - A generalised one that will work with most logfiles =back It’s easy to add another one. Simply add a key to the hash, with a value that is a regex. Parenthesise the part that is the referring URL, as the script uses $1 to obtain the URL. (see the example in the Synopsis section). I have only one request for people who use this module. *Please* tell me where and how you've used it, and if you have any thoughts or suggestions on it, tell me! =back =head1 BUGS Doesn't like the Amnesi Search Engine. But then, neither do I. Also, the 'General' log type needs to be used with discretion ... be sure that none of the URLs contain literal " if you use it. =head1 AUTHOR Peter Sergeant Epete@grou.chE =head1 COPYRIGHT Copyright 2001 Peter Sergeant. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut