package Regexp::Log::BlueCoat; use strict; use Carp; use Regexp::Log 0.01; use base qw( Regexp::Log ); use vars qw( $VERSION %DEFAULT %FORMAT %REGEXP %UFS ); $VERSION = 0.03; =head1 NAME Regexp::Log::BlueCoat - A regexp builder to parse BlueCoat log files =head1 SYNOPSIS my $blue = Regexp::Log::BlueCoat->new( format => '%g %e %a %w/%s %b %m %i %u %H/%d %c', capture => [qw( host code )], ); # the format() and capture() methods can be used to set or get $blue->format('%g %e %a %w/%s %b %m %i %u %H/%d %c %f %A'); $blue->capture(qw( host code )); $blue->ufs( 'smartfilter' ); # this is necessary to know in which order # we will receive the captured fields from the regex my @fields = $blue->capture; # the all-powerful capturing regex :-) my $re = $blue->regex; while (<>) { my %data; @data{@fields} = /$re/; # do something with the fields } =head1 DESCRIPTION Regexp::Log::BlueCoat is a module that computes custom regular expressions to parse log files generated by the BlueCoat Sytems I. See the Regexp::Log documentation for a description of the standard Regexp::Log interface. =head2 Streaming media logs This version of Regexp::Log::BlueCoat does not support streaming related logs. You will have to add the following line at the beginning of the log parsing loop in your scripts, if your BlueCoat appliance is configured to log those events. next if /^(?:Windows_Media|)/; This may or may not be faster than have the regular expression generated by the regexp() method fail on each streaming log line. =cut my $IP = '\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}'; my $HOST = '[-.\\S]+'; # define the BlueCoat specific stuff %REGEXP = ( # %% - Denotes '%' character - '%%' => '%', # %a c-ip Client IP address. Yes '%a' => "(?#=c-ip)$IP(?#!c-ip)", # %b sc-bytes Number of bytes returned by the server (or the Cache). Yes '%b' => '(?#=sc-bytes)-|\\d+(?#!sc-bytes)', # %c cs (content-type) The type of object. Usually the MIME-type. No '%c' => '(?#=cs-content-type)-|UNKNOWN|\\S+(?:/\\S+)?(?#!cs-content-type)', # %d cs-supplier-name SUPPLIER NAME - Name or IP address of the server/cache from which the object was received. Yes '%d' => "(?#=cs-supplier-name)-|$HOST(?#!cs-supplier-name)", # %e time-taken Number of milliseconds request took to process. Yes '%e' => '(?#=time-taken)\\d+(?#!time-taken)', # %f sc-filter-category Filtering reason. Why it was denied (such as sex or business) No # this is handled in _postprocess() '%f' => '(?#=sc-filter-category)%f(?#!sc-filter-category)', # %g timestamp UNIX type timestamp. Yes '%g' => '(?#=timestamp)\\d+\\.\\d+(?#!timestamp)', # %h c-ip Client Hostname (uses IP to avoid reverse DNS) - same as %a Yes '%h' => "(?#=c-hostname)-|$HOST(?#!c-hostname)", # %i cs-uri The requested URI. Note: Web trends expects this to be only cs-uri-stem + cs-uri-query No '%i' => '(?#=cs-uri)-|\\S+://\\S+|.*?(?#!cs-uri)', # %j - [Not used.] - '%j' => '', # %l - Client Identification string. (User Login name remote). - always '-' Yes # %m cs-method HTTP method. HTTP methods include GET, PUT, POST, and so on. Yes '%m' => '(?#=cs-method)-|OPTIONS|GET|HEAD|POST|PUT|DELETE|TRACE|CONNECT(?#!cs-method)', # %n - [Not used.] - '%n' => '', # %o - [Not used.] - '%o' => '', # %p r-port Port fetched from on host - origin server port Yes '%p' => '(?#=r-port)\\d+(?#!r-port)', # %q - [Not used.] - '%q' => '', # %r cs-request-line First line of the request No # %s sc-status The code returned by the cache to the client (HTTP code). Yes '%s' => '(?#=sc-status)\\d{1,4}(?#!sc-status)', # %t gmttime GMT date and time of the user request, in the format [DD/MM/YYYY:hh:mm:ss GMT] Yes '%t' => '(?#=gmttime)-|\\[(?#=gmtday)\\d\\d(?#!gmtday)/(?#=gmtmonth)\\d\\d(?#!gmtmonth)/(?#=gmtyear)\\d\\d\\d\\d(?#!gmtyear):(?#=gmthour)\\d\\d(?#!gmthour):(?#=gmtminute)(?#!gmtminute):(?#=gmtsecond)(?#!gmtsecond) GMT\\](?#!gmttime)', # %u cs-username Authenticated user ID. Yes '%u' => '(?### You must define \'login\' to use %u in format ###))', # %v cs-host Name of host sourcing the object. Yes # %w s-action What type of action did the CM take to process this request. NOTE: 'cached' is used by ELFF but has int value. Yes '%w' => '(?#=s-action)(?:TCP_(?:CLIENT_REFRESH|DENIED|ERR_MISS|HIT|M(?:EM_HIT|ISS)|NC_MISS|PARTIAL_MISS|REFRESH_(?:HIT|MISS)|S(?:PLASHED|WAPFAIL)|TUNNELED)?|UDP_(?:DENIED|HIT|INVALID|MISS(?:_NOFETCH)?)?)(?#!s-action)', # %x date Date in YYYY-MM-DD format Yes '%x' => '(?#=date)(?#=year)\\d\\d\\d\\d(?#!year)-(?#=month)\\d\\d(?#!month)-(?#=day)\\d\\d(?#!day)(?#!date)', # %y time GMT time in HH:MM:SS format No '%y' => '(?#=time)(?#=hour)\\d\\d(?#!hour):(?#=minute)\\d\\d(?#!minute):(?#=second)\\d\\d(?#!second)(?#!time)', # %z - [Not used.] - '%z' => '', # %A cs (user-agent) User agent No '%A' => '(?#=user-agent).*(?#!user-agent)', # %B cs-bytes The number of bytes received by the server Yes '%b' => '(?#=cs-bytes)\\d+(?#!cs-bytes)', # %C cs (cookie) Cookie data No # %D s-supplier-ip SUPPLIER IP - IP address of server/cache from which the object was received. Yes # %E s-Policy-Message Policy enforcement message Yes # %F - [Not used.] - '%F' => '', # %G - [Not used.] - '%G' => '', # %H s-hierarchy How and where the object was retrieved from the cache hierarchy (DIRECT from the server, PARENT_HIT = from the parent cache, and so on) No '%H' => '(?#=s-hierarchy)DIRECT|NONE|(?:PARENT|SIBLING)_HIT|FIRST_PARENT_MISS(?#!s-hierarchy)', # %I s-ip Server IP, the IP address of the server on which the log entry was generated Yes # %J - [Not used.] - '%J' => '', # %K - [Not used.] - '%K' => '', # %L localtime Local date and time of the user request in format: [DD/MMM/YYYY:hh:mm:ss +nnnn] Yes '%L' => '\\[(?#=localtime)(?#=localday)\\d\\d(?#!localday)/(?#=localmonth)\\d\\d(?#!localmonth)/(?#=localyear)\\d\\d\\d\\d(?#!localyear):(?#=localhour)\\d\\d(?#!localhour):(?#=localminute)\\d\\d(?#!localminute):(?#=localsecond)\\d\\d(?#!localsecond) \\+\\d\\d\\d\\d(?#!localtime)\\]', # %M - [Not used.] - '%M' => '', # %N s-computername Server name, the name of the server on which the log entry was generated Yes '%N' => "(?#=s-computername)$HOST(?#!s-computername)", # %O - [Not used.] - '%O' => '', # %P s-port Server port, the port number the client is connected to. Yes '%P' => '(?#=s-port)\\d+(?#!s-port)', # %Q cs-uri-query The URI query portion of the URL No # %R cs (Referer) Request referrer No # %S s-sitename Internet service and instance number running on client computer Yes # %T duration Elapsed time, seconds Yes '%T' => '(?#=duration)\\d+(?#!duration)', # %U cs-uri-stem Object path from request URL Yes # %V cs-version The protocol (HTTP, FTP) version used by the client. Yes # %W sc-filter-result UFS event (May differ between Websense or SmartFilter or others). No # this is handled in _postprocess() and is unsupported yet '%W' => '', # %X cs (X-Forwarded-For) The IP address of the device which sent the HTTP request. No # %Y - [Not used.] - '%Y' => '', # %Z - [Not used.] - '%Z' => '', # UFS specific # Smartfilter # Login specific '%u-username' => '(?#=cs-username)[-.\\w]+(?#!cs-username)', '%u-ldap' => '(?#=cs-username)-|(?:[A-Za-z]+=[^,]*,)*[A-Za-z]=[^,]*?(?#!cs-username)', ); =head1 METHODS Regexp::Log::BlueCoat is a standard Regexp::Log object, and therefore supports all the standard Regexp::Log methods. Regexp::Log::BlueCoat's constructor accepts several BlueCoat specific arguments: ufs - URL Filtering Service login - The type of username information Note: Though BlueCoat supports SmartFilter, Websense and others, Regexp::Log::BlueCoat only support I UFS in this version. The appropriate accessors are defined for them (if used to set, they return the new value for the attribute). =over 4 =item ufs( [$ufs] ) Get/set the URL Filter System type (C<%f> and C<%W>). Only C is supported in this version. =cut sub ufs { my $self = shift; $self->{ufs} = shift if @_; return $self->{ufs}; } =item ufs_category( category => string, [...] ) This method lets you override the default category names in your UFS. For example, I allows to configure the name of the categories; Regexp::Log::BlueCoat supports the default category names, but lets you override them if needed. The changes are applied on the objet current C. $log->ufs('smartfilter'); $log->ufs_category( hm => 'FunStuff' ); # change the Humor category See L for details about the category names. When called without arguments, ufs_category() will return the whole category list for the instance. =item ufs_category( ufs_name, category => string, [...] ) This method can also be called as a class method. If you'd rather change the UFS category names for every Regexp::Log::BlueCoat that will be created, you can use the method as a class method. You'll need to tell ufs_category() on which UFS to apply these modifications. Regexp::Log::BlueCoat->ufs_category( 'smartfilter', hm => 'Fun', # change the Humor category mp => 'Music', # change the MP3 category ); These changes will be on for any new Regexp::Log::Object you'll create. When called with a single argument, ufs_category() will return the whole category list for the specified UFS for the class. =cut sub ufs_category { my $self = shift; # instance method if ( ref $self ) { my $ufs = $self->{ufs}; if (@_) { my %ufs = @_; @{ $self->{_ufs}{$ufs} }{ keys %ufs } = values %ufs; } else { return ( %{ $UFS{$ufs} }, %{ $self->{_ufs}{$ufs} } ) } } # class method else { my $ufs = shift; if (@_) { my %ufs = @_; @{ $UFS{$ufs} }{ keys %ufs } = values %ufs; } else { return %{ $UFS{$ufs} } } } } =item login() Get/set the user login type (C<%u>). This version supports C (standard bareword) and C (standard C form). =cut sub login { my $self = shift; $self->{login} = shift if @_; return $self->{login}; } =back =head1 PREDEFINED FORMATS Regexp::Log::BlueCoat supports several standards log formats. These can be set up by using their short name as the format string, with the format() method. Description Name Format String ----------- ---- ------------- Squid log format :squid %g %e %a %w/%s %b %m %i %u %H/%d %c NCSA common log format :clf %h %l %u %t "%r" %s %b NCSA extended log format :elf %h %l %u %L "%r" %s %b "%R" "%A" Microsoft IIS format :iis %a, -, %x, %y, %S, %N, %I, %e, %b, %B, %s, 0, %m, %U, - =cut %FORMAT = ( ':squid' => '%g %e %a %w/%s %b %m %i %u %H/%d %c', ':clf' => '%h %l %u %t "%r" %s %b', ':elf' => '%h %l %u %L "%r" %s %b "%R" "%A"', ':iis' => '%a, -, %x, %y, %S, %N, %I, %e, %b, %B, %s, 0, %m, %U, -', ); =head1 FIELDS =head2 Blue Coat custom format Not all C<%>-escapes are supported in this version of Regexp::Log::BlueCoat. ELFF is not supported yet. Multiple consecutive spaces in the format string are compressed to a single space. The following list is straight from Blue Coat's documentation. Name ELFF Description ---- ---- ----------- % - Denotes an expansion field. %% - Denotes '%' character. %a c-ip Client IP address. %b sc-bytes Number of bytes returned by the server (or the Cache). %c cs (content-type) The type of object. Usually the MIME-type. %d cs-supplier-name SUPPLIER NAME - Name or IP address of the server/cache from which the object was received. %e time-taken Number of milliseconds request took to process. %f sc-filter-category Filtering reason. Why it was denied (such as sex or business) %g timestamp UNIX type timestamp. %h c-ip Client Hostname (uses IP to avoid reverse DNS) - same as %a %i cs-uri The requested URI. Note: Web trends expects this to be only cs-uri-stem + cs-uri-query %j - [Not used.] %l - Client Identification string. (User Login name remote). - always '-' %m cs-method HTTP method. HTTP methods include GET, PUT, POST, and so on. %n - [Not used.] %o - [Not used.] %p r-port Port fetched from on host - origin server port %q - [Not used.] %r cs-request-line First line of the request %s sc-status The code returned by the cache to the client (HTTP code). %t gmttime GMT date and time of the user request, in the format [DD/MM/YYYY:hh:mm:ss GMT] %u cs-username Authenticated user ID. %v cs-host Name of host sourcing the object. %w s-action What type of action did the CM take to process this request. NOTE: 'cached' is used by ELFF but has int value. %x date Date in YYYY-MM-DD format %y time GMT time in HH:MM:SS format %z - [Not used.] %A cs (user-agent) User agent %B cs-bytes The number of bytes received by the server %C cs (cookie) Cookie data %D s-supplier-ip SUPPLIER IP - IP address of server/cache from which the object was received. %E s-Policy-Message Policy enforcement message %F - [Not used.] %G - [Not used.] %H s-hierarchy How and where the object was retrieved from the cache hierarchy (DIRECT from the server, PARENT_HIT = from the parent cache, and so on) %I s-ip Server IP, the IP address of the server on which the log entry was generated %J - [Not used.] %K - [Not used.] %L localtime Local date and time of the user request in format: [DD/MMM/YYYY:hh:mm:ss +nnnn] %M - [Not used.] %N s-computername Server name, the name of the server on which the log entry was generated %O - [Not used.] %P s-port Server port, the port number the client is connected to. %Q cs-uri-query The URI query portion of the URL %R cs (Referer) Request referrer %S s-sitename Internet service and instance number running on client computer %T duration Elapsed time, seconds %U cs-uri-stem Object path from request URL %V cs-version The protocol (HTTP, FTP) version used by the client. %W sc-filter-result UFS event (May differ between Websense or SmartFilter or others). %X cs (X-Forwarded-For) The IP address of the device which sent the HTTP request. %Y - [Not used.] %Z - [Not used.] =head1 URL FILTERING SYSTEMS The BlueCoat Systems Port 80 Security Appliance supports two URL Filtering Systems (UFS): I and I. Since I only had access to log files generated with a BlueCoat + SmartFilter combination, this version of Regexp::Log only supports I UFS. Patches welcome! =head2 SmartFilter When C is set to C, the computed regular expression matches the default SmartFilter category names. These can be changed in SmartFilter's configuration (furthermore one can create one's own categories, with user-defined names). So we need to be able to modify the category names, either in an object instance, or in class data (shared by all instances). To compute a regular expression that matches your specific fields, there are several possibilities: =over 4 =item Make the changes in your object instance The method ufs_category() lets you replace any standard category by your own, and even add new "categories" (text that will be matched by the C<%f> fields). These changes are valid for the object only. See ufs_category() for details. =item Change the Regexp::Log::BlueCoat class itself ufs_category() can be used as a class method. One can also be adventurous and acces %Regexp::Log::BlueCoat::UFS directly, but you'll need to read the source to understand the details. Here's an example: $Regexp::Log::UFS{smartfilter} = { simple => '[-\\w]+' }; =back I default categories are: Key Default value Category --- ------------- -------- sx "sex" Sex dr "drugs" Drugs hs "hate speech" Hate Speech cs "crim. skills" Criminal Skills nd "nudity" Nudity os "on-line sales" Online Sales gb "gambling" Gambling pp "personal pages" Personnal Pages js "job search" Job Search sp "sports" Sports gm "games" Games hm "humor" Humor mp "MP3 sites" MP3 Sites et "entertainment" Entertainment ls "lifestyle" Lifestyle ex "extreme" Extreme ch "chat" Chat in "investing" Investing nw "general news" General News po "politics, opinion, religion" Politics, Opinion, Religion mm "dating" Dating ac "art/culture" Art/Culture na "usenet news access" Usenet News Access oc "cults/occult" Cults/Occult na "Usenet News" Usenet News sh "self help" Self-Help tr "travel" Travel mt "mature" Mature wm "webmail" Webmail ps "portal sites" Portal Sites an "anonymizer/translator" Anonymizer/Translator u0 "user defined category 0" First User-defined Category u1 "user defined category 1" Second User-defined Category u2 "user defined category 2" Third User-defined Category u3 "user defined category 3" Fourth User-defined Category u4 "user defined category 4" Fifth User-defined Category u5 "user defined category 5" Sixth User-defined Category u6 "user defined category 6" Seventh User-defined Category u7 "user defined category 7" Eighth User-defined Category u8 "user defined category 8" Ninth User-defined Category u9 "user defined category 9" Tenth User-defined Category Regexp::Log::BlueCoat add the following three categories: Key Default value Category --- ------------- -------- none "-" None uncategorized "uncategorized" Uncategorized not_applied "content_filter_not_applied" Filter not applied =head2 Websense I is not supported yet. Patches and log file excerpts are welcome. =cut %UFS = ( smartfilter => { none => '-', uncategorized => 'uncategorized', filter_not_applied => 'content_filter_not_applied', sx => "sex", dr => "drugs", hs => "hate speech", cs => "crim. skills", nd => "nudity", os => "on-line sales", gb => "gambling", pp => "personal pages", js => "job search", sp => "sports", gm => "games", hm => "humor", mp => "MP3 sites", et => "entertainment", ls => "lifestyle", ex => "extreme", ch => "chat", in => "investing", nw => "general news", po => "politics, opinion, religion", mm => "dating", ac => "art/culture", na => "usenet news access", oc => "cults/occult", na => "Usenet News", sh => "self help", tr => "travel", mt => "mature", wm => "webmail", ps => "portal sites", an => "anonymizer/translator", u0 => "user defined category 0", u1 => "user defined category 1", u2 => "user defined category 2", u3 => "user defined category 3", u4 => "user defined category 4", u5 => "user defined category 5", u6 => "user defined category 6", u7 => "user defined category 7", u8 => "user defined category 8", u9 => "user defined category 9", }, websense => {}, ); %DEFAULT = ( format => '', capture => [], ufs => '', login => '', _ufs => { map { ($_, {} ) } keys %UFS }, ); sub _preprocess { my $self = shift; my $login = $self->{login}; # Login specific regexps $self->{_regexp} =~ s/%u/%u-$login/g if defined $login && $login =~ /^(?:ldap|username)$/; # Multiple consecutive spaces are compressed to a single space $self->{_regexp} =~ s/ +/ /g; } sub _postprocess { my $self = shift; my $ufs = $self->{ufs}; # UFS specific regexps if ( defined $ufs and $ufs ne '' ) { my %categories = ( %{ $UFS{$ufs} }, %{ $self->{_ufs}{$ufs} } ); my $categories = join '|', sort values %categories; $self->{_regexp} =~ s/%f/$categories/g; } } =head1 TODO Support streaming logs: Windows Media and RealMedia. Support the W3C Extended Log File Format (ELFF), which is a subset of the Blue Coat format where each field is described using a text string. Have a look at the entries that produce multi-line logs. =head1 BUGS Most of the developpement has been done when I was trying to process logs created with the following format: C<%g %e %a %w/%s %b %m %i %u %H/%d %c %f %A>. Which means that the regular expressions that this module produces do not cover every possible format. If Regexp::Log::BlueCoat's regular expressions do not match some of the log that you are trying to munge, please use the F script and send the resulting file to me. =head1 REFERENCES Blue Coat Systems Port 80 Security Appliance, I: http://www.bluecoat.com/downloads/manuals/BC_Config_Mgmt_Guide.pdf Secure Computing Smartfilter, I, version 3.1.2: http://www.securecomputing.com/pdf/SFConfig312_IC_RevE.pdf =head1 THANKS Thanks to Jarkko Hietaniemi for Regex::PreSuf. =head1 AUTHOR Philippe 'BooK' Bruhat Ebook@cpan.orgE. =head1 LICENCE This module is free software; you can redistribute it or modify it under the same terms as Perl itself. =cut 1;