package Log::Statistics;
use warnings;
use strict;
# $Id: Statistics.pm 47 2006-04-11 23:34:19Z wu $
our $VERSION = sprintf "0.%03d", q$Revision: 47 $ =~ /(\d+)/g;
#
#_* Libraries
#
use Data::Dumper;
use IO::File;
use Date::Manip;
use YAML;
use XML::Writer;
# logging
use Log::Log4perl qw(:resurrect :easy);
###l4p my $logger = get_logger( 'default' );
#
#_* new
#
sub new
{
my ( $class, $data ) = @_;
my $objref = ( { data => $data } );
###l4p $logger->info( "creating new object: $class" );
bless $objref, $class;
return $objref;
}
sub register_field {
my ( $self, $name, $column ) = @_;
unless ( defined( $name ) && defined( $column ) ) {
###l4p $logger->logconfess( "attempted to register field without specifying name or column" );
die "attempted to register field without specifying name or column";
}
if ( $name =~ m|\-| ) {
###l4p $logger->logconfess( "column name may not contain a dash" );
die "column name may not contain a dash";
}
# provide an easy way to look up a column by field name
###l4p $logger->debug( "Adding index for field $name => $column" );
$self->{'field_index'}->{ $name } = $column;
return 1;
}
sub add_field {
my ( $self, $column, $name, $threshold ) = @_;
unless ( defined( $column ) && defined( $name ) ) {
###l4p $logger->logconfess( "not enough arguments" );
die "not enough arguments";
}
###l4p $logger->info( "adding field: $column -> $name" );
###l4p $logger->info( "threshold: $threshold" ) if $threshold;
unless ( $name eq "duration" ) {
push @{ $self->{'field_column'} }, $column;
push @{ $self->{'field_name'} }, $name;
my @thresholds;
if ( $threshold ) {
@thresholds = split /\|/, $threshold;
unshift @thresholds, 0 unless $thresholds[0] eq "0";
$self->{'threshold_index'}->{ $name } = \@thresholds;
}
push @{ $self->{'thresholds'} }, \@thresholds;
}
$self->register_field( $name, $column );
return 1;
}
# specify to keep data for the intersection of field1 and field2. For
# example, if name is transaction name and name2 is the location
# field, data will be kept about transaction breakdown per location.
sub add_group {
my ( $self, $names, $threshold ) = @_;
unless ( scalar( @$names ) > 1 ) {
###l4p $logger->logconfess( "not enough fields specified" );
die "not enough fields specified";
}
my $group_name = join "-", @$names;
# data field name
push @{ $self->{'group'}->{'name'} }, $group_name;
push @{ $self->{'group'}->{'names'} }, $names;
my @group_columns;
for my $name ( @$names ) {
unless ( defined( $self->{'field_index'}->{ $name } ) ) {
###l4p $logger->logconfess( "error: no index found for field $name" );
die "error: no index found for field $name";
}
push @group_columns, $self->{'field_index'}->{ $name };
}
# which columns to track
push @{ $self->{'group'}->{'column'} }, \@group_columns;
my @thresholds;
if ( $threshold ) {
@thresholds = split /\|/, $threshold;
$self->{'threshold_index'}->{ $group_name } = \@thresholds;
}
push @{ $self->{'group'}->{'thresholds'} }, \@thresholds;
return 1;
}
# add a regexp to parse the time field
sub add_time_regexp {
my ( $self, $time_regexp ) = @_;
$self->{'time_regexp'} = $time_regexp;
}
# add a custom regexp to split entire entry into fields
sub add_line_regexp {
my ( $self, $line_regexp ) = @_;
$self->{'line_regexp'} = $line_regexp;
}
sub parse_text {
my ( $self, $text ) = @_;
for my $line ( split /\n/, $text ) {
$self->parse_line( $line );
}
return $self->{'data'};
}
sub save_data {
my ( $self, $file ) = @_;
###l4p $logger->info( "Saving data to yaml file: $file" );
unless ( $self->{'data'} ) {
###l4p $logger->logconfess( "No data specified" );
die "No data to save";
}
unless ( $file ) {
###l4p $logger->logconfess( "No file specified" );
die "No file specified";
}
YAML::DumpFile( $file, $self->{'data'} );
}
sub read_data {
my ( $self, $file ) = @_;
###l4p $logger->info( "Reading yaml data from $file" );
# Load
$self->{'data'} = {
%{ YAML::LoadFile( $file ) || {} },
};
}
sub parse_line {
my ( $self, $line ) = @_;
return unless $line;
return if $line =~ m|^\#|;
$line =~ s|\s+$||;
$line =~ s|^\s+||;
my @values;
if ( $self->{'line_regexp'} ) {
$line =~ m|$self->{'line_regexp'}|;
@values = ( $1, $2, $3, $4, $5, $6, $7, $8, $9 );
}
else {
@values = split /\s*,\s*/, $line;
}
return unless scalar @values;
my $duration = defined $self->{'field_index'}->{'duration'} ? $values[ $self->{'field_index'}->{'duration'} ] : undef;
# for each line, total counters are incremented
$self->{'data'}->{'total'}->{'duration'} += $duration if $duration;
$self->{'data'}->{'total'}->{'count'} ++;
if ( $self->{'time_regexp'} && $self->{'field_index'}->{'time'} ) {
if ( $values[ $self->{'field_index'}->{'time'} ] ) {
$values[ $self->{'field_index'}->{'time'} ] =~ s|^.*(?:$self->{'time_regexp'}).*$|$1|;
}
else {
###l4p $logger->debug( "no time in entry: $line" );
return;
}
}
# for each column being parsed, collect summary data
for my $index ( 0 .. $#{ $self->{'field_column'} } ) {
# get the name of this field
my $name = $self->{'field_name'}->[$index];
my $value = $values[ $self->{'field_index'}->{$name} ] || "null";
# increment counters for this name/value pair
$self->{'data'}->{'fields'}->{ $name }->{ $value }->{'count'}++;
if ( defined $duration ) {
$self->{'data'}->{'fields'}->{ $name }->{ $value }->{'duration'} += $duration;
THRESHOLD:
for my $threshold_idx ( reverse( 0 .. $#{ (@{$self->{'thresholds'}})[$index] } ) ) {
my $threshold = $self->{'thresholds'}->[$index]->[$threshold_idx];
if ( $duration >= $threshold ) {
#print "dur:$duration th:$threshold\n";
$self->{'data'}->{'fields'}->{ $self->{'field_name'}->[$index] }->
{ $values[ $self->{'field_column'}->[$index] ] }->{"th_$threshold_idx"}++;
last THRESHOLD;
}
}
}
}
# for each group, collect summary data
if ( $self->{'group'} ) {
for my $index ( 0 .. $#{ $self->{'group'}->{'name'} } ) {
my $name = (@{$self->{'group'}->{'name'}})[$index];
my @names = @{ (@{$self->{'group'}->{'names'}})[$index] };
my @group_values;
unless ( defined ( $self->{'data'}->{ 'groups' }->{ $name } ) ) {
$self->{'data'}->{ 'groups' }->{ $name } = {};
}
# walk down the data structure moving the pointer along.
# must be done since the depth of the hash depends on the
# number of fields in the group
my $group_pointer = $self->{'data'}->{ 'groups' }->{ $name };
for my $name_idx ( 0 .. $#names ) {
my $value_idx = $self->{'group'}->{'column'}->[$index]->[ $name_idx ];
my $value = $values[ $value_idx ] || "null";
push @group_values, $value;
unless ( defined( $group_pointer->{ $value } ) ) {
$group_pointer->{ $value } = {};
}
$group_pointer = $group_pointer->{ $value };
}
my ( $value1, $value2 ) = @group_values;
$group_pointer->{'count'} += 1;
if ( defined $duration ) {
$group_pointer->{'duration'} += $duration;
THRESHOLD:
for my $threshold_idx ( reverse( 0 .. $#{ $self->{'group'}->{'thresholds'}->[$index] } ) ) {
my $threshold = $self->{'group'}->{'thresholds'}->[$index]->[$threshold_idx];
if ( $duration >= $threshold ) {
$group_pointer->{"th_$threshold_idx"}++;
last THRESHOLD;
}
}
}
}
}
}
#
#__* date parsing fu
#
# since date parsing is expensive, dates are cached
sub get_utime_from_string
{
my ( $self, $string ) = @_;
if ( $self->{'date_cache'}->{ $string } )
{
return $self->{'date_cache'}->{ $string };
}
my $date = &UnixDate(ParseDate($string),"%s");
$self->{'date_cache'}->{ $string } = $date;
return $date;
}
#
#__* XML in/out
#
sub get_xml {
my ( $self ) = @_;
#print Dumper $self->{'data'};
my $xml = "";
my $writer = new XML::Writer( OUTPUT => \$xml, DATA_MODE => 1, DATA_INDENT => 2 );
$writer->xmlDecl( "", 1 );
$writer->startTag( "log-statistics" );
for my $field ( sort keys %{ $self->{'data'}->{'fields'} } ) {
$writer->startTag( "fields", "name" => $field );
$self->_xmlify_fields( $self->{'data'}->{'fields'}->{ $field }, $writer, [ $field ] );
$writer->endTag( "fields" );
}
for my $field ( sort keys %{ $self->{'data'}->{'groups'} } ) {
$writer->startTag( "groups", "name" => $field );
my @names = split /\-/, $field;
$self->_xmlify_groups( $self->{'data'}->{'groups'}->{ $field }, $writer, [ $field ], \@names );
$writer->endTag( "groups" );
}
$writer->endTag( "log-statistics" );
$writer->end();
return $xml;
}
sub _xmlify_fields {
my ( $self, $data, $writer, $fields_a ) = @_;
unless ( ref $data eq "HASH" ) {
die "Error, xmlify_fields called on non-hash";
}
if ( $data->{'count'} ) {
# leaf
my $name = $fields_a->[-1];
my $level = $fields_a->[-2];
my $count = $data->{'count'};
pop @$fields_a;
my @data = ( 'name' => $name, 'count' => $count );
@data = $self->_xmlify_duration( $data, @data );
$writer->emptyTag( $level, @data );
}
else {
for my $field ( sort keys %{ $data } ) {
push @{$fields_a}, $field;
$self->_xmlify_fields( $data->{ $field }, $writer, $fields_a );
}
}
}
sub _xmlify_groups {
my ( $self, $data, $writer, $fields_a, $names_a, $depth ) = @_;
$depth = defined $depth ? $depth + 1 : 0;
unless ( ref $data eq "HASH" ) {
die "Error, xmlify_fields called on non-hash";
}
my $name = $names_a->[$depth];
for my $field ( sort keys %{ $data } ) {
push @{$fields_a}, $field;
if ( $data->{ $field }->{ 'count' } ) {
# leaf
my $field_name = $fields_a->[-1];
my $field_id = $fields_a->[-2];
my $count = $data->{ $field }->{'count'};
pop @$fields_a;
my @data = ( 'name' => $field_name, 'count' => $count );
@data = $self->_xmlify_duration( $data->{ $field }, @data );
$writer->emptyTag( $name, @data );
}
else {
$writer->startTag( $name, "name" => $field );
$self->_xmlify_groups( $data->{ $field }, $writer, $fields_a, $names_a, $depth );
$writer->endTag( $name );
}
}
}
sub _xmlify_duration {
my ( $self, $data, @data ) = @_;
if ( $data->{'duration'} ) {
push @data, 'duration';
push @data, $data->{'duration'};
push @data, 'duration_average';
push @data, sprintf( "%0.4f", $data->{'duration'} / $data->{'count'} );
for my $index ( 0 .. 5 ) {
if ( $data->{ "th_$index" } ) {
push @data, "th_$index";
push @data, $data->{ "th_$index" };
}
}
}
return @data;
}
1;
__END__
#
#_* POD
#
=head1 NAME
Log::Statistics - near-real-time statistics from log files
=head1 SYNOPSIS
use Log::Statistics;
my $log = Log::Statistics->new();
# field 3 in the log contains the duration. registering a
# duration field causes duration information to be added to all
# summary data.
$log->register_field( "duration", 2 );
# field 1 in the log contains transaction name. add this field to
# the list of fields for which a summary report will be generated
$log->add_field( "transaction", 0 );
# field 2 in the log contains the log status entry (e.g. 404).
# don't generate a report on this field, but add it to the list of
# defined fields.
$log->register_field( "status", 1 );
# collect data about transaction and status grouped together.
# this will result in a break-down of all transactions by status.
# note this is different than all statuses by transaction.
$log->add_group( [ "transaction", "status" ] );
# add a regular expression to capture the year, month, day, hour,
# and minute from the time field.
my $time_regexp = ^(\d{4}-\d{2}-\d{2}\s\d{2}\:\d{2})
$log->add_time_regexp( $time_regexp );
# track overall response times per minute. time is in field 6 in
# the log
$log->add_field( "time", 5 );
# parse data in the log file
$log->parse_text( $log_entries );
=head1 DESCRIPTION
Log::Statistics is a module for collecting summary data from a log file.
For examples of what can be done with Log::Statistics, see the code and
documentation in scripts/logstatsd. logstatsd contains a prototype
implementation of several features which will eventually be migrated
from scripts/logstatsd.
The basic usage is to begin by creating a new Log::Statistics object.
Next, register each field name that you want to collect data about,
indicating which column that data is in. Next, add fields or groups
of fields for which you wish to collect statistics. Finally, use
parse_text to add multiple entries or parse_line to a single entry.
This module is alpha quality code, and is still under development. A
number of the features currently implemented in logstatsd will
eventually find their way back here.
=head1 SUBROUTINES/METHODS
=over 4
=item $log->new()
Create a new Log::Statistics object.
=item $log->register_field( $name, $column )
Define a field in the log, and indicate the column in which the field
exists. Once a field has been registered, it can be used again later
with add_group or add_field without having to re-specify the column
number.
Registering a field does not automatically include the field in the
report, except for the duration field. When a duration field has been
defined, all data collected will contain information about durations.
=item $log->add_field( $column, $name, [ $threshold1, $threshold2, ... ] )
Collect summary data about the specified field. The column can be
undef if the field has previously been registered using
register_field().
For each field added to the report, summary data will be collected for
each unique entry in the field. So for example, if a transaction
field is added, then summary data will be collected about each unique
transaction found in the log (e.g. the number of hits, total response
times, etc).
Thresholds will only be honored if a duration field has been defined
in the log (see THRESHOLDS below).
=item $log->add_group( [ $field1, $field, ...], [$threshold1, $threshold2, ... ]
Collect summary data about two or more fields grouped together. The
columns must have previously been defined either by using add_field or
else register_field.
For each group added to the report summary data will be collected for
each unique combination of entries in the fields. For example, if a
group is defined with "transaction" and "status", then summary
information will be collected about each transaction broken down by
the transaction status.
Note that a group for "transaction","status" is slightly different
from "status","transaction". The former builds a data structure for
each transaction that contains a hash with the summary data for each
status. The latter builds a data structure for each status that
contains a hash with the summary data for each transaction. Dumping
the two data structure to xml using XML::Simple will result in
different output. For more readable output, it is generally
recommended that you use the field which has the least number of
possible unique values first.
Thresholds will only be honored if a duration field has been defined
in the log (see THRESHOLDS below).
=item $log->add_time_regexp( $regexp )
Define a regular expression which can be used to parse the time field.
The regular expression should capture time to the resolution at which
data should be collected. If you are parsing a log with many days
data, you may want to generate a report which summarized by each day.
On the other hand, if your log contains many transactions over a short
time period, you might want to break down the summary by activity per
second.
=item $log->add_line_regexp( $regexp )
Define a regular expression which can be used to parse the entire log
entry and divide it up into a series of fields. This only needs to be
defined if the entries are not single-line comma delimited.
=item $log->parse_text( $text )
Generate summary data about the log entries contained in $text.
If no fields or groups have been defined, only overall total data will
be collected.
=item $log->parse_line( $line )
Similar to parse_text, except that only a single log entry is passed.
=item $log->save_data( $file )
Save the data collected to the specified file. Data will be stored in
the YAML format.
=item $log->read_data( $file )
Load the data collected from the specified store file. Data can been
stored using save_data.
=item $log->get_utime_from_string
Given a plain text date string from a log, convert it to unix time. A
cache is built up in RAM of the previously seen time strings to reduce
the overhead of using Date::Manip.
=item $log->get_xml
Get XML report for log entries that have been processed.
=back
=head1 Example XML Output
Here are some examples of the XML generated by Log::Statistics:
# time field and duration field defined
# group of status:transaction
=head1 THRESHOLDS
Thresholds allow monitoring the number of long response times. For
example, a given transaction might be expected to be complete within 5
seconds. In addition to measuring the average response time of the
transaction, you may also wish to measure how many transactions are
not completed within 5 seconds. You may define any number of
thresholds, so you could measure those that you consider to be fast
(under 3 seconds), good (under 5 seconds), slow (over 10 seconds), and
very slow (over 20 seoncds).
NOTE: If a duration field was not defined, then response times
thresholds statistics can not be calculated.
=head1 DEPENDENCIES
YAML - back end storage for log summary data
Date::Manip - for converting log times to unix time.
=head1 SEE ALSO
http://www.geekfarm.org/wu/muse/LogStatistics.html
=head1 BUGS AND LIMITATIONS
Specifying a duplicate field or group definition will cause all values
for the duplicated group(s) to be counted twice.
Please report problems to VVu@geekfarm.org
Patches are welcome.
=head1 AUTHOR
VVu@geekfarm.org
=head1 LICENCE AND COPYRIGHT
Copyright (c) 2005, VVu@geekfarm.org
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of geekfarm.org nor the names of its contributors
may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.