The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Statistics::Burst;

use 5.008004;
use strict;
use warnings;

require Exporter;
use AutoLoader qw(AUTOLOAD);

our @ISA = qw(Exporter);

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

# This allows declaration	use Statistics::Burst ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
our %EXPORT_TAGS = ( 'all' => [ qw(
	
) ] );

our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );

our @EXPORT = qw(
	
);

our $VERSION = '0.01';


# Preloaded methods go here.

# Autoload methods go after =cut, and are processed by the autosplit program.

1;
__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

Statistics::Burst -  Perl Implementation of Kleinberg's Word Busts algorithm

=head1 SYNOPSIS

  use Statistics::Burst;
  my $burstObj=Burst::new();
  $burstObj->generateStates(3,.111,2);
  $burstObj->gamma(.5);
  $burstObj->setData(\@gap_space);
  $burstObj->process();
  $statesUsed=$burstObj->getStatesUsed();

=head1 DESCRIPTION

This Burst Module is an implementation of Kleinberg's Word Bursts algorithm.
The paper describing the algorithm is located at http://www.cs.cornell.edu/home/kleinber/bhs.pdf.

What this algorithm implementation does is after a few parameters are set the driver code will pass it a list of numbers. The list of numbers are the time diferences between different arrivals.  So if you were modelling the arrivals of people into a store entrance and they arrive at times 8:10,8:20,8:25,8:30 then the list of numbers you would pass to Burst will be (10,5,5). 

Bursts can be used to model the popularity of words by defining what is the arrival and rate for words.  For instance you could monitor RSS titles from Slashdot.  If the word appears in a title you could consider that an arrival of that word.  Your arrival rate could be defined as how many arrivals of a paticular word in a day or how many times a word appears per a heading (This would usually be less than 1)

With this information you would build the seperation lists  that will be used processed by the Burst function.



=cut

=head2 new

  $burst=Statistics::Burst::new();

Returns a burst object.

=cut
	

sub new
{
	my $self={states=>[],gamma=>undef, statesUsed=>[], data=>[]};
	bless $self;
	return $self;
}


=head2 setState($lamba, [$index])

     $burst->setState(.112);
     $burst->setState(.24,1);

Allows you to set or change the lamba of a paticular state. If $index is not specified then it creates a new state.

=cut
	
sub setState
{
	my ($self, $lamba, $index)=@_;
	if (defined($index))
	{
		return undef if (scalar(@{$self->{states}}) < $index);
		$self->{states}->[$index]=$lamba;
	}
	else
	{
		push @{$self->{states}}, $lamba;
	}
}

=head2 generateStates($count,$rate, $sigma)

     $burst->generateStates(4,.123,1);

Allows developer to generate the states programatically.  You specify the number of states to create [$count], The initial rate for state 0 [$rate], and sigma the parameter that defines how much the state changes.

The higher the sigma the larger the difference between states.

=cut

sub generateStates
{
	my ($self, $count, $rate, $sigma)=@_;
	for (my $i=0;$i<$count;$i++)
	{
	   $self->setState($rate*($sigma**$i));
	}

}

=head2 gamma($gamma)

     $burst->gamma(2,);

A parameter for the transistion cost.  The larger the gamma value the more expensive it is to move to a higher state.

=cut

sub gamma
{
	my ($self, $gamma)=@_;
	$self->{gamma}=$gamma;
}

=head2 setData($gap_space)

     $burst->setData([4,5,6,3,4,2]);

Sets the data that will be processed.

=cut

sub setData
{
	my ($self, $gap_space)=@_;
	$self->{data}=[];
	@{$self->{data}}=@$gap_space;
}

=head2 process

     $burst->process();

Triggers the calculation of the bursts.

=cut

sub process
{
	my ($self)=@_;
	my $result={};
	$self->calcCost(11);
}
##################################################

sub calcCost
{
   my ($self,$pos)=@_;
	my @lamba=@{$self->{states}};
   my $stateCount=scalar(@lamba);
   if ($pos==0)
   {
     $self->{statesUsed}->[$pos]=0;
     return {state=>0,minCosts=>0};

   }
   my ($trans, $pc, $cost);
   my $minCosts=99999;
   my $state=-1;
	my @gap_space=@{$self->{data}};

	my $costImprove=1;

   for(my $tryState=0;($tryState<$stateCount) && ($costImprove) ;$tryState++)
   {
       my $prevresult={};
       $prevresult=$self->calcCost($pos-1,$prevresult);

       my $trans= $self->transistion($prevresult->{state},$tryState,
				 $gap_space[$pos]);
       my $pc=  $prevresult->{minCosts} ;
       my $alignment=-1*log(func($lamba[$tryState],$gap_space[$pos])) ;
       my $cost= $alignment + $pc + $trans;
#print "$pos: $tryState ->",$cost,
#          " = a:$alignment + prev:$pc +trans:$trans \n";
      if ($cost<$minCosts)
      {
         $minCosts=$cost;
         $state=$tryState;
         $self->{statesUsed}->[$pos]=$state;
      }
		else
		{  #If the costs didnt improve (the tryState should be >=1 by now)
			#Then there is no use in trying higher states.
			$costImprove=0;
		}
   }
   return {state=>$state,minCosts=>$minCosts};
}

sub transistion
{ 
   my ($self,$prev,$curr,$pos)=@_;
   if ($prev >= $curr)
   {
      return 0;
   }
   else
   {
      return ( $self->{gamma}*($curr-$prev) *log($pos));
   }
}

sub func($$)
{
  my ($lamba, $x)=@_;
  my $e=2.718;
  my $value=$lamba*($e ** ($lamba *$x *-1));
  return $value
}

=head2 getStatesUsed
  
  $array_ref=$burst->getStatesUsed();
  
Returns the states of the automaton for each step in the data set.

=cut

sub getStatesUsed
{
	my ($self)=@_;
	return $self->{statesUsed};
}

=head1 AUTHOR

Copyright 2004-2005, Tommie M. Jones All Rights Reserved.
This library is free software; you can redistribute it and/or modify it
       under the same terms as Perl itself.

=cut