package Algorithm::NaiveBayes; use strict; use Storable; use vars qw($VERSION); $VERSION = '0.04'; sub new { my $package = shift; my $self = bless { version => $VERSION, purge => 1, model_type => 'Frequency', @_, instances => 0, training_data => {}, }, $package; if ($package eq __PACKAGE__) { # Bless into the proper subclass return $self->_load_model_class->new(@_); } return bless $self, $package; } sub _load_model_class { my $self = shift; die "model_class cannot be set to " . __PACKAGE__ if ($self->{model_class}||'') eq __PACKAGE__; my $package = $self->{model_class} || __PACKAGE__ . "::Model::" . $self->{model_type}; unless ($package->can('new')) { eval "use $package"; die $@ if $@; } return $package; } sub save_state { my ($self, $path) = @_; Storable::nstore($self, $path); } sub restore_state { my ($pkg, $path) = @_; my $self = Storable::retrieve($path) or die "Can't restore state from $path: $!"; $self->_load_model_class; return $self; } sub add_instance { my ($self, %params) = @_; for ('attributes', 'label') { die "Missing required '$_' parameter" unless exists $params{$_}; } for ($params{label}) { $_ = [$_] unless ref; @{$self->{labels}}{@$_} = (); } $self->{instances}++; $self->do_add_instance($params{attributes}, $params{label}, $self->{training_data}); } sub labels { keys %{ $_[0]->{labels} } } sub instances { $_[0]->{instances} } sub training_data { $_[0]->{training_data} } sub train { my $self = shift; $self->{model} = $self->do_train($self->{training_data}); $self->do_purge if $self->purge; } sub do_purge { my $self = shift; delete $self->{training_data}; } sub purge { my $self = shift; $self->{purge} = shift if @_; return $self->{purge}; } sub predict { my ($self, %params) = @_; my $newattrs = $params{attributes} or die "Missing 'attributes' parameter for predict()"; return $self->do_predict($self->{model}, $newattrs); } 1; __END__ # Below is stub documentation for your module. You better edit it! =head1 NAME Algorithm::NaiveBayes - Bayesian prediction of categories =head1 SYNOPSIS use Algorithm::NaiveBayes; my $nb = Algorithm::NaiveBayes->new; $nb->add_instance (attributes => {foo => 1, bar => 1, baz => 3}, label => 'sports'); $nb->add_instance (attributes => {foo => 2, blurp => 1}, label => ['sports', 'finance']); ... repeat for several more instances, then: $nb->train; # Find results for unseen instances my $result = $nb->predict (attributes => {bar => 3, blurp => 2}); =head1 DESCRIPTION This module implements the classic "Naive Bayes" machine learning algorithm. It is a well-studied probabilistic algorithm often used in automatic text categorization. Compared to other algorithms (kNN, SVM, Decision Trees), it's pretty fast and reasonably competitive in the quality of its results. A paper by Fabrizio Sebastiani provides a really good introduction to text categorization: L =head1 METHODS =over 4 =item new() Creates a new C object and returns it. The following parameters are accepted: =over 4 =item purge If set to a true value, the C method will be invoked during C. The default is true. Set this to a false value if you'd like to be able to add additional instances after training and then call C again. =back =item add_instance( attributes =E HASH, label =E STRING|ARRAY ) Adds a training instance to the categorizer. The C parameter contains a hash reference whose keys are string attributes and whose values are the weights of those attributes. For instance, if you're categorizing text documents, the attributes might be the words of the document, and the weights might be the number of times each word occurs in the document. The C