#!/usr/bin/perl # This script creates a Categorizer and runs several of its methods on # a corpus, reporting the results. # # Copyright 2002 Ken Williams, under the same license as the # AI::Categorizer distribution. use strict; use AI::Categorizer; use Benchmark; my $HAVE_YAML = eval "use YAML; 1"; my ($opt, $do_stage, $outfile) = parse_command_line(@ARGV); @ARGV = grep !/^-\d$/, @ARGV; my $c = eval {new AI::Categorizer(%$opt)}; if ($@ and $@ =~ /^The following parameter/) { die "$@\nPlease see the AI::Categorizer documentation for a description of parameters accepted.\n"; } die $@ if $@; %$do_stage = map {$_, 1} 1..5 unless keys %$do_stage; my $out_fh; if ($outfile) { open $out_fh, ">> $outfile" or die "Can't create $outfile: $!"; select((select($out_fh), $|=1)[0]); if (keys(%$do_stage) > 1) { print $out_fh "~~~~~~~~~~~~~~~~", scalar(localtime), "~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; if ($HAVE_YAML) { print {$out_fh} YAML::Dump($c->dump_parameters); } else { warn "More detailed parameter dumping is available if you install the YAML module from CPAN.\n"; } } } run_section('scan_features', 1, $do_stage); run_section('read_training_set', 2, $do_stage); run_section('train', 3, $do_stage); run_section('evaluate_test_set', 4, $do_stage); if ($do_stage->{5}) { my $result = $c->stats_table; print $result if $c->verbose; print $out_fh $result if $out_fh; } sub run_section { my ($section, $stage, $do_stage) = @_; return unless $do_stage->{$stage}; if (keys %$do_stage > 1) { print " % $0 @ARGV -$stage\n" if $c->verbose; die "$0 is not executable, please change its execution permissions" unless -x $0; system($0, @ARGV, "-$stage") == 0 or die "$0 returned nonzero status, \$?=$?"; return; } my $start = new Benchmark; $c->$section(); my $end = new Benchmark; my $summary = timestr(timediff($end, $start)); my ($rss, $vsz) = memory_usage(); print "$summary (memory: rss=$rss, vsz=$vsz)\n" if $c->verbose; print $out_fh "Stage $stage: $summary (memory: rss=$rss, vsz=$vsz)\n" if $out_fh; } sub parse_command_line { my (%opt, %do_stage); while (@_) { if ($_[0] =~ /^-(\d+)$/) { shift; $do_stage{$1} = 1; } elsif ( $_[0] eq '--config_file' ) { die "--config_file requires the YAML module from CPAN to be installed.\n" unless $HAVE_YAML; shift; my $file = shift; my $href = YAML::LoadFile($file); @opt{keys %$href} = values %$href; } elsif ( $_[0] =~ /^--/ ) { my ($k, $v) = (shift, shift); $k =~ s/^--//; $opt{$k} = $v; } else { die usage(); } } while (my ($k, $v) = each %opt) { # Allow abbreviations if ($k =~ /^(\w+)_class$/) { my $name = $1; $v =~ s/^::/AI::Categorizer::\u${name}::/; $opt{$k} = $v; } } my $outfile; unless ($outfile = delete $opt{outfile}) { $outfile = $opt{progress_file} ? "$opt{progress_file}-results.txt" : "results.txt"; } return (\%opt, \%do_stage, $outfile); } sub usage { return < --parameter_2 # You may specify a YAML config file as follows: $0 --config_file --parameter_3 # Or, to run only step 3 (of 5) $0 --config_file -3 --parameter_1, --parameter_2, etc. are parameters accepted by AI::Categorizer objects' new() methods. EOF } sub memory_usage { my ($rss, $vsz); if ($^O eq 'darwin' or $^O eq 'linux') { ($rss, $vsz) = `ps -eo rss,vsz -p $$` =~ /(\d+)\s+(\d+)/; } elsif ($^O eq 'solaris') { ($rss, $vsz) = `ps -o rss,vsz -p $$` =~ /(\d+)\s+(\d+)/; } else { warn "Unknown system, can't get memory usage"; } return ($rss, $vsz); }