package Statistics::CalinskiHarabasz;
use 5.008005;
use strict;
use warnings;
require Exporter;
use AutoLoader qw(AUTOLOAD);
our @ISA = qw(Exporter);
our @EXPORT = qw( ch );
our $VERSION = '0.01';
# global variable
my @d = ();
my $g_mean = 0;
my $rcnt = 0;
sub ch
{
# Input params
my $matrixfile = shift;
my $clustmtd = shift;
my $K = shift;
my $i = 0;
my $j = 0;
# Read the matrix file into a 2 dimensional array.
my @inpmat = ();
open(INP,"<$matrixfile") || die "Error opening input matrix file!";
# Extract the number of rows from the first line in the file.
my $ccnt = 0;
my $line;
$line = ;
chomp($line);
$line=~s/\s+/ /;
($rcnt,$ccnt) = split(/\s+/,$line);
# Not a valid condition:
# If maximum number of clusters requested (k) is greater than the
# number of observations.
if($K > $rcnt)
{
print STDERR "The K value ($K) cannot be greater than the number of observations present in the input data ($rcnt). \n";
exit 1;
}
# Copy the complete matrix to a 2D array
while()
{
# remove the newline at the end of the input line
chomp;
# skip empty lines
if(m/^\s*\s*\s*$/)
{
next;
}
# remove leading white spaces
s/^\s+//;
# seperate individual values in a line
my @tmp = ();
@tmp = split(/\s+/);
# populate them into the 2D matrix
push @inpmat, [ @tmp ];
}
close INP;
my @row1 = ();
my @row2 = ();
my $acc = 0;
# Calculate all possible unique pairwise distances between the vectors
for($i = 0; $i < $rcnt; $i++)
{
# for all the rows in the cluster
for($j = $i+1; $j < $rcnt; $j++)
{
@row1 = @{$inpmat[$i]};
@row2 = @{$inpmat[$j]};
$d[$i][$j] = &dist_euclidean_sqr(\@row1, \@row2);
$acc += $d[$i][$j];
}
}
# Calculate general mean (d^2)
$g_mean = ($acc * 2)/($rcnt * ($rcnt - 1));
# Calculate mean for each cluster
# Calculate Ak
# Calculate VRC (Variance Ratio Criterion)
# For each K
my $k = 0;
my @VRC = ();
for($k=2; $k<=$K; $k++)
{
# avoid the case K = #ofContexts because then the denominator of VRC (n-k)
# become 0 and gives "division by 0" error.
if($k == $rcnt)
{
last;
}
my $lineNo = 0;
my %hash = ();
# Cluster the input dataset into k clusters
my $out_filename = "tmp.op" . $k . time();
my $status = 0;
$status = system("vcluster --clmethod $clustmtd $matrixfile $k >& $out_filename ");
die "Error running vcluster \n" unless $status==0;
# read the clustering output file
open(CO,"<$matrixfile.clustering.$k") || die "Error opening clustering output file.";
my $clust = 0;
while($clust = )
{
# hash on the cluster# and append the observation#
chomp($clust);
if(exists $hash{$clust})
{
$hash{$clust} .= " $lineNo";
}
else
{
$hash{$clust} = $lineNo;
}
# increment the line number
$lineNo++;
}
close CO;
# Calculate the "Within Cluster Dispersion Measure / Error Measure" Wk
# for given matrix and k value.
$VRC[$k] = &variance_ratio(\%hash,$k);
unlink "$out_filename","$matrixfile.clustering.$k";
}
# Calculate smallest k for which VRC is maximum
my $max = 0;
my $ans = 0;
for($k=2; $k<=$K; $k++)
{
# avoid the case K = #ofContexts because then the denominator of VRC (n-k)
# become 0 and gives "division by 0" error.
if($k == $rcnt)
{
last;
}
if($VRC[$k] > $max)
{
$max = $VRC[$k];
$ans = $k;
}
}
print "$ans\n";
}
sub dist_euclidean_sqr
{
# arguments
my @i = @{(shift)};
my @j = @{(shift)};
# local variables
my $a;
my $dist = 0;
my $retvalue = 0;
# Squared Euclidean measure
# summation on all j (xij - xi'j)^2 where i, i' are the rows indicies.
for $a (0 .. $#i)
{
$dist += (($i[$a] - $j[$a])**2);
}
$retvalue = sprintf("%.4f",$dist);
return $retvalue;
}
sub variance_ratio
{
# Input arguments
my %clustout = %{(shift)};
my $k = shift;
# Local variables
my $i;
my $j;
my @rownum;
my $key;
my $row1;
my $row2;
my $VRC = 0;
my @D = ();
my $tmp;
my $c_mean = ();
my $A = 0;
# For each cluster
foreach $key (sort keys %clustout)
{
$D[$key] = 0;
@rownum = split(/\s+/,$clustout{$key});
# for each instance in the cluster
for($i = 0; $i < $#rownum; $i++)
{
# for all the rows in the cluster
for($j = $i+1; $j <= $#rownum; $j++)
{
# find the distance between the 2 rows of the matrix.
$row1 = $rownum[$i];
$row2 = $rownum[$j];
# store the Dr value
if(exists $d[$row1][$row2])
{
$D[$key] += $d[$row1][$row2];
}
else
{
$D[$key] += $d[$row2][$row1];
}
}
}
# Calculate individual cluster mean
if($#rownum == 0)
{
$c_mean = 0;
}
else
{
$c_mean = ($D[$key] * 2)/(($#rownum + 1) * $#rownum);
}
$A += $#rownum * ($g_mean - $c_mean);
}
$A = $A/($rcnt - $k);
if($g_mean == $A)
{
$VRC = 99999;
}
else
{
$VRC = ( $g_mean + ($rcnt - $k) / ($k-1) * $A ) / ( $g_mean - $A );
}
return $VRC;
}
1;
__END__
# Below is stub documentation for your module. You'd better edit it!
=head1 NAME
Statistics::CalinskiHarabasz - Perl extension to the cluster stopping rule proposed by Calinski and Harabasz (C&H)
=head1 SYNOPSIS
use Statistics::CalinskiHarabasz;
&ch(InputFile, "agglo", 10);
Input file is expected in the "dense" format -
Sample Input file:
6 5
1 1 0 0 1
1 0 0 0 0
1 1 0 0 1
1 1 0 0 1
1 0 0 0 1
1 1 0 0 1
=head1 DESCRIPTION
C&H use the Variance Ratio Criterion which is analogous to
F-Statistics to estimate the number of clusters a given data
naturally falls into. They minimize Within Cluster/Group Sum
of Squares (WGSS) and maximize Between Cluster/Group Sum of
Squares (BGSS)
=head2 EXPORT
"ch" function by default.
=head1 INPUT
=head2 InputFile
The input dataset is expected in "dense" matrix format.
The input dense matrix is expected in a plain text file where the first
line in the file gives the dimensions of the dataset and then the
dataset in a matrix format should follow. The contexts / observations
should be along the rows and the features should be along the column.
eg:
6 5
1 1 0 0 1
1 0 0 0 0
1 1 0 0 1
1 1 0 0 1
1 0 0 0 1
1 1 0 0 1
The first line (6 5) gives the number of rows (observations) and the
number of columns (features) present in the following matrix.
Following each line records the frequency of occurrence of the feature
at the column in the given observation. Thus features1 (1st column) occurs
once in the observation1 and infact once in all the other observations too
while the feature3 does not occur in observation1.
=head2 ClusteringMethod
The Clustering Measures that can be used are:
1. rb - Repeated Bisections [Default]
2. rbr - Repeated Bisections for by k-way refinement
3. direct - Direct k-way clustering
4. agglo - Agglomerative clustering
5. graph - Graph partitioning-based clustering
6. bagglo - Partitional biased Agglomerative clustering
=head2 K value
This is an approximate upper bound for the number of clusters that may be
present in the dataset. Thus for a dataset that you expect to be seperated
into 3 clusters this value should be set some integer value greater than 3.
=head1 OUTPUT
A single integer number which is the estimate of number of clusters present in the input dataset.
=head1 PRE-REQUISITES
1. This module uses suite of C programs called CLUTO for clustering purposes.
Thus CLUTO needs to be installed for this module to be functional.
CLUTO can be downloaded from http://www-users.cs.umn.edu/~karypis/cluto/
=head1 SEE ALSO
1. T. Calinski and J. Harabasz. A dendrite method for cluster analysis. Communications in statistics, 3(1):1--27, 1974.
2. http://www-users.cs.umn.edu/~karypis/cluto/
=head1 AUTHOR
Anagha Kulkarni, University of Minnesota Duluth
kulka020 d.umn.edu
Guergana Savova, Mayo Clinic
savova.guergana mayo.edu
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2005-2006, Guergana Savova and Anagha Kulkarni
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
=cut