package Text::CSV::UniqueColumns; use 5.008008; use strict; use warnings; require Exporter; our @ISA = qw(Exporter); our $VERSION = '0.3'; my (%headerHash, @cleanup); sub new { my ($class) = shift; my ($sFile) = shift || die "Please provide csv file as argument\n"; my ($sCols) = shift; die "$sFile not found" if (!-e $sFile); my $self = { '_file' => $sFile, '_cols' => $sCols, '_headers' => "" }; bless $self, $class; getHeaders($self,$sFile); die "Could not get headers\n" if (!$self->{'_headers'}); return $self; } sub checkUniq { my ($self) = shift; my $sCols = shift || return "Provide cols as arguments\n"; $self->{'_cols'} = $sCols; my $iCount = 1; my $sPasteOutput; my $sPasteFiles = " "; my @CompositeCols = split(',',$sCols); foreach my $sCol (@CompositeCols) { chomp($sCol); if ($headerHash{$sCol}) { $headerHash{"Composite$iCount"} = `cut -f$headerHash{$sCol} -d , $self->{'_file'}`; open (FILE , ">Composite$iCount") or return "Cannot write Composite$iCount $! \n"; push (@cleanup, "Composite$iCount"); print FILE $headerHash{"Composite$iCount"}; $sPasteFiles .= " Composite$iCount"; close(FILE); $iCount++; } else { return "Column - $sCol not found\n INFO - Use \"-l\" option to list columns in file\n"; } } my $sCmd = "paste -d , $sPasteFiles > pasteOutput "; push (@cleanup, 'pasteOutput'); $sPasteOutput = `$sCmd`; my $iCount1 = `cat pasteOutput | sed s/' '//g | wc -l`; my $iCount2 = `cat pasteOutput | sed s/' '//g | sort | uniq | wc -l`; cleanUp(); if ($iCount1 == $iCount2) { return "1"; #unique } else { return "0"; } } sub getColumnList { my ($self) = @_; foreach (@{$self->{'_headers'}}) { return join(',', @{$self->{'_headers'}}); print "$_\n"; } } sub getUniqCols { my ($self) = @_; my ($sUniqCols) = " "; foreach my $iNo ( 0 .. (scalar(@{$self->{'_headers'}}) - 1)) { my $iField = $iNo + 1; my $sCmd = "cut -f$iField -d , $self->{'_file'} | sed s/' '//g | wc -l;"; $sCmd .= "cut -f$iField -d , $self->{'_file'} | sed s/' '//g | sort | uniq | wc -l"; my ($iCount1, $iCount2) = split("\n",`$sCmd`); if ( $iCount1 == $iCount2) { $sUniqCols .= $self->{'_headers'}->[$iNo].","; } else { next; } } chop ($sUniqCols); return $sUniqCols; } sub buildHeaderHash { my ($self) = @_; my $iColNo = 1; foreach my $sCol (@{$self->{'_headers'}}) { $sCol =~ s/\s+//g; $sCol =~ s/\n//g; $headerHash{$sCol} = $iColNo; $iColNo++; } } sub getHeaders { my ($self,$sFile) = @_; print "file is $sFile \n"; my @headers = split(',', `head -1 $sFile`); $self->{'_headers'} = \@headers; buildHeaderHash($self); } sub cleanUp { foreach my $sFile (@cleanup){ `rm -f $sFile`; } } 1; __END__ =head1 NAME Text::CSV::UniqueColumns - Perl extension for finding columns with unique values in a CSV =head1 SYNOPSIS use Text::CSV::UniqueColumns; --Create an object of the module $Obj = new UniqueColumns('check.csv'); --To get list of columns $list = $Obj->getColumnList(); --To check if column 'col1' has unique values. $Int = $Obj->checkUniq('col1'); --To get a list of columns(comma seperated) having unique values. #Returns 1 if unique, 0 if not $Uniq = $Obj->getUniqCols(); =head1 DESCRIPTION find columns with unique values of a CSV file. Functions and their usage -- getColumnList - list all columns name in a csv checkUniq - Check if one('Col1') or more combination of column('Col1,Col2,Col3') values are unique getUniqCols - give list of unique columns in an CSV. **** Module works on UNIX boxes only **** =head1 AUTHOR Tushar, Etushar@cpan.org =head1 COPYRIGHT AND LICENSE Copyright (C) 2010 by Tushar Murudkar This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.8 or, at your option, any later version of Perl 5 you may have available. =cut