package File::Find::Duplicates; =head1 NAME File::Find::Duplicates - Find duplicate files =head1 SYNOPSIS use File::Find::Duplicates; my @dupes = find_duplicate_files('/basedir1', '/basedir2'); foreach my $dupeset (@dupes) { printf "Files %s (of size %d) hash to %s\n", join(", ", @{$dupeset->files}), $dupeset->size, $dupeset->md5; } =head1 DESCRIPTION This module provides a way of finding duplicate files on your system. =head1 FUNCTIONS =head2 find_duplicate_files my %dupes = find_duplicate_files('/basedir1', '/basedir2'); When passed a base directory (or list of such directories) it returns a list of objects with the following methods: =head2 files A listref of the names of the duplicate files. =head2 size The size of the duplicate files. =head2 md5 The md5 sum of the duplicate files. =head1 TODO Check the contents of tars, zipfiles etc to ensure none of these also exist elsewhere (if so requested). =head1 SEE ALSO L. =head1 AUTHOR Tony Bowden =head1 BUGS and QUERIES Please direct all correspondence regarding this module to: bug-File-Find-Duplicates@rt.cpan.org =head1 COPYRIGHT AND LICENSE Copyright (C) 2001-2005 Tony Bowden. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. =cut use strict; use File::Find; use Digest::MD5; require Exporter; use vars qw($VERSION @ISA @EXPORT); @ISA = qw/Exporter/; @EXPORT = qw/find_duplicate_files/; $VERSION = '1.00'; use Class::Struct 'File::Find::Duplicates::Set' => [ files => '@', size => '$', md5 => '$' ]; sub find_duplicate_files { my (@dupes, %files); find sub { -f && push @{ $files{ (stat(_))[7] } }, $File::Find::name; }, @_; foreach my $size (sort { $b <=> $a } keys %files) { next unless @{ $files{$size} } > 1; my %md5; foreach my $file (@{ $files{$size} }) { open(my $fh, $file) or next; binmode($fh); push @{ $md5{ Digest::MD5->new->addfile($fh)->hexdigest } }, $file; } push @dupes, map File::Find::Duplicates::Set->new( files => $md5{$_}, size => $size, md5 => $_, ), grep @{ $md5{$_} } > 1, keys %md5; } return @dupes; } return q/ dissolving ... removing ... there is water at the bottom of the ocean /;