package Text::Same; =head1 NAME Text::Same - Look for similarities between files or arrays =head1 SYNOPSIS use Text::Same; my $matchmap = compare "file_1", "file_2", { ignore_whitespace => 1 }; my $matchmap = compare \@records1, \@records2, { ignore_simple => 3 }; or use the "psame" command: psame -a file_1 file_2 | more =head1 DESCRIPTION C compares two files or arrays of strings and returns a MatchMap object holding the results. =cut =head1 FUNCTIONS =cut use warnings; use strict; use Carp; use vars qw($VERSION @ISA @EXPORT); use Exporter; @ISA = qw( Exporter ); @EXPORT = qw( compare ); $VERSION = '0.07'; use Text::Same::Match; use Text::Same::ChunkPair; use Text::Same::MatchMap; use Text::Same::Cache; use Text::Same::Util; sub _process_hits { my ($options, $this_chunk_indx, $seen_pairs_ref, $matching_chunk_indexes_ref, $this_chunked_source, $other_chunked_source) = @_; for my $other_chunk_indx (@$matching_chunk_indexes_ref) { my $chunk_pair = new Text::Same::ChunkPair($this_chunk_indx, $other_chunk_indx); my $pair_id = $chunk_pair->packed_pair(); if (!exists $seen_pairs_ref->{$pair_id}) { my $this_prev_chunk_indx = $this_chunked_source->get_previous_chunk_indx($options, $this_chunk_indx); my $other_prev_chunk_indx = $other_chunked_source->get_previous_chunk_indx($options, $other_chunk_indx); if (defined $this_prev_chunk_indx && defined $other_prev_chunk_indx) { my $this_prev_chunk_text = $this_chunked_source->get_chunk_by_indx($this_prev_chunk_indx); my $other_prev_chunk_text = $other_chunked_source->get_chunk_by_indx($other_prev_chunk_indx); my $this_prev_hash = Text::Same::ChunkedSource::hash($options, $this_prev_chunk_text); my $other_prev_hash = Text::Same::ChunkedSource::hash($options, $other_prev_chunk_text); if ($this_prev_hash eq $other_prev_hash) { my $prev_pair_id = Text::Same::ChunkPair::make_packed_pair($this_prev_chunk_indx, $other_prev_chunk_indx); my $prev_match = $seen_pairs_ref->{$prev_pair_id}; if (defined $prev_match) { $prev_match->add($chunk_pair); $seen_pairs_ref->{$pair_id} = $prev_match; next; } } } my $match = new Text::Same::Match(source1=>$this_chunked_source, source2=>$other_chunked_source, pairs=>[$chunk_pair]); $seen_pairs_ref->{$pair_id} = $match; } } } sub _find_matches($$$) { my ($options, $source1, $source2) = @_; my $source1_chunk_indexes = $source1->get_filtered_chunk_indexes($options); my %seen_pairs = (); for my $this_chunk_indx (@$source1_chunk_indexes) { my $chunk_text = $source1->get_chunk_by_indx($this_chunk_indx); my @matching_chunk_indexes = $source2->get_matching_chunk_indexes($options, $chunk_text); if (@matching_chunk_indexes) { _process_hits($options, $this_chunk_indx, \%seen_pairs, \@matching_chunk_indexes, $source1, $source2); } } return \%seen_pairs; } sub _extend_matches { my ($options, $matches, $source1, $source2) = @_; for my $match (@$matches) { my ($prev, $next) = undef; $prev = $source1->get_previous_chunk_indx($options, $match->min1()); if (defined $prev) { $match->set_min1($prev + 1); } else { $match->set_min1(0); } $prev = $source2->get_previous_chunk_indx($options, $match->min2()); if (defined $prev) { $match->set_min2($prev + 1); } else { $match->set_min2(0); } $next = $source1->get_next_chunk_indx($options, $match->max1()); if (defined $next) { $match->set_max1($next - 1); } else { $match->set_max1($source1->get_all_chunks_count() - 1); } $next = $source2->get_next_chunk_indx($options, $match->max2()); if (defined $next) { $match->set_max2($next - 1); } else { $match->set_max2($source2->get_all_chunks_count() - 1); } } } =head2 compare Title : compare Usage : $matchmap = compare($options, $file1, $file2) or $matchmap = compare($options, \@array1, \@array2) then: my @all_matches = $matchmap->all_matches; Function: return a MatchMap object holding matches and non-matches between the two given files or arrays of strings =cut sub compare { my $options = shift || {}; my $data1 = shift; my $data2 = shift; my $cache = new Text::Same::Cache(); my $source1; if (ref $data1 eq "ARRAY") { $source1 = new Text::Same::ChunkedSource(name=>"array1", chunks=>$data1); } else { $source1 = $cache->get($data1, $options); } my $source2; if (ref $data2 eq "ARRAY") { $source2 = new Text::Same::ChunkedSource(name=>"array2", chunks=>$data2); } else { $source2 = $cache->get($data2, $options); } my $seen_pairs_ref = _find_matches $options, $source1, $source2; my @matches = values %{$seen_pairs_ref}; my %uniq_matches = (); for my $match (@matches) { $uniq_matches{$match} = $match; } my @unique_matches = values %uniq_matches; _extend_matches($options, \@unique_matches, $source1, $source2); return new Text::Same::MatchMap(options=>$options, source1=>$source1, source2=>$source2, matches=>\@unique_matches); } =head1 SEE ALSO B =head1 AUTHOR Kim Rutherford =head1 COPYRIGHT & LICENSE Copyright 2005,2006 Kim Rutherford, all rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1;