package File::CachingFind; # # Copyright 2002 Thomas Dorner # # Author: see end of file # Created: 9. April 2002 # # This program is free software; you can redistribute it and/or modify # it under the same terms as Perl itself. =head1 NAME File::CachingFind - find files within cached search paths (e.g. include files) =head1 SYNOPSIS use File::CachingFind; $includes = File::CachingFind->new(Path => ['/usr/local/include', '/usr/include']); $stdio = $includes->findFirstInPath('stdio.h'); =head1 DESCRIPTION C is useful for repeated file searches within a path of directories. It caches the contents of its search and supports two different methods of fuzzy search, a normalize function and regular expressions. See the different METHODS for details. =head1 METHODS =over 4 =cut ######################################################################### require 5.006; use strict; use warnings; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); require Exporter; @ISA = qw(Exporter); @EXPORT = qw(); @EXPORT_OK = qw(); $VERSION = '0.67'; use Carp; use Cwd 'abs_path'; use DirHandle; ######################################################################### =item B - create a new File::CachingFind object $obj = File::CachingFind->new(Path => $reference_to_list_of_directories, Normalize => $reference_to_function, Filter => $regular_expression, NoSoftlinks => $true_or_false); Example: $win32_includes = File::CachingFind->new (Path => ['.!', '/cygdrive/C/Programme/DevStudio/VC/include'], Normalize => sub{lc @_}, Filter => '\.h$'); This is the constructor for a cache to the filenames of one or more directories. It has one mandatory and three optional parameters. The cache build is a hash using the normalized filename without any directory parts in it as a key for retrieval. Each key of course can point to one or more real, full filenames. =over 4 =item B< Path> is the mandatory parameter. It must contain a reference to list of directories. Both relative and absolute paths are possible. Normally the directory itself and all its subdirectories are cached. If the directory name is followed by (ends with) an exclamation mark, the subdirectories are ignored. =item B< Normalize> is an optional code reference. The function referenced to must take exactly one string parameter (the filename withot its directory parts) as input and returns the string in a normalized fashion. If this result is not the empty string it's used as key for the cache (otherwise the filename is ignored). If no code reference is given, the unmodified filename is used as key for the cache. =item B< Filter> is an optional regular expression used for caching only certain files of the directories (those matching the regular expression). If no filter is given, every file is cached. =item B< NoSoftlinks> is an optional flag telling if the caching of softlinks should be inhibited. Normally the names of ordinary files as well as the name of softlinks are cached. Set the flag to true, if this is not wanted. =back =cut # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub new { my $this = shift; my $class = ref($this) || $this; my %newObject = (); local $_; # clone object (if applicable): if (ref($this)) { $newObject{Path} = $this->{Path}; $newObject{Norm} = $this->{Norm}; $newObject{Filter} = $this->{Filter}; $newObject{NoLink} = $this->{NoLink}; } # analyze parameters: my %args = @_; foreach (keys %args) { if (/^Path$/i) { croak $_, ' is not a reference to an array' unless 'ARRAY' eq ref($args{$_}); $newObject{Path} = $args{$_}; } elsif (/^Normali[zs]e$/i) { croak $_, ' is not a reference to a function' unless 'CODE' eq ref($args{$_}); $newObject{Norm} = $args{$_}; } elsif (/^Filter$/i) { croak $_, ' is not scalar' unless '' eq ref($args{$_}); $newObject{Filter} = $args{$_}; } elsif (/^NoSoftlinks$/i) { croak $_, ' is not scalar' unless '' eq ref($args{$_}); $newObject{NoLink} = $args{$_}; } else { croak 'unknown parameter ', $_, ' passed to ', __PACKAGE__; } } # check for completeness: croak 'no path defined' unless defined $newObject{Path}; # cache files with full names and priorities in object: my %fullname = (); $newObject{Fullname} = \%fullname; my %priority = (); $newObject{Priority} = \%priority; my $priority = 0; foreach (@{$newObject{Path}}) { my $recursive = ! s/!$//; # handle no-recursive flag next unless -d $_; _parse_directory(\%newObject, abs_path($_), $recursive, ++$priority); } # now we're finished: bless \%newObject, $class; } ######################################################################### =item B - locate all files with a given (normalized) name @list = $obj->findInPath($a_file_name); Example: @time_h = $includes->findInPath('time.h'); This method returns all full filenames (including the directory parts) of all files in the cache of the object, which have the same normalized filename as the parameter passed to this method. The parameter itself will be normalized as well before comparizion. On a standard Unix system the list in aboves example should at least contain /usr/include/time.h and /usr/include/sys/time.h, provided $includes is similar to the one defined at the very beginning of this documentation. If no file is found, an empty list is returned. =cut # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub findInPath { my ($this, $name) = @_; # apply normalization: $name = &{$this->{Norm}}($name) if $this->{Norm}; # return list: if (! defined $this->{Fullname}->{$name}) { return (); } elsif ('' eq ref($this->{Fullname}->{$name})) { return ($this->{Fullname}->{$name}); } elsif ('ARRAY' eq ref($this->{Fullname}->{$name})) { return @{$this->{Fullname}->{$name}}; } else { confess('internal error in ', __PACKAGE__, '(please report this bug): unexpected reference type "', ref($this->{Fullname}->{$name}), '"'); } } ######################################################################### =item B - locate first file with a given (normalized) name @list = $obj->findFirstInPath($a_file_name); Example: $includes2 = File::CachingFind->new(Path => ['/usr/include!', '/usr/include/sys!']); $time_h = $includes2->findFirstInPath('time.h'); This method returns the first full filename (including the directory parts) of all files in the cache of the object. The search is similar to the one in the method B. The function will search the cache in the order of the paths given to the constructor (B). On a standard Unix system above example returns /usr/include/time.h. A call to C<$includes-EfindFirstInPath('time.h')> (see B) would return either /usr/include/time.h or /usr/include/sys/time.h (indeterministic). If no file is found, undef is returned. =cut # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub findFirstInPath { my ($this) = @_; my @list = findInPath(@_); return undef if 0 == @list; @list = sort {$this->{Priority}->{$a} <=> $this->{Priority}->{$b}} @list; return $list[0]; } ######################################################################### =item B - locate best file with a given (normalized) name @list = $obj->findBestInPath($a_file_name, $reference_to_comparison_function); Example: $time_h = $includes2->findBestInPath ('time.h', sub{ length($_[1]) <=> length($_[0]) }); This method returns the best full filename (including the directory parts) of all files in the cache of the object. The search is similar to the one in the method B. All files found are compared using the given comparision function (similar to comparision functions given to sort, except that it uses real parameters). If more than one file remains, the order of the paths given to the constructor (B) will be considered as well (as in B). On a standard Unix system above example returns /usr/include/sys/time.h as it has a longer full filename than /usr/include/time.h. If no file is found, undef is returned. =cut # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub findBestInPath { my ($this, $name, $rCompare) = @_; croak 'third parameter is not a reference to a function' unless 'CODE' eq ref($rCompare); my @list = findInPath($this, $name); return undef if 0 == @list; @list = sort { my $order = &$rCompare($a, $b); return $order != 0 ? $order : $this->{Priority}->{$a} <=> $this->{Priority}->{$b} } @list; return $list[0]; } ######################################################################### =item B - locate all files matching a regular expression @list = $obj->findMatch($regular_expression); Example: @std_h = $includes2->findMatch('^(?i:std)'); This method returns all full filenames (including the directory parts) of all files in the cache of the object, which match the given regular expression. Note, that the regular expression won't be normalized, I have to make sure that it matches the normalized filenames. On a standard Unix system the list in aboves example should at least contain /usr/include/stdio.h and /usr/include/stdlib.h, provided $includes2 is similar to the used in prior examples. Your mileage may vary, especially on different systems. Note that the example uses a case insensitive match. If no file is found, an empty list is returned. =cut # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub findMatch { my ($this, $regexp) = @_; my @result = (); # loop all files: while (my ($name, $files) = each %{$this->{Fullname}}) { next unless $name =~ m/$regexp/; if ('' eq ref($files)) { push @result, $files; } elsif ('ARRAY' eq ref($files)) { push @result, @{$files}; } else { confess('internal error in ', __PACKAGE__, '(please report this bug): unexpected reference type "', ref($files), '"'); } } return @result; } ######################################################################### =item B - locate first file matching a regular expression @list = $obj->findFirstMatch($regular_expression); Example: $std_h = $includes2->findFirstMatch('^std'); This method returns the first full filename (including the directory parts) of all files in the cache of the object matching the given regular expression. It works similar to B and will search the cache in the order of the paths given to the constructor (B). Thus it may be of limited use as the algorithm chosing between more than one file of the same path is indeterministic. B would be a better choice in most circumstances though it is a bit slower most of the times. On a standard Unix system above example returns /usr/include/stdio.h or /usr/include/stdlib.h or another matching file (indeterministic). If no file is found, undef is returned. =cut # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub findFirstMatch { my ($this) = @_; my @list = findMatch(@_); return undef if 0 == @list; @list = sort {$this->{Priority}->{$a} <=> $this->{Priority}->{$b}} @list; return $list[0]; } ######################################################################### =item B - locate best file matching a regular expression @list = $obj->findBestMatch($regular_expression, $reference_to_comparison_function); Example: $std_h = $includes2->findBestMatch ('^std', sub{ length($_[0]) <=> length($_[1]) }); This method returns the best full filename (including the directory parts) of all files in the cache of the object matching the given regular expression. As in B all files found are compared using the given comparision function followed by the order of the paths given to the constructor (B). On a standard Unix system above example returns /usr/include/stdio.h unless there is another include with an even shorter name beginning with /usr/include/std. If no file is found, undef is returned. =cut # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub findBestMatch { my ($this, $regexp, $rCompare) = @_; croak 'third parameter is not a reference to a function' unless 'CODE' eq ref($rCompare); my @list = findMatch($this, $regexp); return undef if 0 == @list; @list = sort { my $order = &$rCompare($a, $b); return $order != 0 ? $order : $this->{Priority}->{$a} <=> $this->{Priority}->{$b} } @list; return $list[0]; } ######################################################################### ######################################################################### ######### internal methods / functions following ######### ######################################################################### ######################################################################### ######################################################################### # call: (recursive, only used in new) # # _parse_directory($rNewObject, $directory, $recursive, # # $priority); # # parameters: # # $rNewObject reference to (yet) unblessed new object # # $dir directory (full absolute path!) to parse # # $recursive flag, if subdirectories should be parsed as well# # $priority priority of the current path # # description: # # The function parses the directory $directory and puts its # # relevant filenames and directories into $rNewObject->{Fullname}.# # The priority is cached in $rNewObject->{Priority}. # # global variables used: # # - # # returns: # # - # ######################################################################### sub _parse_directory { my ($rNewObject, $directory, $recursive, $priority) = @_; local $_; # loop directory: my $dirh = new DirHandle $directory; while (defined($_ = $dirh->read)) { next if m/^\.\.?$/o; # ignore . and .. my $fullname = $directory.'/'.$_; # handle directories: if (-d $fullname) { _parse_directory($rNewObject, $fullname, $recursive, $priority) if $recursive; next; } lstat $fullname; # filter non-files / non-links (if applicable): if (! -f _) { next if -l _ and $rNewObject->{NoLink}; } # apply filter: if (defined $rNewObject->{Filter}) { next unless m/$rNewObject->{Filter}/; } # apply normalization: $_ = &{$rNewObject->{Norm}}($_) if $rNewObject->{Norm}; # put filename/fullname in cache: if (! defined $rNewObject->{Fullname}->{$_}) { $rNewObject->{Fullname}->{$_} = $fullname; } elsif ('' eq ref($rNewObject->{Fullname}->{$_})) { $rNewObject->{Fullname}->{$_} = [ $rNewObject->{Fullname}->{$_}, $fullname ]; } elsif ('ARRAY' eq ref($rNewObject->{Fullname}->{$_})) { push @{$rNewObject->{Fullname}->{$_}}, $fullname; } else { confess('internal error in ', __PACKAGE__, '(please report this bug): unexpected reference type "', ref($rNewObject->{Fullname}->{$_}), '"'); } # cache priority: $rNewObject->{Priority}->{$fullname} = $priority; } } 1; __END__ =back =head1 KNOWN BUGS Directory names ending with an exclamation mark can't be handled yet! Softlinks creating a cyclic directory structure will cause an infinite loop. If the same file is found more than once using different paths in the constructor (B), it will be cached more than once! This is considered a feature, not a bug. =head1 SEE ALSO perl(1). =head1 AUTHOR Thomas Dorner, Edorner (AT) cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2007-2009 by Thomas Dorner This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.6.1 or, at your option, any later version of Perl 5 you may have available. =cut