package File::Find::Node; use 5.006; use strict; use warnings; use Carp; our $VERSION = '0.02'; # # constructor # use constant PATH => 0; use constant NAME => 1; use constant LEVEL => 2; use constant PRUNE => 3; use constant FOLLOW => 4; use constant PARENT => 5; use constant PROCESS => 6; use constant POSTPROC => 7; use constant FILTER => 8; use constant ERRPROC => 9; use constant STAT => 10; use constant ARG => 11; use constant USER => 12; use constant GROUP => 13; sub new { my ($class, $path) = @_; defined($path) or $path = "."; $path =~ s{/+}{/}g; $path =~ s{/$}{} if $path ne "/"; my $self = [ $path, # PATH $path, # NAME 0, # LEVEL 0, # PRUNE 0, # FOLLOW undef, # PARENT undef, # PROCESS undef, # POSTPROC undef, # FILTER undef, # ERRPROC undef, # STAT undef, # ARG {}, # USER cache for getpwuid() {} # GROUP cache for getgrgid() ]; $self->[NAME] =~ s{.*/}{}; bless($self); } # # private object methods # # _error calls error callback function or calls carp(). sub _error { my ($self, $what) = @_; if ($self->[ERRPROC]) { $self->[ERRPROC]->($self, $what); } else { my $path = $self->path; carp(__PACKAGE__, " - $what($path) - $!"); } } # _cycle returns true if this directory is in the parent chain sub _cycle { my $self = shift; my ($inum, $dev) = ($self->inum, $self->dev); for (my $p = $self->parent; $p; $p = $p->parent) { return 1 if $dev == $p->dev && $inum == $p->inum; } 0; } # # public object methods # sub process { my $self = shift; $self->[PROCESS] = shift; $self; } sub post_process { my $self = shift; $self->[POSTPROC] = shift; $self; } sub filter { my $self = shift; $self->[FILTER] = shift; $self; } sub error_process { my $self = shift; $self->[ERRPROC] = shift; $self; } sub arg { my $self = shift; $self->[ARG] or ($self->[ARG] = {}); } sub prune { my $self = shift; $self->[PRUNE] = 1; $self; } sub stop { my $self = shift; for (my $p = $self; $p; $p = $p->parent) { $p->[PRUNE] = 1; } $self; } sub follow { my $self = shift; $self->[FOLLOW] = (@_ == 0 || shift); $self; } sub path { shift->[PATH]; } sub name { shift->[NAME]; } sub parent { shift->[PARENT]; } sub level { shift->[LEVEL]; } # These methods return saved stat info sub stat { @{shift->[STAT]}; } sub dev { shift->[STAT]->[0]; } sub inum { shift->[STAT]->[1]; } sub ino { shift->[STAT]->[1]; } sub mode { shift->[STAT]->[2]; } sub perm { shift->[STAT]->[2] & 07777; } sub type { my $idx = (shift->[STAT]->[2] >> 12) & 017; ("?", "p", "c", "?", "d", "?", "b", "?", "f", "?", "l", "?", "s", "?", "?", "?")[$idx]; } sub links { shift->[STAT]->[3]; } sub nlink { shift->[STAT]->[3]; } sub uid { shift->[STAT]->[4]; } sub gid { shift->[STAT]->[5]; } sub user { my $self = shift; my $uid = $self->uid; if (exists($self->[USER]->{$uid})) { return $self->[USER]->{$uid}; } my $user = getpwuid($uid); $self->[USER]->{$uid} = defined($user) ? $user : $uid; } sub group { my $self = shift; my $gid = $self->gid; if (exists($self->[GROUP]->{$gid})) { return $self->[GROUP]->{$gid}; } my $group = getgrgid($gid); $self->[GROUP]->{$gid} = defined($group) ? $group : $gid; } sub rdev { shift->[STAT]->[6]; } sub size { shift->[STAT]->[7]; } sub atime { shift->[STAT]->[8]; } sub mtime { shift->[STAT]->[9]; } sub ctime { shift->[STAT]->[10]; } sub blksize { shift->[STAT]->[11]; } sub blocks { shift->[STAT]->[12]; } # empty returns true for an empty directory or a zero length regular file, # otherwise false. sub empty { my $self = shift; my $ftype = $self->type; if ($ftype eq "f") { return $self->size == 0; } elsif ($ftype eq "d") { my $dirh; if (!opendir($dirh, $self->path)) { $self->_error("opendir"); return 0; } my $ret = 1; while (my $name = readdir($dirh)) { if ($name ne "." && $name ne "..") { $ret = 0; last; } } closedir($dirh); return $ret; } 0; } # refresh calls stat() or lstat() to load saved stat info sub refresh { my $self = shift; my $path = $self->path; my @stat; if ($self->[FOLLOW]) { @stat = CORE::stat($path) or @stat = CORE::lstat($path); } else { @stat = CORE::lstat($path); } if (@stat) { $self->[STAT] = \@stat; } else { $self->_error("stat"); } $self; } # find performs the directory traversal sub find { no warnings "recursion"; my $self = shift; $self->refresh->[STAT] or return; # loads stat info # avoid cycles my $ftype = $self->type; return if $ftype eq "d" && $self->[FOLLOW] && $self->_cycle; # call process callback if ($self->[PROCESS]) { $self->[PROCESS]->($self); } # traverse directory if not pruned return if $ftype ne "d" || $self->[PRUNE]; my $path = $self->path; my $dirh; if (!opendir($dirh, $path)) { $self->_error("opendir"); return; } my @dirent = $self->[FILTER] ? $self->[FILTER]->(readdir($dirh)) : readdir($dirh); closedir($dirh); foreach my $name (@dirent) { next if $name eq "." || $name eq ".."; # build child object and traverse it my $child; @$child = @$self; $child->[PATH] = $path ne "/" ? "$path/$name" : "/$name"; $child->[NAME] = $name; $child->[PARENT] = $self; $child->[LEVEL]++; $child->[STAT] = undef; $child->[ARG] = undef; bless($child); $child->find; return if $self->[PRUNE]; } # call post_process callback if ($self->[POSTPROC]) { $self->[POSTPROC]->($self); } } 1; __END__ =head1 NAME File::Find::Node - Recursively traverses a directory tree and processes each item using callback functions supplied by the user. =head1 SYNOPSIS use File::Find::Node; my $f = File::Find::Node->new("path"); $f->process(sub { ... }); $f->post_process(sub { ... }); $f->find; =head1 DESCRIPTION The constructor File::Find::Node->new creates a top level File::Find::Node object for the specified path. The $f->process method takes a reference to a callback function that is called for each item in the traversal. The $f->post_process method takes a reference to a callback function that is called for each directory after it has been traversed. The $f->find method performs the traversal. Callback functions are passed a File::Find::Node object for the item being visited. This object provides many useful methods that return information about the item. Other methods allow access to the parent directory object and allow arbitrary data to be stored in objects. =head1 Constructor =over 4 =item File::Find::Node->new($path) Returns a top level File::Find::Node object for the specified path. Uses "." if no path is given. =back =head1 Methods for the Top Level Object The following methods are intended to be used with the top level object, but you can call them with child objects to dynamically alter the traversal while it is in progress. =over 4 =item $f->process(\&func) Takes a reference to a callback function that is called for each item visited in the traversal, including the top level object. Returns the object itself, which allows you to chain method calls such as $f->process(\&func)->find; The callback function is passed a single argument, which is a File::Find::Node object for the current item. When visiting a directory, the function is called before the directory is traversed. =item $f->post_process(\&func) Takes a reference to a callback function that is called for each directory after it has been traversed. Returns the object itself. The function is passed the File::Find::Node object for the directory. =item $f->filter(\&func) Takes a reference to a callback function that is called to filter a list of file names. Before descending into a directory, the function is passed the list of file names obtained with readdir() and the function returns a new list. The filter function can be used to sort and/or remove file names. $f->filter(sub {sort @_}); $f->filter(sub {grep($_ ne ".snapshot", @_)}); =item $f->error_process(\&func) Takes a reference to a callback function that is called whenever there is an error. Returns the object itself. The function is passed the File::Find::Node object that encountered the error and a string indicating the cause: "stat" stat() or lstat() failed "opendir" opendir() failed An error does not terminate the traversal, so the callback function may need to call $f->stop or exit() or die(). If the error is a failed stat() or lstat() call then the object passed to the callback function is incomplete, which breaks many object methods. The following methods (discussed later) are safe: $f->path $f->name $f->stop $f->parent $f->arg $f->level Beware that calling $f->refresh or $f->empty may result in an error and hence a recursive call to the callback function. $f->error_process(sub { my ($f, $what) = @_; my $path = $f->path; die("find error: $what($path) : $!"); }); If no $f->error_process function is specified then errors are reported with carp(). =item $f->follow($value) Sets a flag in the object that causes $f->find to follow symbolic links, which is off by default. $f->follow takes an optional argument and returns the object. If the argument is absent or true then symbolic links are followed, otherwise not. If follow is on, then cycles are possible and $f->find avoids them. If a symbolic link cannot be followed, then an object for the link itself is created rather than for what the link refers to. =item $f->find Performs the directory traversal. As it visits each item it creates a File::Find::Node object and passes it to the callback functions specified by the $f->process and $f->post_process methods. (If no callbacks are specified, then nothing useful happens.) $f->find does not change directory and it will probably fail if a callback function changes directory without changing back. =back =head1 Methods for Callback Functions The following methods should be used with the objects passed to callback functions. =over 4 =item $f->path Returns the full path name of the item, beginning with the path of the top level item. =item $f->name Returns the file name (base name) of the item. =item $f->parent Returns the object for the parent directory (or undef for the top level object, which has no parent). You can call methods with the parent object such as $f->parent->path. =item $f->arg Returns a hash reference that is stored in the object. The hash is a handy place for callback functions to store arbitrary data such as flags, counters, totals or other state information between calls. An object can access its parent's hash via $f->parent->arg. For example, you can count the number of regular files in a directory and total their sizes like this: if ($f->type eq "d") { $f->arg->{count} = $f->arg->{total} = 0; } elsif ($f->parent && $f->type eq "f") { $f->parent->arg->{count}++; $f->parent->arg->{total} += $f->size; } =item $f->level Returns the depth level of the object. Returns zero for the top level object and returns one for its immediate children, etc. =item $f->prune Sets a flag in the object that causes $f->find to not traverse the object and returns the object. For example, this code skips a directory if it contains a file called .skipme . if ($f->type eq "d" && -f($f->path . "/.skipme")) { $f->prune; return; } Calling $f->prune on a non-directory has no effect, but you can call $f->parent->prune if you want to prune the parent directory. If a directory is pruned, then $f->find does not call the $f->post_process function. =item $f->stop Prunes everything on the $f->parent chain, which causes $f->find to return. =item $f->empty Returns true if the item is an empty directory or if the item is a zero length regular file. Otherwise returns false. =back The $f->find method calls stat() or lstat() (depending on $f->follow) for each item and saves the information in the object. The following methods are convenient ways to access the saved stat information and are named according to the corresponding field in the Unix stat struct and/or the corresponding option in the Unix find command. =over 4 =item $f->dev Returns the device number of the filesystem containing the item. You can determine when you cross mount points with if ($f->parent && $f->dev != $f->parent->dev) { ... } =item $f->inum =item $f->ino Returns the inode number. =item $f->mode Returns the mode bits. =item $f->type Returns a lower case letter indicating the type of the item: "f" - regular file "d" - directory "l" - symbolic link "b" - block device file "c" - character device file "p" - named pipe (FIFO) "s" - socket "?" - unknown (probably an error) =item $f->perm Returns the permission bits ($f->mode masked with 07777). =item $f->links =item $f->nlink Returns the number of hard links. =item $f->uid Returns the user id number. =item $f->user Returns the user name or else returns the user id number if getpwuid() fails. Uses a cache to avoid extra calls to getpwuid(). =item $f->gid Returns the group id number. =item $f->group Returns the group name or else returns the group id number if getgrgid() fails. Uses a cache to avoid extra calls to getgrgid(). =item $f->rdev Returns the device number of a device file. =item $f->size Returns the size in bytes. =item $f->atime Returns the access time. =item $f->mtime Returns the modification time. =item $f->ctime Returns the inode change time. =item $f->blksize Returns the I/O block size of the containing filesystem. =item $f->blocks Returns the number of 512 byte blocks allocated for the item. =item $f->stat Returns the array of saved stat information. @stat = $f->stat; =item $f->refresh Calls stat() or lstat() (depending on $f->follow) to refresh the saved stat information. Returns the object. For example, you may want to call $f->refresh after changing the permissions of an object with chmod() or else $f->perm returns the old saved permissions. =back =head1 Efficiency File::Find::Node is both space and time efficient. Although it creates an object for each item in the traversal, at any given time the only objects that require memory are the current object and the linear chain of parent objects up to the top level. Because the stat information is saved in the object, extra calls to stat() and lstat() are avoided. =head1 Examples This example prints all path names in sorted order. use File::Find::Node; my $f = File::Find::Node->new($ARGV[0]); $f->process(sub { print(shift->path, "\n") }); $f->filter(sub{sort @_}); $f->find; This example recursively removes a directory tree. use File::Find::Node; my $f = File::Find::Node->new($ARGV[0]); $f->process(sub { my $f = shift; unlink($f->path) if $f->type ne "d"; }); $f->post_process(sub { rmdir(shift->path) }); $f->find; This example mimics the Unix "du -k" command. use File::Find::Node; sub proc { my $f = shift; my $blocks = $f->blocks; if ($f->type eq "d") { $f->arg->{blocks} = $blocks; } elsif ($f->parent) { $f->parent->arg->{blocks} += $blocks; } } sub postproc { my $f = shift; printf("%8d %s\n", $f->arg->{blocks} / 2, $f->path); if ($f->parent) { $f->parent->arg->{blocks} += $f->arg->{blocks}; } } my $f = File::Find::Node->new($ARGV[0]); $f->process(\&proc)->post_process(\&postproc)->find; This example outputs a line for each directory showing the number of regular files immediately contained by the directory, the total size of the files in Kbytes, and the name of the directory. use File::Find::Node; sub proc { my $f = shift; if ($f->type eq "d") { $f->arg->{count} = $f->arg->{total} = 0; } elsif ($f->parent && $f->type eq "f") { $f->parent->arg->{count}++; $f->parent->arg->{total} += $f->size; } } sub postproc { my $f = shift; printf("%5d %12.2f %s\n", $f->arg->{count}, $f->arg->{total} / 1024, $f->path); } my $f = File::Find::Node->new($ARGV[0]); $f->process(\&proc)->post_process(\&postproc); $f->filter(sub {sort @_})->find; This example outputs the N most recently modified regular files in a directory tree. use File::Find::Node; my ($N, $dir) = @ARGV; my @recent; sub proc { my $f = shift; return if $f->type ne "f"; if (@recent == $N) { return if $f->mtime <= $recent[-1]->mtime; pop(@recent); } @recent = sort { $b->mtime <=> $a->mtime } (@recent, $f); } my $f = File::Find::Node->new($dir); $f->process(\&proc)->find; foreach $f (@recent) { print(scalar(localtime($f->mtime)), " ", $f->path, "\n"); } =head1 SEE ALSO See the perl modules File::Find and File::stat. See the man page for the Unix find(1) command. =head1 AUTHOR Stephen C. Losen, University of Virginia, scl@virginia.edu =head1 COPYRIGHT AND LICENSE Copyright (C) 2007 by Stephen C. Losen This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.6 or, at your option, any later version of Perl 5 you may have available. =cut