#!/usr/bin/perl # KGlob2RE.pm # Based on glob2re.pl 1.5 1992/12/09 23:31:01 Tye McQueen # @(#)KGlob2RE.pm 1.2, 95/03/27 22:00:11 # Convert Unix file "glob" patterns to Perl regular expressions. require 5.000; package File::KGlob2RE; require Exporter; @ISA = (Exporter); @EXPORT = qw( &kglob2re ); # The supported features are (where x is a single character and X and Y are # strings): # # . matches . # \x matches x # [...] matches a single character falling into the described character class # ? matches any single character except / # * matches zero or more characters not including / # % matches zero or more directories (see technical notes at bottom) # {X,Y} matches either pattern X or Y (can list any number of patterns) # #Examples: # %s.* matches any file whose name starts with "s." in any directory # %*.o matches any file whose name ends with ".o" in any directory # %old/* matches any file in a directory called "old" # %new%* matches any file in a directory called "new" or a subdirectory # of a directory called "new" # /%* matches any absolute path name # {*,?*%*} matches any relative path name (this would be very inefficient # except that it is specifically optimized) # %X X%* these are also specifically optimized sub kglob2re { local( $glob )= @_; local( $re )= "^"; local( $quote, $bracket, $brace, $slash ); # Optimize special cases: return( "^[^/]?" ) if $glob =~ m-\{\?\*/?%/?\*,\*\}- # {?*/%/*,*} -> ^[^/] || $glob =~ m-\{\*,\?\*/?%/?\*\}-; # {*,?*/%/*} -> ^[^/] for( split( //, $glob ) ) { # Go through glob pattern 1 char at a time: $slash-- if $slash; # Was a / appended to $re last time? if( $quote ) { # Was the last character \ ? $re .= $_; # Don't interpret this character $quote= 0; # Don't quote next char too } elsif( '\\' eq $_ ) { # Is this character a \ ? $re .= $_; $quote= 1; # Quote next character } elsif( $bracket ) { # Are we still inside a [...] ? if( 1 == $bracket && "^" eq $_ ) { $bracket= 2; } else { if( 2 == $bracket ) { if( "-" eq $_ ) { $re .= "-"; # [^-...] -> [^-/...] $_= "/"; # (avoid [^-z] -> [^/-z]) } else { $re .= "/"; # [^...] -> [^/...] } } $bracket= 3; } $re .= $_; $bracket= 0 if "]" eq $_; # Unquoted ] ends a [...] } elsif( "[" eq $_ ) { # Start a [...]: $re .= $_; $bracket= 1; } elsif( "?" eq $_ ) { $re .= "."; } elsif( "{" eq $_ ) # } # Start an {X,Y}: { # ^ so % works in vi $re .= "("; # {X,Y} -> (X|Y) $brace++; # Remember how many started } elsif( "," eq $_ ) { # Inside a {X,Y}, comma -> | ... $re .= $brace ? "|" : "\\,"; # else comma -> \, (to be safe) } # { elsif( "}" eq $_ ) { if( $brace ) { # Completed an {X,Y} $re .= ")"; $brace--; } else { # { <- so % works in vi $re .= "\\}"; # { { warn "Unquoted, unmatched `}' will be treated as `\\}'\n"; } } elsif( /\s/ ) { # Quote white space to avoid warning warn qq-Unquoted white space in file glob pattern: "$glob"\n-; $re .= $_; # else I assume it is an accident } elsif( "*" eq $_ ) { # * won't match / $re .= "[^/]*"; # * -> [^/]* } elsif( "/" eq $_ ) { $re .= $_ unless $slash; # // -> / and %/ -> % $slash= 2; # So we know next time } elsif( "%" eq $_ ) { if( $slash ) { # Check this because.... $re .= "(|.*/)"; # (don't include another leading /) } elsif( "^" eq $re ) { # .../%X is different than %X $re= "(^|/)"; # %[/]X -> ^(|.*/)X$ -> (^|/)X$ } else { $re .= "/(|.*/)"; # X[/]%[/]Y -> ^X/(|.*/)Y$ } $slash= 2; # Don't include an extra tailing slash } elsif( /\w/ ) { # Any letter, number, or _ : $re .= $_; # stays the same } else { # Any other symbol, quote it: $re .= "\\" . $_; # Includes ' so m'...' works. } } if( $quote || $bracket || $brace ) { warn "Unexpected end of file glob pattern: $glob\n"; return undef; } if( $re !~ s-$NOQT/\(\|,\.\*/\)$-\1/- ) { # X/%* -> ^X/ $re .= '$'; } elsif( "" eq $re ) { # Since m// means something else: $re= "^"; # %/* -> anything } $re; } package main; require File::Basename; import File::Basename qw(basename); if( &basename( $0 ) eq &basename( __FILE__ ) ) { # Use `find ... -print | KGlob2RE.pm "pattern" [...]' to use as pipe or test eval <<'EXAMPLE'; import File::KGlob2RE qw(&kglob2re); sub quote { local($*)= 1; $_[1] =~ s/^$_[0]//g; $_[1]; } if( 0 == @ARGV ) { die "e( "\t*:\t", <<" ;" ), "\n"; : Usage: KGlob2RE.pm [-e] { "pattern" | -f file } [...] : Examples: : find . -print | KGlob2RE.pm "%*.c" | xargs grep -i "boogers" : \\ls | KGlob2RE.pm "*.dat *.idx" | xargs chmod ug=rw,o=r : Note that if only one argument is given and it contains one or : more spaces, then it is split into several patterns because : just using one set of quotes (") for the whole list is usually : much easier. This splitting is *not* done if two or more : arguments are given. "-f file" reads patterns, one per line, : from the specified file (trailing spaces, #-comments, and : blank lines in the file are ignored). Patterns begining with : "!" exclude matching files. "-e" causes exceptions (files : neither explicitly matched nor excluded) to generate a message : on STDERR noting this. ; } if( "-e" eq $ARGV[0] ) { $Warn= 1; shift( @ARGV ); } if( 1 == @ARGV && index($ARGV[0],' ') ) { @ARGV= split( ' ', $ARGV[0] ); } if( @ARGV < 2 ) { # Simpler example using single pattern: $re= &kglob2re( $ARGV[0] ); while( ) { # For each file name read from stdin: chop; # Take off the trailing newline $_ .= "/" if -d $_ && ! m-/$-; # Put / on end of dir names if( m/$re/o ) { print "$_\n"; # Only print names matching pattern } elsif( $Warn ) { warn "File $_ unmatches by any pattern.\n"; } } } else { while( @ARGV ) { $_= shift( @ARGV ); if( /^-f/ ) { if( "" eq ( $_= substr($_,2) ) ) { @ARGV || die "Required file name missing after -f.\n"; $_= shift( @ARGV ); } open( PAT, "<$_" ) || die "Can't read patterns from $_: $!\n"; push( @pats, grep( (chop,s=\s*#.*==,length), ) ); close( PAT ); } else { push( @pats, $_ ); } } for( @pats ) { if( /^!/ ) { $if .= "\treturn 0 if m'" . &kglob2re(substr($_,1)) . "'o;\n"; } else { $if .= "\treturn 1 if m'" . &kglob2re($_) . "'o;\n"; } } eval "sub matches {\n$if\t-1; }"; while( ) { chop; $_ .= "/" if -d $_ && ! m-/$-; $re= &matches; if( 1 == $re ) { print "$_\n"; } elsif( $Warn && -1 == $re ) { warn "File $_ unmatched by any pattern.\n"; } } } EXAMPLE chop $@; die $@ if $@; } #Technical notes: # # Items were listed in order of precedence. For example: \[ matches [; ?, *, # %, and { have no special meaning within [...]; \x within [...] matches x so # [\][] matches [ or ]; all, including {X,Y}, can be used within {X,Y}. # # % will match / or /.../. If % is the first character of a pattern, it will # also match the empty string. For sanity, /%/, /%, and %/ are equivalent to % # except that this will not cause % to be considered as the first character in # a pattern. So "/%/X" and "/%X" will match "/X" but not "X" (which is good). # # Note that {} and % interfere in the following ways: A % inside {} is never # considered as being the first character of a pattern, even when it probably # should be; If /'s or %'s (but not both) are nested in {} they will not be # treated as adjacent and so the (possibly redundant) / will not be removed, # even when it probably should be removed. # # Hint: If you have "/" appended to all directory file names, patterns ending # in "/" will only match directory names. A % at the end of a pattern will # never match unless you do this. # # You can use # m/$re/[o] # or # eval "m'" . $re . "'" # But other choices may not work. For example, # eval "m/" . $re . "/" # won't because I don't bother to quote /'s. And # eval 'm"' . $re . '"' # risks interpretation of $ in unexpected ways (I think). 1;