package Regexp::English;
use strict;
use warnings;
use Exporter 'import';
use vars qw( @export @EXPORT_OK %EXPORT_TAGS $VERSION );
$VERSION = '1.01';
use overload '""' => \&compile;
use Scalar::Util 'blessed';
# REGEX: storage for the raw regex
# STORE: storage for bound references (see remember())
# STACK: used to nest groupings
use constant REGEX => 0;
use constant STORE => 1;
use constant STACK => 2;
# the key is the name of the method to create
# symbol is the regex token this represents
# plural is the name of the shortcut method for $symbol+, i.e. \w+
# non is the name of the negated token, its shortcut, and a plural, if needed
my %chars = (
word_char =>
{
symbol => '\w',
plural => 'word_chars',
non => [ 'non_word_char', '\W', 'non_word_chars' ],
},
whitespace_char =>
{
symbol => '\s',
plural => 'whitespace_chars',
non => [ 'non_whitespace_char', '\S', 'non_whitespace_chars' ],
},
digit =>
{
symbol => '\d',
plural => 'digits',
non => [ 'non_digit', '\D', 'non_digits' ],
},
word_boundary =>
{
symbol => '\b',
non => [ 'non_word_boundary', '\B' ],
},
end_of_string =>
{
symbol => '\Z',
non => [ 'very_end_of_string', '\z' ],
},
beginning_of_string => { symbol => '\A', },
end_of_previous_match => { symbol => '\G', },
# XXX: non for these ?
tab =>
{
symbol => '\t',
plural => 'tabs',
non => [ 'non_tab', '[^\t]' ],
},
# implies /s modifier
newline =>
{
symbol => '\n',
plural => 'newlines',
non => [ 'non_newline', '(?s)[^\n]' ],
},
carriage_return =>
{
symbol => '\r',
plural => 'carriage_returns',
non => [ 'non_carriage_return', '[^\r]' ],
},
form_feed =>
{
symbol => '\f',
plural => 'form_feeds',
non => [ 'non_form_feed', '[^\f]' ],
},
'alarm' =>
{
symbol => '\a',
plural => 'alarms',
non => [ 'non_alarm', '[^\a]' ],
},
escape =>
{
symbol => '\e',
plural => 'escapes',
non => [ 'non_escape', '[^\e]' ],
},
start_of_line => { symbol => '^', },
end_of_line => { symbol => '$', },
);
sub _chars
{
my $symbol = shift;
return sub
{
# cannot use $_[0] here, as it trips the overload
# that can mess with remember/end groups
return $symbol unless @_;
my $self = shift;
$self = $self->new() unless blessed( $self );
$self->[REGEX] .= $symbol;
return $self;
};
}
my @char_tags;
for my $char ( keys %chars )
{
push @char_tags, $char;
_install( $char, _chars( $chars{$char}{symbol} ) );
if ( $chars{$char}{plural} )
{
_install( $chars{$char}{plural}, _chars( $chars{$char}{symbol} . '+' ));
push @char_tags, $chars{$char}{plural};
}
if ( $chars{$char}{non} )
{
my ( $nonname, $symbol, $pluralname ) = @{ $chars{$char}{non} };
_install( $nonname, _chars($symbol) );
push @char_tags, $nonname;
if ($pluralname)
{
_install( $pluralname, _chars( $symbol . '+' ) );
push @char_tags, $pluralname;
}
}
}
# tested in t/quantifiers
# XXX:
# the syntax for minimal/optional is slightly awkward
my %quantifiers =
(
zero_or_more => '*',
multiple => '+',
minimal => '?',
optional => '?',
);
for my $quantifier ( keys %quantifiers )
{
_install( $quantifier,
_standard( '(?:', '', $quantifiers{$quantifier} . ')' ), 1 );
}
# tested in t/groupings
my %groupings =
(
after => '(?<=',
group => '(?:',
comment => '(?#',
not_after => '(? '(?=',
not_followed_by => '(?!',
);
for my $group ( keys %groupings )
{
_install( $group, _standard( $groupings{$group}, '', '' ), 1 );
}
sub _standard
{
my ( $group, $sep, $symbol ) = @_;
$symbol ||= ')';
return sub
{
if ( eval { $_[0]->isa( 'Regexp::English' ) } )
{
my $self = shift;
$self->[REGEX] .= $group;
if (@_)
{
$self->[REGEX] .= join( "$sep", @_ ) . $symbol;
}
else
{
push @{ $self->[STACK] }, $symbol;
}
return $self;
}
return $group . join( $sep, @_ ) . $symbol;
};
}
# can't be used with standard because of quotemeta()
sub literal
{
my $self = shift;
$self->[REGEX] .= quotemeta( +shift );
return $self;
}
sub _install
{
my ( $name, $sub, $export ) = @_;
no strict 'refs';
*{$name} = $sub;
push @export, "&$name" if $export;
push @EXPORT_OK, "&$name";
}
_install(
'or',
sub {
if ( eval { $_[0]->isa( 'Regexp::English' ) } )
{
my $self = shift;
if (@_)
{
$self->[REGEX] .= '(?:' . join( '|', @_ ) . ')';
}
else
{
$self->[REGEX] .= '|';
}
return $self;
}
return '(?:' . join( '|', @_ ) . ')';
},
1
);
_install( 'class', _standard( '[', '', ']' ), 1 );
# XXX - not()
sub remember
{
my $self = shift;
$self = $self->new() unless blessed( $self );
# the first element may be a reference, so stick it in STORE
if ( ref( $_[0] ) eq 'SCALAR' )
{
push @{ $self->[STORE] }, shift;
}
# if there are other arguments, add them to REGEX
if (@_)
{
$self->[REGEX] .= '(' . join( '', @_ ) . ')';
# otherwise, this is the opening op of a multi-call remember block
# XXX: might store calling line for verbose debugging
}
else
{
$self->[REGEX] .= '(';
push @{ $self->[STACK] }, ')';
}
return $self;
}
sub end
{
my ( $self, $levels ) = @_;
$levels = 1 unless defined $levels;
unless ( @{ $self->[STACK] } )
{
require Carp;
Carp::confess( 'end() called without remember()' );
}
$self->[REGEX] .= pop @{ $self->[STACK] } for 1 .. $levels;
return $self;
}
sub new
{
bless( [ '', [], [] ], $_[0] );
}
sub match
{
my $self = shift;
$self->[REGEX] = $self->compile();
if ( @{ $self->[STORE] } )
{
return $self->capture( $_[0] =~ $self->[REGEX] );
}
else
{
if ( wantarray() )
{
return $_[0] =~ $self->[REGEX];
}
else
{
return ( $_[0] =~ $self->[REGEX] )[0];
}
}
}
sub capture
{
my $self = shift;
for my $ref ( @{ $self->[STORE] } )
{
$$ref = shift @_;
}
if ( wantarray() )
{
return map { $$_ } @{ $self->[STORE] };
}
else
{
return ${ ${ $self->[STORE] }[0] };
}
}
sub compile
{
my $self = shift;
if ( my $num = @{ $self->[STACK] } )
{
$self->end($num);
}
return qr/$self->[REGEX]/;
}
sub debug
{
my $self = shift;
return $self->[REGEX];
}
%EXPORT_TAGS =
(
all => [ @char_tags, @export ],
chars => \@char_tags,
standard => \@export,
);
1;
__END__
=head1 NAME
Regexp::English - Perl module to create regular expressions more verbosely
=head1 SYNOPSIS
use Regexp::English;
my $re = Regexp::English
-> start_of_line
-> literal('Flippers')
-> literal(':')
-> optional
-> whitespace_char
-> end
-> remember
-> multiple
-> digit;
while () {
if (my $match = $re->match($_)) {
print "$match\n";
}
}
=head1 DESCRIPTION
Regexp::English provides an alternate regular expression syntax, one that is
slightly more verbose than the standard mechanisms. In addition, it adds a few
convenient features, like incremental expression building and bound captures.
You can access almost every regular expression available in Regexp::English can
through a method, though some are also (or only) available as functions. These
methods fall into several categories: characters, quantifiers, groupings, and
miscellaneous. The division wouldn't be so rough if the latter had a better
name.
All methods return the Regexp::English object, so you can chain method calls as
in the example above. Though there is a C method, you can use any
character method, or C, to create an object.
To perform a match, use the C method. Alternately, if you use a
Regexp::English object as if it were a compiled regular expression, the module
will automatically compile it behind the scenes.
=head2 Characters
Character methods correspond to standard regular expression characters and
metacharacters, for the most part. As a little bit of syntactic sugar, most of
these methods have plurals, negations, and negated plurals. This is more clear
looking at them. Though the point of these is to be available as calls on a
new Regexp::English object while building up larger regular expressions, you
may also used them as class methods to access regular expression atoms which
you then use in larger regular expressions. This isn't entirely pretty, but it
ought to work just about everywhere.
=over 4
=item * C
Matches the provided literal string. This method passes C<$string> through
C automatically. If you receive strange results, it's probably
because of this.
=item * C
Creates and matches a character class of the provided C<@characters>. Note
that there is currently no validation of the character class, so you can create
an uncompilable regular expression if you're not careful.
=item * C
Matches any word character, respecting the current locale. By default, this
matches alphanumerics and the underscore, corresponding to the C<\w> token.
=item * C
Matches at least one word character.
=item * C
Matches any non-word character.
=item * C
Matches at least one non-word character.
=item * C
Matches any whitespace character, corresponding to the C<\s> token.
=item * C
Matches at least one whitespace characters.
=item * C
Matches a single non-whitespace character.
=item * C
Matches at least one non-whitespace characters.
=item * C
Matches any numeric digit, corresponding to the C<\d> token.
=item * C
Matches at least one numeric digits.
=item * C
Matches a character that is not a digit.
=item * C
Matches at least one non-digit characters.
=item * C
Matches a tab character (C<\t>)
=item * C
Matches at least one tab characters.
=item * C
Matches any character that is not a tab.
=item * C
Matches a newline character (C<\n>). This implies the C modifier.
=item * C
Matches at least one newline characters. This also implies the C modifier.
=item * C
Matches any character that is not a newline.
=item * C
Matches a carriage return character (C<\r>).
=item * C
Matches at least one carriage return characters.
=item * C
Matches any character that is not a carriage return.
=item * C
Matches a form feed character (C<\f>).
=item * C
Matches at least one form feed characters.
=item * C
Matches any character that is not a form feed character.
=item * C
Matches an alarm character (C<\a>).
=item * C
Matches more than one alarm character.
=item * C
Matches anything but an alarm character.
=item * C
Matches an escape character (C<\e>).
=item * C
Matches at least one escape characters.
=item * C
Matches a single non-escape character.
=item * C
Matches the start of a line, just like the C<^> anchor.
=item * C
Matches the beginning of a string, much like the C<^> anchor.
=item * C
Matches the end of a line, just like the C<$> anchor.
=item * C
Matches the end of a string, much like the C<$> anchor, treating newlines
appropriately depending on the C or C modifier.
=item * C
Matches the very end of a string, just as the C<\z> token. This does not
ignore a trailing newline (if it exists).
=item * C
Matches the point at which a previous match ended, in a C<\g>lobally-matched
regular expression. This corresponds to the C<\G> token and relates to
C.
=item * C
Matches the zero-width boundary between a word character and a non-word
character, corresponding to the C<\b> token.
=item * C
Matches anything that is not a word boundary.
=back
=head2 Quantifiers
Quantifiers provide a mechanism to specify how many items to expect, in general
or specific terms. You may have these exported into the calling package's
namespace with the C<:standard> argument to the C