package Regexp::Grammars; use warnings; use strict; use 5.010; use Scalar::Util qw< blessed >; use Data::Dumper qw< Dumper >; our $VERSION = 1.001_005; # Load the module... sub import { # Signal lexical scoping (active, unless something was exported)... $^H{'Regexp::Grammars::active'} = 1; # Process any regexes in module's active lexical scope... use overload; overload::constant( qr => sub { my ($raw, $cooked, $type) = @_; # In active scope and really a regex... if (_module_is_active() && $type =~ /qq?/) { return bless \$raw, 'Regexp::Grammars::Precursor'; } # Ignore everything else... else { return $cooked; } } ); } # Deactivate module's regex effect when it is "anti-imported" with 'no'... sub unimport { # Signal lexical (non-)scoping... $^H{'Regexp::Grammars::active'} = 0; } # Tidy up the hoopy user-defined pragma interface... sub _module_is_active { return (caller 1)[10]->{'Regexp::Grammars::active'}; } #=====[ COMPILE-TIME INTERIM REPRESENTATION OF GRAMMARS ]=================== { package Regexp::Grammars::Precursor; # Only translate precursors once... state %grammar_cache; use overload ( # Concatenation/interpolation just concatenates to the precursor... q{.} => sub { my ($x, $y, $reversed) = @_; if (ref $x) { $x = ${$x} } if (ref $y) { $y = ${$y} } if ($reversed) { ($y,$x) = ($x,$y); } $x .= $y//q{}; return bless \$x, 'Regexp::Grammars::Precursor'; }, # Using as a string (i.e. matching) preprocesses the precursor... q{""} => sub { my ($obj) = @_; use Scalar::Util qw< refaddr >; return $grammar_cache{ refaddr($obj) } //= Regexp::Grammars::_build_grammar( ${$obj} ); }, # Everything else, as usual... fallback => 1, ); } #=====[ SUPPORT FOR THE INTEGRATED DEBUGGER ]========================= # All messages go to STDERR by default... *Regexp::Grammars::LOGFILE = *STDERR{IO}; # Debugging levels indicate where to stop... our %DEBUG_LEVEL = ( off => 0, # No more debugging run => 1, continue => 1, # Run to completion of regex match match => 2, on => 2, # Run to next successful submatch step => 3, try => 3, # Run to next reportable event ); # Debugging levels can be abbreviated to one character during interactions... @DEBUG_LEVEL{ map {substr($_,0,1)} keys %DEBUG_LEVEL } = values %DEBUG_LEVEL; $DEBUG_LEVEL{o} = $DEBUG_LEVEL{off}; # Width of leading context field in debugging messages is constrained... my $MAX_CONTEXT_WIDTH = 20; my $MIN_CONTEXT_WIDTH = 6; # Rewrite a string currently being matched, to make \n and \t visible sub _show_metas { my $context_str = shift // q{}; # Quote newlines (\n -> \\n, without using a regex)... my $index = index($context_str,"\n"); while ($index >= 0) { substr($context_str, $index, 1, '\\n'); $index = index($context_str,"\n",$index+2); } # Quote tabs (\t -> \\t, without using a regex)... $index = index($context_str,"\t"); while ($index >= 0) { substr($context_str, $index, 1, '\\t'); $index = index($context_str,"\t",$index+2); } return $context_str; } # Minimize whitespace in a string... sub _squeeze_ws { my ($str) = @_; $str =~ tr/\n\t/ /; my $index = index($str,q{ }); while ($index >= 0) { substr($str, $index, 2, q{ }); $index = index($str,q{ },$index); } return $str; } # Prepare for debugging... sub _init_try_stack { our (@try_stack, $last_try_pos, $last_context_str); # Start with a representation of the entire grammar match... @try_stack = ({ subrule => '', height => 0, errmsg => ' \\FAIL ', }); # Initialize tracking of location and context... $last_try_pos = -1; $last_context_str = q{}; # Report... say {*Regexp::Grammars::LOGFILE} _debug_context('=>') . 'Trying from position ' . pos(); } # Create a "context string" showing where the regex is currently matching... sub _debug_context { my ($fill_chars) = @_; # Determine minimal sufficient width for context field... my $field_width = length(_show_metas($_)); if ($field_width > $MAX_CONTEXT_WIDTH) { $field_width = $MAX_CONTEXT_WIDTH; } elsif ($field_width < $MIN_CONTEXT_WIDTH) { $field_width = $MIN_CONTEXT_WIDTH; } # Get current matching position (and some additional trailing context)... my $context_str = substr(_show_metas(substr($_//q{},pos(),$field_width)),0,$field_width); # Build the context string, handling special cases... our $last_context_str; if ($fill_chars) { # If caller supplied a 1- or 2-char fill sequence, use that instead... my $last_fill_char = length($fill_chars) > 1 ? substr($fill_chars,-1,1,q{}) : $fill_chars ; $context_str = $fill_chars x ($field_width-1) . $last_fill_char; } else { # Make end-of-string visible in empty context string... if ($context_str eq q{}) { $context_str = '[eos]'; } # Don't repeat consecutive identical context strings... if ($context_str eq $last_context_str) { $context_str = q{ } x $field_width; } else { # If not repeating, remember for next time... $last_context_str = $context_str; } } # Left justify and return context string... return sprintf("%-*s ",$field_width,$context_str); } # Show a debugging message (mainly used for compile-time errors and info)... sub _debug_notify { # Single arg is a line to be printed with a null severity... my ($severity, @lines) = @_==1 ? (q{},@_) : @_; chomp @lines; # Formatting string for all lines... my $format = qq{%*s | %s\n}; # Track previous severity and avoid repeating the same level... state $prev_severity = q{}; if ($severity eq $prev_severity) { $severity = q{}; } else { $prev_severity = $severity; } # Display first line with severity indicator (unless same as previous)... printf {*Regexp::Grammars::LOGFILE} $format, $MIN_CONTEXT_WIDTH, $severity, shift @lines; # Display first line without severity indicator for my $next_line (@lines) { printf {*Regexp::Grammars::LOGFILE} $format, $MIN_CONTEXT_WIDTH, q{}, $next_line; } } # Handle user interactions during runtime debugging... sub _debug_interact { my ($stack_height, $leader, $curr_frame_ref, $min_debug_level) = @_; our $DEBUG; # ...stores current debug level within regex # Only interact with terminals, and if debug level is appropriate... if (-t *Regexp::Grammars::LOGFILE && defined $DEBUG && ($DEBUG_LEVEL{$DEBUG}//0) >= $DEBUG_LEVEL{$min_debug_level}) { INPUT: while (1) { my $cmd = <>; chomp $cmd; # Input of 'd' means 'display current result frame'... if ($cmd eq 'd') { print {*Regexp::Grammars::LOGFILE} join "\n", map { $leader . ($stack_height?'| ':q{}) . ' : ' . $_ } split "\n", q{ }x8 . substr(Dumper($curr_frame_ref),8); print "\t"; } # Any other (valid) input changes debugging level and continues... else { if (exists $DEBUG_LEVEL{$cmd}) { $DEBUG = $cmd; } last INPUT; } } } # When interaction not indicated, just complete the debugging line... else { print {*Regexp::Grammars::LOGFILE} "\n"; } } # Handle reporting of unsuccessful match attempts... sub _debug_handle_failures { my ($stack_height, $subrule, $in_match) = @_; our @try_stack; # Unsuccessful match attempts leave "leftovers" on the attempt stack... CLEANUP: while (@try_stack && $try_stack[-1]{height} >= $stack_height) { # Grab record of (potentially) unsuccessful attempt... my $error_ref = pop @try_stack; # If attempt was the one whose match is being reported, go and report... last CLEANUP if $in_match && $error_ref->{height} == $stack_height && $error_ref->{subrule} eq $subrule; # Otherwise, report the match failure... say {*Regexp::Grammars::LOGFILE} _debug_context(q{ }) . $error_ref->{errmsg}; } } # Handle attempts to call non-existent subrules... sub _debug_fatal { my ($naughty_construct) = @_; print {*Regexp::Grammars::LOGFILE} "_________________________________________________________________\n", "Fatal error: Entire parse terminated prematurely while attempting\n", " to call non-existent rule: $naughty_construct\n", "_________________________________________________________________\n"; $@ = "Entire parse terminated prematurely while attempting to call non-existent rule: $naughty_construct"; } # Print a message in context... sub _debug_logmsg { my ($stack_height, @msg) = @_; # Determine indent for messages... my $leader = _debug_context() . q{| } x ($stack_height-1) . '|'; # Report the attempt... print {*Regexp::Grammars::LOGFILE} map { "$leader$_\n" } @msg; } # Print a message indicating a (sub)match attempt... sub _debug_trying { my ($stack_height, $curr_frame_ref, $subrule) = @_; # Clean up after any preceding unsuccessful attempts... _debug_handle_failures($stack_height, $subrule); # Determine indent for messages... my $leader = _debug_context() . q{| } x ($stack_height-2); # Detect and report any backtracking prior to this attempt... our $last_try_pos; #...Stores the pos() of the most recent match attempt? my $backtrack_distance = $last_try_pos - pos(); if ($backtrack_distance > 0) { say {*Regexp::Grammars::LOGFILE} ' <' . q{~} x (length(_debug_context(q{ }))-3) . q{ } . q{| } x ($stack_height-2) . qq{|...Backtracking $backtrack_distance char} . ($backtrack_distance > 1 ? q{s} : q{}) . q{ and trying new match} ; } # Report the attempt... print {*Regexp::Grammars::LOGFILE} $leader, "|...Trying $subrule\t"; # Handle user interactions during debugging... _debug_interact($stack_height, $leader, $curr_frame_ref, 'step'); # Record the attempt, for later error handling in _debug_matched()... our @try_stack; push @try_stack, { height => $stack_height, subrule => $subrule, # errmsg should align under: |...Trying $subrule\t errmsg => q{| } x ($stack_height-2) . "| \\FAIL $subrule", }; $last_try_pos = pos(); } # Print a message indicating a successful (sub)match... sub _debug_matched { my ($stack_height, $curr_frame_ref, $subrule, $text) = @_; # Clean up any intervening unsuccessful attempts... _debug_handle_failures($stack_height, $subrule, 'in match'); # Build debugging message... my $debug_context = _debug_context(); my $leader = $debug_context . q{| } x ($stack_height-2); my $message = ($stack_height ? '| ' : q{}) . " \\_____$subrule matched '"; my $filler = $stack_height ? '| ' . q{ } x (length($message)-4) : q{ } x length($message); # Split multi-line match texts and indent them correctly... $text = join "\n$leader$filler", split "\n", $text; our $last_try_pos; #...Stores the pos() of the most recent match attempt? # Report if match required backtracking... my $backtrack_distance = $last_try_pos - pos(); if ($backtrack_distance > 0) { say {*Regexp::Grammars::LOGFILE} ' <' . q{~} x (length(_debug_context(q{ }))-3) . q{ } . q{| } x ($stack_height-2) . qq{|...Backtracking $backtrack_distance char} . ($backtrack_distance > 1 ? q{s} : q{}) . qq{ and rematching $subrule} ; } $last_try_pos = pos(); # Print match message... print {*Regexp::Grammars::LOGFILE} $leader . $message . $text . qq{'\t}; # Check for user interaction... _debug_interact($stack_height, $leader, $curr_frame_ref, $stack_height ? 'match' : 'run'); } # Print a message indicating a successful (sub)match... sub _debug_require { my ($stack_height, $condition, $succeeded) = @_; # Build debugging message... my $debug_context = _debug_context(); my $leader = $debug_context . q{| } x ($stack_height-1); my $message1 = ($stack_height ? '|...' : q{}) . "Testing condition: $condition" ; my $message2 = ($stack_height ? '| ' : q{}) . " \\_____" . ($succeeded ? 'Satisified' : 'FAILED') ; # Report if match required backtracking... our $last_try_pos; my $backtrack_distance = $last_try_pos - pos(); if ($backtrack_distance > 0) { say {*Regexp::Grammars::LOGFILE} ' <' . q{~} x (length(_debug_context(q{ }))-3) . q{ } . q{| } x ($stack_height-1) . qq{|...Backtracking $backtrack_distance char} . ($backtrack_distance > 1 ? q{s} : q{}) . qq{ and rematching} ; } # Remember where the condition was tried... $last_try_pos = pos(); # Print match message... say {*Regexp::Grammars::LOGFILE} $leader . $message1; say {*Regexp::Grammars::LOGFILE} $leader . $message2; } # Print a message indicating a successful store-result-of-code-block... sub _debug_executed { my ($stack_height, $curr_frame_ref, $subrule, $value) = @_; # Build message... my $leader = _debug_context() . q{| } x ($stack_height-2); my $message = "|...Action $subrule\n"; my $message2 = "| saved value: '"; $message .= $leader . $message2; my $filler = q{ } x length($message2); # Split multiline results over multiple lines (properly indented)... $value = join "\n$leader$filler", split "\n", $value; # Report the action... print {*Regexp::Grammars::LOGFILE} $leader . $message . $value . qq{'\t}; # Check for user interaction... _debug_interact($stack_height, $leader, $curr_frame_ref, 'match'); } # Create the code to be inserted into the regex to facilitate debugging... sub _build_debugging_statements { my ($debugging_active, $subrule, $extra_pre_indent) = @_; return (q{}, q{}) if ! $debugging_active;; $extra_pre_indent //= 0; $subrule = "q{$subrule}"; return ( qq{ Regexp::Grammars::_debug_trying(\@Regexp::Grammars::RESULT_STACK+$extra_pre_indent, \$Regexp::Grammars::RESULT_STACK[-2+$extra_pre_indent], $subrule) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG}; }, qq{ Regexp::Grammars::_debug_matched(\@Regexp::Grammars::RESULT_STACK+1, \$Regexp::Grammars::RESULT_STACK[-1], $subrule, \$^N) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG}; }, ); } #=====[ SUPPORT FOR UPDATING THE RESULT STACK ]========================= # Create a clone of the current result frame with an new key/value... sub _extend_current_result_frame_with_scalar { my ($stack_ref, $key, $value) = @_; # Autovivify null stacks (only occur when grammar invokes no subrules)... if (!@{$stack_ref}) { $stack_ref = [{}]; } # Copy existing frame, appending new value so it overwrites any old value... my $cloned_result_frame = { %{$stack_ref->[-1]}, $key => $value, }; # Make the copy into an object, if the original was one... if (my $class = blessed($stack_ref->[-1])) { bless $cloned_result_frame, $class; } return $cloned_result_frame; } # Create a clone of the current result frame with an additional key/value # (As above, but preserving the "listiness" of the key being added to)... sub _extend_current_result_frame_with_list { my ($stack_ref, $key, $value) = @_; # Copy existing frame, appending new value to appropriate element's list... my $cloned_result_frame = { %{$stack_ref->[-1]}, $key => [ @{$stack_ref->[-1]{$key}//[]}, $value, ], }; # Make the copy into an object, if the original was one... if (my $class = blessed($stack_ref->[-1])) { bless $cloned_result_frame, $class; } return $cloned_result_frame; } # Pop current result frame and add it to a clone of previous result frame # (flattening it if possible, and preserving any blessing)... sub _pop_current_result_frame { my ($stack_ref, $key, $value) = @_; # Where are we in the stack? my $curr_frame = $stack_ref->[-1]; my $caller_frame = $stack_ref->[-2]; # Track which frames are objects... my $is_blessed_curr = blessed($curr_frame); my $is_blessed_caller = blessed($caller_frame); # Remove "private" captures (i.e. those starting with _)... delete @{$curr_frame}{grep {substr($_,0,1) eq '_'} keys %{$curr_frame} }; # Nest a clone of current frame inside a clone of the caller frame... my $cloned_caller_frame = { %{$caller_frame}, $key => exists $curr_frame->{'='} ? $curr_frame->{'='} : $is_blessed_curr || length(join(q{}, keys %{$curr_frame})) ? { q{} => $value, %{$curr_frame} } : keys %{$curr_frame} ? $curr_frame->{q{}} : $value }; # Make the copies into objects, if the originals were... if ($is_blessed_curr && !exists $curr_frame->{'='} ) { bless $cloned_caller_frame->{$key}, $is_blessed_curr; } if ($is_blessed_caller) { bless $cloned_caller_frame, $is_blessed_caller; } return $cloned_caller_frame; } # Pop current result frame and add it to a clone of previous result frame # (flattening it if possible, and preserving any blessing) # (As above, but preserving listiness of key being added to)... sub _pop_current_result_frame_with_list { my ($stack_ref, $key, $value) = @_; # Where are we in the stack? my $curr_frame = $stack_ref->[-1]; my $caller_frame = $stack_ref->[-2]; # Track which frames are objects... my $is_blessed_curr = blessed($curr_frame); my $is_blessed_caller = blessed($caller_frame); # Remove "private" captures (i.e. those starting with _)... delete @{$curr_frame}{grep {substr($_,0,1) eq '_'} keys %{$curr_frame} }; # Append a clone of current frame inside a clone of the caller frame... my $cloned_caller_frame = { %{$caller_frame}, $key => [ @{$caller_frame->{$key}//[]}, exists $curr_frame->{'='} ? $curr_frame->{'='} : $is_blessed_curr || length(join(q{}, keys %{$curr_frame})) ? { q{} => $value, %{$curr_frame} } : keys %{$curr_frame} ? $curr_frame->{q{}} : $value ], }; # Make the copies into objects, if the originals were... if ($is_blessed_curr && !exists $curr_frame->{'='} ) { bless $cloned_caller_frame->{$key}[-1], $is_blessed_curr; } if ($is_blessed_caller) { bless $cloned_caller_frame, $is_blessed_caller; } return $cloned_caller_frame; } #=====[ MISCELLANEOUS CONSTANTS ]========================= # This code inserted at the start of every grammar regex # (initializes the result stack cleanly and backtrackably, via local)... my $PROLOGUE = q{((?{; @! = () if !pos; local @Regexp::Grammars::RESULT_STACK = (@Regexp::Grammars::RESULT_STACK, {}); local $Regexp::Grammars::DEBUG = 'off' }) }; # This code inserted at the end of every grammar regex # (grabs final result and stores it in %/. Also defines default rule)... my $EPILOGUE = q{ )(?{; $Regexp::Grammars::RESULT_STACK[-1]{""} //= $^N; local $Regexp::Grammars::match_frame = pop @Regexp::Grammars::RESULT_STACK; delete @{$Regexp::Grammars::match_frame}{ grep {substr($_,0,1) eq '_'} keys %{$Regexp::Grammars::match_frame} }; if (@Regexp::Grammars::RESULT_STACK) { $Regexp::Grammars::RESULT_STACK[-1]{'(?R)'} = $Regexp::Grammars::match_frame; } */ = $Regexp::Grammars::match_frame; })(?(DEFINE) (?(?:\\s*)) (?(?:\\S+)) ) }; #=====[ MISCELLANEOUS PATTERNS THAT MATCH USEFUL THINGS ]======== # Match an identifier... my $IDENT = q{[^\W\d]\w*+}; # Match balanced parentheses, taking into account \-escapes and []-escapes... my $PARENS = qr{ (?&PARENS) (?(DEFINE) (? \( (?: \\. | (?&PARENS) | (?&CHARSET) | [^][()\\]++)*+ \) ) (? \[ \^?+ \]?+ [^]]*+ \] ) ) }xms; #=====[ SUPPORT FOR TRANSLATING GRAMMAR-ENHANCED REGEX TO NATIVE REGEX ]==== my %REPETITION_DESCRIPTION_FOR = ( '+' => 'once or more', '*' => 'any number of times', '?' => 'if possible', '+?' => 'as few times as possible', '*?' => 'as few times as possible', '??' => 'if necessary', '++' => 'as many times as possible', '*+' => 'as many times as possible', '?+' => 'if possible', ); sub _translate_raw_regex { my ($regex, $debug_build) = @_; my $is_comment = substr($regex, 0, 1) eq q{#}; my $visible_regex = _squeeze_ws($regex); # Report how regex was interpreted, if requested to... if ($debug_build && $visible_regex ne q{} && $visible_regex ne q{ }) { _debug_notify( info => " |", " |...Treating '$visible_regex' as:", ($is_comment ? " | \\ a comment (which will be ignored)" : " | \\ normal Perl regex syntax" ), ); } return $is_comment ? q{} : $regex; } # Report and convert a debugging directive... sub _translate_debug_directive { my ($construct, $cmd, $debug_build) = @_; # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | \\ Change run-time debugging mode to '$cmd'", ); } return qq{(?{; local \$Regexp::Grammars::DEBUG = q{$cmd}; }) }; } # Report and convert a directive... sub _translate_require_directive { my ($construct, $condition, $debug_build) = @_; $condition = substr($condition, 3, -2); # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | \\ Require that {$condition} is true", ); } my $quoted_condition = $condition; $quoted_condition =~ s{\$}{}xms; return qq{(?(?{;$condition}) (?{;Regexp::Grammars::_debug_require( scalar \@Regexp::Grammars::RESULT_STACK, q{$quoted_condition}, 1) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG}}) | (?{;Regexp::Grammars::_debug_require( scalar \@Regexp::Grammars::RESULT_STACK, q{$quoted_condition}, 0) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG}})(?!)) }; } # Report and convert a debugging directive... sub _translate_error_directive { my ($construct, $type, $msg, $debug_build) = @_; # Determine severity... my $severity = ($type eq 'error') ? 'fatal' : 'non-fatal'; # Unpack message... if (substr($msg,0,3) eq '(?{') { $msg = 'do'. substr($msg,2,-1); } elsif ($type ne 'log' && (lc(substr($msg,0,9) ) eq 'expected ' || lc(substr($msg,0,10)) eq 'expecting ')) { $msg = qq{q{$msg, but found '}.\$CONTEXT.q{' instead}}; } else { $msg = qq{q{$msg}}; } # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", ( $type eq 'log' ? " | \\ Log a message to the logfile" : " | \\ Append a $severity error message to \@!" ), ); } # Generate the regex... return $type eq 'log' ? qq{(?{Regexp::Grammars::_debug_logmsg(scalar \@Regexp::Grammars::RESULT_STACK,$msg) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG} })} : qq{(?:(?{;local \$Regexp::Grammar::_memopos=pos();}) (?>\\s*+((?-s).{0,$MAX_CONTEXT_WIDTH}+)) (?{; pos() = \$Regexp::Grammar::_memopos; push @!, $msg }) (?!)|} . ($severity eq 'fatal' ? q{(?!)} : q{}) . q{)} ; } sub _translate_subpattern { my ($construct, $alias, $subpattern, $savemode, $postmodifier, $debug_build, $debug_runtime) = @_; # Determine save behaviour... my $is_noncapturing = $savemode eq 'noncapturing'; my $is_listifying = $savemode eq 'list'; my $is_codeblock = substr($subpattern,0,3) eq '(?{' || substr($subpattern,0,4) eq '(??{'; my $value_saved = $is_codeblock ? '$^R' : '$^N'; my $do_something_with = $is_codeblock ? 'execute the code block' : 'match the pattern'; my $result = $is_codeblock ? 'result' : 'matched substring'; my $description = $is_codeblock ? substr($subpattern,2,-1) : $subpattern; my $debug_construct = $is_codeblock ? '<' . substr($alias,1,-1) . '= (?{;' . substr($subpattern,3,-2) . '})>' : $construct ; # Report how construct was interpreted, if requested to... my $repeatedly = $REPETITION_DESCRIPTION_FOR{$postmodifier} // q{}; my $results = $is_listifying && $postmodifier ? "each $result" : substr($postmodifier,0,1) eq '?' ? "any $result" : $postmodifier && !$is_noncapturing ? "only the final $result" : "the $result" ; if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | | $do_something_with $description $repeatedly", ( $is_noncapturing ? " | \\ but don't save $results" : $is_listifying ? " | \\ appending $results to \$MATCH{$alias}" : " | \\ saving $results in \$MATCH{$alias}" ) ); } # Generate run-time debugging code (if any)... my ($debug_pre, $debug_post) = _build_debugging_statements($debug_runtime,$debug_construct, +1); # Generate post-match result-capturing code, if match captures... my $post_action = $is_noncapturing ? q{} : qq{local \@Regexp::Grammars::RESULT_STACK = ( \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-2], Regexp::Grammars::_extend_current_result_frame_with_$savemode( \\\@Regexp::Grammars::RESULT_STACK, $alias, $value_saved ), );} ; # Translate to standard regex code... return qq{(?{;local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK;$debug_pre}) (?:($subpattern)(?{;$post_action$debug_post}))$postmodifier}; } sub _translate_hashmatch { my ($construct, $alias, $hashname, $savemode, $postmodifier, $debug_build, $debug_runtime) = @_; # Determine save behaviour... my $is_noncapturing = $savemode eq 'noncapturing'; my $is_listifying = $savemode eq 'list'; # Convert hash to hash lookup... my $hash_lookup = '$' . substr($hashname, 1). '{$^N}'; # Report how construct was interpreted, if requested to... my $repeatedly = $REPETITION_DESCRIPTION_FOR{$postmodifier} // q{}; my $results = $is_listifying && $postmodifier ? 'each matched key' : substr($postmodifier,0,1) eq '?' ? 'any matched key' : $postmodifier && !$is_noncapturing ? 'only the final matched key' : 'the matched key' ; if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | | match a key from the hash $hashname $repeatedly", ( $is_noncapturing ? " | \\ but don't save $results" : $is_listifying ? " | \\ appending $results to \$MATCH{$alias}" : " | \\ saving $results in \$MATCH{$alias}" ) ); } # Generate run-time debugging code (if any)... my ($debug_pre, $debug_post) = _build_debugging_statements($debug_runtime,$construct, +1); # Generate post-match result-capturing code, if match captures... my $post_action = $is_noncapturing ? q{} : qq{local \@Regexp::Grammars::RESULT_STACK = ( \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-2], Regexp::Grammars::_extend_current_result_frame_with_$savemode( \\\@Regexp::Grammars::RESULT_STACK, $alias, \$^N ), );} ; # Translate to standard regex code... return qq{(?:(?{;local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK;$debug_pre}) (?:((?&hk))(??{exists $hash_lookup ? q{} : q{(?!)}})(?{;$post_action$debug_post})))$postmodifier}; } # Convert a " ** " construct to pure Perl 5.10... sub _translate_separated_list { my ($term, $separator, $term_trans, $sep_trans, $ws, $debug_build, $debug_runtime) = @_; # Translate meaningful whitespace... $ws = length($ws) ? q{(?&ws)} : q{}; # Report how construct was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $term ** $separator as:", " | | repeatedly match the subrule $term", " | \\ as long as the matches are separated by matches of $separator", ); } # Translate to list-matching pattern... state $checkpoint = q{(?{;@Regexp::Grammars::RESULT_STACK = @Regexp::Grammars::RESULT_STACK;})}; return qq{(?:$ws$checkpoint$sep_trans$ws$term_trans)*}; } sub _translate_subrule_call { my ( $construct, $alias, $subrule, $savemode, $postmodifier, $debug_build, $debug_runtime, $valid_subrule_names_ref) = @_; # Shortcircuit if unknown subrule invoked... if (!$valid_subrule_names_ref->{$subrule}) { _debug_notify( error => qq{Found call to $construct, but no or}, qq{ was defined in the grammar}, qq{(Did you misspell the rule name or forget to define the rule?)}, q{}, ); return "(?{Regexp::Grammars::_debug_fatal('$construct')})(*COMMIT)(*FAIL)"; } # Determine save behaviour... my $is_noncapturing = $savemode eq 'noncapturing'; my $is_listifying = $savemode eq 'list'; my $save_code = $is_noncapturing? q{ @Regexp::Grammars::RESULT_STACK[0..@Regexp::Grammars::RESULT_STACK-2] } : $is_listifying? qq{ \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-3], Regexp::Grammars::_pop_current_result_frame_with_list( \\\@Regexp::Grammars::RESULT_STACK, $alias, \$^N ), } : qq{ \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-3], Regexp::Grammars::_pop_current_result_frame( \\\@Regexp::Grammars::RESULT_STACK, $alias, \$^N ), } ; # Report how construct was interpreted, if requested to... my $repeatedly = $REPETITION_DESCRIPTION_FOR{$postmodifier} // q{}; my $results = $is_listifying && $postmodifier ? 'each match' : substr($postmodifier,0,1) eq '?' ? 'any match' : 'the match' ; if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | | match the subrule <$subrule> $repeatedly", ( $is_noncapturing ? " | \\ but don't save anything" : $is_listifying ? " | \\ appending $results to \$MATCH{$alias}" : " | \\ saving $results in \$MATCH{$alias}" ), ); } # Generate post-match result-capturing code, if match captures... my ($debug_pre, $debug_post) = _build_debugging_statements($debug_runtime, $construct); # Translate to standard regex code... return qq{(?:(?{; local \@Regexp::Grammars::RESULT_STACK = (\@Regexp::Grammars::RESULT_STACK, {}); $debug_pre})((?&$subrule))(?{; local \@Regexp::Grammars::RESULT_STACK = ( $save_code );$debug_post }))$postmodifier}; } sub _translate_rule_def { my ($type, $qualifier, $name, $body, $objectify) = @_;; # Return object if requested... my $objectification = $objectify ? qq{(?{; local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK; bless \$Regexp::Grammars::RESULT_STACK[-1], '$qualifier$name' })} : q{}; # Each rule or token becomes a DEFINE'd Perl 5.10 named capture... return qq{ (?(DEFINE) (?<$name> (?{\$Regexp::Grammars::RESULT_STACK[-1]{'!'}=\$#{!};}) (?:$body) $objectification (?{;\$#{!}=delete \$Regexp::Grammars::RESULT_STACK[-1]{'!'};}) ) ) }; } # Locate any valid <...> sequences and replace with native regex code... sub _translate_subrule_calls { my ($grammar_spec, $compiletime_debugging_requested, $runtime_debugging_requested, $pre_match_debug, $post_match_debug, $expectation, $subrule_names_ref, ) = @_; # Remember the preceding construct, so as to implement the ** operator... my $prev_construct = q{}; my $prev_translation = q{}; # Translate all other calls... $grammar_spec =~ s{ (? (? \s*+) \*\* (? \s*+) )? (? < (?: (? \. \s* (?(?&IDENT)) \s* ) | (? \s* (?(?&IDENT)) \s* ) | (? \[ \s* (?(?&IDENT)) \s* \] ) | (? (?(?&IDENT)) \s* = \s* (?(?&IDENT)) \s* ) | (? \[ (?(?&IDENT)) \s* = \s* (?(?&IDENT)) \s* \] ) | (? \. (?(?&IDENT)) \s* = \s* (?(?&PARENCODE)|(?&PARENS)) \s* ) | (? (?(?&IDENT)) \s* = \s* (?(?&PARENCODE)|(?&PARENS)) \s* ) | (? \[ (?(?&IDENT)) \s* = \s* (?(?&PARENCODE)|(?&PARENS)) \s* \] ) | (? (?(?&HASH)) \s* ) | (? (?(?&IDENT)) \s* = \s* (?(?&HASH)) \s* ) | (? \[ (?(?&IDENT)) \s* = \s* (?(?&HASH)) \s* \] ) | (? require \s* : \s* (? (?&PARENCODE) ) \s* ) | (? debug \s* : \s* (? run | match | step | try | off | on) \s* ) | (? error \s*+ : \s*+ ) | (? (? log | error | warning ) \s*+ : \s*+ (? (?&PARENCODE) | .+? ) \s*+ ) ) > (? \s* (?! \*\* ) [?+*][?+]? | ) | (? \\. | (?&PARENS) | (?&CHARSET) | \# [^\n]*+ | [^][<>#\\]++ ) ) (?(DEFINE) (? \( (?: \\. | (?&PARENS) | (?&CHARSET) | [^][()\\<>]++ )*+ \) ) (? \{ (?: \\. | (?&BRACES) | [^{}\\]++ )*+ \} ) (? \(\?\{ (?: \\. | (?&BRACES) | [^{}\\]++ )*+ \}\) ) (? \% (?&IDENT) (?: :: (?&IDENT) )* ) (? \[ \^?+ \]?+ [^]]*+ \] ) (? [^\W\d]\w*+ ) ) }{ my $curr_construct = $+{construct}; my $alias = ($+{alias}//'MATCH') eq 'MATCH' ? q{'='} : qq{'$+{alias}'}; # Determine and remember the necessary translation... my $curr_translation = do{ if ($+{alias_parens_scalar}) { _translate_subpattern( $curr_construct, $alias, $+{pattern}, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested ); } elsif ($+{alias_parens_scalar_nocap}) { _translate_subpattern( $curr_construct, $alias, $+{pattern}, 'noncapturing', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested ); } elsif ($+{alias_parens_list}) { _translate_subpattern( $curr_construct, $alias, $+{pattern}, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested ); } elsif ($+{alias_hash_scalar}) { _translate_hashmatch( $curr_construct, $alias, $+{varname}, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested ); } elsif ($+{alias_hash_scalar_nocap}) { _translate_hashmatch( $curr_construct, $alias, $+{varname}, 'noncapturing', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested ); } elsif ($+{alias_hash_list}) { _translate_hashmatch( $curr_construct, $alias, $+{varname}, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested ); } elsif ($+{alias_subrule_scalar}) { _translate_subrule_call( $curr_construct, $alias, $+{subrule}, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $subrule_names_ref, ); } elsif ($+{alias_subrule_list}) { _translate_subrule_call( $curr_construct, $alias, $+{subrule}, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $subrule_names_ref, ); } elsif ($+{self_subrule_scalar_nocap}) { _translate_subrule_call( $curr_construct, qq{'$+{subrule}'}, $+{subrule}, 'noncapturing', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $subrule_names_ref, ); } elsif ($+{self_subrule_scalar}) { _translate_subrule_call( $curr_construct, qq{'$+{subrule}'}, $+{subrule}, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $subrule_names_ref, ); } elsif ($+{self_subrule_list}) { _translate_subrule_call( $curr_construct, qq{'$+{subrule}'}, $+{subrule}, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $subrule_names_ref, ); } elsif ($+{raw_regex}) { _translate_raw_regex( $+{raw_regex}, $compiletime_debugging_requested ); } elsif ($+{require_directive}) { _translate_require_directive( $curr_construct, $+{condition}, $compiletime_debugging_requested ); } elsif ($+{debug_directive}) { _translate_debug_directive( $curr_construct, $+{cmd}, $compiletime_debugging_requested ); } elsif ($+{error_directive}) { _translate_error_directive( $curr_construct, $+{error_type}, $+{msg}, $compiletime_debugging_requested ); } elsif ($+{autoerror_directive}) { _translate_error_directive( $curr_construct, 'error', "Expected $expectation", $compiletime_debugging_requested ); } else { die qq{Internal error: this shouldn't happen!\nNear $curr_construct}; } }; # Handle the ** operator... if ($+{list_marker}) { my $ws = $+{ws1} . $+{ws2}; $curr_translation = _translate_separated_list( $prev_construct, $curr_construct, $prev_translation, $curr_translation, $ws, $compiletime_debugging_requested, $runtime_debugging_requested ); $curr_construct = qq{$prev_construct ** $curr_construct}; } # Finally, remember this latest translation, and return it... $prev_construct = $curr_construct; $prev_translation = $curr_translation;; }exmsg; # Translate magic hash accesses... $grammar_spec =~ s{\$MATCH (?= \s*\{) } # ...Access named entry in hash {\$Regexp::Grammars::RESULT_STACK[-1]}xmsg; # Translate magic scalars and hashes... state $translate_scalar = { q{%$MATCH} => q{%{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}}, q{@$MATCH} => q{@{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}}, q{$MATCH} => q{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}, q{$CAPTURE} => q{$^N}, q{$CONTEXT} => q{$^N}, q{$DEBUG} => q{$Regexp::Grammars::DEBUG}, q{$INDEX} => q{${\\pos()}}, q{%MATCH} => q{$Regexp::Grammars::RESULT_STACK[-1]}, }; state $translatable_scalar = join '|', map {quotemeta $_} sort {length $b <=> length $a} keys %{$translate_scalar}; $grammar_spec =~ s{ ($translatable_scalar) (?! \s* (?: \[ | \{) ) } {$translate_scalar->{$1}}oxmsg; return $grammar_spec; } # Generate a "decimal timestamp" and insert in a template... sub _timestamp { my ($template) = @_; # Generate and insert any timestamp... if ($template =~ /%t/) { my ($sec, $min, $hour, $day, $mon, $year) = localtime; $mon++; $year+=1900; my $timestamp = sprintf("%04d%02d%02d.%02d%02d%02d", $year, $mon, $day, $hour, $min, $sec); $template =~ s{%t}{$timestamp}xms;; } return $template; } # Open (or re-open) the requested log file... sub _autoflush { my ($fh) = @_; my $originally_selected = select $fh; $|=1; select $originally_selected; } sub _open_log { my ($mode, $filename) = @_; # Special case: '-' --> STDERR if ($filename eq q{-}) { return *STDERR{IO}; } # Otherwise, just open the named file... elsif (open my $fh, $mode, $filename) { _autoflush($fh); return $fh; } # Otherwise, generate a warning and default to STDERR... else { local *Regexp::Grammars::LOGFILE = *STDERR{IO}; _debug_notify( warn => qq{Unable to open log file '$filename'}, qq{($!)}, qq{Defaulting to STDERR instead.}, ); _debug_notify( q{} => q{} ); return *STDERR{IO}; } } # Transform grammar-augmented regex into pure Perl 5.10 regex... sub _build_grammar { my ($grammar_spec) = @_; $grammar_spec .= q{}; # Check for dubious repeated constructs that throw away captures... my @dubious = $grammar_spec =~ m{ < (?! \[ ) ( $IDENT (?: = [^>]*)? ) > \s* ([+*][?+]?|\{.*\}[?+]?) }gxms; # Report dubiousities... while (@dubious) { my ($rule, $qual) = splice @dubious, 0, 2; _debug_notify( warn => qq{Repeated subrule <$rule>$qual will only capture its final match}, qq{(Did you mean <[$rule]>$qual instead?)}, q{}, ) } # Check for dubious non-backtracking constructs... @dubious = $grammar_spec =~ m{ < ( [^>]+ ) > \s* ([?+*][+]|\{.*\}[+]) }gxms; # Report dubiousities... while (@dubious) { my ($rule, $qual) = splice @dubious, 0, 2; my $safe_qual = substr($qual,0,-1); _debug_notify( warn => qq{Non-backtracking subrule <$rule>$qual not fully supported yet}, qq{(If grammar does not work try <$rule>$safe_qual instead)}, q{}, ) } # Check whether a log file was specified... my $compiletime_debugging_requested; local *Regexp::Grammars::LOGFILE = *Regexp::Grammars::LOGFILE; my $logfile = q{-}; $grammar_spec =~ s{ ^ [^#]* < logfile: \s* ([^>]+?) \s* > }{ $logfile = _timestamp($1); # Presence of implies compile-time logging... $compiletime_debugging_requested = 1; *Regexp::Grammars::LOGFILE = _open_log('>',$logfile); # Delete directive... q{}; }gexms; # Look ahead for any run-time debugging requests... my $runtime_debugging_requested = $grammar_spec =~ m{ ^ [^#]* < debug: \s* (run | match | step | try | on | off) \s* > | \$DEBUG (?! \s* (?: \[ | \{) ) }xms; # Standard actions set up and clean up any regex debugging... # Before entire match, set up a stack of attempt records and report... my $pre_match_debug = $runtime_debugging_requested ? qq{(?{; *Regexp::Grammars::LOGFILE = Regexp::Grammars::_open_log('>>','$logfile'); Regexp::Grammars::_init_try_stack(); })} : qq{(?{; *Regexp::Grammars::LOGFILE = Regexp::Grammars::_open_log('>>','$logfile'); })} ; # After entire match, report whether successful or not... my $post_match_debug = $runtime_debugging_requested ? qq{(?{;Regexp::Grammars::_debug_matched(0,\\%/,'',\$^N)}) |(?>(?{;Regexp::Grammars::_debug_handle_failures(0,''); }) (?!)) } : q{} ; # Subdivide into rule and token definitions, preparing to process each... my @defns = split m{ < (obj|)(rule|token) \s*+ : \s*+ ((?:${IDENT}::)*+)($IDENT) \s* > }xms, $grammar_spec; # Extract up list of names of defined rules/tokens... # (Name is every 4th item out of every five, skipping the first item) my %subrule_names = map { $_ => 1 } @defns[ map { $_ * 5 + 4 } 0 .. ((@defns-1)/5-1) ]; # Report how main regex was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( info => "Processing the main regex before any rule definitions", ); } # Any actual regex is processed first... my $regex = _translate_subrule_calls( shift @defns, $compiletime_debugging_requested, $runtime_debugging_requested, $pre_match_debug, $post_match_debug, 'valid input', # Expected...what? \%subrule_names, ); # Report how construct was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( q{} => q{ |}, q{ \\___End of main regex}, q{}, ); } # Then iterate any following rule definitions... while (@defns) { # Grab details of each rule defn (as extracted by previous split)... my ($objectify, $type, $qualifier, $name, $body) = splice(@defns, 0, 5); # Report how construct was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( info => "Defining a rule: <$name>", " |...Returns: " . ($objectify ? "an object of class '$qualifier$name'" : "a hash"), ); } # Translate any nested <...> constructs... $body = _translate_subrule_calls( $body, $compiletime_debugging_requested, $runtime_debugging_requested, $pre_match_debug, $post_match_debug, lc($name), # Expected...what? \%subrule_names, ); # Report how construct was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( q{} => q{ |}, q{ \\___End of rule definition}, q{}, ); } # Rules make non-code literal whitespace match textual whitespace... if ($type eq 'rule') { state $CODE_OR_SPACE = qr{ \( \?\?? (?&BRACED) \) | (? \{ (?: \\. | (?&BRACED) | [^{}] )* \} ) ) }xms; $body =~ s{($CODE_OR_SPACE)} [ substr($1,0,3) eq '(?{' || substr($1,0,4) eq '(??{' ? $1 : '(?&ws)']exmsg; #} } $regex .= _translate_rule_def( $type, $qualifier, $name, $body, $objectify ); } # Insert checkpoints into any user-defined code block... $regex =~ s{ \( \?\?? \{ \K (?!;) }{ local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK; }xmsg; # Check for any suspicious left-overs from the start of the regex... pos $regex = 0; # Report anything that starts like a subrule, but isn't... my %seen; while ($regex =~ m{( (?] )}gxms) { my $construct = $1; my $something = $2 ? 'directive' : 'subrule call'; # Only report potential problems once... next if $seen{$construct}++; # Also explain how to indicate the construct is intentional... _debug_notify( warn => qq{Possible invalid $something:}, qq{ $construct}, qq{(To silence this warning, use: \\$construct}, ); _debug_notify( q{} => q{} ); } # Aggregrate the final grammar... _complete_regex($regex, $pre_match_debug, $post_match_debug); } sub _complete_regex { my ($regex, $pre_match_debug, $post_match_debug) = @_; return qq{$pre_match_debug$PROLOGUE$regex$EPILOGUE$post_match_debug}; } 1; # Magic true value required at end of module __END__ =head1 NAME Regexp::Grammars - Add grammatical parsing features to Perl 5.10 regexes =head1 VERSION This document describes Regexp::Grammars version 1.001_005 =head1 SYNOPSIS use Regexp::Grammars; my $parser = qr{ (?: # Parse and save a Verb in a scalar <.ws> # Parse but don't save whitespace # Parse and save a Noun in a scalar 0.5 ? 'VN' : 'VerbNoun' }> # Save result of expression in a scalar | (?: <[Noun]> # Parse a Noun and save result in a list (saved under the key 'Noun') <[PostNoun=ws]> # Parse whitespace, save it in a list # (saved under the key 'PostNoun') )+ # Parse a Verb and save result in a scalar (saved under the key 'Verb') # Save a literal in a scalar | # Turn on the integrated debugger here <.Cmd= (?: mv? )> # Parse but don't capture a subpattern (name it 'Cmd' for debugging purposes) <[File]>+ # Parse 1+ Files and save them in a list (saved under the key 'File') # Turn off the integrated debugger here # Parse a File and save it in a scalar (saved under the key 'Dest') ) ################################################################ # Define a subrule named File <.ws> # - Parse but don't capture whitespace # - Parse the subpattern and capture # matched text as the result of the # subrule # Define a subrule named Noun cat | dog | fish # - Match an alternative (as usual) # Define a whitespace-sensitive subrule eats # - Match a literal (after any space) ? # - Parse optional subrule Noun and # save result under the key 'Object' | # Or else... # - Parse subrule AUX and save result # - Match a literal, save under 'part' # Define a whitespace-insensitive subrule (has | is) # - Match an alternative and capture (?{ $MATCH = uc $^N }) # - Use captured text as subrule result }; # Match the grammar against some text... if ($text =~ $parser) { # If successful, the hash %/ will have the hierarchy of results... process_data_in( %/ ); } =head1 QUICKSTART CHEATSHEET =head2 In your program... use Regexp::Grammars; Allow enhanced regexes in lexical scope %/ Result-hash for successful grammar match =head2 Defining rules in your grammar... Define rule with magic whitespace Define rule without magic whitespace Define rule returning blessed result-hash Define token returning blessed result-hash =head2 Matching rules in your grammar... Call named subrule, save result to $MATCH{RULENAME} <%HASH> Match longest possible key of hash Call subrule, save result in $MATCH{ALIAS} Match a hash key, save in $MATCH{ALIAS} Match pattern, save match in $MATCH{ALIAS} Execute code, save value in $MATCH{ALIAS} <.SUBRULE> Call any kind of subrule (as above), but don't save the result in %MATCH <[SUBRULE]> Call any kind of subrule (as above) but append result instead of overwriting it ** Match one or more repetitions of SUBRULE1 as long as they're separated by SUBRULE2 =head2 In your grammar's code blocks... $CAPTURE Alias for $^N (the most recent paren capture) $CONTEXT Another alias for $^N $INDEX Current index of next matching position in string %MATCH Current rule's result-hash $MATCH Magic override value (returned instead of result-hash) $DEBUG Current match-time debugging mode =head2 Debugging support... Fail if code evaluates false Change match-time debugging mode Queue text or value as an error message Change debugging log file (default: STDERR) Explicitly add a message to the log =head1 DESCRIPTION This module adds a small number of new regex constructs that can be used within Perl 5.10 patterns to implement complete recursive-descent parsing. Perl 5.10 already supports recursive=descent I, via the new C<< (?...) >> and C<< (?&name) >> constructs. For example, here is a simple matcher for a subset of the LaTeX markup language: $matcher = qr{ (?&File) (?(DEFINE) (? (?&Element)* ) (? \s* (?&Command) | \s* (?&Literal) ) (? \\ \s* (?&Literal) \s* (?&Options)? \s* (?&Args)? ) (? \[ \s* (?:(?&Option) (?:\s*,\s* (?&Option) )*)? \s* \]) (? \{ \s* (?&Element)* \s* \} ) (?