package Regexp::Grammars; use warnings; use strict; use 5.010; use Scalar::Util qw< blessed >; use Data::Dumper qw< Dumper >; our $VERSION = '1.014'; # Load the module... sub import { # Signal lexical scoping (active, unless something was exported)... $^H{'Regexp::Grammars::active'} = 1; # Process any regexes in module's active lexical scope... use overload; overload::constant( qr => sub { my ($raw, $cooked, $type) = @_; # In active scope and really a regex... if (_module_is_active() && $type =~ /qq?/) { return bless \$cooked, 'Regexp::Grammars::Precursor'; } # Ignore everything else... else { return $cooked; } } ); } # Deactivate module's regex effect when it is "anti-imported" with 'no'... sub unimport { # Signal lexical (non-)scoping... $^H{'Regexp::Grammars::active'} = 0; } # Encapsulate the hoopy user-defined pragma interface... sub _module_is_active { return (caller 1)[10]->{'Regexp::Grammars::active'}; } my $RULE_HANDLER; sub clear_rule_handler { undef $RULE_HANDLER; } { package Regexp; sub with_actions { my ($self, $handler) = @_; $RULE_HANDLER = $handler; return $self; } } #=====[ COMPILE-TIME INTERIM REPRESENTATION OF GRAMMARS ]=================== { package Regexp::Grammars::Precursor; # Only translate precursors once... state %grammar_cache; use overload ( # Concatenation/interpolation just concatenates to the precursor... q{.} => sub { my ($x, $y, $reversed) = @_; if (ref $x) { $x = ${$x} } if (ref $y) { $y = ${$y} } if ($reversed) { ($y,$x) = ($x,$y); } $x .= $y//q{}; return bless \$x, 'Regexp::Grammars::Precursor'; }, # Using as a string (i.e. matching) preprocesses the precursor... q{""} => sub { my ($obj) = @_; use Scalar::Util qw< refaddr >; return $grammar_cache{ refaddr($obj) } //= Regexp::Grammars::_build_grammar( ${$obj} ); }, # Everything else, as usual... fallback => 1, ); } #=====[ SUPPORT FOR THE INTEGRATED DEBUGGER ]========================= # All messages go to STDERR by default... *Regexp::Grammars::LOGFILE = *STDERR{IO}; # Debugging levels indicate where to stop... our %DEBUG_LEVEL = ( same => undef, # No change in debugging mode off => 0, # No more debugging run => 1, continue => 1, # Run to completion of regex match match => 2, on => 2, # Run to next successful submatch step => 3, try => 3, # Run to next reportable event ); # Debugging levels can be abbreviated to one character during interactions... @DEBUG_LEVEL{ map {substr($_,0,1)} keys %DEBUG_LEVEL } = values %DEBUG_LEVEL; $DEBUG_LEVEL{o} = $DEBUG_LEVEL{off}; # Not "on" $DEBUG_LEVEL{s} = $DEBUG_LEVEL{step}; # Not "same" # Width of leading context field in debugging messages is constrained... my $MAX_CONTEXT_WIDTH = 20; my $MIN_CONTEXT_WIDTH = 6; sub set_context_width { { package Regexp::Grammars::ContextRestorer; sub new { my ($class, $old_context_width) = @_; bless \$old_context_width, $class; } sub DESTROY { my ($old_context_width_ref) = @_; $MAX_CONTEXT_WIDTH = ${$old_context_width_ref}; } } my ($new_context_width) = @_; my $old_context_width = $MAX_CONTEXT_WIDTH; $MAX_CONTEXT_WIDTH = $new_context_width; if (defined wantarray) { return Regexp::Grammars::ContextRestorer->new($old_context_width); } } # Rewrite a string currently being matched, to make \n and \t visible sub _show_metas { my $context_str = shift // q{}; # Quote newlines (\n -> \\n, without using a regex)... my $index = index($context_str,"\n"); while ($index >= 0) { substr($context_str, $index, 1, '\\n'); $index = index($context_str,"\n",$index+2); } # Quote tabs (\t -> \\t, without using a regex)... $index = index($context_str,"\t"); while ($index >= 0) { substr($context_str, $index, 1, '\\t'); $index = index($context_str,"\t",$index+2); } return $context_str; } # Minimize whitespace in a string... sub _squeeze_ws { my ($str) = @_; $str =~ tr/\n\t/ /; my $index = index($str,q{ }); while ($index >= 0) { substr($str, $index, 2, q{ }); $index = index($str,q{ },$index); } return $str; } # Prepare for debugging... sub _init_try_stack { our (@try_stack, $last_try_pos, $last_context_str); # Start with a representation of the entire grammar match... @try_stack = ({ subrule => '', height => 0, errmsg => ' \\FAIL ', }); # Initialize tracking of location and context... $last_try_pos = -1; $last_context_str = q{}; # Report... say {*Regexp::Grammars::LOGFILE} _debug_context('=>') . 'Trying from position ' . pos(); } # Create a "context string" showing where the regex is currently matching... sub _debug_context { my ($fill_chars) = @_; # Determine minimal sufficient width for context field... my $field_width = length(_show_metas($_//q{})); if ($field_width > $MAX_CONTEXT_WIDTH) { $field_width = $MAX_CONTEXT_WIDTH; } elsif ($field_width < $MIN_CONTEXT_WIDTH) { $field_width = $MIN_CONTEXT_WIDTH; } # Get current matching position (and some additional trailing context)... my $context_str = substr(_show_metas(substr(($_//q{}).q{},pos()//0,$field_width)),0,$field_width); # Build the context string, handling special cases... our $last_context_str; if ($fill_chars) { # If caller supplied a 1- or 2-char fill sequence, use that instead... my $last_fill_char = length($fill_chars) > 1 ? substr($fill_chars,-1,1,q{}) : $fill_chars ; $context_str = $fill_chars x ($field_width-1) . $last_fill_char; } else { # Make end-of-string visible in empty context string... if ($context_str eq q{}) { $context_str = '[eos]'; } # Don't repeat consecutive identical context strings... if ($context_str eq $last_context_str) { $context_str = q{ } x $field_width; } else { # If not repeating, remember for next time... $last_context_str = $context_str; } } # Left justify and return context string... return sprintf("%-*s ",$field_width,$context_str); } # Show a debugging message (mainly used for compile-time errors and info)... sub _debug_notify { # Single arg is a line to be printed with a null severity... my ($severity, @lines) = @_==1 ? (q{},@_) : @_; chomp @lines; # Formatting string for all lines... my $format = qq{%*s | %s\n}; # Track previous severity and avoid repeating the same level... state $prev_severity = q{}; if ($severity !~ /\S/) { # Do nothing } elsif ($severity eq 'info' && $prev_severity eq 'info' ) { $severity = q{}; } else { $prev_severity = $severity; } # Display first line with severity indicator (unless same as previous)... printf {*Regexp::Grammars::LOGFILE} $format, $MIN_CONTEXT_WIDTH, $severity, shift @lines; # Display first line without severity indicator for my $next_line (@lines) { printf {*Regexp::Grammars::LOGFILE} $format, $MIN_CONTEXT_WIDTH, q{}, $next_line; } } # Handle user interactions during runtime debugging... sub _debug_interact { my ($stack_height, $leader, $curr_frame_ref, $min_debug_level) = @_; our $DEBUG; # ...stores current debug level within regex # Only interact with terminals, and if debug level is appropriate... if (-t *Regexp::Grammars::LOGFILE && defined $DEBUG && ($DEBUG_LEVEL{$DEBUG}//0) >= $DEBUG_LEVEL{$min_debug_level} ) { local $/ = "\n"; # ...in case some caller is being clever INPUT: while (1) { my $cmd = readline // q{}; chomp $cmd; # Input of 'd' means 'display current result frame'... if ($cmd eq 'd') { print {*Regexp::Grammars::LOGFILE} join "\n", map { $leader . ($stack_height?'| ':q{}) . ' : ' . $_ } split "\n", q{ }x8 . substr(Dumper($curr_frame_ref),8); print "\t"; } # Any other (valid) input changes debugging level and continues... else { if (defined $DEBUG_LEVEL{$cmd}) { $DEBUG = $cmd; } last INPUT; } } } # When interaction not indicated, just complete the debugging line... else { print {*Regexp::Grammars::LOGFILE} "\n"; } } # Handle reporting of unsuccessful match attempts... sub _debug_handle_failures { my ($stack_height, $subrule, $in_match) = @_; our @try_stack; # Unsuccessful match attempts leave "leftovers" on the attempt stack... CLEANUP: while (@try_stack && $try_stack[-1]{height} >= $stack_height) { # Grab record of (potentially) unsuccessful attempt... my $error_ref = pop @try_stack; # If attempt was the one whose match is being reported, go and report... last CLEANUP if $in_match && $error_ref->{height} == $stack_height && $error_ref->{subrule} eq $subrule; # Otherwise, report the match failure... say {*Regexp::Grammars::LOGFILE} _debug_context(q{ }) . $error_ref->{errmsg}; } } # Handle attempts to call non-existent subrules... sub _debug_fatal { my ($naughty_construct) = @_; print {*Regexp::Grammars::LOGFILE} "_________________________________________________________________\n", "Fatal error: Entire parse terminated prematurely while attempting\n", " to call non-existent rule: $naughty_construct\n", "_________________________________________________________________\n"; $@ = "Entire parse terminated prematurely while attempting to call non-existent rule: $naughty_construct"; } # Print a message in context... sub _debug_logmsg { my ($stack_height, @msg) = @_; # Determine indent for messages... my $leader = _debug_context() . q{| } x ($stack_height-1) . '|'; # Report the attempt... print {*Regexp::Grammars::LOGFILE} map { "$leader$_\n" } @msg; } # Print a message indicating a (sub)match attempt... sub _debug_trying { my ($stack_height, $curr_frame_ref, $subrule) = @_; # Clean up after any preceding unsuccessful attempts... _debug_handle_failures($stack_height, $subrule); # Determine indent for messages... my $leader = _debug_context() . q{| } x ($stack_height-2); # Detect and report any backtracking prior to this attempt... our $last_try_pos //= 0; #...Stores the pos() of the most recent match attempt? my $backtrack_distance = $last_try_pos - pos(); if ($backtrack_distance > 0) { say {*Regexp::Grammars::LOGFILE} ' <' . q{~} x (length(_debug_context(q{ }))-3) . q{ } . q{| } x ($stack_height-2) . qq{|...Backtracking $backtrack_distance char} . ($backtrack_distance > 1 ? q{s} : q{}) . q{ and trying new match} ; } # Report the attempt... print {*Regexp::Grammars::LOGFILE} $leader, "|...Trying $subrule\t"; # Handle user interactions during debugging... _debug_interact($stack_height, $leader, $curr_frame_ref, 'step'); # Record the attempt, for later error handling in _debug_matched()... if ($subrule ne 'next alternative') { our @try_stack; push @try_stack, { height => $stack_height, subrule => $subrule, # errmsg should align under: |...Trying $subrule\t errmsg => q{| } x ($stack_height-2) . "| \\FAIL $subrule", }; } $last_try_pos = pos(); } # Print a message indicating a successful (sub)match... sub _debug_matched { my ($stack_height, $curr_frame_ref, $subrule, $matched_text) = @_; # Clean up any intervening unsuccessful attempts... _debug_handle_failures($stack_height, $subrule, 'in match'); # Build debugging message... my $debug_context = _debug_context(); my $leader = $debug_context . q{| } x ($stack_height-2); my $message = ($stack_height ? '| ' : q{}) . " \\_____$subrule matched "; my $filler = $stack_height ? '| ' . q{ } x (length($message)-4) : q{ } x length($message); our $last_try_pos //= 0; #...Stores the pos() of the most recent match attempt? # Report if match required backtracking... my $backtrack_distance = $last_try_pos - (pos()//0); if ($backtrack_distance > 0) { say {*Regexp::Grammars::LOGFILE} ' <' . q{~} x (length(_debug_context(q{ }))-3) . q{ } . q{| } x ($stack_height-2) . qq{|...Backtracking $backtrack_distance char} . ($backtrack_distance > 1 ? q{s} : q{}) . qq{ and rematching $subrule} ; } $last_try_pos = pos(); # Format match text (splitting multi-line texts and indent them correctly)... $matched_text = defined($matched_text) ? $matched_text = q{'} . join("\n$leader$filler", split "\n", $matched_text) . q{'} : q{}; # Print match message... print {*Regexp::Grammars::LOGFILE} $leader . $message . $matched_text . qq{\t}; # Check for user interaction... _debug_interact($stack_height, $leader, $curr_frame_ref, $stack_height ? 'match' : 'run'); } # Print a message indicating a successful (sub)match... sub _debug_require { my ($stack_height, $condition, $succeeded) = @_; # Build debugging message... my $debug_context = _debug_context(); my $leader = $debug_context . q{| } x ($stack_height-1); my $message1 = ($stack_height ? '|...' : q{}) . "Testing condition: $condition" ; my $message2 = ($stack_height ? '| ' : q{}) . " \\_____" . ($succeeded ? 'Satisified' : 'FAILED') ; # Report if match required backtracking... our $last_try_pos; my $backtrack_distance = $last_try_pos - pos(); if ($backtrack_distance > 0) { say {*Regexp::Grammars::LOGFILE} ' <' . q{~} x (length(_debug_context(q{ }))-3) . q{ } . q{| } x ($stack_height-1) . qq{|...Backtracking $backtrack_distance char} . ($backtrack_distance > 1 ? q{s} : q{}) . qq{ and rematching} ; } # Remember where the condition was tried... $last_try_pos = pos(); # Print match message... say {*Regexp::Grammars::LOGFILE} $leader . $message1; say {*Regexp::Grammars::LOGFILE} $leader . $message2; } # Print a message indicating a successful store-result-of-code-block... sub _debug_executed { my ($stack_height, $curr_frame_ref, $subrule, $value) = @_; # Build message... my $leader = _debug_context() . q{| } x ($stack_height-2); my $message = "|...Action $subrule\n"; my $message2 = "| saved value: '"; $message .= $leader . $message2; my $filler = q{ } x length($message2); # Split multiline results over multiple lines (properly indented)... $value = join "\n$leader$filler", split "\n", $value; # Report the action... print {*Regexp::Grammars::LOGFILE} $leader . $message . $value . qq{'\t}; # Check for user interaction... _debug_interact($stack_height, $leader, $curr_frame_ref, 'match'); } # Create the code to be inserted into the regex to facilitate debugging... sub _build_debugging_statements { my ($debugging_active, $subrule, $extra_pre_indent) = @_; return (q{}, q{}) if ! $debugging_active;; $extra_pre_indent //= 0; $subrule = "q{$subrule}"; return ( qq{ Regexp::Grammars::_debug_trying(\@Regexp::Grammars::RESULT_STACK+$extra_pre_indent, \$Regexp::Grammars::RESULT_STACK[-2+$extra_pre_indent], $subrule) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG}; }, qq{ Regexp::Grammars::_debug_matched(\@Regexp::Grammars::RESULT_STACK+1, \$Regexp::Grammars::RESULT_STACK[-1], $subrule, \$^N) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG}; }, ); } sub _build_raw_debugging_statements { my ($debugging_active, $subpattern, $extra_pre_indent) = @_; return (q{}, q{}) if ! $debugging_active; $extra_pre_indent //= 0; if ($subpattern eq '|') { return ( q{}, qq{ (?{;Regexp::Grammars::_debug_trying(\@Regexp::Grammars::RESULT_STACK+$extra_pre_indent, \$Regexp::Grammars::RESULT_STACK[-2+$extra_pre_indent], 'next alternative') if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG};}) }, ); } else { return ( qq{ (?{;Regexp::Grammars::_debug_trying(\@Regexp::Grammars::RESULT_STACK+$extra_pre_indent, \$Regexp::Grammars::RESULT_STACK[-2+$extra_pre_indent], q{subpattern /$subpattern/}) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG};}) }, qq{ (?{;Regexp::Grammars::_debug_matched(\@Regexp::Grammars::RESULT_STACK+1, \$Regexp::Grammars::RESULT_STACK[-1], q{subpattern /$subpattern/}) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG};}) }, ); } } #=====[ SUPPORT FOR AUTOMATIC TIMEOUTS ]========================= sub _test_timeout { our ($DEBUG, $TIMEOUT); return q{} if time() < $TIMEOUT->{'limit'}; my $duration = "$TIMEOUT->{duration} second" . ( $TIMEOUT->{duration} == 1 ? q{} : q{s} ); if (defined($DEBUG) && $DEBUG ne 'off') { my $leader = _debug_context(q{ }); say {*LOGFILE} $leader . '|'; say {*LOGFILE} $leader . "|...Invoking {duration}>"; say {*LOGFILE} $leader . "| \\_____No match after $duration"; say {*LOGFILE} $leader . '|'; say {*LOGFILE} $leader . " \\FAIL "; } if (! @!) { @! = "Internal error: Timed out after $duration (as requested)"; } return q{(*COMMIT)(*FAIL)}; } #=====[ SUPPORT FOR UPDATING THE RESULT STACK ]========================= # Create a clone of the current result frame with an new key/value... sub _extend_current_result_frame_with_scalar { my ($stack_ref, $key, $value) = @_; # Autovivify null stacks (only occur when grammar invokes no subrules)... if (!@{$stack_ref}) { $stack_ref = [{}]; } # Copy existing frame, appending new value so it overwrites any old value... my $cloned_result_frame = { %{$stack_ref->[-1]}, $key => $value, }; # Make the copy into an object, if the original was one... if (my $class = blessed($stack_ref->[-1])) { bless $cloned_result_frame, $class; } return $cloned_result_frame; } # Create a clone of the current result frame with an additional key/value # (As above, but preserving the "listiness" of the key being added to)... sub _extend_current_result_frame_with_list { my ($stack_ref, $key, $value) = @_; # Copy existing frame, appending new value to appropriate element's list... my $cloned_result_frame = { %{$stack_ref->[-1]}, $key => [ @{$stack_ref->[-1]{$key}//[]}, $value, ], }; # Make the copy into an object, if the original was one... if (my $class = blessed($stack_ref->[-1])) { bless $cloned_result_frame, $class; } return $cloned_result_frame; } # Pop current result frame and add it to a clone of previous result frame # (flattening it if possible, and preserving any blessing)... sub _pop_current_result_frame { my ($stack_ref, $key, $original_name, $value) = @_; # Where are we in the stack? my $curr_frame = $stack_ref->[-1]; my $caller_frame = $stack_ref->[-2]; # Track which frames are objects... my $is_blessed_curr = blessed($curr_frame); my $is_blessed_caller = blessed($caller_frame); # Remove "private" captures (i.e. those starting with _)... delete @{$curr_frame}{grep {substr($_,0,1) eq '_'} keys %{$curr_frame} }; # Remove "nocontext" marker... my $nocontext = delete $curr_frame->{'~'}; # Build a clone of the current frame... my $cloned_result_frame = exists $curr_frame->{'='} ? $curr_frame->{'='} : $is_blessed_curr || length(join(q{}, keys %{$curr_frame})) ? { q{} => $value, %{$curr_frame} } : keys %{$curr_frame} ? $curr_frame->{q{}} : $value ; # Apply any appropriate handler... if ($RULE_HANDLER) { if ($RULE_HANDLER->can($original_name) || $RULE_HANDLER->can('AUTOLOAD')) { my $replacement_result_frame = $RULE_HANDLER->$original_name($cloned_result_frame); if (defined $replacement_result_frame) { $cloned_result_frame = $replacement_result_frame; } } } # Remove capture if not requested... if ($nocontext && ref $cloned_result_frame eq 'HASH' && keys %{$cloned_result_frame} > 1) { delete $cloned_result_frame->{q{}}; } # Nest a clone of current frame inside a clone of the caller frame... my $cloned_caller_frame = { %{$caller_frame//{}}, $key => $cloned_result_frame, }; # Make the copies into objects, if the originals were... if ($is_blessed_curr && !exists $curr_frame->{'='} ) { bless $cloned_caller_frame->{$key}, $is_blessed_curr; } if ($is_blessed_caller) { bless $cloned_caller_frame, $is_blessed_caller; } return $cloned_caller_frame; } # Pop current result frame and add it to a clone of previous result frame # (flattening it if possible, and preserving any blessing) # (As above, but preserving listiness of key being added to)... sub _pop_current_result_frame_with_list { my ($stack_ref, $key, $original_name, $value) = @_; # Where are we in the stack? my $curr_frame = $stack_ref->[-1]; my $caller_frame = $stack_ref->[-2]; # Track which frames are objects... my $is_blessed_curr = blessed($curr_frame); my $is_blessed_caller = blessed($caller_frame); # Remove "private" captures (i.e. those starting with _)... delete @{$curr_frame}{grep {substr($_,0,1) eq '_'} keys %{$curr_frame} }; # Remove "nocontext" marker... my $nocontext = delete $curr_frame->{'~'}; # Clone the current frame... my $cloned_result_frame = exists $curr_frame->{'='} ? $curr_frame->{'='} : $is_blessed_curr || length(join(q{}, keys %{$curr_frame})) ? { q{} => $value, %{$curr_frame} } : keys %{$curr_frame} ? $curr_frame->{q{}} : $value ; # Apply any appropriate handler... if ($RULE_HANDLER) { if ($RULE_HANDLER->can($original_name) || $RULE_HANDLER->can('AUTOLOAD')) { my $replacement_result_frame = $RULE_HANDLER->$original_name($cloned_result_frame); if (defined $replacement_result_frame) { $cloned_result_frame = $replacement_result_frame; } } } # Remove capture if not requested... if ($nocontext && ref $cloned_result_frame eq 'HASH' && keys %{$cloned_result_frame} > 1) { delete $cloned_result_frame->{q{}}; } # Append a clone of current frame inside a clone of the caller frame... my $cloned_caller_frame = { %{$caller_frame}, $key => [ @{$caller_frame->{$key}//[]}, $cloned_result_frame, ], }; # Make the copies into objects, if the originals were... if ($is_blessed_curr && !exists $curr_frame->{'='} ) { bless $cloned_caller_frame->{$key}[-1], $is_blessed_curr; } if ($is_blessed_caller) { bless $cloned_caller_frame, $is_blessed_caller; } return $cloned_caller_frame; } #=====[ MISCELLANEOUS CONSTANTS ]========================= # Namespace in which grammar inheritance occurs... my $CACHE = 'Regexp::Grammars::_CACHE_::'; my $CACHE_LEN = length $CACHE; my %CACHE; #...for subrule tracking # This code inserted at the start of every grammar regex # (initializes the result stack cleanly and backtrackably, via local)... my $PROLOGUE = q{((?{; @! = () if !pos; local @Regexp::Grammars::RESULT_STACK = (@Regexp::Grammars::RESULT_STACK, {}); local $Regexp::Grammars::TIMEOUT = { limit => -1>>1 }; local $Regexp::Grammars::DEBUG = 'off' }) }; # This code inserted at the end of every grammar regex # (puts final result in %/. Also defines default , , etc.)... my $EPILOGUE = q{)(?{; $Regexp::Grammars::RESULT_STACK[-1]{q{}} //= $^N; local $Regexp::Grammars::match_frame = pop @Regexp::Grammars::RESULT_STACK; delete @{$Regexp::Grammars::match_frame}{ '~', grep {substr($_,0,1) eq '_'} keys %{$Regexp::Grammars::match_frame} }; if (exists $Regexp::Grammars::match_frame->{'='}) { if (ref($Regexp::Grammars::match_frame->{'='}) eq 'HASH') { $Regexp::Grammars::match_frame = $Regexp::Grammars::match_frame->{'='}; } } if (@Regexp::Grammars::RESULT_STACK) { $Regexp::Grammars::RESULT_STACK[-1]{'(?R)'} = $Regexp::Grammars::match_frame; } Regexp::Grammars::clear_rule_handler(); */ = $Regexp::Grammars::match_frame; })(?(DEFINE) (?(?:\\s*)) (?(?:\\S+)) (? (?{; $Regexp::Grammars::RESULT_STACK[-1]{"="} = pos; }) ) (? (?{; $Regexp::Grammars::RESULT_STACK[-1]{"="} = 1 + substr($_,0,pos) =~ tr/\n/\n/; }) ) ) }; my $EPILOGUE_NC = $EPILOGUE; $EPILOGUE_NC =~ s{ ; [^;]+ ;}{;}xms; #=====[ MISCELLANEOUS PATTERNS THAT MATCH USEFUL THINGS ]======== # Match an identifier... my $IDENT = qr{ [^\W\d] \w*+ }xms; my $QUALIDENT = qr{ (?: $IDENT :: )*+ $IDENT }xms; # Match balanced parentheses, taking into account \-escapes and []-escapes... my $PARENS = qr{ (?&PARENS) (?(DEFINE) (? \( (?: \\. | (?&PARENS) | (?&CHARSET) | [^][()\\]++)*+ \) ) (? \[ \^?+ \\?+ \]?+ [^]]*+ \] ) ) }xms; #=====[ UTILITY SUBS FOR ERROR AND WARNING MESSAGES ]======== sub _uniq { my %seen; return grep { defined $_ && !$seen{$_}++ } @_; } # Default translator for error messages... my $ERRORMSG_TRANSLATOR = sub { my ($errormsg, $rulename, $context) = @_; $rulename = 'valid input' if $rulename eq q{}; $context //= ''; # Unimplemented subrule when rulename starts with '-'... if (substr($rulename,0,1) eq '-') { $rulename = substr($rulename,1); return "Can't match subrule <$rulename> (not implemented)"; } # Empty message converts to a "Expected...but found..." message... if ($errormsg eq q{}) { $rulename =~ tr/_/ /; $rulename = lc($rulename); return "Expected $rulename, but found '$context' instead"; } # "Expecting..." messages get "but found" added... if (lc(substr($errormsg,0,6)) eq 'expect') { return "$errormsg, but found '$context' instead"; } # Everything else stays "as is"... return $errormsg; }; # Allow user to set translation... sub set_error_translator { { package Regexp::Grammars::TranslatorRestorer; sub new { my ($class, $old_translator) = @_; bless \$old_translator, $class; } sub DESTROY { my ($old_translator_ref) = @_; $ERRORMSG_TRANSLATOR = ${$old_translator_ref}; } } my ($translator_ref) = @_; die "Usage: set_error_translator(\$subroutine_reference)\n" if ref($translator_ref) ne 'CODE'; my $old_translator_ref = $ERRORMSG_TRANSLATOR; $ERRORMSG_TRANSLATOR = $translator_ref; return defined wantarray ? Regexp::Grammars::TranslatorRestorer->new($old_translator_ref) : (); } # Dispatch to current translator for error messages... sub _translate_errormsg { goto &{$ERRORMSG_TRANSLATOR}; } #=====[ SUPPORT FOR TRANSLATING GRAMMAR-ENHANCED REGEX TO NATIVE REGEX ]==== # Store any specified grammars... my %user_defined_grammar; my %REPETITION_DESCRIPTION_FOR = ( '+' => 'once or more', '*' => 'any number of times', '?' => 'if possible', '+?' => 'as few times as possible', '*?' => 'as few times as possible', '??' => 'if necessary', '++' => 'as many times as possible', '*+' => 'as many times as possible', '?+' => 'if possible', ); sub _translate_raw_regex { my ($regex, $debug_build, $debug_runtime) = @_; my $is_comment = substr($regex, 0, 1) eq q{#} || substr($regex, 0, 3) eq q{(?#}; my $visible_regex = _squeeze_ws($regex); # Report how regex was interpreted, if requested to... if ($debug_build && $visible_regex ne q{} && $visible_regex ne q{ }) { _debug_notify( info => " |", " |...Treating '$visible_regex' as:", ($is_comment ? " | \\ a comment (which will be ignored)" : " | \\ normal Perl regex syntax" ), ); } return q{} if $is_comment; # Generate run-time debugging code (if any)... my ($debug_pre, $debug_post) = _build_raw_debugging_statements($debug_runtime,$visible_regex, +1); # Replace negative lookahead with one that works under R::G... $regex =~ s{\(\?!}{(?!(?!)|}gxms; # ToDo: Also replace positive lookahead with one that works under R::G... # This replacement should be of the form: # $regex =~ s{\(\?!}{(?!(?!)|(?!(?!)|}gxms; # but need to find a way to insert the extra ) at the other end return $debug_runtime && $regex eq '|' ? $regex . $debug_post : $debug_runtime && $regex =~ /\S/ ? '(?:' .$debug_pre . $regex . $debug_post . ')' : $regex; } # Report and convert a debugging directive... sub _translate_debug_directive { my ($construct, $cmd, $debug_build) = @_; # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | \\ Change run-time debugging mode to '$cmd'", ); } return qq{(?{; local \$Regexp::Grammars::DEBUG = q{$cmd}; }) }; } # Report and convert a timeout directive... sub _translate_timeout_directive { my ($construct, $timeout, $debug_build) = @_; # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", ($timeout > 0 ? " | \\ Cause the entire parse to fail after $timeout second" . ($timeout==1 ? q{} : q{s}) : " | \\ Cause the entire parse to fail immediately" ), ); } return $timeout > 0 ? qq{(?{; local \$Regexp::Grammars::TIMEOUT = { duration => $timeout, limit => time() + $timeout }; }) } : qq{(*COMMIT)(*FAIL)}; } # Report and convert a directive... sub _translate_require_directive { my ($construct, $condition, $debug_build) = @_; $condition = substr($condition, 3, -2); # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | \\ Require that {$condition} is true", ); } my $quoted_condition = $condition; $quoted_condition =~ s{\$}{}xms; return qq{(?(?{;$condition}) (?{;Regexp::Grammars::_debug_require( scalar \@Regexp::Grammars::RESULT_STACK, q{$quoted_condition}, 1) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG}}) | (?{;Regexp::Grammars::_debug_require( scalar \@Regexp::Grammars::RESULT_STACK, q{$quoted_condition}, 0) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG}})(?!)) }; } # Report and convert a directive... sub _translate_minimize_directive { my ($construct, $debug_build) = @_; # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | \\ Minimize result value if possible", ); } return q{(?{; if (1 == grep { $_ ne '!' && $_ ne '@' } keys %MATCH) { # ...single alnum key local %Regexp::Grammars::matches = %MATCH; delete @Regexp::Grammars::matches{'!', '@'}; local ($Regexp::Grammars::only_key) = keys %Regexp::Grammars::matches; local $Regexp::Grammars::array_ref = $MATCH{$Regexp::Grammars::only_key}; if (ref($Regexp::Grammars::array_ref) eq 'ARRAY' && 1 == @{$Regexp::Grammars::array_ref}) { $MATCH = $Regexp::Grammars::array_ref->[0]; } } })}; } # Report and convert a debugging directive... sub _translate_error_directive { my ($construct, $type, $msg, $debug_build, $subrule_name) = @_; $subrule_name //= 'undef'; # Determine severity... my $severity = ($type eq 'error') ? 'fail' : 'non-fail'; # Determine fatality (and build code to invoke it)... my $fatality = ($type eq 'fatal') ? '(*COMMIT)(*FAIL)' : q{}; # Unpack message... if (substr($msg,0,3) eq '(?{') { $msg = 'do'. substr($msg,2,-1); } else { $msg = quotemeta $msg; $msg = qq{qq{$msg}}; } # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", ( $type eq 'log' ? " | \\ Log a message to the logfile" : " | \\ Append a $severity error message to \@!" ), ); } # Generate the regex... return $type eq 'log' ? qq{(?{Regexp::Grammars::_debug_logmsg(scalar \@Regexp::Grammars::RESULT_STACK,$msg) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG} })} : qq{(?:(?{;local \$Regexp::Grammar::_memopos=pos();}) (?>\\s*+((?-s).{0,$MAX_CONTEXT_WIDTH}+)) (?{; pos() = \$Regexp::Grammar::_memopos; @! = Regexp::Grammars::_uniq( @!, Regexp::Grammars::_translate_errormsg($msg,q{$subrule_name},\$CONTEXT) ) }) (?!)|} . ($severity eq 'fail' ? q{(?!)} : $fatality) . q{)} ; } sub _translate_subpattern { my ($construct, $alias, $subpattern, $savemode, $postmodifier, $debug_build, $debug_runtime, $timeout, $backref) = @_; # Determine save behaviour... my $is_noncapturing = $savemode eq 'noncapturing'; my $is_listifying = $savemode eq 'list'; my $is_codeblock = substr($subpattern,0,3) eq '(?{'; my $value_saved = $is_codeblock ? '$^R' : '$^N'; my $do_something_with = $is_codeblock ? 'execute the code block' : 'match the pattern'; my $result = $is_codeblock ? 'result' : 'matched substring'; my $description = $is_codeblock ? substr($subpattern,2,-1) : defined $backref ? $backref : $subpattern; my $debug_construct = $is_codeblock ? '<' . substr($alias,1,-1) . '= (?{;' . substr($subpattern,3,-2) . '})>' : $construct ; # Report how construct was interpreted, if requested to... my $repeatedly = $REPETITION_DESCRIPTION_FOR{$postmodifier} // q{}; my $results = $is_listifying && $postmodifier ? "each $result" : substr($postmodifier,0,1) eq '?' ? "any $result" : $postmodifier && !$is_noncapturing ? "only the final $result" : "the $result" ; if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | | $do_something_with $description $repeatedly", ( $is_noncapturing ? " | \\ but don't save $results" : $is_listifying ? " | \\ appending $results to \@{\$MATCH{$alias}}" : " | \\ saving $results in \$MATCH{$alias}" ) ); } # Generate run-time debugging code (if any)... my ($debug_pre, $debug_post) = _build_debugging_statements($debug_runtime,$debug_construct, +1); # Generate post-match result-capturing code, if match captures... my $post_action = $is_noncapturing ? q{} : qq{local \@Regexp::Grammars::RESULT_STACK = ( \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-2], Regexp::Grammars::_extend_current_result_frame_with_$savemode( \\\@Regexp::Grammars::RESULT_STACK, $alias, $value_saved ), );} ; # Generate timeout test... my $timeout_test = $timeout ? q{(??{;Regexp::Grammars::_test_timeout()})} : q{}; # Translate to standard regex code... return qq{$timeout_test(?{;local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK;$debug_pre}) (?:($subpattern)(?{;$post_action$debug_post}))$postmodifier}; } sub _translate_hashmatch { my ($construct, $alias, $hashname, $keypat, $savemode, $postmodifier, $debug_build, $debug_runtime, $timeout) = @_; # Empty or missing keypattern defaults to <.hk>... if (!defined $keypat || $keypat !~ /\S/) { $keypat = '(?&hk)' } else { $keypat = substr($keypat, 1, -1); } # Determine save behaviour... my $is_noncapturing = $savemode eq 'noncapturing'; my $is_listifying = $savemode eq 'list'; # Convert hash to hash lookup... my $hash_lookup = '$' . substr($hashname, 1). '{$^N}'; # Report how construct was interpreted, if requested to... my $repeatedly = $REPETITION_DESCRIPTION_FOR{$postmodifier} // q{}; my $results = $is_listifying && $postmodifier ? 'each matched key' : substr($postmodifier,0,1) eq '?' ? 'any matched key' : $postmodifier && !$is_noncapturing ? 'only the final matched key' : 'the matched key' ; if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | | match a key from the hash $hashname $repeatedly", ( $is_noncapturing ? " | \\ but don't save $results" : $is_listifying ? " | \\ appending $results to \$MATCH{$alias}" : " | \\ saving $results in \$MATCH{$alias}" ) ); } # Generate run-time debugging code (if any)... my ($debug_pre, $debug_post) = _build_debugging_statements($debug_runtime,$construct, +1); # Generate post-match result-capturing code, if match captures... my $post_action = $is_noncapturing ? q{} : qq{local \@Regexp::Grammars::RESULT_STACK = ( \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-2], Regexp::Grammars::_extend_current_result_frame_with_$savemode( \\\@Regexp::Grammars::RESULT_STACK, $alias, \$^N ), );} ; # Generate timeout test... my $timeout_test = $timeout ? q{(??{;Regexp::Grammars::_test_timeout()})} : q{}; # Translate to standard regex code... return qq{$timeout_test(?:(?{;local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK;$debug_pre}) (?:($keypat)(??{exists $hash_lookup ? q{} : q{(?!)}})(?{;$post_action$debug_post})))$postmodifier}; } # Convert a " % " construct to pure Perl 5.10... sub _translate_separated_list { my ($term, $op, $separator, $term_trans, $sep_trans, $ws, $debug_build, $debug_runtime, $timeout) = @_; # This insertion ensures backtracking upwinds the stack correctly... state $CHECKPOINT = q{(?{;@Regexp::Grammars::RESULT_STACK = @Regexp::Grammars::RESULT_STACK;})}; # Translate meaningful whitespace... $ws = length($ws) ? q{(?&ws)} : q{}; # Generate timeout test... my $timeout_test = $timeout ? q{(??{;Regexp::Grammars::_test_timeout()})} : q{}; # Report how construct was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $term $op $separator as:", " | | repeatedly match the subrule $term", " | \\ as long as the matches are separated by matches of $separator", ); } # One-or-more... return qq{$timeout_test(?:$ws$CHECKPOINT$sep_trans$ws$term_trans)*$+} if $op =~ m{ [*][*]() | [+]([+?]?) \s* % | \{ 1, \}([+?]?) \s* % }xms; # Zero-or-more... return qq{{0}$timeout_test$ws(?:$term_trans(?:$ws$CHECKPOINT$sep_trans$ws$term_trans)*$+)?$+} if $op =~ m{ [*]([+?]?) \s* % | \{ 0, \}([+?]?) \s* % }xms; # One-or-zero... return qq{?$+} if $op =~ m{ [?]([+?]?) \s* % | \{ 0,1 \}([+?]?) \s* % }xms; # Zero exactly... return qq{{0}$ws} if $op =~ m{ \{ 0 \}[+?]? \s* % }xms; # N exactly... if ($op =~ m{ \{ (\d+) \}([+?]?) \s* % }xms ) { my $min = $1-1; return qq{{0}$timeout_test$ws(?:$term_trans(?:$ws$CHECKPOINT$sep_trans$ws$term_trans){$min}$+)} } # Zero-to-N... if ($op =~ m{ \{ 0,(\d+) \}([+?]?) \s* % }xms ) { my $max = $1-1; return qq{{0}$timeout_test$ws(?:$term_trans(?:$ws$CHECKPOINT$sep_trans$ws$term_trans){0,$max}$+)?$+} } # M-to-N and M-to-whatever... if ($op =~ m{ \{ (\d+),(\d*) \} ([+?]?) \s* % }xms ) { my $min = $1-1; my $max = $2 ? $2-1 : q{}; return qq{{0}$timeout_test$ws(?:$term_trans(?:$ws$CHECKPOINT$sep_trans$ws$term_trans){$min,$max}$+)} } # Somehow we missed a case (this should never happen)... die "Internal error: missing case in separated list handler"; } sub _translate_subrule_call { my ( $grammar_name, $construct, $alias, $subrule, $args, $savemode, $postmodifier, $debug_build, $debug_runtime, $timeout, $valid_subrule_names_ref, $nocontext) = @_; # Translate arg list, if provided... my $arg_desc; if ($args eq q{}) { $args = q{()}; } elsif (substr($args,0,3) eq '(?{') { # Turn parencode into do block... $arg_desc = substr($args,3,-2); substr($args,1,1) = 'do'; } else { # Turn abbreviated format into a key=>value list... $args =~ s{ [(,] \s* \K : (\w+) (?= \s* [,)] ) }{$1 => \$MATCH{'$1'}}gxms; $arg_desc = substr($args,1,-1); } # Transform qualified subrule names... my $simple_subrule = $subrule; my $start_grammar = (($simple_subrule =~ s{(.*)::}{}xms) ? $1 : ""); if ($start_grammar !~ /^NEXT$|::/) { $start_grammar = caller(3).'::'.$start_grammar; } my @candidates = $start_grammar eq 'NEXT' ? _ancestry_of($grammar_name) : _ancestry_of($start_grammar); # Rename fully-qualified rule call, if to ancestor grammar... RESOLVING: for my $parent_class (@candidates) { my $inherited_subrule = $parent_class.'::'.$simple_subrule; if ($CACHE{$inherited_subrule}) { $subrule = $inherited_subrule; last RESOLVING; } } # Replace package separators, which regex engine can't handle... my $internal_subrule = $subrule; $internal_subrule =~ s{::}{_88_}gxms; # Shortcircuit if unknown subrule invoked... if (!$valid_subrule_names_ref->{$subrule}) { _debug_notify( error => qq{Found call to $construct, but no or}, qq{ was defined in the grammar}, qq{(Did you misspell the rule name or forget to define the rule?)}, q{}, ); return "(?{Regexp::Grammars::_debug_fatal('$construct')})(*COMMIT)(*FAIL)"; } # Determine save behaviour... my $is_noncapturing = $savemode =~ /noncapturing|lookahead/; my $is_listifying = $savemode eq 'list'; my $save_code = $is_noncapturing? q{ @Regexp::Grammars::RESULT_STACK[0..@Regexp::Grammars::RESULT_STACK-2] } : $is_listifying? qq{ \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-3], Regexp::Grammars::_pop_current_result_frame_with_list( \\\@Regexp::Grammars::RESULT_STACK, $alias, '$simple_subrule', \$^N ), } : qq{ \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-3], Regexp::Grammars::_pop_current_result_frame( \\\@Regexp::Grammars::RESULT_STACK, $alias, '$simple_subrule', \$^N ), } ; # Report how construct was interpreted, if requested to... my $repeatedly = $REPETITION_DESCRIPTION_FOR{$postmodifier} // q{}; my $results = $is_listifying && $postmodifier ? 'each match' : substr($postmodifier,0,1) eq '?' ? 'any match' : 'the match' ; my $do_something_with = $savemode eq 'neglookahead' ? 'lookahead for anything except' : $savemode eq 'poslookahead' ? 'lookahead for' : 'match' ; if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | | $do_something_with the subrule <$subrule> $repeatedly", (defined $arg_desc ? " | | passing the args: ($arg_desc)" : () ), ( $is_noncapturing ? " | \\ but don't save anything" : $is_listifying ? " | \\ appending $results to \$MATCH{$alias}" : " | \\ saving $results in \$MATCH{$alias}" ), ); } # Generate post-match result-capturing code, if match captures... my ($debug_pre, $debug_post) = _build_debugging_statements($debug_runtime, $construct); # Generate timeout test... my $timeout_test = $timeout ? q{(??{;Regexp::Grammars::_test_timeout()})} : q{}; # Translate to standard regex code... return qq{(?:$timeout_test(?{; local \@Regexp::Grammars::RESULT_STACK = (\@Regexp::Grammars::RESULT_STACK, {'\@'=>{$args}}); \$Regexp::Grammars::RESULT_STACK[-2]{'~'} = $nocontext if \@Regexp::Grammars::RESULT_STACK >= 2; $debug_pre})((?&$internal_subrule))(?{; local \@Regexp::Grammars::RESULT_STACK = ( $save_code );$debug_post }))$postmodifier}; } sub _translate_rule_def { my ($type, $qualifier, $name, $callname, $qualname, $body, $objectify, $local_ws) = @_; $qualname =~ s{::}{_88_}gxms; # Return object if requested... my $objectification = $objectify ? qq{(?{; local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK; \$Regexp::Grammars::RESULT_STACK[-1] = '$qualifier$name'->can('new') ? '$qualifier$name'->new(\$Regexp::Grammars::RESULT_STACK[-1]) : bless \$Regexp::Grammars::RESULT_STACK[-1], '$qualifier$name'; })} : q{}; # Each rule or token becomes a DEFINE'd Perl 5.10 named capture... return qq{ (?(DEFINE) $local_ws (?<$qualname> (?<$callname> (?{\$Regexp::Grammars::RESULT_STACK[-1]{'!'}=\$#{!};}) (?:$body) $objectification (?{;\$#{!}=delete(\$Regexp::Grammars::RESULT_STACK[-1]{'!'})//0; delete(\$Regexp::Grammars::RESULT_STACK[-1]{'\@'}); }) )) ) }; } # Locate any valid <...> sequences and replace with native regex code... sub _translate_subrule_calls { my ($grammar_name, $grammar_spec, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $pre_match_debug, $post_match_debug, $rule_name, $subrule_names_ref, $magic_ws, $nocontext, ) = @_; # Remember the preceding construct, so as to implement the +% etc. operators... my $prev_construct = q{}; my $prev_translation = q{}; # Translate all other calls (MAIN GRAMMAR FOR MODULE)... $grammar_spec =~ s{ (? (? \s*+) (? (?&SEPLIST_OP) ) (? \s*+) )? (? < (?: (? \. \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* ) | (? (? \? | \! ) \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* ) | (? \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* ) | (? \[ \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* \] ) | (? (?(?&IDENT)) \s* = \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* ) | (? \[ (?(?&IDENT)) \s* = \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* \] ) | (? \s* : (?(?&QUALIDENT)) \s* ) | (? (?(?&IDENT)) \s* = \s* : (?(?&QUALIDENT)) \s* ) | (? \[ (?(?&IDENT)) \s* = \s* : (?(?&QUALIDENT)) \s* \] ) | (? \. (?(?&IDENT)) \s* = \s* (?(?&PARENCODE)|(?&PARENS)|(?&LITERAL)) \s* ) | (? (?(?&IDENT)) \s* = \s* (?(?&PARENCODE)|(?&PARENS)|(?&LITERAL)) \s* ) | (? \[ (?(?&IDENT)) \s* = \s* (?(?&PARENCODE)|(?&PARENS)|(?&LITERAL)) \s* \] ) | (? (?(?&HASH)) \s* (?(?&BRACES))? \s* ) | (? (?(?&IDENT)) \s* = \s* (?(?&HASH)) \s* (?(?&BRACES))? \s* ) | (? \[ (?(?&IDENT)) \s* = \s* (?(?&HASH)) \s* (?(?&BRACES))? \s* \] ) | (? \s* (? \\ | /) (? :? (?&QUALIDENT)) \s* ) | (? (?(?&IDENT)) \s* = \s* (? \\ | /) (? :? (?&QUALIDENT)) \s* ) | (? \[ (?(?&IDENT)) \s* = \s* (? \\ | /) (? :? (?&QUALIDENT)) \s* \] ) | (? minimize \s* : \s* ) | (? require \s* : \s* (? (?&PARENCODE) ) \s* ) | (? debug \s* : \s* (? run | match | step | try | off | on) \s* ) | (? timeout \s* : \s* (? \d+) \s* ) | (? context \s* : \s* ) | (? nocontext \s* : \s* ) | (? [.][.][.] | [!][!][!] | [?][?][?] ) | (? (? error | fatal ) \s*+ : \s*+ ) | (? (? log | error | warning | fatal ) \s*+ : \s*+ (? (?&PARENCODE) | .+? ) \s*+ ) ) > (? \s* (?! (?&SEPLIST_OP) ) [?+*][?+]? | ) | (? \s++ | \\. | \(\?! | \(\?\# [^)]* \) # (?# -> old style inline comment) | (?&PARENCODE) | (?&PARENS) | (?&CHARSET) | \# [^\n]*+ | [^][\s()<>#\\]++ ) ) (?(DEFINE) (? \*\* | [*+?][+?]?\s*% | {\d+(,\d*)?}[+?]?\s*% ) (? \( (?: \\. | (?&PARENCODE) | (?&PARENS) | (?&CHARSET) | [^][()\\<>]++ )*+ \) ) (? \{ (?: \\. | (?&BRACES) | [^{}\\]++ )*+ \} ) (? \(\?\{ (?: \\. | (?&BRACES) | [^{}\\]++ )*+ \}\) ) (? \% (?&IDENT) (?: :: (?&IDENT) )* ) (? \[ \^?+ \\?+ \]?+ [^]]*+ \] ) (? [^\W\d]\w*+ ) (? (?: [^\W\d]\w*+ :: )* [^\W\d]\w*+ ) (? (?&NUMBER) | (?&STRING) | (?&VAR) ) (? [+-]? \d++ (?:\. \d++)? (?:[eE] [+-]? \d++)? ) (? ' [^\\']++ (?: \\. [^\\']++ )* ' ) (? (?&PARENCODE) | \( \s* (?&ARGS)? \s* \) | (?# NOTHING ) ) (? (?&ARG) \s* (?: , \s* (?&ARG) \s* )* ,? ) (? (?&VAR) | (?&KEY) \s* => \s* (?&LITERAL) ) (? : (?&IDENT) ) (? (?&IDENT) | (?&LITERAL) ) ) }{ my $curr_construct = $+{construct}; my $alias = ($+{alias}//'MATCH') eq 'MATCH' ? q{'='} : qq{'$+{alias}'}; # Determine and remember the necessary translation... my $curr_translation = do{ # Translate subrule calls of the form: ... if ($+{alias_parens_scalar}) { my $pattern = substr($+{pattern},0,1) eq '(' ? $+{pattern} : "(?{$+{pattern}})"; _translate_subpattern( $curr_construct, $alias, $pattern, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } elsif ($+{alias_parens_scalar_nocap}) { my $pattern = substr($+{pattern},0,1) eq '(' ? $+{pattern} : "(?{$+{pattern}})"; _translate_subpattern( $curr_construct, $alias, $pattern, 'noncapturing', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } elsif ($+{alias_parens_list}) { my $pattern = substr($+{pattern},0,1) eq '(' ? $+{pattern} : "(?{$+{pattern}})"; _translate_subpattern( $curr_construct, $alias, $pattern, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } # Translate subrule calls of the form: ... elsif ($+{alias_hash_scalar}) { _translate_hashmatch( $curr_construct, $alias, $+{varname}, $+{keypat}, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } elsif ($+{alias_hash_scalar_nocap}) { _translate_hashmatch( $curr_construct, $alias, $+{varname}, $+{keypat}, 'noncapturing', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } elsif ($+{alias_hash_list}) { _translate_hashmatch( $curr_construct, $alias, $+{varname}, $+{keypat}, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } # Translate subrule calls of the form: ... elsif ($+{alias_subrule_scalar}) { _translate_subrule_call( $grammar_name, $curr_construct, $alias, $+{subrule}, $+{args}, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, $nocontext, ); } elsif ($+{alias_subrule_list}) { _translate_subrule_call( $grammar_name, $curr_construct, $alias, $+{subrule}, $+{args}, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, $nocontext, ); } # Translate subrule calls of the form: and ... elsif ($+{self_subrule_lookahead}) { # Determine type of lookahead, and work around capture problem... my ($type, $pre, $post) = ( 'neglookahead', '(?!(?!)|', ')' ); if ($+{sign} eq '?') { $type = 'poslookahead'; $pre x= 2; $post x= 2; } $pre . _translate_subrule_call( $grammar_name, $curr_construct, qq{'$+{subrule}'}, $+{subrule}, $+{args}, $type, q{}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, $nocontext, ) . $post; } elsif ($+{self_subrule_scalar_nocap}) { _translate_subrule_call( $grammar_name, $curr_construct, qq{'$+{subrule}'}, $+{subrule}, $+{args}, 'noncapturing', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, $nocontext, ); } elsif ($+{self_subrule_scalar}) { _translate_subrule_call( $grammar_name, $curr_construct, qq{'$+{subrule}'}, $+{subrule}, $+{args}, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, $nocontext, ); } elsif ($+{self_subrule_list}) { _translate_subrule_call( $grammar_name, $curr_construct, qq{'$+{subrule}'}, $+{subrule}, $+{args}, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, $nocontext, ); } # Translate subrule calls of the form: ... elsif ($+{alias_argrule_scalar}) { my $pattern = qq{(??{;\$Regexp::Grammars::RESULT_STACK[-1]{'\@'}{'$+{subrule}'} // '(?!)'})}; _translate_subpattern( $curr_construct, $alias, $pattern, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, "in \$ARG{'$+{subrule}'}" ); } elsif ($+{alias_argrule_list}) { my $pattern = qq{(??{;\$Regexp::Grammars::RESULT_STACK[-1]{'\@'}{'$+{subrule}'} // '(?!)'})}; _translate_subpattern( $curr_construct, $alias, $pattern, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, "in \$ARG{'$+{subrule}'}" ); } # Translate subrule calls of the form: <:ARGNAME>... elsif ($+{self_argrule_scalar}) { my $pattern = qq{(??{;\$Regexp::Grammars::RESULT_STACK[-1]{'\@'}{'$+{subrule}'} // '(?!)'})}; _translate_subpattern( $curr_construct, qq{'$+{subrule}'}, $pattern, 'noncapturing', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, "in \$ARG{'$+{subrule}'}" ); } # Translate subrule calls of the form: <\IDENT> or ... elsif ($+{backref} || $+{alias_backref} || $+{alias_backref_list}) { # Use "%ARGS" if subrule names starts with a colon... my $subrule = $+{subrule}; if (substr($subrule,0,1) eq ':') { substr($subrule,0,1,"\@'}{'"); } my $backref = qq{\$Regexp::Grammars::RESULT_STACK[-1]{'$subrule'}}; my $quoter = $+{slash} eq '\\' ? "quotemeta($backref)" : "Regexp::Grammars::_invert_delim($backref)" ; my $pattern = qq{ (??{ defined $backref ? $quoter : q{(?!)}})}; my $type = $+{backref} ? 'noncapturing' : $+{alias_backref} ? 'scalar' : 'list' ; _translate_subpattern( $curr_construct, $alias, $pattern, $type, $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, "in \$MATCH{'$subrule'}" ); } # Translate raw regexes (leave as is)... elsif ($+{raw_regex}) { _translate_raw_regex( $+{raw_regex}, $compiletime_debugging_requested, ); } # Translate directives... elsif ($+{require_directive}) { _translate_require_directive( $curr_construct, $+{condition}, $compiletime_debugging_requested ); } elsif ($+{minimize_directive}) { _translate_minimize_directive( $curr_construct, $+{condition}, $compiletime_debugging_requested ); } elsif ($+{debug_directive}) { _translate_debug_directive( $curr_construct, $+{cmd}, $compiletime_debugging_requested ); } elsif ($+{timeout_directive}) { _translate_timeout_directive( $curr_construct, $+{timeout}, $compiletime_debugging_requested ); } elsif ($+{error_directive}) { _translate_error_directive( $curr_construct, $+{error_type}, $+{msg}, $compiletime_debugging_requested, $rule_name ); } elsif ($+{autoerror_directive}) { _translate_error_directive( $curr_construct, $+{error_type}, q{}, $compiletime_debugging_requested, $rule_name ); } elsif ($+{yadaerror_directive}) { _translate_error_directive( $curr_construct, ($+{yadaerror_directive} eq '???' ? 'warning' : 'error'), q{}, $compiletime_debugging_requested, -$rule_name ); } elsif ($+{context_directive}) { $nocontext = 0; q{}; # Remove the directive } elsif ($+{nocontext_directive}) { $nocontext = 1; q{}; # Remove the directive } # There shouldn't be any other possibility... else { die qq{Internal error: this shouldn't happen!\nNear '$curr_construct': }; } }; # Handle the **/*%/+%/{n,m}%/etc operators... if ($+{list_marker}) { my $ws = $magic_ws ? $+{ws1} . $+{ws2} : q{}; my $op = $+{op}; $curr_translation = _translate_separated_list( $prev_construct, $op, $curr_construct, $prev_translation, $curr_translation, $ws, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); $curr_construct = qq{$prev_construct $op $curr_construct}; } # Finally, remember this latest translation, and return it... $prev_construct = $curr_construct; $prev_translation = $curr_translation;; }exmsg; # Translate magic hash accesses... $grammar_spec =~ s{\$MATCH (?= \s*\{) } {\$Regexp::Grammars::RESULT_STACK[-1]}xmsg; $grammar_spec =~ s{\$ARG (?= \s*\{) } {\$Regexp::Grammars::RESULT_STACK[-1]{'\@'}}xmsg; # Translate magic scalars and hashes... state $translate_scalar = { q{%$MATCH} => q{%{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}}, q{@$MATCH} => q{@{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}}, q{$MATCH} => q{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}, q{%MATCH} => q{%{$Regexp::Grammars::RESULT_STACK[-1]}}, q{$CAPTURE} => q{$^N}, q{$CONTEXT} => q{$^N}, q{$DEBUG} => q{$Regexp::Grammars::DEBUG}, q{$INDEX} => q{${\\pos()}}, q{%ARG} => q{%{$Regexp::Grammars::RESULT_STACK[-1]{'@'}}}, }; state $translatable_scalar = join '|', map {quotemeta $_} sort {length $b <=> length $a} keys %{$translate_scalar}; $grammar_spec =~ s{ ($translatable_scalar) (?! \s* (?: \[ | \{) ) } {$translate_scalar->{$1}}oxmsg; return $grammar_spec; } # Generate a "decimal timestamp" and insert in a template... sub _timestamp { my ($template) = @_; # Generate and insert any timestamp... if ($template =~ /%t/) { my ($sec, $min, $hour, $day, $mon, $year) = localtime; $mon++; $year+=1900; my $timestamp = sprintf("%04d%02d%02d.%02d%02d%02d", $year, $mon, $day, $hour, $min, $sec); $template =~ s{%t}{$timestamp}xms;; } return $template; } # Open (or re-open) the requested log file... sub _autoflush { my ($fh) = @_; my $originally_selected = select $fh; $|=1; select $originally_selected; } sub _open_log { my ($mode, $filename) = @_; # Special case: '-' --> STDERR if ($filename eq q{-}) { return *STDERR{IO}; } # Otherwise, just open the named file... elsif (open my $fh, $mode, $filename) { _autoflush($fh); return $fh; } # Otherwise, generate a warning and default to STDERR... else { local *Regexp::Grammars::LOGFILE = *STDERR{IO}; _debug_notify( warn => qq{Unable to open log file '$filename'}, qq{($!)}, qq{Defaulting to STDERR instead.}, ); _debug_notify( q{} => q{} ); return *STDERR{IO}; } } sub _invert_delim { my ($delim) = @_; $delim = reverse $delim; $delim =~ tr/<>[]{}()«»`'/><][}{)(»«'`/; return quotemeta $delim; } # Regex to detect if other regexes contain a grammar specification... my $GRAMMAR_DIRECTIVE = qr{ < grammar: \s* (? $QUALIDENT ) \s* > }xms; # Regex to detect if other regexes contain a grammar inheritance... my $EXTENDS_DIRECTIVE = qr{ < extends: \s* (? $QUALIDENT ) \s* > }xms; # Cache of rule/token names within defined grammars... my %subrule_names_for; # Build list of ancestors for a given grammar... sub _ancestry_of { my ($grammar_name) = @_; return () if !$grammar_name; use mro; return map { substr($_, $CACHE_LEN) } @{mro::get_linear_isa($CACHE.$grammar_name, 'c3')}; } # Detect and translate any requested grammar inheritances... sub _extract_inheritances { my ($regex, $compiletime_debugging_requested, $derived_grammar_name) = @_; # Detect and remove inheritance requests... while ($regex =~ s{$EXTENDS_DIRECTIVE}{}xms) { # Normalize grammar name and report... my $orig_grammar_name = $+{base_grammar_name}; my $grammar_name = $orig_grammar_name; if ($grammar_name !~ /::/) { $grammar_name = caller(2).'::'.$grammar_name; } if (exists $user_defined_grammar{$grammar_name}) { if ($compiletime_debugging_requested) { _debug_notify( info => "Processing inheritance request for $grammar_name...", q{}, ); } # Specify new relationship... no strict 'refs'; push @{$CACHE.$derived_grammar_name.'::ISA'}, $CACHE.$grammar_name; } else { my (undef, $file, $line) = caller(2); _debug_notify( fatal => "Inheritance from unknown grammar requested", "in ", "at $file line $line", ); } } # Retrieve ancestors (but not self) in C3 dispatch order... my (undef, @ancestors) = _ancestry_of($derived_grammar_name); # Extract subrule names and implementations for ancestors... my %subrule_names = map { %{$subrule_names_for{$_}} } @ancestors; $_ = -1 for values %subrule_names; my $implementation = join "\n", map { $user_defined_grammar{$_} } @ancestors; return $implementation, \%subrule_names; } # Pattern for directive within rules... my $WS_PATTERN = qr{]++ | $PARENS )*+) >}xms; # Transform grammar-augmented regex into pure Perl 5.10 regex... sub _build_grammar { my ($grammar_spec) = @_; $grammar_spec .= q{}; # Check for dubious repeated constructs that throw away captures... my @dubious = $grammar_spec =~ m{ < (?! \[ ) # not <[SUBRULE]> ( $IDENT (?: = [^>]*)? ) # but or > \s* ( # followed by a quantifier... [+*][?+]? # either symbolic | \{\d+(?:,\d*)?\}[?+]? # or numeric ) }gxms; # Report dubiousities... while (@dubious) { my ($rule, $qual) = splice @dubious, 0, 2; _debug_notify( warn => qq{Repeated subrule <$rule>$qual will only capture its final match}, qq{(Did you mean <[$rule]>$qual instead?)}, q{}, ) } # Check for dubious non-backtracking constructs... @dubious = $grammar_spec =~ m{ < ( [^>]+ ) > \s* ([?+*][+]|\{.*\}[+]) }gxms; # Report dubiousities... while (@dubious) { my ($rule, $qual) = splice @dubious, 0, 2; my $safe_qual = substr($qual,0,-1); _debug_notify( warn => qq{Non-backtracking subrule <$rule>$qual not fully supported yet}, qq{(If grammar does not work try <$rule>$safe_qual instead)}, q{}, ) } # Check whether a log file was specified... my $compiletime_debugging_requested; local *Regexp::Grammars::LOGFILE = *Regexp::Grammars::LOGFILE; my $logfile = q{-}; $grammar_spec =~ s{ ^ [^#]* < logfile: \s* ([^>]+?) \s* > }{ $logfile = _timestamp($1); # Presence of implies compile-time logging... $compiletime_debugging_requested = 1; *Regexp::Grammars::LOGFILE = _open_log('>',$logfile); # Delete directive... q{}; }gexms; # Look ahead for any run-time debugging or timeout requests... my $runtime_debugging_requested = $grammar_spec =~ m{ ^ [^#]* < debug: \s* (run | match | step | try | on | off | same ) \s* > | \$DEBUG (?! \s* (?: \[ | \{) ) }xms; my $timeout_requested = $grammar_spec =~ m{ ^ [^#]* < timeout: \s* \d+ \s* > }xms; # Standard actions set up and clean up any regex debugging... # Before entire match, set up a stack of attempt records and report... my $pre_match_debug = $runtime_debugging_requested ? qq{(?{; *Regexp::Grammars::LOGFILE = Regexp::Grammars::_open_log('>>','$logfile'); Regexp::Grammars::_init_try_stack(); })} : qq{(?{; *Regexp::Grammars::LOGFILE = Regexp::Grammars::_open_log('>>','$logfile'); })} ; # After entire match, report whether successful or not... my $post_match_debug = $runtime_debugging_requested ? qq{(?{;Regexp::Grammars::_debug_matched(0,\\%/,'',\$^N)}) |(?>(?{;Regexp::Grammars::_debug_handle_failures(0,''); }) (?!)) } : q{} ; # Subdivide into rule and token definitions, preparing to process each... # REWRITE THIS, USING (PROBABLY NEED TO REFACTOR ALL GRAMMARS TO REUSe # THESE COMPONENTS: # (? \( \s* (?&PARAMS)? \s* \) | (?# NOTHING ) ) # (? (?&PARAM) \s* (?: , \s* (?&PARAM) \s* )* ,? ) # (? (?&VAR) (?: \s* = \s* (?: (?&LITERAL) | (?&PARENCODE) ) )? ) # (? (?&NUMBER) | (?&STRING) | (?&VAR) ) # (? : (?&IDENT) ) my @defns = split m{ ^ [^#\n]*? \K < (obj|)(rule|token) \s*+ : \s*+ ((?:${IDENT}::)*+) (?: ($IDENT) \s*+ = \s*+ )?+ ($IDENT) \s* > }xms, $grammar_spec; # Extract up list of names of defined rules/tokens... # (Name is every 4th item out of every five, skipping the first item) my @subrule_names = @defns[ map { $_ * 6 + 5 } 0 .. ((@defns-1)/6-1) ]; my %subrule_names; # Build a look-up table of subrule names, checking for duplicates... for my $subrule_name (@subrule_names) { if (++$subrule_names{$subrule_name} == 2) { _debug_notify( warn => "Multiple definitions for <$subrule_name>", "(only the first definition will be used)", ); } } # Add the built-ins... @subrule_names{'ws', 'hk', 'matchpos', 'matchline'} = (1) x 4; # An empty main rule will never match anything... my $main_regex = shift @defns; if ($main_regex =~ m{\A (?: \s++ | \(\?\# [^)]* \) | \# [^\n]++ )* \z}xms) { _debug_notify( error => "No main regex specified before rule definitions.", "Grammar will never match anything.", "(Did you forget a specification?)", q{}, ); } # Compile the regex or grammar... my $regex = q{}; my $grammar_name; my $is_grammar; # Is this a grammar specification? if ($main_regex =~ $GRAMMAR_DIRECTIVE) { # Normalize grammar name and report... $grammar_name = $+{grammar_name}; if ($grammar_name !~ /::/) { $grammar_name = caller(1) . "::$grammar_name"; } $is_grammar = 1; # Add subrule definitions to namespace... for my $subrule_name (@subrule_names) { $CACHE{$grammar_name.'::'.$subrule_name} = 1; } } else { state $dummy_grammar_index = 0; $grammar_name = '______' . $dummy_grammar_index++; } # Extract any inheritance information... my ($inherited_rules, $inherited_subrule_names) = _extract_inheritances( $main_regex, $compiletime_debugging_requested, $grammar_name ); # Remove requests... $main_regex =~ s{ $EXTENDS_DIRECTIVE }{}gxms; # Add inherited subrule names to allowed subrule names; @subrule_names{ keys %{$inherited_subrule_names} } = values %{$inherited_subrule_names}; # Remove comments from top-level grammar... $main_regex =~ s{ \(\?\# [^)]* \) | (? }{}gxms) ? 1 : ($main_regex =~ s{ < context \s* : \s* > }{}gxms) ? 0 : 0; # If so, set up to save the grammar... if ($is_grammar) { # Normalize grammar name and report... if ($grammar_name !~ /::/) { $grammar_name = caller(1) . "::$grammar_name"; } if ($compiletime_debugging_requested) { _debug_notify( info => "Processing definition of grammar $grammar_name...", q{}, ); } # Remove the grammar directive... $main_regex =~ s{ $GRAMMAR_DIRECTIVE | < debug: \s* (run | match | step | try | on | off | same ) \s* > }{}gxms; # Check for anything else in the main regex... if ($main_regex =~ /\S/) { _debug_notify( warn => "Unexpected item before first subrule specification", "in definition of :", map({ " $_"} grep /\S/, split "\n", $main_regex), "(this will be ignored when defining the grammar)", ); } # Remember set of valid subrule names... $subrule_names_for{$grammar_name} = { map({ ($_ => 1) } keys %subrule_names), map({ ($grammar_name.'::'.$_ => 1) } grep { !/::/ } keys %subrule_names), }; } else { #...not a grammar specification # Report how main regex was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( info => "Processing the main regex before any rule definitions", ); } # Any actual regex is processed first... $regex = _translate_subrule_calls( $grammar_name, $main_regex, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $pre_match_debug, $post_match_debug, q{}, # Expected...what? \%subrule_names, 0, # Whitespace isn't magical $nocontext, ); # Report how construct was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( q{} => q{ |}, q{ \\___End of main regex}, q{}, ); } } # Then iterate any following rule definitions... while (@defns) { # Grab details of each rule defn (as extracted by previous split)... my ($objectify, $type, $qualifier, $name, $callname, $body) = splice(@defns, 0, 6); $name //= $callname; my $qualified_name = $grammar_name.'::'.$callname; # Report how construct was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( info => "Defining a $type: <$callname>", " |...Returns: " . ($objectify ? "an object of class '$qualifier$name'" : "a hash"), ); } # Translate any nested <...> constructs... $body = _translate_subrule_calls( $grammar_name, $body, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $pre_match_debug, $post_match_debug, $callname, # Expected...what? \%subrule_names, $type eq 'rule', # Is whitespace magical? $nocontext, # Start with the global nocontextuality ); # Report how construct was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( q{} => q{ |}, q{ \\___End of rule definition}, q{}, ); } # Make allowance for possible local whitespace definitions... my $local_ws_defn = q{}; my $local_ws_call = q{(?&ws)}; # Rules make non-code literal whitespace match textual whitespace... if ($type eq 'rule') { # Implement any local whitespace definition... if ($body =~ s{$WS_PATTERN}{}oxms) { my $defn = $1; if ($defn !~ m{\S}xms) { _debug_notify( warn => qq{Ignoring useless empty directive.}, qq{(Did you mean to use a token instead?)}, ); } state $ws_counter = 0; $ws_counter++; $local_ws_defn = qq{(?<__RG_ws_$ws_counter> $defn)}; $local_ws_call = qq{(?&__RG_ws_$ws_counter)}; } # Implement auto-whitespace... state $CODE_OR_SPACE = qr{ \( \?\?? (?&BRACED) \) | (? \{ (?: \\. | (?&BRACED) | [^{}] )* \} ) ) }xms; $body =~ s{($CODE_OR_SPACE)} [ substr($1,0,3) eq '(?{' || substr($1,0,4) eq '(??{' ? $1 : $local_ws_call ]exmsg; #} } elsif ($body =~ s{$WS_PATTERN}{}oxms) { _debug_notify( warn => qq{Ignoring useless directive in a token definition.}, qq{(Did you mean to use a rule instead?)}, ); } $regex .= _translate_rule_def( $type, $qualifier, $name, $callname, $qualified_name, $body, $objectify, $local_ws_defn ); } # Insert checkpoints into any user-defined code block... $regex =~ s{ \( \?\?? \{ \K (?!;) }{ local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK; }xmsg; # Check for any suspicious left-overs from the start of the regex... pos $regex = 0; # Report anything that starts like a subrule, but isn't... my %seen = ( '' => 1, '' => 1, '' => 1, '' => 1); # autogenerated while ($regex =~ m{( (?] )}gxms) { my $construct = $1; my $something = $2 ? 'directive' : 'subrule call'; # Only report potential problems once... next if $seen{$construct}++; # Also explain how to indicate the construct is intentional... _debug_notify( warn => qq{Possible invalid $something:}, qq{ $construct}, qq{(To silence this warning, use: \\$construct}, ); _debug_notify( q{} => q{} ); } # If a grammar definition, save grammar and return a placeholder... if ($is_grammar) { $user_defined_grammar{$grammar_name} = $regex; return qq{(?{ warn "Can't match against \n"; })(*COMMIT)(?!)}; } # Otherwise, aggregrate the final grammar... else { return _complete_regex($regex.$inherited_rules, $pre_match_debug, $post_match_debug, $nocontext); } } sub _complete_regex { my ($regex, $pre_match_debug, $post_match_debug, $nocontext) = @_; return $nocontext ? qq{$pre_match_debug$PROLOGUE$regex$EPILOGUE_NC$post_match_debug} : qq{$pre_match_debug$PROLOGUE$regex$EPILOGUE$post_match_debug}; } 1; # Magic true value required at end of module __END__ =head1 NAME Regexp::Grammars - Add grammatical parsing features to Perl 5.10 regexes =head1 VERSION This document describes Regexp::Grammars version 1.014 =head1 SYNOPSIS use Regexp::Grammars; my $parser = qr{ (?: # Parse and save a Verb in a scalar <.ws> # Parse but don't save whitespace # Parse and save a Noun in a scalar 0.5 ? 'VN' : 'VerbNoun' }> # Save result of expression in a scalar | (?: <[Noun]> # Parse a Noun and save result in a list (saved under the key 'Noun') <[PostNoun=ws]> # Parse whitespace, save it in a list # (saved under the key 'PostNoun') )+ # Parse a Verb and save result in a scalar (saved under the key 'Verb') # Save a literal in a scalar | # Turn on the integrated debugger here <.Cmd= (?: mv? )> # Parse but don't capture a subpattern (name it 'Cmd' for debugging purposes) <[File]>+ # Parse 1+ Files and save them in a list (saved under the key 'File') # Turn off the integrated debugger here # Parse a File and save it in a scalar (saved under the key 'Dest') ) ################################################################ # Define a subrule named File <.ws> # - Parse but don't capture whitespace # - Parse the subpattern and capture # matched text as the result of the # subrule # Define a subrule named Noun cat | dog | fish # - Match an alternative (as usual) # Define a whitespace-sensitive subrule eats # - Match a literal (after any space) ? # - Parse optional subrule Noun and # save result under the key 'Object' | # Or else... # - Parse subrule AUX and save result # - Match a literal, save under 'part' # Define a whitespace-insensitive subrule (has | is) # - Match an alternative and capture (?{ $MATCH = uc $^N }) # - Use captured text as subrule result }x; # Match the grammar against some text... if ($text =~ $parser) { # If successful, the hash %/ will have the hierarchy of results... process_data_in( %/ ); } =head1 QUICKSTART CHEATSHEET =head2 In your program... use Regexp::Grammars; Allow enhanced regexes in lexical scope %/ Result-hash for successful grammar match =head2 Defining and using named grammars... Define a named grammar that can be inherited Current grammar inherits named grammar's rules =head2 Defining rules in your grammar... Define rule with magic whitespace Define rule without magic whitespace Define rule that blesses return-hash into class Define token that blesses return-hash into class Shortcut for above (rule name derived from class) Shortcut for above (token name derived from class) =head2 Matching rules in your grammar... Call named subrule (may be fully qualified) save result to $MATCH{RULENAME} Call named subrule, passing args to it Call subrule and fail if it matches (shorthand for (?!<.RULENAME>) ) <:IDENT> Match contents of $ARG{IDENT} as a pattern <\:IDENT> Match contents of $ARG{IDENT} as a literal Match closing delimiter for $ARG{IDENT} <%HASH> Match longest possible key of hash <%HASH {PAT}> Match any key of hash that also matches PAT <\IDENT> Match the literal contents of $MATCH{IDENT} Match closing delimiter for $MATCH{IDENT} Call subrule, save result in $MATCH{ALIAS} Match a hash key, save key in $MATCH{ALIAS} Match pattern, save match in $MATCH{ALIAS} Execute code, save value in $MATCH{ALIAS} Save specified string in $MATCH{ALIAS} Save specified number in $MATCH{ALIAS} Match /$MATCH{IDENT}/, save as $MATCH{ALIAS} Match '$MATCH{IDENT}', save as $MATCH{ALIAS} Match closing delim, save as $MATCH{ALIAS} <.SUBRULE> Call subrule (one of the above forms), but don't save the result in %MATCH <[SUBRULE]> Call subrule (one of the above forms), but append result instead of overwriting it + % Match one or more repetitions of SUBRULE1 as long as they're separated by SUBRULE2 ** Same (only for backwards compatibility) * % Match zero or more repetitions of SUBRULE1 as long as they're separated by SUBRULE2 =head2 In your grammar's code blocks... $CAPTURE Alias for $^N (the most recent paren capture) $CONTEXT Another alias for $^N $INDEX Current index of next matching position in string %MATCH Current rule's result-hash $MATCH Magic override value (returned instead of result-hash) %ARG Current rule's argument hash $DEBUG Current match-time debugging mode =head2 Directives... Fail if code evaluates false Fail if matching takes too long Change match-time debugging mode Change debugging log file (default: STDERR) Queue error message and fail parse Queue error message and backtrack Queue warning message and continue Explicitly add a message to debugging log Override automatic whitespace matching Simplify the result of a subrule match Switch on context substring retention Switch off context substring retention =head1 DESCRIPTION This module adds a small number of new regex constructs that can be used within Perl 5.10 patterns to implement complete recursive-descent parsing. Perl 5.10 already supports recursive=descent I, via the new C<< (?...) >> and C<< (?&name) >> constructs. For example, here is a simple matcher for a subset of the LaTeX markup language: $matcher = qr{ (?&File) (?(DEFINE) (? (?&Element)* ) (? \s* (?&Command) | \s* (?&Literal) ) (? \\ \s* (?&Literal) \s* (?&Options)? \s* (?&Args)? ) (? \[ \s* (?:(?&Option) (?:\s*,\s* (?&Option) )*)? \s* \]) (? \{ \s* (?&Element)* \s* \} ) (?