use 5.010; use strict; use warnings; package MarpaX::Repa::Lexer; our $VERSION = '0.01'; =head1 NAME MarpaX::Repa::Lexer - simplify lexing for Marpa parser =head1 DESCRIPTION Most details are in L. =head1 METHODS =head2 new Returns a new lexer instance. Takes named arguments. my $lexer = MyLexer->new( tokens => { word => qr{\b\w+\b}, }, store => 'array', recognizer => $recognizer, debug => 1, ); Possible arguments: =over 4 =item tokens Hash with names of terminals as keys and one of the following as values: =over 4 =item string Just a string to match. 'a token' => "matches this long string", =item regular expression A C compiled regexp. 'a token' => qr{"[^"]+"}, Note that regexp MUST match at least one character. At this moment look behind to look at chars before the current position is not supported. =item hash With hash you can define token specific options. At this moment 'store' option only (see below). Use C key to set what to match (string or regular expression). 'a token' => { match => "a string", store => 'hash', }, =back =item store What to store (pass to Marpa's recognizer). The following variants are supported: =over 4 =item hash (default) { token => 'a token', value => 'a value' } =item array [ 'a token', 'a value' ] =item scalar 'a value' =item undef undef is stored so later Repa's actions will skip it. =item a callback A function will be called with token name and reference to its value. Should return a reference or undef that will be passed to recognizer. =back =item recognizer L object or its subclass. =item debug If true then lexer prints debug log to STDERR. =item min_buffer Minimal size of the buffer (4*1024 by default). =back =cut sub new { my $proto = shift; my $self = bless { @_ }, ref $proto || $proto; return $self->init; } =head2 init Setups instance and returns C<$self>. Called from constructor. =cut sub init { my $self = shift; my $tokens = $self->{'tokens'}; foreach my $token ( keys %$tokens ) { my ($match, @rest); if ( ref( $tokens->{ $token } ) eq 'HASH' ) { $match = $tokens->{ $token }{'match'}; @rest = ($tokens->{ $token }{'store'}); } else { $match = $tokens->{ $token }; } $rest[0] ||= $self->{'store'} || 'hash'; my $type = ref $match ? 'RE' : length $match == 1 ? 'CHAR' : 'STRING'; $tokens->{ $token } = [ $type, $match, @rest ]; } $self->{'min_buffer'} //= 4*1024; $self->{'buffer'} //= ''; return $self; } =head2 recognize Takes a file handle and parses it. Dies on critical errors, not when parser lost its way. Returns recognizer that was passed to L. =cut sub recognize { my $self = shift; my $fh = shift; my $rec = $self->{'recognizer'}; my $buffer = $self->buffer; my $buffer_can_grow = $self->grow_buffer( $fh ); my $expected = $rec->terminals_expected; return $rec unless @$expected; while ( length $$buffer ) { say STDERR "Expect token(s): ". join(', ', map "'$_'", @$expected) if $self->{'debug'}; say STDERR "Buffer start: ". $self->dump_buffer .'...' if $self->{'debug'}; my $first_char = substr $$buffer, 0, 1; foreach my $token ( @$expected ) { REDO: my ($matched, $match, $length); my ($type, $what, $how) = @{ $self->{'tokens'}{ $token } || [] }; unless ( $type ) { say STDERR "Unknown token: '$token'" if $self->{'debug'}; next; } elsif ( $type eq 'RE' ) { if ( $$buffer =~ /^($what)/ ) { ($matched, $match, $length) = (1, $1, length $1); if ( $length == length $$buffer && $buffer_can_grow ) { $buffer_can_grow = $self->grow_buffer( $fh ); goto REDO; } } } elsif ( $type eq 'STRING' ) { $length = length $what; ($matched, $match) = (1, $what) if $what eq substr $$buffer, 0, $length; } elsif ( $type eq 'CHAR' ) { ($matched, $match, $length) = (1, $first_char, 1) if $what eq $first_char; } else { die "Unknown type $type"; } unless ( $matched ) { say STDERR "No '$token' in ". $self->dump_buffer if $self->{'debug'}; next; } unless ( $length ) { die "Token '$token' matched empty string. This is not supported."; } say STDERR "Token '$token' matched ". $self->dump_buffer( $length ) if $self->{'debug'}; if ( ref $how ) { $match = $how->( $token, \"$match" ); } elsif ( $how eq 'hash' ) { $match = \{ token => $token, value => $match }; } elsif ( $how eq 'array' ) { $match = \[$token, $match]; } elsif ( $how eq 'scalar' ) { $match = \"$match"; } elsif ( $how eq 'undef' ) { $match = \undef; } else { die "Unknown store variant - '$how'"; } $rec->alternative( $token, $match, $length ); } my $skip = 0; while (1) { # XXX: we are done, no way to advance further, we would love this # to be improved in Marpa if ( $rec->current_earleme == $rec->thin->furthest_earleme ) { return $rec; } $skip++; local $@; if ( defined (my $events = eval { $rec->earleme_complete }) ) { if ( $events && $rec->exhausted ) { substr $$buffer, 0, $skip, ''; return $rec; } $expected = $rec->terminals_expected; last if @$expected; } else { say STDERR "Failed to parse: $@" if $self->{'debug'}; return $rec; } } substr $$buffer, 0, $skip, ''; $buffer_can_grow = $self->grow_buffer( $fh ) if $buffer_can_grow && $self->{'min_buffer'} > length $$buffer; say STDERR '' if $self->{'debug'}; } return $rec; } =head2 buffer Returns reference to the current buffer. =cut sub buffer { \$_[0]->{'buffer'} } =head2 grow_buffer Called when L needs a re-fill with a file handle as argument. Returns true if there is still data to come from the handle. =cut sub grow_buffer { my $self = shift; local $/ = \($self->{'min_buffer'}*2); $self->{'buffer'} .= readline($_[0]) // return 0; return 1 && $self->{'min_buffer'}; } =head2 dump_buffer Returns first 20 chars of the buffer with everything besides ASCII encoded with C<\x{####}>. Use argument to control size, zero to mean whole buffer. =cut sub dump_buffer { my $self = shift; my $show = shift // 20; my $str = $show? substr( $self->{'buffer'}, 0, $show ) : $self->{'buffer'}; (my $res = $str) =~ s/([^\x20-\x7E])/'\\x{'. hex( ord $1 ) .'}' /ge; return $res; } 1;