package Math::Expression::Evaluator::Parser; =head1 NAME Math::Expression::Evaluator::Parser - Parse mathematical expressions =head1 SYNOPSIS use Math::Expression::Evaluator::Parser; my $exp = '2 + a * 4'; my $ast = Math::Expression::Evaluator::Parser::parse($exp, {}); # $ast is now something like this: # $ast = ['+', # 2, # ['*', # ['$', 'a'], # 4 # ] # ]; =head1 DESCRIPTION This module parses a mathematical expression in usual notation, and turns it into an Abstract Syntax Tree (AST). If you want to have a simple interface and want to evaluate these ASTs, use L. The AST is a tree that consists of nested array refs. The first item is a string (until now always a single character), and denotes the type of the node. The rest of the items in the array is a list of its arguments. For the mathematical symbols C<+>, C<->, C<*>, C, C<^> (exponentation) this is straight forward, but C and C<-> are always treated as prefix ops, so the string '2 - 3' is actually turned into C<['+', 2, ['-', 3]]>. Other AST nodes are =over 4 =item '$' C<['$', $var_name]> represents a variable. =item '{' C<['{', $expr1, $expr2, ... ]> represents a block, i.e. a list of expressions. =item '=' C<['=', $var, $expr]> represents an assignment, where C<$expr> is assigned to C<$var>. =item '&' C<['&', $name, @args]> is a function toll to the function called C<$name>. =back =head1 METHODS =over =item parse C takes a string and a hash ref, where the hash ref takes configuration parameters. Currently the only allowed option is C. If set to a true value, it forces statements to be forced by semicolons (so C<2 3> will be forbidden, C<2; 3> is still allowed). C throws an exception on parse errors. =back =cut use strict; use warnings; use Math::Expression::Evaluator::Lexer qw(lex); use Math::Expression::Evaluator::Util qw(is_lvalue); use Carp qw(confess); use Data::Dumper; my @input_tokens = ( ['ExpOp' => '\^'], ['MulOp' => qr{[*/%]}], ['AddOp' => '\+|-'], # This regex is 'stolen' from Regexp::Common, and a bit simplified # Copyright by Damian Conway and Abigail, 2001-2005 ['Float' => "[+-]?(?=[0-9]|[.])[0-9]*(?:[.][0-9]*)?(?:[eE](?:[+-]?[0-9]+)|)"], ['OpenParen' => '\('], ['ClosingParen' => '\)'], ['Colon' => ';'], ['Comma' => ','], ['AssignmentOp' => '='], ['Name' => '[a-zA-Z_][a-zA-Z_0-9]*'], ['Whitespace' => '\s+', sub {return undef}], ['Comment' => qr/\#.*?$/, sub {return undef}], ); my %token_description = ( ExpOp => 'Operator', MulOp => 'Operator', AddOp => 'Operator', AssignmentOp => 'Operator', Float => 'Term', Name => 'Term', ); sub parse { my ($text, $parse_opts) = @_; # note that this object is only used internally, to the # world outside we hide it. my $self = bless {}; $self->{config} = $parse_opts; $self->{tokens} = lex($text, \@input_tokens); $self->{token_pointer} = 0; return $self->_program(); } # checks if the next token is what you expected, for example # _is_next_token("AddOp") checks if the next token is a '+' or '-' sub _is_next_token { my $self = shift; my $cmp = shift; if (defined $self->_next_token() && $self->_next_token()->[0] eq $cmp){ return $self->_next_token->[1]; } } # basically the same _is_next_token, but does an arbitrary number of lookahead # steps. sub _lookahead { my $self = shift; my $i = 0; while (my $v = shift){ return undef unless($self->{tokens}[$self->{token_pointer}+$i]); my $ref = $self->{tokens}[$self->{token_pointer} + $i]->[0]; return undef unless($ref eq $v); $i++; } return 1; } # move the token pointer one step further. sub _proceed { my $self = shift; $self->{token_pointer}++; } # returns the next not-yet-parsed token sub _next_token { my $self = shift; return $self->{tokens}[$self->{token_pointer}]; } # program -> statement* # parse a program, e.g. a collection of statements. # The corrsponding AST looks like this: ['{', $s1, $s2, $s3, ... ] sub _program { my $self = shift; my @res = ('{'); while (defined $self->_next_token()){ push @res, $self->_statement(); } return _return_simplify(@res); } # generates an error message that something was expected but not found, # for example 'a + +' would warn that a value was expected, but an AddOp # was found. sub _expected { my $self = shift; if (scalar @_ > 1){ confess("Parse error: Expected $_[0]; got: '$_[1]'\n" . "near character " . $self->_next_token->[2] . "\n"); } else { confess("Parse error: Expected $_[0]\n" . "near character " . $self->_next_token->[2] . "\n"); } } # matches a specific token, and returns its text if successfull. Dies if # unsuccessfull. sub _match { my $self = shift; my $m = shift; my $val; confess("Expected $m, got EOF") unless ref $self->_next_token(); if ($self->_next_token()->[0] eq $m){ $val = $self->_next_token()->[1]; $self->_proceed(); return $val; } else { $self->_expected($m, $self->_next_token()->[0]); } } # -> | | # parses a single value: a float, a function call or a variable name # returns the corresponding AST. sub _value { my $self = shift; if ($self->_lookahead("Name", "OpenParen")){ return $self->_function_call(); } elsif ($self->_is_next_token("Name")){ return $self->_get_variable(); } else { return $self->_match("Float"); } } # -> '(' [ [',' ]* ]? ')' # parses a function call, the AST looks like this: ['&', $name, @args] sub _function_call { my $self = shift; my @res = ('&', $self->_match("Name")); $self->_match("OpenParen"); if ($self->_is_next_token("ClosingParen")){ $self->_proceed(); return \@res; } push @res, $self->_expression(); while ($self->_is_next_token("Comma")){ $self->_proceed(); last if $self->_is_next_token('ClosingParen'); push @res, $self->_expression(); } $self->_match("ClosingParen"); return \@res; } # -> m/[a-zA-Z_]\w*/ # parses a variable name, and returns it sub _get_variable { my $self = shift; my $var_name = $self->_match("Name"); return ['$', $var_name]; } # -> <_assignment> | # parses a statement, eg an _assignment or an expression. sub _statement { my $self = shift; my $e = $self->_expression(); if ($self->_is_next_token("AssignmentOp")){ $e = $self->_assignment($e); } if ($self->{config}->{force_semicolon}){ # forced semicolon at the and of a statement, but last statement # isn't forced to have one. if ($self->_next_token()){ $self->_match("Colon"); } } else { # optional semicolon at end of statement if ($self->_is_next_token("Colon")){ $self->_proceed(); } } return $e; } # <_assignment> ::= '=' # expects the lvalue as first argument sub _assignment { my $self = shift; my $e = shift; $self->_match("AssignmentOp"); my $val = $self->_expression(); if (is_lvalue($e)){ return ['=', $e, $val]; } else { confess("Not an lvalue in _assignment"); } } # ::= [('*'|'/') ]* # the AST is a bit weird, a simple product is expressed as # ['*', $v1, $v2, ... ] # a division is a bit more complex: # a / b / c becomes ['*', a, ['/', b], ['/', c]] sub _term { my $self = shift; my $val = $self->_exponential(); my @res = ('*', $val); while (my $op = $self->_is_next_token("MulOp")){ if ($op eq '*'){ $self->_proceed(); push @res, $self->_exponential(); } elsif ($op eq '/'){ $self->_proceed(); push @res, ['/', $self->_exponential()]; } elsif ($op eq '%') { $self->_proceed(); # XXX not very efficient @res = ('*', ['%', [@res], $self->_exponential()]); } else { die "Don't know how to handle MulOp $op\n"; } } return _return_simplify(@res); } # ::= ['+'|'-']? [('+'|'-') term]* sub _expression { my $self = shift; # print STDERR "expression...\n"; my @res = ('+'); if (my $op = $self->_is_next_token("AddOp")){ # unary +/- $self->_proceed(); if ($op eq '+'){ push @res, $self->_term(); } else { push @res, ['-', $self->_term()]; } } else { push @res, $self->_term(); } while (my $op = $self->_is_next_token("AddOp")){ if ($op eq '+'){ $self->_proceed(); push @res, $self->_term(); } else { # a '-' $self->_proceed(); push @res, ['-', $self->_term()]; } } return _return_simplify(@res); } # ::= | '(' ')' sub _factor { my $self = shift; my $val; if ($self->_is_next_token("OpenParen")){ $self->_match("OpenParen"); $val = $self->_expression(); $self->_match("ClosingParen"); } else { $val = $self->_value(); } return $val; } # ::= [ '^' ]? # note that 2**3**4 is not defined sub _exponential { my $self = shift; my $val = $self->_factor(); if ($self->_is_next_token("ExpOp")){ $self->_match("ExpOp"); return ['^', $val, $self->_factor()]; } else { return $val; } } sub _return_simplify { return $_[1] if @_ == 2; return \@_; } 1; # vim: sw=4 ts=4 expandtab