#!/usr/bin/perl =pod 2005 September 7 This file is a development snapshot of the prolog regex engine I started over the weekend. A "look at what I'm up to". It passes most (95%) of a perl 5.9 re test file (re_tests), but it's real p5 re coverage is much less. The test passing was simply to permit refactoring with confidence. It uses Language::Prolog::Yaswi, which requires swi-prolog be installed with non-default settings, and so is not intended for "general" use. It's more of a pugs bootstrap exercise. This code is currently completely in flux. It is simultaneously being overhauled, cleaned up, and p6 added. It's basically three days old, and only now is changing from an experimental "let's see if something like this can work" to "ok, this looks doable". So yes, the code is an utter mess. The project objectives are nominally: 1- Provide an environment in which p6 rules can be run against strings, sufficient to permit recommencing writing a grammar for perl6. This will likely be pure p5/prolog, with no p6 runtime associated. Then, - aid in prototyping the regexp system - the regexp tree, its compilation to pil, how regexp modification macros work, etc. Working with a p5 accessible runtime (pil2js or pilrun), and/or motivating implementations in p5/p6/haskell. - perhaps serve as a reference implementation for a while. By sacrificing ease of installation and portability, we may have bought fairly simple code with tolerable performance. However, pugs critical path is currently more oo than rules, my time is limited, and there are other bootstrap approaches to rules, so this file may or may not go anywhere. The plan as of this moment is to continue cleaning and adding p6, tuck it into the pugs distribution somewhere with test files, and do objective 1 above. If anyone would like to help out, let me know, and I'll upload more of the development environment. Recognition is due Salvador Fandino Garcia for Language::Prolog::Yaswi, http://search.cpan.org/~salva/Language-Prolog-Yaswi-0.08/ and Robert D. Cameron for his "Perl Style Regular Expressions in Prolog". http://www.cs.sfu.ca/people/Faculty/cameron/Teaching/383/regexp-plg.html Without their work, I almost certainly would have punted. Also to iblech, and Michael Schilli of JavaScript::SpiderMonkey, whose successful work got me thinking about trying a prolog plugin. Fyi, Yaswi apparently requires swi prolog be built ./configure --enable-shared. =cut # export LD_LIBRARY_PATH=/usr/local/lib/pl-5.5.27/lib/x86_64-linux my $prolog = <<'END'; %====================================================================== :- style_check(-discontiguous). % remove is plausible p5_start(Z) --> p5_concatenation(W), p5_alternatives(W, Z), !. p5_start(noop) --> {true}. p5_alternatives(W, Z) --> "|", p5_concatenation(X), p5_alternatives(union(W,X), Z), !. p5_alternatives(W, W) --> {true}. p6_start(Z) --> p6_conjunct(W), p6_alternatives(W, Z). p6_conjunct(Z) --> p6_concatenation(W), p6_conjunctions(W,Z). p6_alternatives(W, Z) --> "|", p6_conjunction(X), p6_alternatives(union(W,X), Z), !. p6_alternatives(W, W) --> {true}. p6_conjunctions(W, Z) --> "&", p6_concatenation(X), p6_conjunctions(intersect(W,X), Z), !. p6_conjunctions(W, W) --> {true}. p5_concatenation(Z) --> p5_quantified(W), p5_concatenation_more(W, Z), !. p5_concatenation(noop) --> {true}. p5_concatenation_more(W, Z) --> p5_quantified(X), p5_concatenation_more(conc(W,X), Z), !. p5_concatenation_more(W, W) --> {true}. p6_concatenation(Z) --> p6_quantified(W), p6_concatenation_more(W, Z), !. p6_concatenation_more(W, Z) --> p6_quantified(X), p6_concatenation_more(conc(W,X), Z), !. p6_concatenation_more(W, W) --> {true}. p5_quantified(Z) --> p5_elemental(W), p5_quantified_op(W, Z). p5_quantified_op(W, Z) --> p56_quantified_op(W,Z), !. p5_quantified_op(W, Z) --> p5_range(W, Z), !. p5_quantified_op(W, W) --> {true}. p6_quantified(Z) --> p6_elemental(W), p6_quantified_op(W, Z). p6_quantified_op(W, Z) --> p56_quantified_op(W,Z), !. p6_quantified_op(W, Z) --> p6_range(W, Z), !. p6_quantified_op(W, W) --> {true}. p56_quantified_op(W, ques_ng(W)) --> "??", !. p56_quantified_op(W, star_ng(W)) --> "*?", !. p56_quantified_op(W, plus_ng(W)) --> "+?", !. p56_quantified_op(W, ques(W)) --> "?", !. p56_quantified_op(W, star(W)) --> "*", !. p56_quantified_op(W, plus(W)) --> "+", !. p5_range(W, Z) --> "{", !, p5_range_bounds(Min,Max), "}", p5_range_greed(W,Min,Max,Z), !. p5_range_greed(W,Min,Max,repeat_ng(W,Min,Max)) --> "?", !. p5_range_greed(W,Min,Max, repeat(W,Min,Max)) --> {true}. p5_range_bounds(Min,Max) --> digits_int(Min), !, p5_range_bounds_RHS(Min,Max), !. p5_range_bounds( 0 ,Max) --> ",", digits_int(Max). p5_range_bounds_RHS(_Min,Max) --> ",", digits_int(Max), !. p5_range_bounds_RHS(_Min,-) --> ",", !. p5_range_bounds_RHS(Min ,Min) --> {true}. %# XXX - spaces permitted around ** p6_range(W, Z) --> "**{", !, p6_range_bounds(Min,Max), "}", p6_range_greed(W,Min,Max,Z), !. p6_range_greed(W,Min,Max,repeat_ng(W,Min,Max)) --> "?", !. p6_range_greed(W,Min,Max, repeat(W,Min,Max)) --> {true}. p6_range_bounds(Min,Max) --> digits_int(Min), !, p6_range_bounds_RHS(Min,Max), !. p6_range_bounds( 0 ,Max) --> "...", digits_int(Max). p6_range_bounds_RHS(_Min,Max) --> "..", digits_int(Max), !. p6_range_bounds_RHS(_Min,-) --> "...", !. p6_range_bounds_RHS(Min ,Min) --> {true}. p5_elemental(group(R)) --> "(?:", !, p5_start(R), ")". p5_elemental(standalone(R)) --> "(?>", !, p5_start(R), ")". p5_elemental(lookahead(R)) --> "(?=", !, p5_start(R), ")". p5_elemental(lookahead_not(R)) --> "(?!", !, p5_start(R), ")". p5_elemental(lookbehind(R)) --> "(?<=", !, p5_start(Rx), {re_reverse(Rx,R)}, ")". p5_elemental(lookbehind_not(R)) --> "(? "(?", p5_condition_body(Rk,R1,R2), !. p5_elemental(X) --> "(?", p5_modifier_spec(M), !, p5_modifier_scope(M,X). p5_elemental(cap(X)) --> "(", !, p5_start(X), ")". p5_elemental(bos) --> "\\A". p5_elemental(eosnl) --> "\\Z". p5_elemental(eos) --> "\\z". p5_elemental(bol) --> "^". p5_elemental(eol) --> "$". %" p5_elemental(word_boundary) --> "\\b", !. p5_elemental(word_boundary_not) --> "\\B", !. p5_elemental(dot) --> ".". p5_elemental(R) --> p56_char_class(R). p5_elemental(R) --> p5_simple_escape_sequence(R). p5_elemental(negSet(X)) --> "[^", !, p5_charset_def(X), "]". p5_elemental(posSet(X)) --> "[", p5_charset_def(X), "]". p5_elemental(matchref(I)) --> p5_matchref(I). p5_elemental(ws(C)) --> [C], {isSpacePerl(C)}, !. p5_elemental(char(C)) --> [C], {\+(p5_metachar([C]))}. p5_elemental(char(C)) --> "\\", [C], {p5_metachar([C])}. %# XXX - needs overhaul p5_metachar("\\"). p5_metachar("\|"). p5_metachar("?"). p5_metachar("*"). p5_metachar("+"). p5_metachar("\."). p5_metachar("["). p5_metachar("^"). p5_metachar("$"). %" p5_metachar("("). p5_metachar(")"). p5_metachar("{"). isP6metacharacter(C) :- member(C,"\{\}\[\]\(\)\^\$\.\|\*\+\?\#\\"). p6_metachar(C) --> [C], {isP6metacharacter(C)}. p5_matchref(I) --> "\\", digits_int(I1), {I is I1 - 1}, {I1 \= 0}, {(I1 =< 9; fail)}, !. p5_condition_body(Rk,R1,R2) --> p5_condition_test(Rk), p5_concatenation(R1), p5_condition_alternative(R2). p5_condition_alternative(R) --> "|", p5_concatenation(R), ")", !. p5_condition_alternative(noop) --> ")". p5_condition_test(lookahead(R)) --> p5_elemental(lookahead(R)). p5_condition_test(lookahead_not(R)) --> p5_elemental(lookahead_not(R)). p5_condition_test(lookbehind(R)) --> p5_elemental(lookbehind(R)). p5_condition_test(lookbehind_not(R)) --> p5_elemental(lookbehind_not(R)). p5_condition_test(condi(I)) --> "(", digits_int(I1), ")", {I is I1 - 1}. p5_modifier_scope(M,modify(M)) --> ")", !. p5_modifier_scope(M,group_modify(M,R)) --> ":", !, p5_start(R), ")", !. p5_modifier_spec(L) --> p5_modifier_spec_pos(L). p5_modifier_spec_pos([i|L]) --> "i", p5_modifier_spec_pos(L), !. p5_modifier_spec_pos([m|L]) --> "m", p5_modifier_spec_pos(L), !. p5_modifier_spec_pos([s|L]) --> "s", p5_modifier_spec_pos(L), !. p5_modifier_spec_pos([x|L]) --> "x", p5_modifier_spec_pos(L), !. p5_modifier_spec_pos(L) --> "-", p5_modifier_spec_neg(L), !. p5_modifier_spec_pos([]) --> {true}. p5_modifier_spec_neg([-i|L]) --> "i", p5_modifier_spec_neg(L), !. p5_modifier_spec_neg([-m|L]) --> "m", p5_modifier_spec_neg(L), !. p5_modifier_spec_neg([-s|L]) --> "s", p5_modifier_spec_neg(L), !. p5_modifier_spec_neg([-x|L]) --> "x", p5_modifier_spec_neg(L), !. p5_modifier_spec_neg([]) --> {true}. p5_charset_def([char(45)|L]) --> "-", p5_charset_items_opt(L0), p5_charset_def_x(L0,L), !. % 45 is - p5_charset_def([char(93)|L]) --> "]", p5_charset_items_opt(L0), p5_charset_def_x(L0,L), !. % 93 is ] p5_charset_def(L) --> p5_charset_items(L0), p5_charset_def_x(L0,L). p5_charset_def_x(L0,L1) --> "-", {append(L0,[char(45)],L1)}, !. p5_charset_def_x(L0,L0) --> {true}. p5_charset_items([Item1|MoreItems]) --> p5_charset_item(Item1), p5_charset_items_opt(MoreItems). p5_charset_items_opt([Item1|MoreItems]) --> p5_charset_item(Item1), p5_charset_items_opt(MoreItems). p5_charset_items_opt([]) --> {true}. p5_charset_item(class_neg(X)) --> "[:^", charset_identifier(X), ":]", !. p5_charset_item(class_pos(X)) --> "[:", charset_identifier(X), ":]", !. p5_charset_item(R) --> p56_char_class(R). p5_charset_item(R) --> p5_simple_escape_sequence(R). p5_charset_item(setrange(A,B)) --> p5_charset_item(char(A)), "-", p5_charset_range_x(B). p5_charset_range_x(_) --> p5_charset_item(class_pos(_)), {!, fail}. p5_charset_range_x(_) --> p5_charset_item(class_neg(_)), {!, fail}. p5_charset_range_x(B) --> p5_charset_item(char(B)). p5_charset_item(char(C)) --> [C], {\+(set_metachar([C]))}. p5_charset_item(char(C)) --> "\\", [C], {set_metachar([C])}. %# XXX - should permit \misc, no? set_metachar("\\"). set_metachar("]"). set_metachar("-"). p5_charset_item(char(C)) --> [C], {[C] = "-"}. charset_identifier(A) --> alnums(L), { atom_codes(A,L) }. p56_char_class(class_pos(digit)) --> "\\d", !. p56_char_class(class_neg(digit)) --> "\\D", !. p56_char_class(class_pos(space_perl)) --> "\\s", !. p56_char_class(class_neg(space_perl)) --> "\\S", !. p56_char_class(class_pos(word)) --> "\\w", !. p56_char_class(class_neg(word)) --> "\\W", !. p6_char_class(class_pos(horizontal_space)) --> "\\h", !. p6_char_class(class_neg(horizontal_space)) --> "\\H", !. p6_char_class(class_pos(vertical_space)) --> "\\v", !. p6_char_class(class_neg(vertical_space)) --> "\\H", !. p6_char_class(class_pos(newline)) --> "\\n", !. %# XXX - not quite? p6_char_class(class_neg(newline)) --> "\\N", !. p5_simple_escape_sequence(char(C)) --> "\\n", {[C] = "\n"}, !. %# XXX - not really p5_simple_escape_sequence(char(C)) --> "\\r", {[C] = "\r"}, !. p5_simple_escape_sequence(char(C)) --> "\\t", {[C] = "\t"}, !. %# XXX - more p6_simple_escape_sequence(char(C)) --> "\\t", {[C] = "\t"}, !. p6_simple_escape_sequence(char(C)) --> "\\r", {[C] = "\r"}, !. p6_simple_escape_sequence(char(C)) --> "\\f", {[C] = "\f"}, !. p6_simple_escape_sequence(char(C)) --> "\\e", {[C] = [0x1B]}, !. % XXX - connect to above p5_comment_opt --> p5_comment(_), !. p5_comment_opt --> {true}. p5_comment(L) --> "(?#", !, p5_comment_body(L), ")". p5_comment_body([C|L]) --> [C], {[C] \== ")"}, p5_comment_body(L), !. p5_comment_body([]) --> {true}. p6_comment --> "#", rest_of_line(_). rest_of_line([10]) --> [10], !. rest_of_line([C|T]) --> [C], rest_of_line(T). rest_of_line([]). p6_octidecimal_escape_sequence(X) --> "\\0", p6_oct_spec(X), !. p6_oct_spec(char(C)) --> octaldigit3_int(C), !. p6_oct_spec(R) --> [Open], brackets(Open,Close), p6_oct_chars(R), [Close]. p6_oct_chars(R) --> octdigits_int(C), p6_oct_chars_more(char(C),R). p6_oct_chars_more(Rc0,conc(R,char(C))) --> ";", octdigits_int(C), p6_oct_chars_more(Rc0,R), !. p6_oct_chars_more(Rc0,Rc0) --> {true}. p6_hexidecimal_escape_sequence(X) --> "\\", ("x";"X"), !, p6_hex_spec(X), !. p6_hex_spec(char(C)) --> hexdigits4_int(C), !. p6_hex_spec(char(C)) --> hexdigits2_int(C), !. p6_hex_spec(R) --> [Open], brackets(Open,Close), p6_hex_chars(R), [Close]. p6_hex_chars(R) --> hexdigits_int(C), p6_hex_chars_more(char(C),R). p6_hex_chars_more(Rc0,conc(R,char(C))) --> ";", hexdigits_int(C), p6_hex_chars_more(Rc0,R), !. p6_hex_chars_more(Rc0,Rc0) --> {true}. p6_named_character(char(C)) --> "\\", ("c","C"), [Open], brackets(Open,Close), p6_named_character_tail(Name,Close), !, uncode_charname_to_char(Name,C), !. p6_named_character_tail([],Close) --> [Close], !. p6_named_character_tail([C,T],Close) --> [C], p6_named_character_tail(T,Close). brackets("{","}"). brackets("(",")"). brackets("[","]"). brackets("<",">"). brackets(">","<"). brackets("/","/"). brackets("!","!"). brackets("=","="). brackets("?","?"). brackets("#","#"). brackets([0xabd],[0xbbd]). brackets([0xbbd],[0xabd]). digits_int(I) --> digit(D0), digits(D), { number_chars(I, [D0|D]) }. digits([D|T]) --> digit(D), digits(T), !. digits([]) --> []. digit(D) --> [D], { code_type(D, digit) }. hexdigits_int(I) --> hexdigit(D0), hexdigits(D), { number_chars(I, [42,120,D0|D]) }.% [42,120]="0x" hexdigits([D|T]) --> hexdigit(D), hexdigits(T), !. hexdigits([]) --> []. hexdigit(D) --> [D], { code_type(D, xdigit(_)) }. hexdigits4_int(I) --> hexdigit(D0), hexdigit(D1), hexdigit(D2), hexdigit(D3), {number_chars(I,[42,120,D0,D1,D2,D3])}. hexdigits2_int(I) --> hexdigit(D0), hexdigit(D1), {number_chars(I,[42,120,D0,D1])}. octdigits_int(I) --> octdigit(D0), octdigits(D), { number_chars(I, [42,111,D0|D]) }.% [42,111]="0o" octdigits([D|T]) --> octdigit(D), octdigits(T), !. octdigits([]) --> []. octdigit(D) --> [D], { member(D,"01234567") }. octdigits3_int(I) --> octdigit(D0), octdigit(D1), octdigit(D2), {number_chars(I,[42,111,D0,D1,D2])}. alnums([C|T]) --> [C], {code_type(C,alnum)}, alnums(T), !. alnums([]) --> {true}. uncode_charname_to_char(Name,C) :- cached_uncode_charname_to_char(Name,C), !. uncode_charname_to_char(Name,C) :- atom_codes(AName,Name), concat_atom(['use charnames ":full";unpack("U*","\\N{',AName,'}\n")'],Cmd), !, perl5_eval(Cmd,[C,10]), !, % XXX - warn on failure assert(cached_uncode_charname_to_char(Name,C)), !. %------------------------- re_reverse(conc(Ra0,Rb0),conc(Rb1,Ra1)) :- re_reverse(Ra0,Ra1), re_reverse(Rb0,Rb1), !. %# XXX - not quite right... need some symetric op pairs. re_reverse(bos,eos). re_reverse(eos,bos). re_reverse(bol,eol). re_reverse(eol,bol). %# XXX - captures in reverse dont currently work. %# XXX - more... re_reverse(R,R). %---------------------------------------------------------------------- re_rewrite(R0,R1) :- re_rewrite(R0,R1,rewr(0),_). re_rewrite(plus(cap(plus(R))),R1,RW0,RW1) :- re_rewrite(cap(plus(R)),R1,RW0,RW1). re_rewrite(cap(R0),capture(N,R1),RW0,RW1) :- !, RW0 = rewr(N), N1 is N + 1, RWx = rewr(N1), re_rewrite(R0,R1,RWx,RW1). re_rewrite(R0,R1,RW0,RW1) :- R0 =.. [P|A], re_rewrite_list(A,A1,RW0,RW1), R1 =.. [P|A1], !. re_rewrite(X,X,RW0,RW0). re_rewrite_list([],[],RW0,RW0). re_rewrite_list([T0|L0],[T1|L1],RW0,RW1) :- re_rewrite(T0,T1,RW0,RWx), re_rewrite_list(L0,L1,RWx,RW1). re_parse(Pat,Re) :- p5_start(Re0,Pat,[]), re_rewrite(Re0,Re). %---------------------------------------------------------------------- rm(noop,F0,F0,B0,B0,E0,E0). rm(standalone(Re),F0,F1,B0,B1,E0,E1) :- rm(Re,F0,F1,B0,B1,E0,E1), !. % this cut is semantic. rm(condition(condi(I),Ra,Rb),F0,F1,B0,B1,E0,E1) :- !, %# XXX - move (is "capture matched" not) ( \+(match_captures_get(I,E0,_)) ; match_captures_get(I,E0,X), var(X) ) -> rm(Rb,F0,F1,B0,B1,E0,E1) ; rm(Ra,F0,F1,B0,B1,E0,E1). rm(condition(Rk,Ra,Rb),F0,F1,B0,B1,E0,E1) :- !, rm(Rk,F0,_,B0,_,E0,_) -> rm(Ra,F0,F1,B0,B1,E0,E1) ; rm(Rb,F0,F1,B0,B1,E0,E1). rm(lookahead(Re),F0,F0,B0,B0,E0,E0) :- rm(Re,F0,_,B0,_,E0,_). rm(lookahead_not(Re),F0,F0,B0,B0,E0,E0) :- \+(rm(Re,F0,_,B0,_,E0,_)). rm(lookbehind(Re),F0,F0,B0,B0,E0,E0) :- rm(Re,B0,_,F0,_,E0,_). rm(lookbehind_not(Re),F0,F0,B0,B0,E0,E0) :- \+(rm(Re,B0,_,F0,_,E0,_)). rm(modify(M),F0,F0,B0,B0,E0,E1) :- set_modifiers(E0,M,E1). rm(group_modify(M,Re),F0,F1,B0,B1,E0,E1) :- set_modifiers(E0,M,Ex), rm(Re,F0,F1,B0,B1,Ex,Ey), set_modifiers_from(Ey,E0,E1). rm(group(Re),F0,F1,B0,B1,E0,E1) :- rm(Re,F0,F1,B0,B1,E0,Ex), set_modifiers_from(Ex,E0,E1). rm(union(Re1,_Re2),F0,F1,B0,B1,E0,E1) :- rm(Re1,F0,F1,B0,B1,E0,E1). rm(union(_Re1,Re2),F0,F1,B0,B1,E0,E1) :- rm(Re2,F0,F1,B0,B1,E0,E1). rm(intersection(Re1,Re2),F0,F1,B0,B1,E0,E1) :- rm(Re1,F0,F1,B0,B1,E0,Ex), rm(Re2,F0,F1,B0,B1,Ex,E1). rm(conc(Re1,Re2),F0,F1,B0,B1,E0,E1) :- rm(Re1,F0,Fx,B0,Bx,E0,Ex), rm(Re2,Fx,F1,Bx,B1,Ex,E1). rm(ques_ng(_Re),F0,F0,B0,B0,E0,E0). rm(ques_ng(Re),F0,F1,B0,B1,E0,E1) :- rm(Re,F0,F1,B0,B1,E0,E1). rm(star_ng(_Re),F0,F0,B0,B0,E0,E0). rm(star_ng(Re),F0,F1,B0,B1,E0,E1) :- rm(Re, F0,Fx,B0,Bx,E0,Ex), ( (F0 == Fx) -> (Fx = F1, Bx = B1, Ex = E1) ; rm(star_ng(Re),Fx,F1,Bx,B1,Ex,E1) ). rm(plus_ng(Re),F0,F1,B0,B1,E0,E1) :- rm(Re, F0,Fx,B0,Bx,E0,Ex), ( (F0 == Fx) -> (Fx = F1, Bx = B1, Ex = E1) ; rm(star_ng(Re),Fx,F1,Bx,B1,Ex,E1) ). rm(ques(Re),F0,F1,B0,B1,E0,E1) :- rm(Re,F0,F1,B0,B1,E0,E1). rm(ques(_Re),F0,F0,B0,B0,E0,E0). rm(star(Re),F0,F1,B0,B1,E0,E1) :- rm(Re, F0,Fx,B0,Bx,E0,Ex), ( (F0 == Fx) -> (Fx = F1, Bx = B1, Ex = E1) ; rm(star(Re),Fx,F1,Bx,B1,Ex,E1) ). rm(star(_Re),F0,F0,B0,B0,E0,E0). rm(plus(Re),F0,F1,B0,B1,E0,E1) :- rm(Re, F0,Fx,B0,Bx,E0,Ex), ( (F0 == Fx) -> (Fx = F1, Bx = B1, Ex = E1) ; rm(star(Re),Fx,F1,Bx,B1,Ex,E1) ). rm(repeat(Re,Min, - ),F0,F1,B0,B1,E0,E1) :- rm_repeat_min(Re,Min,F0,Fx,B0,Bx,E0,Ex), rm(star(Re),Fx,F1,Bx,B1,Ex,E1). rm(repeat(Re,Min,Max),F0,F1,B0,B1,E0,E1) :- number(Max), Min =< Max, rm_repeat_min(Re,Min,F0,Fx,B0,Bx,E0,Ex), Left is Max - Min, rm_repeat_max(Re,Left,Fx,F1,Bx,B1,Ex,E1). rm(repeat_ng(Re,Min, - ),F0,F1,B0,B1,E0,E1) :- rm_repeat_min(Re,Min,F0,Fx,B0,Bx,E0,Ex), rm(star_ng(Re),Fx,F1,Bx,B1,Ex,E1). rm(repeat_ng(Re,Min,Max),F0,F1,B0,B1,E0,E1) :- number(Max), Min =< Max, rm_repeat_min(Re,Min,F0,Fx,B0,Bx,E0,Ex), Left is Max - Min, rm_repeat_max_ng(Re,Left,Fx,F1,Bx,B1,Ex,E1). rm_repeat_min(_Re,0,F0,F0,B0,B0,E0,E0). rm_repeat_min(Re,N,F0,F1,B0,B1,E0,E1) :- rm(Re,F0,Fx,B0,Bx,E0,Ex), N1 is N - 1, rm_repeat_min(Re,N1,Fx,F1,Bx,B1,Ex,E1). rm_repeat_max(_Re,0,F0,F0,B0,B0,E0,E0). rm_repeat_max(Re,N,F0,F1,B0,B1,E0,E1) :- rm(Re,F0,Fx,B0,Bx,E0,Ex), N1 is N - 1, rm_repeat_max(Re,N1,Fx,F1,Bx,B1,Ex,E1). rm_repeat_max(_Re,_,F0,F0,B0,B0,E0,E0). rm_repeat_max_ng(_Re,0,F0,F0,B0,B0,E0,E0). rm_repeat_max_ng(_Re,_,F0,F0,B0,B0,E0,E0). rm_repeat_max_ng(Re,N,F0,F1,B0,B1,E0,E1) :- rm(Re,F0,Fx,B0,Bx,E0,Ex), N1 is N - 1, rm_repeat_max_ng(Re,N1,Fx,F1,Bx,B1,Ex,E1). rm(capture(N,Re),F0,F1,B0,B1,E0,E1) :- % match_captures_nest(E0,Ex0), % rm(Re,F0,F1,B0,B1,Ex0,Ex), rm(Re,F0,F1,B0,B1,E0,Ex), append(Str,F1,F0), length(F0,NPos), match_captures_set(N,[Str,NPos,[]],Ex,Ey), set_modifiers_from(Ey,E0,E1). % match_captures(E0,C0), % match_captures(Ex,Cx), % append(C0,[[Str,NPos,[]]],Cy), append(Cy,Cx,C1), % match_captures(Ex,C1,E1). rm(matchref(I),F0,F1,B0,B1,E0,E1) :- match_captures_get(I,E0,M), nonvar(M), %# XXX - excess caution M = [Str|_], %# XXX - need api re_parse(Str,Re), %# XXX - eeeeeeeep. and need to quote. rm(Re,F0,F1,B0,B1,E0,E1). rm(dot, [C|F1],F1,B0,[C|B0],E0,E0) :- C \= 10, !. %\n rm(dot, [C|F1],F1,B0,[C|B0],E0,E0) :- flag_s(E0), !. %#rm(ws(C), [C|F1],F1,B0,B0,E0,E0) :- flag_x(E0), !. XXX - so how??? rm(ws(C), [C|F1],F1,B0,[C|B0],E0,E0) :- !. rm(char(C), [C|F1],F1,B0,[C|B0],E0,E0) :- !. rm(char(C), [Ci|F1],F1,B0,[C|B0],E0,E0) :- flag_i(E0), case_insensitive_char_eq(C,Ci), !. rm(class_pos(X),[C|F1],F1,B0,[C|B0],E0,E0) :- class_member(class_pos(X),C), !. rm(class_neg(X),[C|F1],F1,B0,[C|B0],E0,E0) :- class_member(class_neg(X),C), !. rm(eos, [],[],B0,B0,E0,E0) :- !. rm(eosnl, [],[],B0,B0,E0,E0) :- !. rm(eosnl, [10],[10],B0,B0,E0,E0) :- !. rm(bos, F0,F0,[],[],E0,E0) :- !. rm(bol, F0,F0,[],[],E0,E0) :- !. rm(bol, [X|Fx],[X|Fx],[10|Bx],[10|Bx],E0,E0) :- flag_m(E0), !. rm(eol, [],[],B0,B0,E0,E0) :- !. rm(eol, [10],[10],B0,B0,E0,E0) :- !. rm(eol, [10|Fx],[10|Fx],B0,B0,E0,E0) :- flag_m(E0), !. rm(word_boundary,[C|Fx],[C|Fx],[],[],E0,E0) :- !, char_type(C,csym). rm(word_boundary,[],[],[C|Bx],[C|Bx],E0,E0) :- !, char_type(C,csym). rm(word_boundary,F0,F0,B0,B0,E0,E0) :- F0 = [CF|_], B0 = [CB|_], ( char_type(CF,csym) -> \+(char_type(CB,csym)) ; char_type(CB,csym) ), !. rm(word_boundary_not,F0,F0,B0,B0,E0,E0) :- F0 = [CF|_], B0 = [CB|_], ( char_type(CF,csym) -> char_type(CB,csym) ; \+(char_type(CB,csym)) ), !. rm(negSet(Set),[C|F1],F1,B0,[C|B0],E0,E0) :- \+(charset_member(E0,C,Set)). rm(posSet(Set),[C|F1],F1,B0,[C|B0],E0,E0) :- charset_member(E0,C,Set). class_member(class_pos(space_perl),C) :- !, (char_type(C,space) ; C = 11). %\v chr(11) class_member(class_neg(space_perl),C) :- !, \+(char_type(C,space)), C \= 11. class_member(class_pos(word) ,C) :- !, char_type(C,csym). class_member(class_neg(word) ,C) :- !, \+(char_type(C,csym)). class_member(class_pos(X) ,C) :- catch( char_type(C, X),_,fail), !. class_member(class_neg(X) ,C) :- catch(\+(char_type(C, X)),_,fail), !. class_member(class_pos(xdigit) ,C) :- !, char_type(C,xdigit(_)). class_member(class_neg(xdigit) ,C) :- !, \+(char_type(C,xdigit(_))). class_member(class_pos(ascii) ,C) :- !, isAscii(C). class_member(class_neg(ascii) ,C) :- !, \+(isAscii(C)). class_member(class_pos(print) ,C) :- !, isPrint(C). class_member(class_neg(print) ,C) :- !, \+(isPrint(C)). isAscii(C) :- 0 =< C, C =< 127. isPrint(C) :- (char_type(C,alnum);char_type(C,punct);C=32),!. % 32 = space isSpacePerl(C) :- (char_type(C,space) ; C = 11), !. %\v chr(11) case_insensitive_char_eq(Ca,Cb) :- char_type(Ca,to_lower(C)), char_type(Cb,to_lower(C)). charset_member(_E,C,[char(C)|_]) :- !. charset_member(E0,C,[char(X)|_]) :- flag_i(E0), case_insensitive_char_eq(C,X), !. charset_member(_E,C,[setrange(C1,C2)|_]) :- C1 =< C, C =< C2, !. charset_member(E0,C,[setrange(C1,C2)|_]) :- flag_i(E0), code_type(C,to_lower(Cx)), code_type(C1,to_lower(C1x)), code_type(C2,to_lower(C2x)), C1x =< Cx, Cx =< C2x, !. charset_member(_,C,[class_pos(X)|_]) :- class_member(class_pos(X),C), !. charset_member(_,C,[class_neg(X)|_]) :- class_member(class_neg(X),C), !. charset_member(E0,C,[_|T]) :- charset_member(E0,C,T). %---------------------------------------------------------------------- rmSomewhere(Re,F0,F1,B0,B1,E0,E1) :- rm(Re,F0,F1,B0,B1,E0,Ex), append(Str,F1,F0), length(F0,NPos), match_captures(Ex,Cx), C1 = [Str,NPos,Cx], match_captures(Ex,C1,E1). rmSomewhere(Re,[C|Fx],F1,B0,B1,E0,E1) :- rmSomewhere(Re,Fx,F1,[C|B0],B1,E0,E1). rmGlobal(Re,F0,F1,B0,B1,E0,E1) :- rmSomewhere(Re,F0,Fx,B0,Bx,E0,Ex), (rmGlobal(Re,Fx,F1,Bx,B1,Ex,E1), !; F1 = Fx, B1 = Bx, E1 = Ex). %---------------------------------------------------------------------- replace_at_n(0,Replace,[],[Replace]) :- !. replace_at_n(0,Replace,[_|T],[Replace|T]) :- !. replace_at_n(N,Replace,[],[_H|T1]) :- !, N1 is N - 1, replace_at_n(N1,Replace,[],T1). replace_at_n(N,Replace,[H|T],[H|T1]) :- !, N1 is N - 1, replace_at_n(N1,Replace,T,T1). replace_at_n_if_undef(0,Replace,[],[Replace]) :- !. replace_at_n_if_undef(0,Replace,[X|T],[Replace|T]) :- !, var(X). replace_at_n_if_undef(N,Replace,[],[_H|T1]) :- !, N1 is N - 1, replace_at_n_if_undef(N1,Replace,[],T1). replace_at_n_if_undef(N,Replace,[H|T],[H|T1]) :- !, N1 is N - 1, replace_at_n_if_undef(N1,Replace,T,T1). ground_list([],_,[]). ground_list([H0|T0],D,[H1|T1]) :- (var(H0), H1 = D ; ground_list(H0,D,H1) ; H1 = H0), !, ground_list(T0,D,T1). initialE(amat([],flgs(-,-,-,-,-,-,-,-,-,-))). match_captures(amat(C0,_G),C0). match_captures(amat(_C0,G),C1,amat(C1,G)). match_captures_nest(amat(_C0,G),amat([],G)). match_captures_push(amat(C0,G),CL,amat(C1,G)) :- append(C0,CL,C1). match_captures_set(N,X,amat(C0,G),amat(C1,G)) :- replace_at_n(N,X,C0,C1). match_captures_set_if_undef(N,X,amat(C0,G),amat(C1,G)) :- replace_at_n_if_undef(N,X,C0,C1). match_captures_get(N,amat(C0,_G),X) :- nth0(N,C0,X). set_modifiers(E0,[Flag|M],E1) :- set_flag(E0,Flag,Ex), set_modifiers(Ex,M,E1). set_modifiers(E0,[],E0). set_flag(E0,i,E1) :- set_flag_i(E0,i,E1), !. set_flag(E0,-i,E1) :- set_flag_i(E0,-,E1), !. set_flag(E0,m,E1) :- set_flag_m(E0,m,E1), !. set_flag(E0,-m,E1) :- set_flag_m(E0,-,E1), !. set_flag(E0,s,E1) :- set_flag_s(E0,s,E1), !. set_flag(E0,-s,E1) :- set_flag_s(E0,-,E1), !. set_flag(E0,x,E1) :- set_flag_x(E0,x,E1), !. set_flag(E0,-x,E1) :- set_flag_x(E0,-,E1), !. set_flag(E0,_,E0). set_modifiers_from(E0,Em,E1) :- E0 = amat(C0,_), Em = amat(_,flgs(A,B,C,D,E,F,G,H,I,J)), E1 = amat(C0,flgs(A,B,C,D,E,F,G,H,I,J)). set_flag_i(amat(C0,flgs(_,B,C,D,E,F,G,H,I,J)),X,amat(C0,flgs(X,B,C,D,E,F,G,H,I,J))). set_flag_m(amat(C0,flgs(A,_,C,D,E,F,G,H,I,J)),X,amat(C0,flgs(A,X,C,D,E,F,G,H,I,J))). set_flag_s(amat(C0,flgs(A,B,_,D,E,F,G,H,I,J)),X,amat(C0,flgs(A,B,X,D,E,F,G,H,I,J))). set_flag_x(amat(C0,flgs(A,B,C,_,E,F,G,H,I,J)),X,amat(C0,flgs(A,B,C,X,E,F,G,H,I,J))). flag_i(amat(_,flgs(i,_,_,_,_,_,_,_,_,_))). flag_m(amat(_,flgs(_,m,_,_,_,_,_,_,_,_))). flag_s(amat(_,flgs(_,_,s,_,_,_,_,_,_,_))). flag_x(amat(_,flgs(_,_,_,x,_,_,_,_,_,_))). %---------------------------------------------------------------------- listify(X,L) :- atom(X), atom_codes(X,L), !. listify(X,L) :- string_to_list(X,L), !. listify(L,L). listify_lists([X|Xs],[L|Ls]) :- listify(X,L), listify_lists(Xs,Ls). listify_lists([],[]). domatch(R,T,O) :- listify(R,RS), listify(T,F0), re_parse(RS,Re), initialE(E0), !, rmSomewhere(Re,F0,_F1,[],_B1,E0,E1), match_captures(E1,C1), ground_list(C1,'undef',O). END #====================================================================== use strict; use warnings; BEGIN { require Language::Prolog::Yaswi::Low; @Language::Prolog::Yaswi::Low::args = ($Language::Prolog::Yaswi::Low::args[0], @ARGV); } use Language::Prolog::Yaswi qw(:query :interactive :load :assert :context :run); #$Language::Prolog::Yaswi::swi_converter->pass_as_opaque('UNIVERSAL'); #Predicates perl5_call/3, perl5_eval/2 and perl5_method/4 will be available do{open F,">deleteme_pro.pl";print F $prolog;close F;} if 1; swi_inline($prolog); use Language::Prolog::Types::overload; use Language::Prolog::Sugar functors => [qw(foo tokenize domatch)], vars => [qw (L X Y Z)]; #use Language::Prolog::Types qw(:ctors); use Benchmark qw(:all); require "m02.pl"; require "re05.pl"; sub mkMatch { my($len,$a)=@_; return MatchX->new()->set_as_failed() if !defined $a; my($strA,$npos,$A)=@$a; my $str = join("",map{chr($_)} @$strA); my $cap = [map{mkMatch($len,$_)}@$A]; my $from = $len-$npos; my $to = $from + length($str); MatchX->new()->set(1,$str,$cap,{},$from,$to); } sub rx { my($re,$mods)=@_; my $rem = $re; $rem = "(?$mods)(?:$rem)" if $mods; #print "re>$rem<\n"; sub { my($s)=@_; #print "s>$s<\n"; #use Data::Dumper; print Dumper($s); my $ret = swi_find_one(domatch($rem,$s,L),L); if ($ret) { #print ">>$ret<<\n"; my $res = eval($ret); die "bug $@\n$ret" if $@; #use Data::Dumper; #print Dumper($res); mkMatch(length($s),$res); } else { MatchX->new()->set_as_failed() } }; } my $rx = \℞ $/=undef; do { Pkg_re_tests::test(\&rx); exit; } if !@ARGV; do { print "----\n"; my $r = rx(' +|([0-9]+|\\+|-)'); print $r->('12 + 4 - 29'),"\n"; print $r->('x12 blah + 4 - 29'),"\n"; print $r->('12 + 4 - 29')->describe,"\n"; print $r->('xxx')->describe,"\n"; print $rx->('abc')->('abc')->describe,"\n"; print $rx->('(a+|b)*')->('ab')->describe,"\n"; print "----\n"; }if(1); Language::Prolog::Yaswi::swi_toplevel; __END__ % :- use_module(library(time)). call_with_time_limit, but allegedly cant from perl. :( # XXX - Known bugs () isnt an unsuccessful capture. (a\1) isnt handled properly (a(b)?)* aba doesnt have $2 as an unsuccessful match. //x isnt being handled (re_rewrite step?) lots of stuff with an XXX above crlf handling unix specific