package Encode::UTR22; =head1 TITLE Encode::UTR22 - Implement Unicode TR22 complex conversions =DESCRIPTION Implements all of UTR22 except: validity negative regular expression groups header bidirectional re-ordering history v attribute for versioning aliases - that's the job of another module fbu and fub are treated synonymously with single directional a, with equal priority Features of this module There is some error checking of description files to help those writing them It's slow at the moment! Supports priority attribute =cut require 5.6.0; use XML::Parser::Expat; use Carp; use strict; use vars qw($current $curr_side); sub new { my ($class, $infile, %attrs) = @_; my ($self) = $class->process_file($infile, %attrs) || return undef; $self->compile(%attrs); $self; } sub process_file { my ($class, $infile, %attrs) = @_; my ($xml) = XML::Parser::Expat->new(); my ($context) = {}; my ($r); bless $context, ref $class || $class; $xml->{' mycontext'} = $context; my (%regex_classes) = ( 'group' => 'Encode::UTR22::Regexp::Group', 'class-ref' => 'Encode::UTR22::Regexp::classRef', 'context-ref' => 'Encode::UTR22::Regexp::contextRef', 'eos' => 'Encode::UTR22::Regexp::EOS' ); $xml->setHandlers('Start' => sub { my ($xml, $tag, %attrs) = @_; my ($this, $temp); $attrs{'line'} = $xml->current_line; if ($tag eq 'characterMapping') { $xml->{' mycontext'}{'info'} = {%attrs}; } elsif ($tag eq 'assignments') { $xml->{' mycontext'}{'sub'}[0] = pack('C', hex($attrs{'sub'})); $xml->{' mycontext'}{'sub'}[1] = pack('U', 0xFFFD); } elsif ($tag eq 'a' || $tag eq 'fbu' || $tag eq 'fub') { error($xml, undef, "b and u attributes are required in a element") unless (defined $attrs{'b'} && defined $attrs{'u'}); push(@{$xml->{' mycontext'}{'rules'}}, { 'line' => $xml->current_line, 'b' => pack('C0C*', map {hex($_)} $attrs{'b'} =~ m/\G\s*([0-9a-fA-F]{2})\s*/og), 'u' => pack('U0U*', map {hex($_)} $attrs{'u'} =~ m/\G\s*([0-9a-fA-F]{4,6})\s*/og), 'type' => $tag, 'bactxt' => $attrs{'bactxt'}, 'bbctxt' => $attrs{'bbctxt'}, 'uactxt' => $attrs{'uactxt'}, 'ubctxt' => $attrs{'ubctxt'}, 'priority' => $attrs{'priority'}}); } elsif ($tag eq 'range') { $xml->{' mycontext'}{'rules'} = [] unless $xml->{' mycontext'}{'rules'}; process_range($xml, $xml->{' mycontext'}{'rules'}, %attrs); } elsif (defined($regex_classes{$tag})) { $this = $regex_classes{$tag}->new(%attrs) || error($xml, undef, "illegal $tag element"); if ($current) { $current = $current->add_child($this); } else { error($xml, $this, "top level regexps must be named") unless ($attrs{'id'}); error($xml, $this, "top level regexps may not share names") if (defined $xml->{' mycontext'}{'contexts'}{$attrs{'id'}}); $xml->{' mycontext'}{'contexts'}{$attrs{'id'}} = $this; $current = $this; } $this->{'owner'} = $xml->{' mycontext'}; error($xml, $this, "context name $attrs{'id'} must not include /") if ($attrs{'id'} && $attrs{'id'} =~ m|/|o); } elsif ($tag eq 'class') { error($xml, undef, "classes must be named") unless ($attrs{'name'}); error($xml, undef, "size must be 'bytes' in class definition $attrs{'name'}") if (defined $attrs{'size'} && $attrs{'size'} ne 'bytes'); $this = Encode::UTR22::Regexp::class->new(%attrs); $current = $this; $xml->{' mycontext'}{'classes'}{$attrs{'name'}} = $this; } elsif ($tag eq 'class-include') { error($xml, undef, "class-include $attrs{'name'} not in classes element") unless (defined $current && $current->can('add_from')); $temp = $xml->{' mycontext'}{'classes'}{$attrs{'name'}} || error($xml, $current, "Class $attrs{'name'} not yet defined"); error($xml, $current, "Class-include $attrs{'name'} must be the same data size") if ($temp->{'size'} ne $current->{'size'}); $current->add_from($temp); } elsif ($tag eq 'class-range') { $current->add_range(hex($attrs{'first'}), hex($attrs{'last'})); } elsif ($tag eq 'ordering') { $curr_side = lc($attrs{'side'}); error($xml, undef, "ordering must have a side of 'unicode' or 'bytes'") unless ($curr_side eq 'unicode' || $curr_side eq 'bytes'); } elsif ($tag eq 'order') { error($xml, undef, "order element must have b and u attributes") unless (defined $attrs{'b'} && defined $attrs{'u'}); error($xml, undef, "order must occur inside ordering") unless (defined $curr_side); push(@{$xml->{' mycontext'}{'orders'}{$curr_side}}, { 'line' => $xml->current_line, 'b' => $attrs{'b'}, 'u' => $attrs{'u'}}); } }, 'End' => sub { my ($xml, $tag) = @_; if ($tag eq 'class') { undef $current; } elsif (defined $regex_classes{$tag}) { $current = $current->{'parent'}; } elsif ($tag eq 'ordering') { $curr_side = ''; } }, 'Char' => sub { my ($xml, $str) = @_; if (defined $current && $current->can('add_elements')) { if ($current->{'size'} && $current->{'size'} eq 'bytes') { $current->add_elements(map {pack('C0C', hex($_))} $str =~ m/\G\s*([0-9a-fA-F]{2})\s*/og); } else { $current->add_elements(map {pack('U0U', hex($_))} $str =~ m/\G\s*([0-9a-fA-F]{4,6})\s*/og); } } }); if ($attrs{'-path'}) { my ($done); foreach $r (@{$attrs{'-path'}}) { if (-f "$r/$infile") { $xml->parsefile("$r/$infile"); $done = 1; last; } } $xml->parsefile($infile) unless $done; } elsif (ref $infile) { $xml->parse($infile); } else { $xml->parsefile($infile); } return $context; } sub compile { my ($self, %attrs) = @_; my ($r); foreach $r (sort {$self->{'contexts'}{$a}{'line'} <=> $self->{'contexts'}{$b}{'line'}} keys %{$self->{'contexts'}}) { $self->{'regexps'}{$r} = $self->{'contexts'}{$r}->as_perl(1); } if (defined $attrs{'toBytes'}) { $self->compile_map($attrs{'toBytes'}); $self->compile_order($attrs{'toBytes'}); } else { $self->compile_map(0); $self->compile_order(0); $self->compile_map(1); $self->compile_order(1); unless($attrs{'debug'}) { foreach $r (qw(rules classes contexts regexps orders)) { delete $self->{$r}; } } } $self; } sub decode { my ($self, $str, $check) = @_; my ($res, $len, $c, $temp, $r, $count, $tpos, $found); return undef unless ($self->{'bsimple'} || $self->{'bconv'}); use bytes; $str = $self->reorder($str, $self->{'border'}[0], 1) if (defined $self->{'border'}[0]); $len = length($str); pos($str) = 0; while (pos($str) < $len) { $found = 0; $temp = pos($str); $str =~ m/\G(.)/ogcs; $c = $1; $tpos = pos($str); pos($str) = $temp; if (defined $self->{'bconv'}{$c}) { foreach $r (@{$self->{'bconv'}{$c}}) { if ($str =~ m/$r->[0]/gcs) { $res .= $r->[1]; $found = 1; last; } } } unless ($found) { if (defined $self->{'bsimple'}{$c}) { $res .= $self->{'bsimple'}{$c}; } elsif (ref $check eq 'CODE') { $res .= &{$check}($str, pos($str)); } elsif ($check) { $res .= $check; } else { $res .= pack('U', 0xFFFD); } pos($str) = $tpos; } } $res = $self->reorder($res, $self->{'border'}[1], 0) if (defined $self->{'border'}[1]); $res; } sub encode { my ($self, $str, $check) = @_; my ($res, $len, $c, $temp, $r, $tpos, $found); return undef unless ($self->{'usimple'} || $self->{'uconv'}); use utf8; $str = $self->reorder($str, $self->{'uorder'}[0], 0) if (defined $self->{'uorder'}[0]); $len = length($str); pos($str) = 0; while (pos($str) < $len) { $found = 0; $temp = pos($str); $str =~ m/\G(.)/ogcs; $c = $1; $tpos = pos($str); pos($str) = $temp; if (defined $self->{'uconv'}{$c}) { foreach $r (@{$self->{'uconv'}{$c}}) { if ($str =~ m/$r->[0]/gcs) { $res .= $r->[1]; $found = 1; last; } } } unless ($found) { if (defined $self->{'usimple'}{$c}) { $res .= $self->{'usimple'}{$c}; } elsif (ref $check eq 'CODE') { $res .= &{$check}($str, pos($str)); } elsif ($check) { $res .= $check; } else { $res .= $self->{'sub'}[0]; } pos($str) = $tpos; } } $res = $self->reorder($res, $self->{'uorder'}[1], 1) if (defined $self->{'uorder'}[1]); $res; } sub debug_decode { my ($self, $str, $check) = @_; my ($res, $len, $c, $temp, $r, $count, $tpos, $found, $debug, $debstr); return undef unless ($self->{'bsimple'} || $self->{'bconv'}); use bytes; ($str, $debug) = $self->debug_reorder($str, $self->{'border'}[0], 1) if (defined $self->{'border'}[0]); $debug .= "\nMapping from Bytes to Unicode\n"; $len = length($str); pos($str) = 0; while (pos($str) < $len) { $found = 0; $temp = pos($str); $str =~ m/\G(.)/ogcs; $c = $1; $tpos = pos($str); pos($str) = $temp; if (defined $self->{'bconv'}{$c}) { foreach $r (@{$self->{'bconv'}{$c}}) { if ($str =~ m/$r->[0]/gcs) { $res .= $r->[1]; $found = 1; $debug .= "matched line $r->[2]: " . debug_blist($str, $temp) . " =~ $r->[0] -> " . debug_ulist($r->[1]) . "\n\n"; last; } else { $debug .= "tried line $r->[2]: " . debug_blist($str, $temp) . " =~ $r->[0]\n"; } } } unless ($found) { if (defined $self->{'bsimple'}{$c}) { $res .= $self->{'bsimple'}{$c}; $debug .= "simple: " . debug_blist($c) . " = " . debug_ulist($self->{'bsimple'}{$c}) . "\n\n"; } elsif (ref $check eq 'CODE') { $debug .= "checked at " . pos($str) . "\n\n"; $res .= &{$check}($str, pos($str)); } elsif ($check) { $debug .= "added check: $check\n\n"; $res .= $check; } else { $debug .= "failed\n\n"; $res .= pack('U', 0xFFFD); } pos($str) = $tpos; } } ($res, $debstr) = $self->debug_reorder($res, $self->{'border'}[1], 0) if (defined $self->{'border'}[1]); ($res, $debug . $debstr); } sub debug_encode { my ($self, $str, $check) = @_; my ($res, $len, $c, $temp, $r, $tpos, $found, $debug, $debstr); return undef unless ($self->{'usimple'} || $self->{'uconv'}); use utf8; ($str, $debug) = $self->debug_reorder($str, $self->{'uorder'}[0], 0) if (defined $self->{'uorder'}[0]); $debug .= "\nMapping from Unicode to Bytes\n"; $len = length($str); pos($str) = 0; while (pos($str) < $len) { use utf8; $found = 0; $temp = pos($str); $str =~ m/\G(.)/ogcs; $c = $1; $tpos = pos($str); pos($str) = $temp; if (defined $self->{'uconv'}{$c}) { foreach $r (@{$self->{'uconv'}{$c}}) { if ($str =~ m/$r->[0]/gcs) { $res .= $r->[1]; $debug .= "matched line $r->[2]: " . debug_ulist($str, $temp) . " =~ $r->[0] -> " . debug_blist($r->[1]) . "\n\n"; $found = 1; last; } else { $debug .= "tried line $r->[2]: " . debug_ulist($str, $temp) . " =~ $r->[0]\n"; } } } unless ($found) { if (defined $self->{'usimple'}{$c}) { $debug .= "simple: " . debug_ulist($c) . " = " . debug_blist($self->{'usimple'}{$c}) . "\n\n"; $res .= $self->{'usimple'}{$c}; } elsif (ref $check eq 'CODE') { $debug .= "check at " . pos($str) . "\n\n"; $res .= &{$check}($str, pos($str)); } elsif ($check) { $debug .= "check: $check\n\n"; $res .= $check; } else { $debug .= "failed: $self->{'sub'}[0]\n\n"; $res .= $self->{'sub'}[0]; } pos($str) = $tpos; } } ($res, $debstr) = $self->debug_reorder($res, $self->{'uorder'}[1], 1) if (defined $self->{'uorder'}[1]); ($res, $debug . $debstr); } sub name { my ($self) = @_; return $self->{'info'}{'id'}; } sub new_sequence { return $_[0]; } sub compile_map { my ($self, $toBytes) = @_; my ($srcl) = $toBytes ? 'u' : 'b'; my ($destl) = $toBytes ? 'b' : 'u'; my ($r, $res, $pre, $post, $lres, $lpre, $lpost, $line, $first, $dump); return $self if ($self->{"${srcl}simple"} || $self->{"${srcl}conv"}); foreach $r (@{$self->{'rules'}}) { next if ($r->{'type'} ne 'a' && (($toBytes == 1) ^ ($r->{'type'} eq 'fub'))); $pre = ''; $post = ''; $res = ''; $lpre = 0; $lpost = 0; $line = $r->{'line'}; if ($r->{"${srcl}bctxt"}) { error (undef, undef, "No regexp " . $r->{"${srcl}bctxt"} . " for ${srcl}bctxt at line $r->{line}") unless ($self->{'regexps'}{$r->{"${srcl}bctxt"}}[0]); ($pre, $dump, $lpre) = @{$self->{'regexps'}{$r->{"${srcl}bctxt"}}}; } $res = $r->{$srcl}; error (undef, undef, "Empty mapping to " . strerror($r->{$destl}, $toBytes) . " not allowed") if ($res eq ''); if ($r->{"${srcl}actxt"}) { error (undef, undef, "No regexp " . $r->{"${srcl}actxt"} . " for ${srcl}actxt at line $r->{line}") unless ($self->{'regexps'}{$r->{"${srcl}actxt"}}[0]); ($post, $dump, $lpost) = @{$self->{'regexps'}{$r->{"${srcl}actxt"}}}; $post = "(?=" . $post . ")"; } if ($toBytes) { use utf8; $r->{'u'} =~ m/^(.)/os; $first = $1; # substr() not working yet # $line = -length($res) if (length($res) > 1); # doesn't work yet my (@temp) = unpack('U*', $res); if ($#temp == 0 && $pre eq '' and $post eq '') { error (undef, undef, "Ambiguous mapping from " . strerror($first, !$toBytes) . " to at least " . strerror($self->{'usimple'}{$first}, $toBytes) . " and " . strerror($r->{$destl}, $toBytes)) if (defined $self->{'usimple'}{$first}); $self->{'usimple'}{$first} = $r->{$destl}; } else { $lres = $#temp + 1; $res =~ s/([$%\\^&*(){}\[\]|"'?\/+.`~\-])/\\$1/ogs; $res =~ s/([^\x21-\x7e])/sprintf("\\x{%04X}", unpack('U', $1))/ogse; push (@{$self->{"uconv"}{$first}}, [qr/$pre\G$res$post/, $r->{$destl}, $line, $lres, $lpre, $lpost, $r->{'priority'}]); # print STDERR "qr/$pre\\G$res$post/, $r->{$destl}, $line\n"; } } else { use bytes; $first = substr($res, 0, 1); if (length($res) == 1 && $pre eq '' && $post eq '') { error (undef, undef, "Ambiguous mapping from " . strerror($first, !$toBytes) . " to at least " . strerror($self->{'bsimple'}{$first}, $toBytes) . " and " . strerror($r->{$destl}, $toBytes)) if (defined $self->{'bsimple'}{$first}); $self->{'bsimple'}{$first} = $r->{$destl}; } else { $lres = length($res); $res =~ s/([$%\\^&*(){}\[\]|"'?\/+.`~\-])/\\$1/ogs; $res =~ s/([^\x21-\x7e])/sprintf("\\x%02x", ord($1))/ogse; push (@{$self->{"bconv"}{$first}}, [qr/$pre\G$res$post/, $r->{$destl}, $line, $lres, $lpre, $lpost, $r->{'priority'}]); } } } $res = $self->{"${srcl}conv"}; foreach $first (keys %{$res}) { my ($has_short); $r = $res->{$first}; $res->{$first} = [sort { $b->[6] <=> $a->[6] || # highest priority attribute first $b->[3] <=> $a->[3] || # longest match string first $b->[4] + $b->[5] <=> $a->[4] + $a->[5] || # pre+post longest first $a->[2] <=> $b->[2]; # lowest line number first } @{$r}]; foreach (@{$res->{$first}}) { $has_short ||= ($_->[3] == 0);} error (undef, undef, 'No default mapping for ' . ($toBytes ? sprintf("U+%04X", unpack('U', $first)) : sprintf("0x%02x", ord($first)))) unless (!$has_short || defined $self->{"${srcl}simple"}{$first}); } $self; } sub compile_order { my ($self, $toBytes) = @_; my ($srcl) = $toBytes ? 'u' : 'b'; my ($destl) = $toBytes ? 'b' : 'u'; my ($output) = $toBytes ? 'uorder' : 'border'; my (@sides) = $toBytes ? ('unicode', 'bytes') : ('bytes', 'unicode'); my ($count, $r, $obj, $i, $reg, $list, %names, @nums, $name, $reg1, $eval); for ($count = 0; $count < 2; $count++) { $obj = $self->{'orders'}{$sides[$count]}; next unless defined $obj; foreach $r (@{$obj}) { @nums = (); %names = (); error(undef, undef, "No regexp called $r->{$srcl} available") unless defined $r->{$srcl}; ($reg, $list) = @{$self->{'regexps'}{$r->{$srcl}}}; for ($i = 0; $i <= $#{$list}; $i++) { $name = $list->[$i]; if ($name =~ s{^\Q$r->{$srcl}\E(?:/|$)}{}) { $name =~ s{^\Q$r->{$destl}\E(?:/|$)}{}; } # next unless ($name !~ m|/|o && $name ne ''); $names{$name} = $i; } error(undef, undef, "No regexp called $r->{$destl} available") unless defined $r->{$destl}; ($reg1, $list) = @{$self->{'regexps'}{$r->{$destl}}}; for ($i = 0; $i <= $#{$list}; $i++) { $name = $list->[$i]; if ($name =~ s{^\Q$r->{$destl}\E(?:/|$)}{}) { $name =~ s{^\Q$r->{$srcl}\E(?:/|$)}{}; } # next unless ($name !~ m|/|o && $name ne ''); push (@nums, $names{$name}+1) if ($name && $names{$name}); } $eval = join('', map {"\$$_"} @nums); if ($sides[$count] eq 'unicode') { use utf8; push (@{$self->{$output}[$count]}, [qr/\G$reg/, $eval]); } else { use bytes; push (@{$self->{$output}[$count]}, [qr/\G$reg/, $eval]); } } } $self; } sub reorder { my ($self, $str, $rules, $isbytes) = @_; my ($r, $res, $found, $len, @ress, $temp, $oldpos); use bytes; $len = length($str); while (pos($str) < $len) { $found = 0; foreach $r (@{$rules}) { if ($isbytes) { use bytes; # the \G seems to be anchoring the global search # here so it only finds $r->[0] once next unless (@ress = $str =~ m/$r->[0]/gcs); } else { use utf8; # and here next unless (@ress = $str =~ m/$r->[0]/gcs); } $oldpos += length($ress[0]); pos($str) = $oldpos; $temp = $r->[1]; $temp =~ s/\$(\d+)/$ress[$1 - 1]/og; $res .= $temp; $found = 1; last; } unless ($found) { if ($isbytes) { use bytes; $str =~ m/\G(.)/ogcs; $res .= $1; } else { use utf8; $str =~ m/\G(.)/ogcs; $res .= $1; } } } $res; } sub debug_reorder { my ($self, $str, $rules, $isbytes) = @_; my ($r, $res, $found, $len, @ress, $temp, $debug, $oldpos); $debug = "\nRe-ordering:\n"; if ($isbytes || $] < 5.008) { use bytes; $len = length($str); } else { use utf8; $len = length($str); } while (pos($str) < $len) { $found = 0; foreach $r (@{$rules}) { if ($isbytes) { use bytes; # the \G seems to be anchoring the global search # here so it only finds $r->[0] once next unless (@ress = $str =~ m/$r->[0]/gcs) } else { use utf8; # and here next unless (@ress = $str =~ m/$r->[0]/gcs); } $oldpos += length($ress[0]); pos($str) = $oldpos; $temp = $r->[1]; $temp =~ s/\$(\d+)/$ress[$1 - 1]/og; $res .= $temp; if ($isbytes) { $debug .= "reorder: $r->[0] = " . join(",", map {debug_blist($_)} @ress); $debug .= " -> $r->[1] = "; $debug .= debug_blist($temp) . "\n\n"; } else { $debug .= "reorder: $r->[0] = " . join(",", map {debug_ulist($_)} @ress); $debug .= " -> $r->[1]) = "; $debug .= debug_ulist($temp) . "\n\n"; } $found = 1; last; } unless ($found) { if ($isbytes) { use bytes; $str =~ m/\G(.)/ogcs; $res .= $1; } else { use utf8; $str =~ m/\G(.)/ogcs; $res .= $1; } } } if ($isbytes) { $debug .= "Final result: " . debug_blist($res) . "\n"; } else { $debug .= "Final result: " . debug_ulist($res) . "\n"; } ($res, $debug); } sub process_range { my ($xml, $store, %attrs) = @_; my (@first, @last, @max, @min, $uFirst, $uLast); my (@current, $i, $j, $done); @first = map {hex($_)} ($attrs{'bFirst'} =~ m/\G\s*([0-9a-fA-F]{2})\s*/og) or return error($xml, undef, "bFirst attribute required in range"); @last = map {hex($_)} ($attrs{'bLast'} =~ m/\G\s*([0-9a-fA-F]{2})\s*/og) or return error($xml, undef, "bLast attribute required in range"); @max = map {hex($_)} ($attrs{'bMax'} =~ m/\G\s*([0-9a-fA-F]{2})\s*/og); @min = map {hex($_)} ($attrs{'bMin'} =~ m/\G\s*([0-9a-fA-F]{2})\s*/og); $uFirst = hex($attrs{'uFirst'}); $uLast = hex($attrs{'uLast'}) || return error($xml, undef, "uLast attribute require in range"); @current = @first; for ($i = $uFirst; $i <= $uLast; $i++) { push(@{$store}, { 'line' => $xml->current_line, 'b' => pack('C0C*', @current), 'u' => pack('U0U', $i), 'type' => 'a', 'bactxt' => $attrs{'bactxt'}, 'bbctxt' => $attrs{'bbctxt'}, 'uactxt' => $attrs{'uactxt'}, 'ubctxt' => $attrs{'ubctxt'}, 'priority' => $attrs{'priority'}}); last if $i == $uLast; for ($j = 0; $j <= $#current; $j++) { $current[$j]++; if (defined $max[$j] && $current[$j] > $max[$j]) { $current[$j] = $min[$j]; } else { last; } } } for ($j = 0; $j <= $#current; $j++) { if ($current[$j] != $last[$j]) { error($xml, undef, "Number of byte codes does not correspond to number of Unicodes in range"); last; } } } sub strerror { my ($str, $isBytes) = @_; my ($res); if ($isBytes) { use bytes; $res = join(" ", map {sprintf("0x%02x", ord($_))} split('', $str)); } else { use utf8; $res = join(" ", map {sprintf("U+%04X", unpack('U', $_))} split('', $str)); } $res; } sub error { my ($xml, $obj, $str, $die) = @_; if (defined $obj && $obj->can('as_error')) { $str .= "\n in " . $obj->as_error; } if ($die) { if ($xml) { $xml->xpcroak($str); } else { die($str); } } else { if ($xml) { $xml->xpcarp($str); } else { print STDERR "$str\n"; } } undef; } sub debug_ulist { my ($str, $pos) = @_; my (@res1) = map{sprintf("%04X",$_)} unpack('U*', substr($str, 0, $pos)); my (@res2) = map{sprintf("%04X",$_)} unpack('U*', substr($str, $pos)); return join(" ", defined $pos ? (@res1, '|') : (), @res2); } sub debug_blist { my ($str, $pos) = @_; my (@res) = map{sprintf("x%02X",$_)} unpack('C*', $str); splice(@res, $pos, 0, '|') if defined $pos; return join(" ", @res); } package Encode::UTR22::Regexp::Element; sub new { my ($class, %attrs) = @_; my ($self) = {%attrs}; bless $self, ref ($class) || $class; $self; } sub add_child { my ($self, $child) = @_; $child->{'parent'} = $self; push (@{$self->{'child'}}, $child); if ($child->{'id'}) { $self->{'named'}{$child->{'id'}} = $child; } elsif ($child->{'name'}) { $self->{'named'}{$child->{'name'}} = $child; } $child; } sub as_error { $_[0]->{'id'}; } package Encode::UTR22::Regexp::Group; use vars qw(@ISA); BEGIN { @ISA = qw(Encode::UTR22::Regexp::Element); } sub as_perl { my ($self, $atstart) = @_; my ($r, $res, $names, $count, $text, $sub, $subl, $lacc); my ($min, $max); $min = defined $self->{'min'} ? $self->{'min'} : 1; $max = defined $self->{'max'} ? $self->{'max'} : 1; $names = []; if ($self->{'id'} ne '') { $res = "("; $names = [$self->{'id'}]; } if ($self->{'id'} eq '' || $min != 1 || $max != 1) { $res .= "(?:"; } $lacc = 0; foreach $r (@{$self->{'child'}}) { ($text, $sub, $subl) = @{$r->as_perl($atstart)}; if (defined $self->{'alt'}) { if ($count) { $res .= "|"; } else { $count = 1; } $res .= $text; $lacc = $subl if $subl > $lacc; } else { $res .= $text; $atstart = 0; $lacc += $subl; } push (@{$names}, map {"$self->{'id'}/$_"} @{$sub}); } $res .= ')' if ($self->{'id'} eq '' || $min != 1 || $max != 1); if ($max > 1) { if ($max != $min) { $res .= "{$min,$max}"; } else { $res .= "{$max}"; } } elsif ($min == 0) { $res .= "?"; } $res .= ")" if ($self->{'id'} ne ''); return [$res, $names, $lacc * $max]; } package Encode::UTR22::Regexp::classRef; use vars qw(@ISA); BEGIN { @ISA = qw(Encode::UTR22::Regexp::Element); } sub as_perl { my ($self, $atstart) = @_; my ($class) = $self->{'owner'}{'classes'}{$self->{'name'}}; my ($res, $temp, $wrap); my ($min, $max); $min = defined $self->{'min'} ? $self->{'min'} : 1; $max = defined $self->{'max'} ? $self->{'max'} : 1; return warn("No class defined for $self->{'name'}\n in " . $self->as_error) unless defined $class; if ($class->{'size'} && $class->{'size'} eq 'bytes') { $temp = join('', @{$class->{'elements'}}); $temp =~ s/([$%\\^&*(){}\[\]|+"'?\/.`~\-])/\\$1/og; $temp =~ s/([^\x21-\x7e])/sprintf("\\x%02x", ord($1))/oge; } else { use utf8; $temp = join('', @{$class->{'elements'}}); $temp =~ s/([$%\\^&*(){}\[\]|+"'?\/.`~\-])/\\$1/og; $temp =~ s/([^\x21-\x7e])/sprintf("\\x{%04X}", unpack('U', $1))/oge; } $res = "(" if ($self->{'id'}); $wrap = ($#{$class->{'elements'}} > 0 || defined $self->{'neg'} || $min != 1 || $max != 1); $res .= "[" if $wrap; $res .= '^' if (defined $self->{'neg'}); $res .= "$temp"; $res .= "]" if $wrap; if ($max > 1) { if ($min != $max) { $res .= "{$min,$max}"; } else { $res .= "{$max}"; } } elsif ($min == 0) { $res .= "?"; } if ($self->{'id'}) { $res .= ')'; return [$res, [$self->{'id'}], $max]; } else { return [$res, [], $max]; } } package Encode::UTR22::Regexp::contextRef; use vars qw(@ISA); BEGIN { @ISA = qw(Encode::UTR22::Regexp::Element); } sub as_perl { my ($self, $atstart) = @_; my ($ref, $n, $res, $ind, $temp, $len, $id); unless ($self->{'name'}) { print STDERR "No name attribute in context-ref at line $self->{'line'}\n"; return [undef, '', 0]; } foreach $n (split('/', $self->{'name'})) { if ($ref) { $ref = $ref->{'named'}{$n}; } else { $ref = $self->{'owner'}{'contexts'}{$n}; } unless ($ref) { print STDERR "Can't find reference to $n in $self->{'name'} at line $self->{'line'}\n"; return ['', []]; } } $self->{'named'} = $ref->{'named'}; if (defined $self->{'max'} || defined $self->{'min'}) { $temp = bless {%$ref}, ref $ref; $temp->{'max'} = $self->{'max'} if defined $self->{'max'}; $temp->{'min'} = $self->{'min'} if defined $self->{'min'}; ($res, $ind, $len) = @{$temp->as_perl($atstart)}; } else { ($res, $ind, $len) = @{$ref->as_perl($atstart)}; } $id = $self->{'id'} || $self->{'name'}; foreach $n (@{$ind}) { $n =~ s|^[^/]+|$id|o; } return [$res, $ind, $len]; } package Encode::UTR22::Regexp::EOS; use vars qw(@ISA); BEGIN { @ISA = qw(Encode::UTR22::Regexp::Element); } sub as_perl { my ($self, $atstart) = @_; return [$atstart ? '^' : '$', [], 0]; } package Encode::UTR22::Regexp::class; sub new { my ($class, %attrs) = @_; my ($self) = {%attrs}; bless $self, ref $class || $class; } sub add_elements { my ($self) = shift; push (@{$self->{'elements'}}, @_); $self; } sub add_from { my ($self, $other) = @_; push (@{$self->{'elements'}}, @{$other->{'elements'}}); $self; } sub add_range { my ($self, $start, $end) = @_; push (@{$self->{'elements'}}, map {pack($self->{'size'} eq 'bytes' ? 'C' : 'U', $_)} ($start .. $end)); $self; } 1;