package Regexp::Cherokee; use base qw(Exporter); use utf8; BEGIN { use strict; use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS %CherokeeClasses %CherokeeEquivalence $pseudoMatrix); $VERSION = "0.03"; @EXPORT_OK = qw(%CherokeeClasses %CherokeeEquivalence &getForm &setForm &subForm &formatForms); %EXPORT_TAGS = ( utils => [qw(&getForm &setForm &subForm &formatForms)] ); %CherokeeClasses =( 1 => "ᎠᎦᎧᎭᎳᎹᎾᎿᏀᏆᏌᏍᏓᏔᏜᏝᏣᏩᏯ", 2 => "ᎡᎨᎮᎴᎺᏁᏇᏎᏕᏖᏞᏤᏪᏰ", 3 => "ᎢᎩᎯᎵᎻᏂᏈᏏᏗᏘᏟᏥᏫᏱ", 4 => "ᎣᎪᎰᎶᎼᏃᏉᏐᏙᏠᏦᏬᏲ", 5 => "ᎤᎫᎱᎷᎽᏄᏊᏑᏚᏡᏧᏭᏳ", 6 => "ᎥᎬᎲᎸᏅᏋᏒᏛᏢᏨᏮᏴ", Ꭰ => "Ꭰ-Ꭵ", Ꭶ => "Ꭶ-Ꭼ", Ꭽ => "Ꭽ-Ꮂ", Ꮃ => "Ꮃ-Ꮈ", Ꮉ => "Ꮉ-Ꮍ", Ꮎ => "Ꮎ-Ꮕ", Ꮖ => "Ꮖ-Ꮛ", Ꮜ => "Ꮜ-Ꮢ", Ꮣ => "Ꮣ-Ꮫ", Ꮬ => "Ꮬ-Ꮲ", Ꮳ => "Ꮳ-Ꮸ", Ꮹ => "Ꮹ-Ꮾ", Ꮿ => "Ꮿ-Ᏼ" ); # # Cherokee Rules Orthography Equivalence # %CherokeeEquivalence =( Ꭶ => "ᎦᎧ", Ꮎ => "ᎾᎿᏀ", Ꮜ => "ᏌᏍ", Ꮣ => "ᏓᏔ", Ꮥ => "ᏕᏖ", Ꮧ => "ᏗᏘ", Ꮬ => "ᏜᏝ" ); $CherokeeEquivalence{'Ꭷ'} = $CherokeeEquivalence{'Ꭶ'} ; $CherokeeEquivalence{'Ꮏ'} = $CherokeeEquivalence{'Ꮐ'} = $CherokeeEquivalence{'Ꮎ'} ; $CherokeeEquivalence{'Ꮝ'} = $CherokeeEquivalence{'Ꮜ'} ; $CherokeeEquivalence{'Ꮤ'} = $CherokeeEquivalence{'Ꮣ'} ; $CherokeeEquivalence{'Ꮦ'} = $CherokeeEquivalence{'Ꮥ'} ; $CherokeeEquivalence{'Ꮨ'} = $CherokeeEquivalence{'Ꮧ'} ; $CherokeeEquivalence{'Ꮭ'} = $CherokeeEquivalence{'Ꮬ'} ; # use a long string as a pseudo matrix # get index in pseudo matrix, then find in index+form combination position in matrix # 6x13 matrix # Form 1: "ᎠᎦᎭᎳᎹᎾᏆᏌᏓᏜᏣᏩᏯ", # Form 2: "ᎡᎨᎮᎴᎺᏁᏇᏎᏕᏞᏤᏪᏰ", # Form 3: "ᎢᎩᎯᎵᎻᏂᏈᏏᏗᏟᏥᏫᏱ", # Form 4: "ᎣᎪᎰᎶᎼᏃᏉᏐᏙᏠᏦᏬᏲ", # Form 5: "ᎤᎫᎱᎷᎽᏄᏊᏑᏚᏡᏧᏭᏳ", # Form 6: "ᎥᎬᎲᎸXᏅᏋᏒᏛᏢᏨᏮᏴ", $pseudoMatrix = "ᎠᎦᎭᎳᎹᎾᏆᏌᏓᏜᏣᏩᏯᎡᎨᎮᎴᎺᏁᏇᏎᏕᏞᏤᏪᏰᎢᎩᎯᎵᎻᏂᏈᏏᏗᏟᏥᏫᏱᎣᎪᎰᎶᎼᏃᏉᏐᏙᏠᏦᏬᏲᎤᎫᎱᎷᎽᏄᏊᏑᏚᏡᏧᏭᏳᎥᎬᎲᎸXᏅᏋᏒᏛᏢᏨᏮᏴ"; } sub import { my @args = ( shift ); # package foreach (@_) { if ( /overload/o ) { use overload; overload::constant 'qr' => \&getRe; } elsif ( /:forms/o ) { Regexp::Cherokee->export_to_level (1, $args[0], ':forms'); # this works too... } elsif ( /:utils/o ) { Regexp::Cherokee->export_to_level (1, $args[0], ':utils'); # this works too... } else { push (@args, $_); } } if ($#args) { Regexp::Cherokee->export_to_level (1, @args); # this works too... } } sub getForm { my ($letter) = @_; foreach my $form (1..6) { return $form if ( $CherokeeClasses{$form} =~ $letter ); } } # # unfortunately the index function in Perl 5.8.0 is broken for some # Unicode sequences: http://rt.perl.org/rt2/Ticket/Display.html?id=22375 # sub _index { my ( $haystack, $needle ) = @_; my $pos = my $found = 0; foreach (split (//, $haystack) ) { $found = 1 if ( /$needle/ ); $pos++ unless ( $found ); } $pos; } sub setForm { my ($letter, $form) = @_; $form--; # # simplify # $letter =~ s/Ꭷ/Ꭶ/; $letter =~ s/[ᎿᏀ]/Ꮎ/; $letter =~ s/Ꮝ/Ꮜ/; $letter =~ s/Ꮤ/Ꮣ/; $letter =~ s/Ꮦ/Ꮥ/; $letter =~ s/Ꮨ/Ꮧ/; $letter =~ s/Ꮭ/Ꮬ/; # print "letter = $letter / form = $form\n
"; my $index = _index ( $pseudoMatrix, $letter ); # print "index = $index
\n"; my $offset = ( ($index%13) + $form*13 ); substr ( $pseudoMatrix, $offset, 1 ); } sub subForm { my ($set, $get) = @_; setForm ( $set, getForm ( $get ) ); } sub formatForms { my ($format, $string) = @_; my @chars = split ( //, $string ); if ( @chars != ($format =~ s/%/%/g) ) { $format =~ s/\p{Cherokee}//g; warn ( "\"$string\" is of different length from $format." ); return; } foreach (@chars) { $format =~ s/%(\d+)/setForm($_, $1)/e; } $format; } sub handleChars { my ($chars,$form) = @_; return ( $CherokeeClasses{$form} ) if ( $chars eq "all" ); my $re; $chars =~ s/(\w)(?=\w)/$1,/og; my @Chars = split ( /,/, $chars ); foreach (@Chars) { if ( /(\w)-(\w)/o ) { my ($a,$b) = ($1,$2); foreach my $char (sort keys %CherokeeClasses) { next if ( length($char) > 1 ); next unless ( (ord($a) <= ord($char)) && (ord($char) <= ord($b)) ); if ( $form eq "all" ) { $re .= $CherokeeClasses{$char}; } else { $CherokeeClasses{$form} =~ /([$CherokeeClasses{$char}])/; $re .= $1; } } } else { if ( $form eq "all" ) { $re .= $CherokeeClasses{$_}; } else { $CherokeeClasses{$form} =~ /([$CherokeeClasses{$_}])/; $re .= $1; } } } $re; } sub setRange { my ($chars,$forms,$not) = @_; $not ||= $_[3]; my $re; if ( $forms eq "all" ) { $re = handleChars ( $chars, $forms ); } else { my @Forms = split ( /,/, $forms); # # next time, put @Chars loop on the outside and set # up character ranges with - # foreach (@Forms) { if ( /(\d)-(\d)/o ) { my ($a,$b) = ($1,$2); foreach my $form ($a..$b) { $re .= handleChars ( $chars, $form ); } } else { my $form = $_; $re .= handleChars ( $chars, $form ); } } } ($re) ? ($not) ? "[$not$re]" : "[$re]" : ""; } sub getRe { $_ = ($#_) ? $_[1] : $_[0]; s/\[=(\p{Cherokee})=\]/($CherokeeEquivalence{$1}) ? "[$CherokeeEquivalence{$1}]" : $1/eog; s/\[#(\p{Cherokee}|\d)#\]/($CherokeeClasses{$1}) ? "[$CherokeeClasses{$1}]" : ""/eog; s/\[#(\^)?([\d,-]+)#\]/setRange("all",$2,$1)/eog; s/\[#(\^)?([\p{Cherokee},-]+)#\]/setRange($2,"all",$1)/eog; # # for some stupid reason the below doesn't work, so \w # is used in place of \p{Cherokee}, dangerous... # # s/(\p{Cherokee})\{%([\d,-]+)\}/setRange($1,$2)/eog; s/(\w)\{#([\d,-]+)#\}/setRange($1,$2)/eog; s/\[(\^)?(\p{Cherokee}+.*?)\]\{(\^)?#([\d,-]+)#\}/setRange($2,$4,$1,$3)/eog; $_; } ######################################################### # Do not change this, Do not put anything below this. # File must return "true" value at termination 1; ########################################################## __END__ =head1 NAME Regexp::Cherokee - Regular Expressions Support for Cherokee Script. =head1 SYNOPSIS # # Overloading Perl REs: # use utf8; use Regexp::Cherokee qw(overload setForm); : s/([#2#])/setForm($1,6)/eg; s/([ᎠᎦᎧᎭ]%2)/setForm($1,6)/eg; s/([ᎠᎦᎧᎭ]%{1,3})/setForm($1,6)/eg; s/([ᎠᎦᎧᎭ]%{1-3,7})/setForm($1,6)/eg; s/([#Ꮎ#])/subForm('Ꮬ',$1)/eg; # substitute, a 'Ꮬ' for a 'Ꮎ' in the form found for the 'Ꮎ' if ( /[#Ꮜ#]/ ) { # # do something # : } : : # # Without overloading: # use utf8; require Regexp::Cherokee; my $string = "[ᎠᎦᎧᎭ]%{1-3,7}"; my $re = Regexp::Cherokee::getRe ( $string ); s/abc($re)xyz/"abc".Regexp::Cherokee::setForm($1,6)."xyz"/eg; =head1 DESCRIPTION The Regexp::Cherokee module provides POSIX style character class definitions for working with the Cherokee syllabary. The character classes provided by the Regexp::Cherokee package correspond to inate properties of the script and are language independent. The Regexp::Cherokee package is NOT derived from the Regexp class and may not be instantiated into an object. Regexp::Cherokee can optionally export the utility functions C, C, C and C (or all with the C<:utils> pragma) to query or set the form of an Cherokee character. Tags of variables in the form names set to form values may be exported under the C<:forms> pragma. See the files in the doc/ and examples/ directories that are included with this package. =head2 Substituion Utilities =head3 getForm A utility function to query the "form" of an Cherokee syllable. It will return an integer between 1 and 12 corresponding to the [#\d+#] classes. print getForm ( "Ꮿ" ), "\n"; # prints 1 =head3 setForm A utility function to set the form number of a syllable. The form number must be an integer between 1 and 12 corresponding to the [#\d+#] classes. s/(.)/setForm($1, 1)/eg; =head3 subForm A utility function to set the form number of a syllable based on the form of another syllable. s/(\w+)([#Ꮎ#]/$1.subForm('Ꮬ', $2)/eg; =head3 formatForms A utility function somewhat analogous to C for a sequence of syllables: print formatForms ( "%1%2%3%4", "ᎠᎦᎧᎭ" ), "\n"; # prints ᎠᎨᎯᎶ =head1 LIMITATIONS The overloading mechanism only applies to the constant part of the RE. The following would not be handled by the Regexp::Ethiopic package as expected: use Regexp::Cherokee 'overload'; my $x = "Ꭷ"; : : if ( /[#$x#]/ ) { : : } The package never gets to see the variable C<$x> to then perform the RE expansion. The work around is to use the package as per: use Regexp::Cherokee 'overload'; my $x = "Ꭷ"; : : my $re = Regexp::Cherokee::getRe ( "[#$x#]" ); if ( /$re/ ) { : : } This works as expected at the cost of one extra step. The overloading and functional modes of the Regexp::Cherokee package may be used together without conflict. =head1 REQUIRES Works perfectly with Perl 5.8.0, may work with Perl 5.6.x but has not yet been tested. =head1 BUGS None presently known. =head1 AUTHOR Daniel Yacob, L =head1 SEE ALSO Included with this package: examples/overload.pl examples/utils.p =cut