# ################################################################### Otakar Smrz, 2003/01/23 # # Encoding of Arabic: ArabTeX Notation by Klaus Lagally ##################################### # $Id: RE.pm,v 1.3 2004/08/23 08:57:53 smrz Exp $ package Encode::Arabic::ArabTeX::RE; use 5.008; use strict; use warnings; use Scalar::Util 'blessed'; our $VERSION = do { my @r = q$Revision: 1.3 $ =~ /\d+/g; sprintf "%d." . "%02d" x $#r, @r }; use Encode::Encoding; use base 'Encode::Encoding'; __PACKAGE__->Define('arabtex-re', 'ArabTeX-RE'); our (%encode_used, %decode_used, @shams, @qamar); our $enmode; our $demode; our %modemap = ( 'default' => 3, 'undef' => 0, 'fullvocalize' => 4, 'full' => 4, 'vocalize' => 3, 'nosukuun' => 3, 'novocalize' => 2, 'novowels' => 2, 'none' => 2, 'noshadda' => 1, 'noneplus' => 1, ); sub import { # perform import as if Encode were used one level before this module require Encode; Encode->export_to_level(1, @_); } sub encode ($$;$) { my ($cls, $text, $check) = @_; $_[1] = '' if $check; # this is what in-place edit needs $cls->initialize_encode() unless %encode_used; $text = join '', map { exists $encode_used{$_} ? $encode_used{$_} : $_ } split '', $text; $text =~ s/([\_\.\^]?\w)\\shadda{}/$1$1/g; $text =~ s/(\\ham{.})\\shadda{}/$1$1/g; $text =~ s/([\=\s\-\%])\\alif{}\\vow{a}l/$1"al-/g; $text =~ s/([\=\s\-\%])\\alif{}l/$1al-/g; $text =~ s/(b\\vow{i})\\alif{}l/$1-al-/g; $text =~ s/([\=\s\-\%])\\alif{}\\vow{([ui])}/$1"$2/g; $text =~ s/([\=\s\-\%])\\alif{}/$1i/g; $text =~ s/(\\vow{aN})\\alif{}/$1/g; $text =~ s/\\alif{}(\\vow{aN})/$1/g; $text =~ s/\\vow{a}\\alif{}/\\vow{A}/g; $text =~ s/\\alif{}/\\aux{A}/g; $text =~ s/\\madda{}/'A/g; $text =~ s/\\vow{a}\\maq{}/\\vow{Y}/g; $text =~ s/\\maq{}(\\vow{aN})/\\vow{aNY}/g; $text =~ s/\\vow{aN}\\maq{}/\\vow{aNY}/g; $text =~ s/\\maq{}/\\aux{Y}/g; $text =~ s/i(}?)y([^aiuAIUY])/I$1$2/g; # produces \ham{I}, too $text =~ s/u(}?)w([^aiuAIUY])/U$1$2/g; $text =~ s/([iIuU]})(\\ham{a})/$1-$2/g; $text =~ s/([\-\s])\\ham{a}([^\\])/$1'\\aux{a}$2/g; $text =~ s/([^\}\s])\\ham{a}/$1\\aux{a}'/g; $text =~ s/([^\-\s])(\\ham{i})/$1-$2/g; $text =~ s/\\ham{i}([^\\])/'\\aux{i}$1/g; $text =~ s/([^\}])(\\ham{w})/$1\\aux{u}$2/g; $text =~ s/(\\ham{w})([^\\])/$1\\aux{u}$2/g; $text =~ s/([^\}])(\\ham{y})/$1\\aux{i}$2/g; $text =~ s/(\\ham{y})([^\\])/$1\\aux{i}$2/g; $text =~ s/\\ham{[aiwy]}/'/g; $text =~ s/\\ham{I}/'I/g; $text =~ s/(?initialize_decode() unless %decode_used; $text = "\n" . $text . "\n"; $text =~ s/_a/a/g; # is there any special character for the defective fatha? $text =~ s/_U/U/g; # _U only affects the pronunciation, not the spelling foreach $one (@shams, '\\\'', @qamar) { $text =~ s/(?? $text =~ s/a'(\\D{})/a\\H{a}$1/g; # how do you write ? $text =~ s/'A/\\M{}/g; $text =~ s/'a/\\H{a}a/g; $text =~ s/'Y/\\H{a}Y/g; $text =~ s/'/\\H{a}a/g; # final resort foreach $one (@shams) { $text =~ s/($one)\-$one/l$1\\D{}/g; } foreach $one ('\\\\', @qamar) { $text =~ s/l\-($one)/l\\O{}$1/g; } # alas! using $one in the replacing expression produces extra \\ $text =~ s/aNY/\\V{aN}\\Q{}/g; $text =~ s/(?<=A\\H{}|\\H{a})aN/\\V{aN}/g; $text =~ s/(?<=T)aN/\\V{aN}/g; $text =~ s/(? "\x{0629}", "T", # C9 "\311", 'T', # 201 "\xC9", # "\xD8\xA9" "\x{062A}", "t", # CA "\312", 't', # 202 "\xCA", # "\xD8\xAA" "\x{062B}", "_t", # CB "\313", '\\_t', # 203 "\xCB", # "\xD8\xAB" <_t> "\x{062C}", "^g", # CC "\314", '\\^g', # 204 "\xCC", # "\xD8\xAC" <^g> "\x{062D}", ".h", # CD "\315", '\\.h', # 205 "\xCD", # "\xD8\xAD" <.h> "\x{062E}", "_h", # CE "\316", '\\_h', # 206 "\xCE", # "\xD8\xAE" <_h> "\x{062F}", "d", # CF "\317", 'd', # 207 "\xCF", # "\xD8\xAF" "\x{0630}", "_d", # D0 "\320", '\\_d', # 208 "\xD0", # "\xD8\xB0" <_d> "\x{0631}", "r", # D1 "\321", 'r', # 209 "\xD1", # "\xD8\xB1" "\x{0632}", "z", # D2 "\322", 'z', # 210 "\xD2", # "\xD8\xB2" "\x{0633}", "s", # D3 "\323", 's', # 211 "\xD3", # "\xD8\xB3" "\x{0634}", "^s", # D4 "\324", '\\^s', # 212 "\xD4", # "\xD8\xB4" <^s> "\x{0635}", ".s", # D5 "\325", '\\.s', # 213 "\xD5", # "\xD8\xB5" <.s> "\x{0636}", ".d", # D6 "\326", '\\.d', # 214 "\xD6", # "\xD8\xB6" <.d> # "\327", # D7 #"\327", "\x{0637}", ".t", # D8 "\330", '\\.t', # 216 "\xD8", # "\xD8\xB7" <.t> "\x{0638}", ".z", # D9 "\331", '\\.z', # 217 "\xD9", # "\xD8\xB8" <.z> "\x{0639}", "`", # DA "\332", '\\`', # 218 "\xDA", # "\xD8\xB9" <`> "\x{063A}", ".g", # DB "\333", '\\.g', # 219 "\xDB", # "\xD8\xBA" <.g> "\x{0640}", "-", # DC "\334", '--', # 220 "\xDC", # "\xD9\x80" ta.twiil "\x{0641}", "f", # DD "\335", 'f', # 221 "\xDD", # "\xD9\x81" "\x{0642}", "q", # DE "\336", 'q', # 222 "\xDE", # "\xD9\x82" "\x{0643}", "k", # DF "\337", 'k', # 223 "\xDF", # "\xD9\x83" # "\340", # E0 #"\340", "\x{0644}", "l", # E1 "\341", 'l', # 225 "\xE1", # "\xD9\x84" # "\342", # E2 #"\342", "\x{0645}", "m", # E3 "\343", 'm', # 227 "\xE3", # "\xD9\x85" "\x{0646}", "n", # E4 "\344", 'n', # 228 "\xE4", # "\xD9\x86" "\x{0647}", "h", # E5 "\345", 'h', # 229 "\xE5", # "\xD9\x87" "\x{0648}", "w", # E6 "\346", 'w', # 230 "\xE6", # "\xD9\x88" "\x{0649}", "\\maq{}", # EC "\354", '\\\\Q{}', # 236 "\xEC", # "\xD9\x89" 'alif maq.suura "\x{064A}", "y", # ED "\355", 'y', # 237 "\xED", # "\xD9\x8A" "\x{064B}", "\\vow{aN}", # F0 "\360", '\\\\V{aN}', # 240 "\xF0", # "\xD9\x8B" "\x{064C}", "\\vow{uN}", # F1 "\361", '\\\\V{uN}', # 241 "\xF1", # "\xD9\x8C" "\x{064D}", "\\vow{iN}", # F2 "\362", '\\\\V{iN}', # 242 "\xF2", # "\xD9\x8D" "\x{064E}", "\\vow{a}", # F3 "\363", '\\\\V{a}', # 243 "\xF3", # "\xD9\x8E" "\x{064F}", "\\vow{u}", # F5 "\365", '\\\\V{u}', # 245 "\xF5", # "\xD9\x8F" "\x{0650}", "\\vow{i}", # F6 "\366", '\\\\V{i}', # 246 "\xF6", # "\xD9\x90" "\x{0651}", "\\shadda{}", # F8 "\370", '\\\\D{}', # 248 "\xF8", # "\xD9\x91" ^sadda "\x{0652}", "\\sukun{}", # FA "\372", '\\\\O{}', # 250 "\xFA", # "\xD9\x92" sukuun "\x{0670}", # '\\\\A{}', # 243 "\xF3", # "\xD9\xB0" <_a> -> "\x{0671}", # '\\\\W{}', # 199 "\xC7", # "\xD9\xB1" wa.sla-on-'alif -> bare 'alif ); no strict 'refs'; $cls->enmode(defined ${ $cls . '::enmode' } ? ${ $cls . '::enmode' } : 'default'); } sub initialize_decode ($) { my $cls = shift @_; @shams = ('t', '\\_t', 'd', '\\_d', 'r', 'z', 's', '\\^s', '\\.s', '\\.d', '\\.t', '\\.z', 'l', 'n'); @qamar = ('b', '\\^g', '\\.h', '\\_h', '\\`', '\\.g', 'f', 'q', 'k', 'm', 'h', 'w', 'y'); %decode_used = ( # 'p', # 129 "\x81", # '\\^c', # 141 "\x8D", # '\\^z', # 142 "\x8E", # 'g', # 144 "\x90", '\\,', "\x{060C}", # 161 "\xA1", # "\xD8\x8C" right-to-left-comma '\\;', "\x{061B}", # 186 "\xBA", # "\xD8\x9B" right-to-left-semicolon '\\?', "\x{061F}", # 191 "\xBF", # "\xD8\x9F" right-to-left-question-mark '\\\\H{}', "\x{0621}", # 193 "\xC1", # "\xD8\xA1" hamza-on-the-line '\\\\M{}', "\x{0622}", # 194 "\xC2", # "\xD8\xA2" madda-over-'alif '\\\\H{a}', "\x{0623}", # 195 "\xC3", # "\xD8\xA3" hamza-over-'alif '\\\\H{w}', "\x{0624}", # 196 "\xC4", # "\xD8\xA4" hamza-over-waaw '\\\\H{i}', "\x{0625}", # 197 "\xC5", # "\xD8\xA5" hamza-under-'alif '\\\\H{y}', "\x{0626}", # 198 "\xC6", # "\xD8\xA6" hamza-over-yaa' '\\\\L{}', "\x{0627}", # 199 "\xC7", # "\xD8\xA7" bare 'alif 'b', "\x{0628}", # 200 "\xC8", # "\xD8\xA8" 'T', "\x{0629}", # 201 "\xC9", # "\xD8\xA9" 't', "\x{062A}", # 202 "\xCA", # "\xD8\xAA" '\\_t', "\x{062B}", # 203 "\xCB", # "\xD8\xAB" <_t> '\\^g', "\x{062C}", # 204 "\xCC", # "\xD8\xAC" <^g> '\\.h', "\x{062D}", # 205 "\xCD", # "\xD8\xAD" <.h> '\\_h', "\x{062E}", # 206 "\xCE", # "\xD8\xAE" <_h> 'd', "\x{062F}", # 207 "\xCF", # "\xD8\xAF" '\\_d', "\x{0630}", # 208 "\xD0", # "\xD8\xB0" <_d> 'r', "\x{0631}", # 209 "\xD1", # "\xD8\xB1" 'z', "\x{0632}", # 210 "\xD2", # "\xD8\xB2" 's', "\x{0633}", # 211 "\xD3", # "\xD8\xB3" '\\^s', "\x{0634}", # 212 "\xD4", # "\xD8\xB4" <^s> '\\.s', "\x{0635}", # 213 "\xD5", # "\xD8\xB5" <.s> '\\.d', "\x{0636}", # 214 "\xD6", # "\xD8\xB6" <.d> '\\.t', "\x{0637}", # 216 "\xD8", # "\xD8\xB7" <.t> '\\.z', "\x{0638}", # 217 "\xD9", # "\xD8\xB8" <.z> '\\`', "\x{0639}", # 218 "\xDA", # "\xD8\xB9" <`> '\\.g', "\x{063A}", # 219 "\xDB", # "\xD8\xBA" <.g> '--', "\x{0640}", # 220 "\xDC", # "\xD9\x80" ta.twiil 'f', "\x{0641}", # 221 "\xDD", # "\xD9\x81" 'q', "\x{0642}", # 222 "\xDE", # "\xD9\x82" 'k', "\x{0643}", # 223 "\xDF", # "\xD9\x83" 'l', "\x{0644}", # 225 "\xE1", # "\xD9\x84" 'm', "\x{0645}", # 227 "\xE3", # "\xD9\x85" 'n', "\x{0646}", # 228 "\xE4", # "\xD9\x86" 'h', "\x{0647}", # 229 "\xE5", # "\xD9\x87" 'w', "\x{0648}", # 230 "\xE6", # "\xD9\x88" '\\\\Q{}', "\x{0649}", # 236 "\xEC", # "\xD9\x89" 'alif maq.suura 'y', "\x{064A}", # 237 "\xED", # "\xD9\x8A" '\\\\V{aN}', "\x{064B}", # 240 "\xF0", # "\xD9\x8B" '\\\\V{uN}', "\x{064C}", # 241 "\xF1", # "\xD9\x8C" '\\\\V{iN}', "\x{064D}", # 242 "\xF2", # "\xD9\x8D" '\\\\V{a}', "\x{064E}", # 243 "\xF3", # "\xD9\x8E" '\\\\V{u}', "\x{064F}", # 245 "\xF5", # "\xD9\x8F" '\\\\V{i}', "\x{0650}", # 246 "\xF6", # "\xD9\x90" '\\\\D{}', "\x{0651}", # 248 "\xF8", # "\xD9\x91" ^sadda '\\\\O{}', "\x{0652}", # 250 "\xFA", # "\xD9\x92" sukuun '\\\\A{}', "\x{0670}", # 243 "\xF3", # "\xD9\xB0" <_a> -> '\\\\W{}', "\x{0671}", # 199 "\xC7", # "\xD9\xB1" wa.sla-on-'alif -> bare 'alif ); no strict 'refs'; $cls->demode(defined ${ $cls . '::demode' } ? ${ $cls . '::demode' } : 'default'); } sub enmode ($$) { my ($cls, $mode) = @_; $cls = blessed $cls if ref $cls; $mode = 'undef' unless defined $mode; $mode = $modemap{$mode} if exists $modemap{$mode}; no strict 'refs'; my $return = ${ $cls . '::enmode' }; ${ $cls . '::enmode' } = $mode; return $return; } sub demode ($$) { my ($cls, $mode) = @_; $cls = blessed $cls if ref $cls; $mode = 'undef' unless defined $mode; $mode = $modemap{$mode} if exists $modemap{$mode}; no strict 'refs'; my $return = ${ $cls . '::demode' }; ${ $cls . '::demode' } = $mode; return $return; } 1; __END__ =head1 NAME Encode::Arabic::ArabTeX::RE - Deprecated Encode::Arabic::ArabTeX implemented with regular expressions =head1 REVISION $Revision: 1.3 $ $Date: 2004/08/23 08:57:53 $ =head1 SYNOPSIS use Encode::Arabic::ArabTeX::RE; $string = decode 'arabtex-re', $octets; $octets = encode 'arabtex-re', $string; =head1 DESCRIPTION Deprecated method using sequential regular-expression substitutions. Limited in scope over the ArabTeX notation and non-efficient in data processing, still, not requiring the L module. =head2 EXPORTS & MODES Exports as if C also appeared in the package. Experimental and incomplete support for B is provided, see L. =head1 SEE ALSO L =head1 AUTHOR Otakar Smrz, L eval { 'E' . 'smrz' . "\x40" . ( join '.', qw 'ckl mff cuni cz' ) . 'E' } Perl is also designed to make the easy jobs not that easy ;) =head1 COPYRIGHT AND LICENSE Copyright 2003, 2004 by Otakar Smrz This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut