package LiveGeez::CacheAsSERA;
use base qw(HTML::Parser Exporter);
BEGIN
{
use strict;
use vars qw($VERSION @EXPORT_OK @gFile @FontStack %SystemList %CheckTags $s $p);
$VERSION = '0.20';
@EXPORT_OK = qw(Local Remote);
require LiveGeez::Services;
require Convert::Ethiopic::System;
require HTML::Entities;
require LiveGeez::HTML;
require LiveGeez::Directives;
use URI;
$URI::ABS_REMOTE_LEADING_DOTS = 1;
# $#gFile = 100;
$#FontStack = 4;
%CheckTags = (
font => 1,
# a => 1,
# frame => 1,
# base => 1,
# link => 1,
);
$s = new Convert::Ethiopic::System ( "sera" );
$p = new LiveGeez::CacheAsSERA ( api_version => 3,
start_h => ['start', "self, tagname, attr, text"],
end_h => ['end', "self, tagname, text"],
text_h => ['textOut', "self, dtext, text"],
default_h => [sub { push ( @gFile, @_ ) }, 'text']
);
}
sub textOut
{
my ($self, $decoded, $orig) = @_;
$_ = $orig;
# if ( /\S/so && !/^[\s\xa0]+$/o && !/^/ ) {
if ( !/^[\s\xa0]+$/o && !/^/ ) {
# unless ( /^[\s\xa0]+$/o ) {
$_ = $decoded;
$decoded = &decode_more_entities(HTML::Entities::decode($decoded));
if ( /\S/so && $self->{fontStack} && ${$FontStack[ $self->{fontStack} ]{sysIn}} ) {
$self->{request}->{sysIn} = ${$FontStack [ $self->{fontStack} ]{sysIn}};
# $self->{request}->{string} = &decode_more_entities(HTML::Entities::decode($_));
$self->{request}->{string} = $decoded;
# print "[ $self->{request}->{string} ]\n";
# print "FONT: ${$FontStack[ $self->{fontStack} ]{sysIn}} \n";
$_ = "" . LiveGeez::Services::ProcessString ( $self->{request} ) . "";
}
else {
$_ = $orig; # original text
}
}
push ( @gFile, $_ );
}
sub start
{
my ($self, $tagname, $attr, $text) = @_;
my $test = 0;
# print STDERR "TAG: $tagname | $text\n";
# if ( exists($CheckTags{$tagname}) || ( $test = ( $attr->{style} && $attr->{style} =~ /font-family/ ) ) ) {
if ( ( $test = ( $attr->{style} && $attr->{style} =~ /font-family/ ) ) || exists($CheckTags{$tagname}) ) {
if ( $test || ( $tagname eq "font" && $attr->{face} ) ) {
$FontStack[ ++$self->{fontStack} ]{tag} = $tagname;
$text = "";
if ( $tagname eq "font" ) {
$FontStack[ $self->{fontStack} ]{sysIn} = GetSystemOut ( $self, $attr->{face} );
if ( my $newtext = $self->UpdateFontTag($attr) ) {
# printf STDERR "OLD <$_[3]> => NEW <$newtext>\n";
$FontStack[ $self->{fontStack} ]{keep} = 1;
$text = $newtext;
}
# $FontStack[ $self->{fontStack} ]{sysIn} = $attr->{face};
# print " " x $self->{fontStack};
# print "OPEN <$tagname>: $attr->{face} [$self->{fontStack}]\n";
}
elsif ( $attr->{style} && $attr->{style} =~ /font-family:\s*(['"]|("))?([\w -]+)[;'"&]?/i ) {
$CheckTags{$tagname} = 1;
$FontStack[ $self->{fontStack} ]{sysIn} = ( $3 ) ? GetSystemOut ( $self, $3 ) : undef;
if ( my $newtext = $self->UpdateStyle($tagname, $attr) ) {
if ( $newtext eq "" ) {
$newtext = "";
}
else {
$FontStack[ $self->{fontStack} ]{keep} = 1;
}
$text = $newtext;
}
# $FontStack[ $self->{fontStack} ]{sysIn} = ( $3 ) ? $3 : undef;
# print " " x $self->{fontStack};
# print "OPEN <$tagname>: $3 [$self->{fontStack}]\n";
}
}
elsif ( exists ($FontStack[ $self->{fontStack} ]{tag}) && ($tagname eq $FontStack[ $self->{fontStack} ]{tag}) ) {
$FontStack[ ++$self->{fontStack} ]{tag} = $tagname;
$FontStack[ $self->{fontStack} ]{sysIn} = $FontStack[ ($self->{fontStack} - 1) ]{sysIn};
$FontStack[ $self->{fontStack} ]{keep} = 1;
}
}
# print STDERR "PUSHING: $text\n==========================\n";
push ( @gFile, $text );
}
sub end
{
my ($self, $tagname, $text) = @_;
if ( exists ($FontStack[ $self->{fontStack} ]{tag}) && ($tagname eq $FontStack[ $self->{fontStack} ]{tag}) ) {
if ( exists($FontStack[ $self->{fontStack} ]{keep}) ) {
push ( @gFile, $text );
delete($FontStack[ $self->{fontStack} ]{keep});
}
delete($FontStack[ $self->{fontStack} ]{tag});
delete($FontStack[ $self->{fontStack}-- ]{sysIn});
}
else {
push ( @gFile, $text );
}
}
sub UpdateFontTag
{
my ( $self, $attr ) = @_;
delete ( $attr->{face} );
return unless ( %{$attr} );
my $args;
foreach ( keys %$attr ) {
$args .= " $_=\"$attr->{$_}\"";
}
"";
}
sub UpdateStyle
{
my ( $self, $tagname, $attr ) = @_;
my $style = $attr->{style};
delete ( $attr->{style} );
#
# if ( ($style !~ /font-size/i) && ($style !~ /font-color/i) ) {
#
# return if ( $style !~ ";" || $style =~ "char-type:" );
return if ( $style =~ "char-type:" );
# $style =~ s/(\s*)?((\w+-)*)?font-family:\s*(['"]|")?(.*?);+?//gi;
#
# This takes care of font names with boundaries:
#
$style =~ s/(\s*)?((\w+-)*)?font-family:\s*(['"]|")(.*?)$1//gi;
#
# This takes care of font names without boundaries:
#
$style =~ s/(\s*)?((\w+-)*)?font-family:\s*[\w -]+//gi;
$attr->{style} = $style if ( $style );
return unless ( %$attr );
my $args;
foreach ( keys %$attr ) {
$args .= " $_=\"$attr->{$_}\"";
}
"<$tagname$args>";
}
sub GetSystemOut
{
my ( $self, $face ) = ( shift, shift );
if ( !$face ) {
return unless ( $self->{fontStack} && ${$FontStack[ $self->{fontStack} ]{sysIn}} );
$face = $lastFace;
}
$lastFace = $face;
$SystemList{$face} ||= new Convert::Ethiopic::System ( $face );
\$SystemList{$face}; # Return the pointer
}
sub PostUpdateHREF
{
my ($base_uri, $link, $file_query) = @_;
# printf STDERR "Entering PostUpdateHREF $file_query\n";
my $attr = $link;
$attr =~ s/(href\s*=\s*\S+)(.*)?/$1/i;
$attr =~ s/href//i;
$attr =~ s/=//;
$attr =~ s/"//g;
$attr =~ s/^\s+//;
my $uri = new URI ( $attr );
if ( my $scheme = $uri->scheme ) {
# printf STDERR "YES SCHEME\n";
return ( $link ) if ( $scheme eq "mailto" || $scheme eq "file" );
return ( $link ) if ( $link =~ s/nolivegeezlink//i );
}
else {
# printf STDERR "NO SCHEME\n";
my $uri_out = URI->new_abs ( $uri, $base_uri->{_uri} );
# printf STDERR $uri_out->canonical,"\n";
return ( "href=\"".$uri_out->canonical."\"" ) if ( $link =~ /nolivegeezlink/i );
$uri = $uri_out;
}
# printf STDERR "QUERY: $URIS{file_query}\n";
qq(href="$file_query) . $uri->canonical . qq(");
}
sub UpdateTitle
{
#
# This will use the last "sysIn" value which may not correspond to the encoding
# here, so this approach will be hit-and-miss. Consider a 2 part font where
# the last encoding might have been the 2nd part (GeezNewB), we should have a
# method to get the first part encoding
#
$_[0]->{request}->{string} = &decode_more_entities ( HTML::Entities::decode ( $_[0]->{request}->{string} ) );
LiveGeez::Services::ProcessString ( $_[0]->{request} );
}
sub FakeCCS
{
my ( $sourceFile ) = @_;
open (FILE, "$sourceFile");
$_ = join ( "", );
close (FILE);
if ( ///oi;
s/(<.*?>)/$1/isgo;
s/()/$1/isgo;
}
s/(<\/(p|(t[dh]))>)/<\/font>$1/isgo;
open (FILEX, ">$sourceFile");
print FILEX;
close (FILEX);
}
sub Local
{
my ( $file, $sourceFile ) = @_;
open ( FILE, "$sourceFile" );
local $/ = undef;
# @gFile = ;
# $_ = join ( "", @gFile);
$_ = ;
close ( FILE );
# $#gFile = -1;
my $updated = 0;
if ( /livegeez/i ) {
$updated = 1;
$_ = LiveGeez::Directives::ParseDirectives ( $file, $_ );
}
if ( /href/i ) {
$updated = 1;
unless ( $file->{request}->{config}->{usecookies} ) {
my $uri = new LiveGeez::URI ( $file->{request}->{uri}->canonical );
s#]+)>(.*?)#$space = $1; $arg = $2; $data = $3; $link = ($3 =~ "" && $arg !~ $file->{scriptRoot} && $arg !~ /mailto:/i) ? PostUpdateHREF( $uri, $arg, $file->{request}->{config}->{uris}->{file_query} ) : $arg ; "$data"#oeisg;
}
s/ NOLIVEGEEZLINK>/>/oig;
}
$file->{refsUpdated} = 1;
return ( $sourceFile ) unless ( $updated );
my $seraFileIn = $sourceFile;
$seraFileIn =~ s/$file->{ext}$/sera.html/i;
$seraFileIn =~ s/sera\.sera\.html$/zobel.html/i;
$seraFileIn =~ s/$file->{request}->{config}->{uris}->{webroot}/$file->{request}->{config}->{uris}->{cachelocal}/
unless ( $sourceFile =~ "cache" );
my $seraFileOut = $seraFileIn;
unless ( $seraFileIn =~ /index\.($file->{request}->{sysOut}->{lang}\.)?zobel\.html$/ ) {
$seraFileOut .= ".gz";
$file->{isZipped} = "true";
}
$file->{request}->{sysIn} = $s;
# printf STDERR "SeraFile[$$]: $seraFileOut\n";
if (-e $seraFileOut) {
# printf STDERR "Found[$$]: $seraFileOut\n";
# $file->{refsUpdated} = 1;
return ( $seraFileOut )
}
open ( SERACACHE, ">$seraFileIn" ) || $file->{request}->DieCgi
( "!: Could Not Open Source File: $seraFileIn!\n" );
print SERACACHE;
close ( SERACACHE );
system ( 'gzip' , $seraFileIn ) if ( $file->{isZipped} eq "true" );
$seraFileOut; # this is the return value
}
sub Remote
{
my ( $file, $sourceFile ) = @_;
#
# The first thing we do is check if we have a cached sera file,
# if so we're outa here:
#
my $seraFileIn = $sourceFile;
# printf STDERR "SOURCE[$$]: $sourceFile\n";
$seraFileIn =~ s/$file->{ext}$/sera.html/i;
$seraFileIn =~ s/sera\.sera\.html$/zobel.html/i;
$seraFileIn =~ s/$file->{request}->{config}->{uris}->{webroot}/$file->{request}->{config}->{uris}->{cachelocal}/
unless ( $sourceFile =~ "cache" );
my $seraFileOut = $seraFileIn;
unless ( $seraFileIn =~ /index\.($file->{request}->{sysOut}->{lang}\.)?zobel\.html$/ ) {
$seraFileOut .= ".gz";
$file->{isZipped} = "true";
}
# printf STDERR "SeraFile[$$]: $seraFileOut\n";
if (-e $seraFileOut) {
# printf STDERR "Found[$$]: $seraFileOut\n";
$file->{refsUpdated} = 1;
$file->{request}->{sysIn} = $s;
return ( $seraFileOut )
}
$p->{uri} = new LiveGeez::URI ( $file->{request}->{uri}->canonical );
if ( $file->{request}->{sysIn}->{sysName} eq "sera" ) {
open ( FILE, "$sourceFile" );
# @gFile = ;
# $_ = join ( "", @gFile);
local $/ = undef;
$_ = ;
close ( FILE );
}
else {
FakeCCS ( $sourceFile )
if ( $file->{request}->{file} =~ m|http://www.waltainfo.com|i );
$p->{fontStack} = 0;
$p->{request}->{sysOut} = $s;
$p->{request}->{sysOut}->{langNum} = $file->{request}->{sysOut}->{langNum};
$p->{request}->{sysOut}->{options} = $file->{request}->{sysOut}->{options};
$p->{request}->{sysOut}->{iPath} = "";
$p->{request}->{sysOut}->{fontNum} = 0;
$p->{request}->{pragma} = $file->{request}->{pragma};
system ( 'gzip', '-d', $sourceFile ) if ( $sourceFile =~ s/\.gz$// );
# printf STDERR "Parsing[$$]: $sourceFile\n";
$p->parse_file( $sourceFile );
# printf STDERR "Done[$$]: $sourceFile\n";
$_ = join ( "", @gFile );
# printf STDERR "$_\n";
#
# convert title to sera if 8-bit chars present
#
my ($space, $link, $data);
s#(.*?)#$title = $1; if ( $title =~ /[\x80-\xff]/ ) { $p->{request}->{string} = $title ; $title = UpdateTitle ( $p ); } "$title"#imse;
$#gFile = -1;
}
# $#gFile = -1;
$file->{refsUpdated} = -1;
$file->{request}->{sysIn} = $s;
$_ = LiveGeez::Directives::ParseDirectives ( $file, $_ );
#
# strip extra and tags
#
s#(<((br)|((/)?(p)))>)?#$1#og;
s# # #g;
#
# set up local links with Ethiopic text to use Zobel
#
# printf STDERR "Before PostUpdateHREF [$URIS{zuri}]\n";
# printf STDERR "$_\n";
unless ( $file->{request}->{config}->{usecookies} ) {
s#]+)>(.*?)#$space = $1; $arg = $2; $data = $3; $link = ($3 =~ "" && $arg !~ $file->{scriptRoot} && $arg !~ /mailto:/i) ? PostUpdateHREF( $p->{uri}, $arg, $file->{request}->{config}->{uris}->{file_query} ) : $arg ; "$data"#oeisg;
}
s/ NOLIVEGEEZLINK>/>/oig;
#
# strip meta tags which we no longer need and may infact set
# charsets that we don't want.
#
s/]+)>(\r)?(\n)?//oig;
#
# strip out anything before the $seraFileIn" ) || $file->{request}->DieCgi
( "!: Could Not Open Source File: $seraFileIn!\n" );
print SERACACHE;
close ( SERACACHE );
system ( 'gzip' , $seraFileIn ) if ( $file->{isZipped} eq "true" );
$seraFileOut; # this is the return value
}
%entity2char =(
'sbquo' => "\x82",
'bdquo' => "\x84",
'hellip' => "\x85",
'dagger' => "\x86",
'Dagger' => "\x87",
'permil' => "\x89",
'circ' => "\x88",
'Scaron' => "\x8a",
'lsaquo' => "\x8b",
'OElig' => "\x8c",
'lsquo' => "\x91",
'rsquo' => "\x92",
'ldquo' => "\x93",
'rdquo' => "\x94",
'bull' => "\x95",
'ndash' => "\x96",
'mdash' => "\x97",
'tilde' => "\x98",
'trade' => "\x99",
'scaron' => "\x9a",
'rsaquo' => "\x9b",
'oelig' => "\x9c",
'Yuml' => "\x9f"
);
sub decode_more_entities
{
my $array;
if (defined wantarray) {
$array = [@_]; # copy
}
else {
$array = \@_; # modify in-place
}
for (@$array) {
s/(&(\w+);?)/$entity2char{$2} || $1/eg;
}
$array->[0];
}
#########################################################
# Do not change this, Do not put anything below this.
# File must return "true" value at termination
1;
##########################################################
__END__
=head1 NAME
LiveGeez::CacheAsSERA - HTML Conversion for LiveGe'ez
=head1 SYNOPSIS
$cacheFile = LiveGeez::CacheAsSERA::HTML($f, $sourceFile)
Where $f is a File.pm object and $sourceFile is the pre-cached file name.
=head1 DESCRIPTION
CacheAsSERA.pm contains the routines for conversion of HTML document content
from Ethiopic encoding systems into SERA for document caching and later
conversion into other Ethiopic systems.
=head1 AUTHOR
Daniel Yacob, L
=head1 SEE ALSO
S>
=cut