############################################################################# ## Name: Parser.pm ## Purpose: Pod::HtmlEasy::Parser ## Author: Graciliano M. P. ## Modified by: Geoffrey Leach ## Created: 11/01/2004 ## Updated: 2008-05-31 ## Copyright: (c) 2004 Graciliano M. P. (c) 2007, 2008 Geoffrey Leach ## Licence: This program is free software; you can redistribute it and/or ## modify it under the same terms as Perl itself ############################################################################# package Pod::HtmlEasy::Parser; use 5.006002; use base qw{ Pod::Parser }; use Pod::Parser; use Pod::ParseLink; use Readonly; use Pod::HtmlEasy::Data qw(EMPTY NUL); use Carp; use English qw{ -no_match_vars }; use Regexp::Common qw{ whitespace number URI }; use Regexp::Common::URI::RFC2396 qw { $escaped }; use Pod::Escapes qw{ e2char }; use Switch qw{ Perl6 }; use strict; use warnings; use version; our $VERSION = qv('1.0.1'); ######## # VARS # ######## Readonly::Scalar my $NUL => NUL; Readonly::Scalar my $MAIL_RE => qr{ ( # grab all of this [\w-]+ # some word chars with '-' included foo \0? # possible NUL escape \@ # literal '@' @ [\w\\-]+ # another word bar (?: # non-grabbing pattern \. # literal '.' . [\w\-\.]+# that word stuff stuff \. # another literal '.' . [\w\-]+ # another word and | # or \. # literal '.' . [\w\-]+ # word nonsense | # or empty? ) # end of non-grab ) # end of grab }smx; # [6062] # Treatment of embedded HTML-significant characters and embedded URIs. # There are some characters (%html_entities below) which may in some # circumstances be interpreted by a browser, and you probably don't want that # Consequently, they are replaced by names defined by the W3C UNICODE spec, # http://www.w3.org/TR/MathML2/bycodes.html, bracketed by '&' and ';' # Thus, '>' becomes '<' This is handled by _encode_entities() # There's a "gotchya" in this process. As we are generating HTML, # the encoding needs to take place _before_ any HTML is generated. # If the HTML appears garbled, and UNICODE entities appear where they # shouldn't, this encoding has happened to late at some point. # This is all further complicated by the fact that the POD formatting # codes syntax uses some of the same characters, as in "L<...>", for example, # and we can't expand those first, because some of them generate # HTML. This is resolved by tagging the characters that we want # to distinguish from HTML with ASCII NUL ('\0', $NUL). Thus, '$lt;' becomes # '\0&' in _encode_entities(). Generated HTML is also handled # this way by _nul_escape(). After all processing of the POD formatting # codes are processed, this is reversed by _remove _nul_escapes(). # Then there's the issue of embedded URIs. URIs are also generated # by the processing of L<...>, and can show up _inside L<...>, we # delay processing of embedded URIs until after all of the POD # formatting codes is complete. URIs that result from that processing # are tagged (you guessed it!) with a NUL character, but not preceeding # the generated URI, but after the first character. These NULs are removed # by _remove _nul_escapes() Readonly::Hash my %html_entities => ( q{&} => q{amp}, q{>} => q{gt}, q{<} => q{lt}, q{"} => q{quot}, ); my $HTML_ENTITIES_RE = join q{|}, keys %html_entities; $HTML_ENTITIES_RE = qr{$HTML_ENTITIES_RE}mx; ################# # _NUL_ESCAPE # ################# # Escape HTML-significant characters with ASCII NUL to differentiate them # from the same characters that get converted to entity names sub _nul_escape { my $txt_ref = shift; ${$txt_ref} =~ s{($HTML_ENTITIES_RE)}{$NUL$1}gsmx; return; } ####################### # _REMOVE_NUL_ESCAPSE # ####################### sub _remove_nul_escapes { my $txt_ref = shift; ${$txt_ref} =~ s{$NUL}{}gsmx; return; } #################### # _ENCODE_ENTITIES # #################### sub _encode_entities { my ( $parser, $txt_ref ) = @_; foreach my $chr ( keys %html_entities ) { # $chr gets a lookbehind to avoid converting flagged from E<...> my $re = qq{(? bracketing # Note that the HTML-significant characters are escaped; # The escapes are removed by _encode_entities # Note that there's no presumption that there's a URI in the # text, so not matching is _not_ and error. sub _add_uri_href { my ( $parser, $txt_ref ) = @_; if ( ${$txt_ref} =~ m{https?:}smx ) { # Replace escaped characters in URL with their ASCII equivalents # Regexp::Common escapes in path part, but not in host part, which appears correct # per the RFC. However, the Spamassassin folks use it in the host. # $escaped is defined by Regexp::Common::URI::RFC2396, and matches %xx # This is done first because if needed, the host part won't be parsed correctly while ( ${$txt_ref} =~ m{($escaped)}mx ) { my $esc = $1; my $new = $1; $new =~ s{%}{0x}mx; $new = e2char($new); ${$txt_ref} =~ s{$esc}{$new}gmx; } # target='_blank' causes load to a new window or tab # See HTML 4.01 spec, section 6.16 Frame target names # Doing this because URI RE grabs non-word trailing characters # ${$txt_ref} =~ m{$RE{URI}{HTTP}{-keep}{-scheme=>'https?'}}mx; # my $uri = $1; # my $host = $3; # $uri =~ s{[^/\w]+\z}{}mx; # ${$txt_ref} =~ s{$uri}{$host}mx; ${$txt_ref} =~ s{$RE{URI}{HTTP}{-keep}{-scheme=>'https?'}}{$3}gsmx; return; } if ( ${$txt_ref} =~ m{ftp:}smx ) { ${$txt_ref} =~ s{$RE{URI}{FTP}{-keep}}{$5}gsmx; return; } if ( ${$txt_ref} =~ m{file:}smx ) { ${$txt_ref} =~ s{$RE{URI}{file}{-keep}}{$3}gsmx; return; } if ( ${$txt_ref} =~ m{$MAIL_RE}smx ) { ${$txt_ref} =~ s{mailto://}{}smx; ${$txt_ref} =~ s{($MAIL_RE)}{$1}gsmx; return; } return; } ########### # COMMAND # ########### # Index levels, which translate into indentation in the index Readonly::Scalar my $LEVEL1 => 1; Readonly::Scalar my $LEVEL2 => 2; Readonly::Scalar my $LEVEL3 => 3; Readonly::Scalar my $LEVEL4 => 4; Readonly::Scalar my $LEVELL => 0; # Overrides command() provided by base class in Pod::Parser sub command { my ( $parser, $command, $paragraph, $line_num, $pod ) = @_; if ( defined $parser->{POD_HTMLEASY}->{VERBATIM_BUFFER} ) { _verbatim($parser); } # [6062] my $expansion = $parser->interpolate( $paragraph, $line_num ); $expansion =~ s{$RE{ws}{crop}}{}gsmx; # delete surrounding whitespace # Encoding puts in a NUL; we're finished with the text, so remove them _encode_entities( $parser, \$expansion ); _remove_nul_escapes( \$expansion ); my $html; given ($command) { when q{head1} { _add_index( $parser, $expansion, $LEVEL1 ); $html = $parser->{POD_HTMLEASY} ->{ON_HEAD1}( $parser->{POD_HTMLEASY}, $expansion ); } when q{head2} { _add_index( $parser, $expansion, $LEVEL2 ); $html = $parser->{POD_HTMLEASY} ->{ON_HEAD2}( $parser->{POD_HTMLEASY}, $expansion ); } when q{head3} { _add_index( $parser, $expansion, $LEVEL3 ); $html = $parser->{POD_HTMLEASY} ->{ON_HEAD3}( $parser->{POD_HTMLEASY}, $expansion ); } when q{head4} { _add_index( $parser, $expansion, $LEVEL4 ); $html = $parser->{POD_HTMLEASY} ->{ON_HEAD4}( $parser->{POD_HTMLEASY}, $expansion ); } when q{begin} { $html = $parser->{POD_HTMLEASY} ->{ON_BEGIN}( $parser->{POD_HTMLEASY}, $expansion ); } when q{end} { $html = $parser->{POD_HTMLEASY} ->{ON_END}( $parser->{POD_HTMLEASY}, $expansion ); } when q{over} { $html = $parser->{POD_HTMLEASY} ->{ON_OVER}( $parser->{POD_HTMLEASY}, $expansion ); } when q{item} { # Items that begin with '* ' are ugly. Is it there for pod2man? # Which is not the same as _only_ '*' $expansion =~ s{\A\*\s+}{}mx; if ( $parser->{INDEX_ITEM} ) { _add_index( $parser, $expansion, $LEVELL ); } # This is for the folks who use =item to list URLs _add_uri_href( $parser, \$expansion ); $html = $parser->{POD_HTMLEASY} ->{ON_ITEM}( $parser->{POD_HTMLEASY}, $expansion ); } when q{back} { $html = $parser->{POD_HTMLEASY} ->{ON_BACK}( $parser->{POD_HTMLEASY}, $expansion ); } when q{for} { $html = $parser->{POD_HTMLEASY} ->{ON_FOR}( $parser->{POD_HTMLEASY}, $expansion ); } default { if ( defined $parser->{POD_HTMLEASY}->{qq{ON_\U$command\E}} ) { $html = $parser->{POD_HTMLEASY} ->{qq{ON_\U$command\E}}( $parser->{POD_HTMLEASY}, $expansion ); } elsif ( $command !~ /^(?:pod|cut)$/imx ) { $html = qq{
=$command $expansion
}; } else { $html = EMPTY; } } }; if ( $html ne EMPTY ) { push @{ $parser->{POD_HTMLEASY}->{HTML} }, $html; } return; } ############ # VERBATIM # ############ # Overrides verbatim() provided by base class in Pod::Parser sub verbatim { my ( $parser, $paragraph, $line_num ) = @_; if ( exists $parser->{POD_HTMLEASY}->{IN_BEGIN} ) { return; } $parser->{POD_HTMLEASY}->{VERBATIM_BUFFER} .= $paragraph; return; } sub _verbatim { my ($parser) = @_; if ( exists $parser->{POD_HTMLEASY}->{IN_BEGIN} ) { return; } my $expansion = $parser->{POD_HTMLEASY}->{VERBATIM_BUFFER}; $parser->{POD_HTMLEASY}->{VERBATIM_BUFFER} = EMPTY; _encode_entities( $parser, \$expansion ); my $html = $parser->{POD_HTMLEASY} ->{ON_VERBATIM}( $parser->{POD_HTMLEASY}, $expansion ); # Now look for any embedded URIs _add_uri_href( $parser, \$html ); # And remove any NUL escapes _remove_nul_escapes( \$html ); if ( $html ne EMPTY ) { push @{ $parser->{POD_HTMLEASY}->{HTML} }, $html; } # [6062] return; } ############# # TEXTBLOCK # ############# # Overrides textblock() provided by base class in Pod::Parser sub textblock { my ( $parser, $paragraph, $line_num ) = @_; if ( exists $parser->{POD_HTMLEASY}->{IN_BEGIN} ) { return; } if ( defined $parser->{POD_HTMLEASY}->{VERBATIM_BUFFER} ) { _verbatim($parser); } # [6062] my $expansion = $parser->interpolate( $paragraph, $line_num ); $expansion =~ s{$RE{ws}{crop}}{}gsmx; # delete surrounding whitespace $expansion =~ s{\s+$}{}gsmx; # Encode HTML-specific characters before adding any HTML (eg

) _encode_entities( $parser, \$expansion ); my $html = $parser->{POD_HTMLEASY} ->{ON_TEXTBLOCK}( $parser->{POD_HTMLEASY}, $expansion ); # Now look for any embedded URIs _add_uri_href( $parser, \$html ); # And remove any NUL escapes _remove_nul_escapes( \$html ); if ( $html ne EMPTY ) { push @{ $parser->{POD_HTMLEASY}->{HTML} }, $html; } return; } ##################### # INTERIOR_SEQUENCE # ##################### # Overrides interior_sequence() provided by base class in Pod::Parser sub interior_sequence { my ( $parser, $seq_command, $seq_argument, $pod_seq ) = @_; my $ret; given ($seq_command) { when q{B} { $ret = $parser->{POD_HTMLEASY} ->{ON_B}( $parser->{POD_HTMLEASY}, $seq_argument ); } when q{C} { $ret = $parser->{POD_HTMLEASY} ->{ON_C}( $parser->{POD_HTMLEASY}, $seq_argument ); } when q{E} { $ret = $parser->{POD_HTMLEASY} ->{ON_E}( $parser->{POD_HTMLEASY}, $seq_argument ); } when q{F} { $ret = $parser->{POD_HTMLEASY} ->{ON_F}( $parser->{POD_HTMLEASY}, $seq_argument ); } when q{I} { $ret = $parser->{POD_HTMLEASY} ->{ON_I}( $parser->{POD_HTMLEASY}, $seq_argument ); } when q{L} { # L<> causes problems, but not with parselink. if ( $seq_argument eq EMPTY ) { _errors( $parser, q{Empty L<>} ); return EMPTY; } $ret = $parser->{POD_HTMLEASY}->{ON_L}( $parser->{POD_HTMLEASY}, Pod::ParseLink::parselink($seq_argument) ); } when q{S} { $ret = $parser->{POD_HTMLEASY} ->{ON_S}( $parser->{POD_HTMLEASY}, $seq_argument ); } when q{Z} { $ret = $parser->{POD_HTMLEASY} ->{ON_Z}( $parser->{POD_HTMLEASY}, $seq_argument ); } default { if ( defined $parser->{POD_HTMLEASY}->{qq{ON_\U$seq_command\E}} ) { $ret = $parser->{POD_HTMLEASY} ->{qq{ON_\U$seq_command\E}}( $parser->{POD_HTMLEASY}, $seq_argument ); } else { $ret = qq{$seq_command<$seq_argument>}; } } } # Escape HTML-significant characters _nul_escape( \$ret ); return $ret; } ######################## # PREPROCESS_PARAGRAPH # ######################## Readonly::Scalar my $INFO_DONE => 3; # Overrides preprocess_paragraph() provided by base class in Pod::Parser # NB: the text is _not_ altered. sub preprocess_paragraph { my ( $parser, $text, $line_num ) = @_; if ( $parser->{POD_HTMLEASY}{INFO_COUNT} == $INFO_DONE ) { return $text; } if ( not exists $parser->{POD_HTMLEASY}{PACKAGE} ) { if ( $text =~ m{package}smx ) { my ($pack) = $text =~ m{package\s+(\w+(?:::\w+)*)}smx; if ( defined $pack ) { $parser->{POD_HTMLEASY}{PACKAGE} = $pack; $parser->{POD_HTMLEASY}{INFO_COUNT}++; } } } if ( not exists $parser->{POD_HTMLEASY}{VERSION} ) { if ( $text =~ m{VERSION}smx ) { my ($ver) = $text =~ m{($RE{num}{decimal})}smx; if ( defined $ver ) { $parser->{POD_HTMLEASY}{VERSION} = $ver; $parser->{POD_HTMLEASY}{INFO_COUNT}++; } } } # This situation is created by evt_on_head1() # _do_title has found nothing following =head1 NAME, so it # creates ...{TITLE}, and leaves it undef, so that it will be # picked up here when the paragraph following is processed. if ( ( exists $parser->{POD_HTMLEASY}{TITLE} ) and ( not defined $parser->{POD_HTMLEASY}{TITLE} ) ) { my @lines = split m{\n}smx, $text; my $tmp_text = shift @lines; if ( not defined $tmp_text ) { return $text; } $tmp_text =~ s{$RE{ws}{crop}}{}gsmx; # delete surrounding whitespace $parser->{POD_HTMLEASY}{TITLE} = $tmp_text; $parser->{POD_HTMLEASY}{INFO_COUNT}++; } return $text; } ############## # _ADD_INDEX # ############## sub _add_index { my ( $parser, $txt, $level ) = @_; # Don't index star items if ( $txt eq q{*} ) { return; } push @{ $parser->{POD_HTMLEASY}->{INDEX} }, [ $level, $txt ]; return; } ############# # BEGIN_POD # ############# # Overrides begin_pod() provided by base class in Pod::Parser sub begin_pod { my ($parser) = @_; delete $parser->{POD_HTMLEASY}->{INDEX}; $parser->{POD_HTMLEASY}->{INDEX} = []; return 1; } ########### # END_POD # ########### # Overrides end_pod() provided by base class in Pod::Parser sub end_pod { my ($parser) = @_; if ( defined $parser->{POD_HTMLEASY}->{VERBATIM_BUFFER} ) { _verbatim($parser); } return 1; } ########### # _ERRORS # ########### sub _errors { my ( $parser, $error ) = @_; carp "$error"; $error =~ s{^\s*\**\s*errors?:?\s*}{}ismx; $error =~ s{\s+$}{}smx; my $html = $parser->{POD_HTMLEASY} ->{ON_ERROR}( $parser->{POD_HTMLEASY}, $error ); if ( $html ne EMPTY ) { push @{ $parser->{POD_HTMLEASY}->{HTML} }, $html; } return 1; } ########### # DESTROY # ########### sub DESTROY { } ####### # END # ####### 1;