package Pod::Html; use strict; require Exporter; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); $VERSION = '1.09_04'; @ISA = qw(Exporter); @EXPORT = qw(pod2html htmlify); @EXPORT_OK = qw(anchorify); use Carp; use Config; use Cwd; use File::Spec; use File::Spec::Unix; use Getopt::Long; use locale; # make \w work right in non-ASCII lands my($Cachedir); my($Dircache, $Itemcache); my @Begin_Stack; my @Libpods; my($Htmlroot, $Htmldir, $Htmlfile, $Htmlfileurl); my($Podfile, @Podpath, $Podroot); my $Css; my $Recurse; my $Quiet; my $HiddenDirs; my $Verbose; my $Doindex; my $Backlink; my($Listlevel, @Listtype); my $ListNewTerm; use vars qw($Ignore); # need to localize it later. my(%Items_Named, @Items_Seen); my($Title, $Header); my $Top; my $Paragraph; my %Sections; # Caches my %Pages = (); # associative array used to find the location # of pages referenced by L<> links. my %Items = (); # associative array used to find the location # of =item directives referenced by C<> links my %Local_Items; my $Is83; my $Curdir = File::Spec->curdir; _init_globals(); sub _init_globals { $Cachedir = "."; # The directory to which item and directory # caches will be written. $Dircache = "pod2htmd.tmp"; $Itemcache = "pod2htmi.tmp"; @Begin_Stack = (); # begin/end stack @Libpods = (); # files to search for links from C<> directives $Htmlroot = "/"; # http-server base directory from which all # relative paths in $podpath stem. $Htmldir = ""; # The directory to which the html pages # will (eventually) be written. $Htmlfile = ""; # write to stdout by default $Htmlfileurl = ""; # The url that other files would use to # refer to this file. This is only used # to make relative urls that point to # other files. $Podfile = ""; # read from stdin by default @Podpath = (); # list of directories containing library pods. $Podroot = $Curdir; # filesystem base directory from which all # relative paths in $podpath stem. $Css = ''; # Cascading style sheet $Recurse = 1; # recurse on subdirectories in $podpath. $Quiet = 0; # not quiet by default $Verbose = 0; # not verbose by default $Doindex = 1; # non-zero if we should generate an index $Backlink = ''; # text for "back to top" links $Listlevel = 0; # current list depth @Listtype = (); # list types for open lists $ListNewTerm = 0; # indicates new term in definition list; used # to correctly open/close
\n$text\n"; } else { _process_text( \$text ); # experimental: check for a paragraph where all lines # have some ...\t...\t...\n pattern if( $text =~ /\t/ ){ my @lines = split( "\n", $text ); if( @lines > 1 ){ my $all = 2; foreach my $line ( @lines ){ if( $line =~ /\S/ && $line !~ /\t/ ){ $all--; last if $all == 0; } } if( $all > 0 ){ $text =~ s/\t+/
$text
\n"; } $after_item = 0; } } # finish off any pending directives _finish_list(); # link to page index print HTML "\n" if $Doindex and $index and $Backlink; print HTML <\n"; if( $level == 1 && ! $Top ){ print HTML "$Backlink\n" if $hasindex and $Backlink; print HTML "
\n - can be a ref or
# need to extract text
my $par = _go_ahead( $rstr, 'C', $closing );
## clean-up of the link target
my $text = _depod( $par );
### my $x = $par =~ /[BI] ? 'yes' : 'no' ;
### print STDERR "-->call _emit_C($par) lev=$lev, par with BI=$x\n";
$res = _emit_C( $text, $lev > 1 || ($par =~ /[BI]) );
} elsif( $func eq 'E' ){
# E - convert to character
$$rstr =~ s/^([^>]*)>//;
my $escape = $1;
$escape =~ s/^(\d+|X[\dA-F]+)$/#$1/i;
$res = "&$escape;";
} elsif( $func eq 'F' ){
# F - italicize
$res = '' . _process_text1( $lev, $rstr ) . '';
} elsif( $func eq 'I' ){
# I - italicize
$res = '' . _process_text1( $lev, $rstr ) . '';
} elsif( $func eq 'L' ){
# L - link
## L => produce text, use cross-ref for linking
## L => make text from cross-ref
## need to extract text
my $par = _go_ahead( $rstr, 'L', $closing );
# some L<>'s that shouldn't be:
# a) full-blown URL's are emitted as-is
if( $par =~ m{^\w+://}s ){
return _make_URL_href( $par );
}
# b) C<...> is stripped and treated as C<>
if( $par =~ /^C<(.*)>$/ ){
my $text = _depod( $1 );
return _emit_C( $text, $lev > 1 || ($par =~ /[BI]) );
}
# analyze the contents
$par =~ s/\n/ /g; # undo word-wrapped tags
my $opar = $par;
my $linktext;
if( $par =~ s{^([^|]+)\|}{} ){
$linktext = $1;
}
# make sure sections start with a /
$par =~ s{^"}{/"};
my( $page, $section, $ident );
# check for link patterns
if( $par =~ m{^([^/]+?)/(?!")(.*?)$} ){ # name/ident
# we've got a name/ident (no quotes)
if (length $2) {
( $page, $ident ) = ( $1, $2 );
} else {
( $page, $section ) = ( $1, $2 );
}
### print STDERR "--> L<$par> to page $page, ident $ident\n";
} elsif( $par =~ m{^(.*?)/"?(.*?)"?$} ){ # [name]/"section"
# even though this should be a "section", we go for ident first
( $page, $ident ) = ( $1, $2 );
### print STDERR "--> L<$par> to page $page, section $section\n";
} elsif( $par =~ /\s/ ){ # this must be a section with missing quotes
( $page, $section ) = ( '', $par );
### print STDERR "--> L<$par> to void page, section $section\n";
} else {
( $page, $section ) = ( $par, '' );
### print STDERR "--> L<$par> to page $par, void section\n";
}
# now, either $section or $ident is defined. the convoluted logic
# below tries to resolve L<> according to what the user specified.
# failing this, we try to find the next best thing...
my( $url, $ltext, $fid );
RESOLVE: {
if( defined $ident ){
## try to resolve $ident as an item
( $url, $fid ) = _coderef( $page, $ident );
if( $url ){
if( ! defined( $linktext ) ){
$linktext = $ident;
$linktext .= " in " if $ident && $page;
$linktext .= "the $page manpage" if $page;
}
### print STDERR "got coderef url=$url\n";
last RESOLVE;
}
## no luck: go for a section (auto-quoting!)
$section = $ident;
}
## now go for a section
my $htmlsection = htmlify( $section );
$url = _page_sect( $page, $htmlsection );
if( $url ){
if( ! defined( $linktext ) ){
$linktext = $section;
$linktext .= " in " if $section && $page;
$linktext .= "the $page manpage" if $page;
}
### print STDERR "got page/section url=$url\n";
last RESOLVE;
}
## no luck: go for an ident
if( $section ){
$ident = $section;
} else {
$ident = $page;
$page = undef();
}
( $url, $fid ) = _coderef( $page, $ident );
if( $url ){
if( ! defined( $linktext ) ){
$linktext = $ident;
$linktext .= " in " if $ident && $page;
$linktext .= "the $page manpage" if $page;
}
### print STDERR "got section=>coderef url=$url\n";
last RESOLVE;
}
# warning; show some text.
$linktext = $opar unless defined $linktext;
warn "$0: $Podfile: cannot resolve L<$opar> in paragraph $Paragraph.\n" unless $Quiet;
}
# now we have a URL or just plain code
$$rstr = $linktext . '>' . $$rstr;
if( defined( $url ) ){
$res = "" . _process_text1( $lev, $rstr ) . '';
} else {
$res = '' . _process_text1( $lev, $rstr ) . '';
}
} elsif( $func eq 'S' ){
# S - non-breaking spaces
$res = _process_text1( $lev, $rstr );
$res =~ s/ / /g;
} elsif( $func eq 'X' ){
# X<> - ignore
warn "$0: $Podfile: invalid X<> in paragraph $Paragraph.\n"
unless $$rstr =~ s/^[^>]*>// or $Quiet;
} elsif( $func eq 'Z' ){
# Z<> - empty
warn "$0: $Podfile: invalid Z<> in paragraph $Paragraph.\n"
unless $$rstr =~ s/^>// or $Quiet;
} else {
my $term = _pattern $closing;
while( $$rstr =~ s/\A(.*?)(([BCEFILSXZ])<(<+[^\S\n]+)?|$term)//s ){
# all others: either recurse into new function or
# terminate at closing angle bracket(s)
my $pt = $1;
$pt .= $2 if !$3 && $lev == 1;
$res .= $lev == 1 ? _pure_text( $pt ) : _inIS_text( $pt );
return $res if !$3 && $lev > 1;
if( $3 ){
$res .= _process_text1( $lev, $rstr, $3, _closing $4 );
}
}
if( $lev == 1 ){
$res .= _pure_text( $$rstr );
} elsif( ! $Quiet ) {
my $snippet = substr($$rstr,0,60);
warn "$0: $Podfile: undelimited $func<> in paragraph $Paragraph: '$snippet'.\n"
}
$res = _process_text_rfc_links($res);
}
return $res;
}
#
# _go_ahead: extract text of an IS (can be nested)
#
sub _go_ahead($$$){
my( $rstr, $func, $closing ) = @_;
my $res = '';
my @closing = ($closing);
while( $$rstr =~
s/\A(.*?)(([BCEFILSXZ])<(<+\s+)?|@{[_pattern $closing[0]]})//s ){
$res .= $1;
unless( $3 ){
shift @closing;
return $res unless @closing;
} else {
unshift @closing, _closing $4;
}
$res .= $2;
}
unless ($Quiet) {
my $snippet = substr($$rstr,0,60);
warn "$0: $Podfile: undelimited $func<> in paragraph $Paragraph (_go_ahead): '$snippet'.\n"
}
return $res;
}
#
# _emit_C - output result of C
# $text is the depod-ed text
#
sub _emit_C($;$$){
my( $text, $nocode, $args ) = @_;
$args = '' unless defined $args;
my $res;
my( $url, $fid ) = _coderef( undef(), $text );
# need HTML-safe text
my $linktext = _html_escape( "$text$args" );
if( defined( $url ) &&
(!defined( $EmittedItem ) || $EmittedItem ne $fid ) ){
$res = "$linktext";
} elsif( 0 && $nocode ){
$res = $linktext;
} else {
$res = "$linktext";
}
return $res;
}
#
# _html_escape: make text safe for HTML
#
sub _html_escape {
my $rest = $_[0];
$rest =~ s/&/&/g;
$rest =~ s/</g;
$rest =~ s/>/>/g;
$rest =~ s/"/"/g;
# ' is only in XHTML, not HTML4. Be conservative
#$rest =~ s/'/'/g;
return $rest;
}
#
# dosify - convert filenames to 8.3
#
sub _dosify {
my($str) = @_;
return lc($str) if $^O eq 'VMS'; # VMS just needs casing
if ($Is83) {
$str = lc $str;
$str =~ s/(\.\w+)/substr ($1,0,4)/ge;
$str =~ s/(\w+)/substr ($1,0,8)/ge;
}
return $str;
}
#
# _page_sect - make a URL from the text of a L<>
#
sub _page_sect($$) {
my( $page, $section ) = @_;
my( $linktext, $page83, $link); # work strings
# check if we know that this is a section in this page
if (!defined $Pages{$page} && defined $Sections{$page}) {
$section = $page;
$page = "";
### print STDERR "reset page='', section=$section\n";
}
$page83=_dosify($page);
$page=$page83 if (defined $Pages{$page83});
if ($page eq "") {
$link = "#" . anchorify( $section );
} elsif ( $page =~ /::/ ) {
$page =~ s,::,/,g;
# Search page cache for an entry keyed under the html page name,
# then look to see what directory that page might be in. NOTE:
# this will only find one page. A better solution might be to produce
# an intermediate page that is an index to all such pages.
my $page_name = $page ;
$page_name =~ s,^.*/,,s ;
if ( defined( $Pages{ $page_name } ) &&
$Pages{ $page_name } =~ /([^:]*$page)\.(?:pod|pm):/
) {
$page = $1 ;
}
else {
# NOTE: This branch assumes that all A::B pages are located in
# $Htmlroot/A/B.html . This is often incorrect, since they are
# often in $Htmlroot/lib/A/B.html or such like. Perhaps we could
# analyze the contents of %Pages and figure out where any
# cousins of A::B are, then assume that. So, if A::B isn't found,
# but A::C is found in lib/A/C.pm, then A::B is assumed to be in
# lib/A/B.pm. This is also limited, but it's an improvement.
# Maybe a hints file so that the links point to the correct places
# nonetheless?
}
$link = "$Htmlroot/$page.html";
$link .= "#" . anchorify( $section ) if ($section);
} elsif (!defined $Pages{$page}) {
$link = "";
} else {
$section = anchorify( $section ) if $section ne "";
### print STDERR "...section=$section\n";
# if there is a directory by the name of the page, then assume that an
# appropriate section will exist in the subdirectory
# if ($section ne "" && $Pages{$page} =~ /([^:]*[^(\.pod|\.pm)]):/) {
if ($section ne "" && $Pages{$page} =~ /([^:]*(?, Foo.(pod|pm) is preferred to A/Foo.(pod|pm)
if ($Pages{$page} =~ /([^:]*)\.(?:pod|pm):/) {
$link = "$Htmlroot/$1.html$section";
} else {
$link = "";
}
}
}
if ($link) {
# Here, we take advantage of the knowledge that $Htmlfileurl ne ''
# implies $Htmlroot eq ''. This means that the link in question
# needs a prefix of $Htmldir if it begins with '/'. The test for
# the initial '/' is done to avoid '#'-only links, and to allow
# for other kinds of links, like file:, ftp:, etc.
my $url ;
if ( $Htmlfileurl ne '' ) {
$link = "$Htmldir$link" if $link =~ m{^/}s;
$url = relativize_url( $link, $Htmlfileurl );
# print( " b: [$link,$Htmlfileurl,$url]\n" );
}
else {
$url = $link ;
}
return $url;
} else {
return undef();
}
}
sub relativize_url {
my ($dest,$source) = @_ ;
my ($dest_volume,$dest_directory,$dest_file) =
File::Spec::Unix->splitpath( $dest ) ;
$dest = File::Spec::Unix->catpath( $dest_volume, $dest_directory, '' ) ;
my ($source_volume,$source_directory,$source_file) =
File::Spec::Unix->splitpath( $source ) ;
$source = File::Spec::Unix->catpath( $source_volume, $source_directory, '' ) ;
my $rel_path = '' ;
if ( $dest ne '' ) {
$rel_path = File::Spec::Unix->abs2rel( $dest, $source ) ;
}
if ( $rel_path ne '' &&
substr( $rel_path, -1 ) ne '/' &&
substr( $dest_file, 0, 1 ) ne '#'
) {
$rel_path .= "/$dest_file" ;
}
else {
$rel_path .= "$dest_file" ;
}
return $rel_path ;
}
#
# _coderef - make URL from the text of a C<>
#
sub _coderef($$){
my( $page, $item ) = @_;
my( $url );
my $fid = _fragment_id( $item );
if( defined( $page ) && $page ne "" ){
# we have been given a $page...
$page =~ s{::}{/}g;
Carp::confess("Undefined fragment '$item' from _fragment_id() in _coderef() in $Podfile")
if !defined $fid;
# Do we take it? Item could be a section!
my $base = $Items{$fid} || "";
$base =~ s{[^/]*/}{};
if( $base ne "$page.html" ){
### print STDERR "_coderef( $page, $item ): items{$fid} = $Items{$fid} = $base => discard page!\n";
$page = undef();
}
} else {
# no page - local items precede cached items
if( defined( $fid ) ){
if( exists $Local_Items{$fid} ){
$page = $Local_Items{$fid};
} else {
$page = $Items{$fid};
}
}
}
# if there was a pod file that we found earlier with an appropriate
# =item directive, then create a link to that page.
if( defined $page ){
if( $page ){
if( exists $Pages{$page} and $Pages{$page} =~ /([^:.]*)\.[^:]*:/){
$page = $1 . '.html';
}
my $link = "$Htmlroot/$page#" . anchorify($fid);
# Here, we take advantage of the knowledge that $Htmlfileurl
# ne '' implies $Htmlroot eq ''.
if ( $Htmlfileurl ne '' ) {
$link = "$Htmldir$link" ;
$url = relativize_url( $link, $Htmlfileurl ) ;
} else {
$url = $link ;
}
} else {
$url = "#" . anchorify($fid);
}
confess "url has space: $url" if $url =~ /"[^"]*\s[^"]*"/;
}
return( $url, $fid );
}
#
# Adapted from Nick Ing-Simmons' PodToHtml package.
sub relative_url {
my $source_file = shift ;
my $destination_file = shift;
my $source = URI::file->new_abs($source_file);
my $uo = URI::file->new($destination_file,$source)->abs;
return $uo->rel->as_string;
}
#
# _finish_list - finish off any pending HTML lists. this should be called
# after the entire pod file has been read and converted.
#
sub _finish_list {
if( $Listlevel ){
warn "$0: $Podfile: unterminated list(s) at =head in paragraph $Paragraph. ignoring.\n" unless $Quiet;
while( $Listlevel ){
_process_back();
}
}
}
#
# htmlify - converts a pod section specification to a suitable section
# specification for HTML. Note that we keep spaces and special characters
# except ", ? (Netscape problem) and the hyphen (writer's problem...).
#
sub htmlify {
my( $heading) = @_;
$heading =~ s/(\s+)/ /g;
$heading =~ s/\s+\Z//;
$heading =~ s/\A\s+//;
# The hyphen is a disgrace to the English language.
# $heading =~ s/[-"?]//g;
$heading =~ s/["?]//g;
$heading = lc( $heading );
return $heading;
}
#
# similar to htmlify, but turns non-alphanumerics into underscores
#
sub anchorify {
my ($anchor) = @_;
$anchor = htmlify($anchor);
$anchor =~ s/\W/_/g;
return $anchor;
}
#
# _depod - convert text by eliminating all interior sequences
# Note: can be called with copy or modify semantics
#
my %E2c;
$E2c{lt} = '<';
$E2c{gt} = '>';
$E2c{sol} = '/';
$E2c{verbar} = '|';
$E2c{amp} = '&'; # in Tk's pods
sub _depod1($;$$);
sub _depod($){
my $string;
if( ref( $_[0] ) ){
$string = ${$_[0]};
${$_[0]} = _depod1( \$string );
} else {
$string = $_[0];
_depod1( \$string );
}
}
sub _depod1($;$$){
my( $rstr, $func, $closing ) = @_;
my $res = '';
return $res unless defined $$rstr;
if( ! defined( $func ) ){
# skip to next begin of an interior sequence
while( $$rstr =~ s/\A(.*?)([BCEFILSXZ])<(<+[^\S\n]+)?//s ){
# recurse into its text
$res .= $1 . _depod1( $rstr, $2, _closing $3);
}
$res .= $$rstr;
} elsif( $func eq 'E' ){
# E - convert to character
$$rstr =~ s/^([^>]*)>//;
$res .= $E2c{$1} || "";
} elsif( $func eq 'X' ){
# X<> - ignore
$$rstr =~ s/^[^>]*>//;
} elsif( $func eq 'Z' ){
# Z<> - empty
$$rstr =~ s/^>//;
} else {
# all others: either recurse into new function or
# terminate at closing angle bracket
my $term = _pattern $closing;
while( $$rstr =~ s/\A(.*?)(([BCEFILSXZ])<(<+[^\S\n]+)?|$term)//s ){
$res .= $1;
last unless $3;
$res .= _depod1( $rstr, $3, _closing $4 );
}
## If we're here and $2 ne '>': undelimited interior sequence.
## Ignored, as this is called without proper indication of where we are.
## Rely on _process_text to produce diagnostics.
}
return $res;
}
{
my %seen; # static fragment record hash
sub _flush_seen {
%seen = ();
}
sub _fragment_id_readable {
my $text = shift;
my $generate = shift; # optional flag
my $orig = $text;
# leave the words for the fragment identifier,
# change everything else to underbars.
$text =~ s/[^A-Za-z0-9_]+/_/g; # do not use \W to avoid locale dependency.
$text =~ s/_{2,}/_/g;
$text =~ s/\A_//;
$text =~ s/_\Z//;
unless ($text)
{
# Nothing left after removing punctuation, so leave it as is
# E.g. if option is named: "=item -#"
$text = $orig;
}
if ($generate) {
if ( exists $seen{$text} ) {
# This already exists, make it unique
$seen{$text}++;
$text = $text . $seen{$text};
} else {
$seen{$text} = 1; # first time seen this fragment
}
}
$text;
}}
my @HC;
sub _fragment_id_obfuscated { # This was the old "_2d_2d__"
my $text = shift;
my $generate = shift; # optional flag
# text? Normalize by obfuscating the fragment id to make it unique
$text =~ s/\s+/_/sg;
$text =~ s{(\W)}{
defined( $HC[ord($1)] ) ? $HC[ord($1)]
: ( $HC[ord($1)] = sprintf( "%%%02X", ord($1) ) ) }gxe;
$text = substr( $text, 0, 50 );
$text;
}
#
# _fragment_id - construct a fragment identifier from:
# a) =item text
# b) contents of C<...>
#
sub _fragment_id {
my $text = shift;
my $generate = shift; # optional flag
$text =~ s/\s+\Z//s;
if( $text ){
# a method or function?
return $1 if $text =~ /(\w+)\s*\(/;
return $1 if $text =~ /->\s*(\w+)\s*\(?/;
# a variable name?
return $1 if $text =~ /^([\$\@%*]\S+)/;
# some pattern matching operator?
return $1 if $text =~ m|^(\w+/).*/\w*$|;
# fancy stuff... like "do { }"
return $1 if $text =~ m|^(\w+)\s*{.*}$|;
# honour the perlfunc manpage: func [PAR[,[ ]PAR]...]
# and some funnies with ... Module ...
return $1 if $text =~ m{^([a-z\d_]+)(\s+[A-Z,/& ][A-Z\d,/& ]*)?$};
return $1 if $text =~ m{^([a-z\d]+)\s+Module(\s+[A-Z\d,/& ]+)?$};
return _fragment_id_readable($text, $generate);
} else {
return;
}
}
#
# _make_URL_href - generate HTML href from URL
# Special treatment for CGI queries.
#
sub _make_URL_href($){
my( $url ) = @_;
if( $url !~
s{^(http:[-\w/#~:.+=&%@!]+)(\?.*)$}{$1}i ){
$url = "$url";
}
return $url;
}
1;
__END__
=head1 NAME
Pod::Html - Convert POD files to HTML
=head1 VERSION
This document describes version 1.09_04 of Pod::HTML, released
2008-05-12.
=head1 SYNOPSIS
use Pod::Html;
pod2html([options]);
=head1 DESCRIPTION
Converts files from pod format (see L) to HTML format. It
can automatically generate indexes and cross-references, and it keeps
a cache of things it knows how to cross-reference.
=head1 FUNCTIONS
=head2 pod2html
pod2html("pod2html",
"--podpath=lib:ext:pod:vms",
"--podroot=/usr/src/perl",
"--htmlroot=/perl/nmanual",
"--libpods=perlfunc:perlguts:perlvar:perlrun:perlop",
"--recurse",
"--infile=foo.pod",
"--outfile=/perl/nmanual/foo.html");
pod2html takes the following arguments:
=over 4
=item backlink
--backlink="Back to Top"
Adds "Back to Top" links in front of every C heading (except for
the first). By default, no backlinks are generated.
=item cachedir
--cachedir=name
Creates the item and directory caches in the given directory.
=item css
--css=stylesheet
Specify the URL of a cascading style sheet. Also disables all HTML/CSS
C