package HTML::ToDocBook; use strict; use warnings; =head1 NAME HTML::ToDocBook - Converts an XHTML file into DocBook. =head1 VERSION This describes version B<0.03> of HTML::ToDocBook. =cut our $VERSION = '0.03'; =head1 SYNOPSIS use HTML::ToDocBook; my $obj = HTML::ToDocBook->new(%args); $obj->convert(infile=>$filename); # convert HTML file $obj->convert(infile=>$filename, html=>1); =head1 DESCRIPTION This module converts an XHTML file into DocBook format using both heuristics and XSLT processing. By default, this expects the input file to be correct XHTML -- there are other programs such as html tidy (http://tidy.sourceforge.net/) which can correct files for you; this does not do that. Note also this is very simple; it doesn't deal with things like
or which it has no way of guessing the meaning of. (For some, however, if they have class names which match DocBook tags, they will be turned into those tags) This does not merge multiple XHTML files into a single document, so this converts each XHTML file into a , with each header being a section (sect1 to sect5). The tag is used for the chapter title. There will likely to be validity errors, depending on how good the original HTML was. There may be broken links, <xref> elements that should be <link>s, and overuse of <emphasis> and <emphasis role="bold">. =cut use Cwd 'abs_path'; use File::Basename; use File::Spec; use XML::LibXSLT; use XML::LibXML; use HTML::SimpleParse; =head1 METHODS =head2 new my $conv = HTML::ToDocBook->new(); my $conv = HTML::ToDocBook->new(stylesheet=>$stylesheet); Arguments: =over =item stylesheet A replacement XSLT stylesheet to use for conversions instead of the built-in one. This can either be a file name or a string containing the entire stylesheet. =back =cut sub new { my $class = shift; my %parameters = @_; my $self = bless ({%parameters}, ref ($class) || $class); my $parser = XML::LibXML->new(); my $xslt = XML::LibXSLT->new(); $self->{_parser} = $parser; $self->{_xslt} = $xslt; if ($self->{stylesheet} and -f $self->{stylesheet}) { my $fn = abs_path($self->{stylesheet}); my $style_doc = $parser->parse_file($fn) or die "Could not parse $fn XSLT file"; my $stylesheet = $xslt->parse_stylesheet($style_doc) or die "Could not parse $fn stylesheet"; $self->{_xslt_sheet} = $stylesheet; } elsif ($self->{stylesheet}) { my $style_doc = $parser->parse_string($self->{stylesheet}) or die "Could not parse string XSLT"; my $stylesheet = $xslt->parse_stylesheet($style_doc) or die "Could not parse stylesheet"; $self->{_xslt_sheet} = $stylesheet; } else { # build the parsed stylesheet from the DATA # This is stored in the DATA handle, after the __DATA__ at # the end of this file; but because the scripts may not just # create one instance of this object, # we have to remember the position of the DATA handle # and reset it after we've read from it, just in case # we have to read from it again. # This also means that we don't close it, either. Hope that doesn't # cause a problem... my $curpos = tell(DATA); # remember the __DATA__ position my $style_doc = $parser->parse_fh(\*DATA); # reset the data handle to the start, just in case seek(DATA, $curpos, 0); my $stylesheet = $xslt->parse_stylesheet($style_doc); $self->{_xslt_sheet} = $stylesheet; } return ($self); } # new =head2 convert $obj->convert(infile=>$filename, html=>1); Arguments: =over =item infile The name of the file to convert. =item html Parse the input as HTML rather than XML. =back =cut sub convert { my $self = shift; my %args = ( html=>0, @_ ); my $filename = $args{infile}; my ($basename,$path,$suffix) = fileparse($filename,qr{\.html?}i); my $outfile = File::Spec->catfile($path, "${basename}.xml"); $outfile = '-' if ($filename eq ''); # We need to read in the file first because we need to # pre-process it my $file_str; if ($filename eq '-') # read from STDIN { local $/; $file_str = <STDIN>; } else { local $/; my $fh; open ($fh, "<", $filename) or die "could not open $filename"; $file_str = <$fh>; close $fh; } $file_str = $self->insert_sections($file_str); my $first_ss = $self->{_xslt_sheet}; my $source = undef; my $result_str = ''; if ($args{html}) { $source = $self->{_parser}->parse_html_string($file_str); } else { $source = $self->{_parser}->parse_string($file_str); } undef $file_str; my %all_params = (); my $results = $first_ss->transform($source, %all_params); $result_str = $first_ss->output_string($results); # print the result my $outfh = undef; if ($outfile eq '-' or $outfile eq '') { $outfh = \*STDOUT; } else { open(OUT, ">", $outfile) || die "Can't open $outfile for writing!"; $outfh = \*OUT; } print $outfh $result_str; if ($outfile ne '-' and $outfile ne '') { close($outfh); } return $result_str; } # convert =head1 Private Methods These are not guaranteed to be stable. =head2 insert_sections $my str = $obj->insert_sections($string); This inserts <div class="sectN"> tags to enclose all levels of header. These will then be picked up by the XSLT stylesheet and converted into section tags. =cut sub insert_sections { my $self = shift; my $string = shift; my %args = ( parse_type=>'xml', @_ ); my $hp = new HTML::SimpleParse(); $hp->text($string); $hp->parse(); my @newhtml = (); my @levels = (); my $tok; my @tree = $hp->tree(); while (@tree) { $tok = shift @tree; if ($tok->{type} eq 'starttag' and $tok->{content} =~ /^h(\d)/i) { # we have a header my $header_level = $1; # if we had a previous header, then close its div # if it is the same or higher if (@levels) { my $prev_level = $levels[$#levels]; while ($prev_level > $header_level) { pop @levels; push @newhtml, "</div>\n"; $prev_level = $levels[$#levels]; } if ($prev_level == $header_level) { pop @levels; push @newhtml, "</div>\n"; } } # start a new div for the new header push @newhtml, sprintf("\n<div class='sect%d'>\n", $header_level); push @levels, $header_level; } elsif ($tok->{type} eq 'endtag' and $tok->{content} =~ /^\/body/i) { # we need to close any remaining open section divs while (@levels) { my $prev_level = pop @levels; push @newhtml, "</div>\n"; } } push @newhtml, $hp->execute($tok); } # go through all the tags return join('', @newhtml); } # insert_sections =head1 REQUIRES Cwd File::Basename File::Spec XML::LibXML XML::LibXSLT HTML::SimpleParse Test::More =head1 INSTALLATION To install this module, run the following commands: perl Build.PL ./Build ./Build test ./Build install Or, if you're on a platform (like DOS or Windows) that doesn't like the "./" notation, you can do this: perl Build.PL perl Build perl Build test perl Build install In order to install somewhere other than the default, such as in a directory under your home directory, like "/home/fred/perl" go perl Build.PL --install_base /home/fred/perl as the first step instead. This will install the files underneath /home/fred/perl. You will then need to make sure that you alter the PERL5LIB variable to find the modules, and the PATH variable to find the script. Therefore you will need to change: your path, to include /home/fred/perl/script (where the script will be) PATH=/home/fred/perl/script:${PATH} the PERL5LIB variable to add /home/fred/perl/lib PERL5LIB=/home/fred/perl/lib:${PERL5LIB} =head1 SEE ALSO perl(1). =head1 BUGS Please report any bugs or feature requests to the author. =head1 AUTHOR Kathryn Andersen (RUBYKAT) perlkat AT katspace dot com http://www.katspace.org/tools =head1 COPYRIGHT AND LICENCE XSLT stylesheet based on the one at http://wiki.docbook.org/topic/Html2DocBook by Jeff Beal Copyright (c) 2006 by Kathryn Andersen This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1; # End of HTML::ToDocBook #------------------------------------------------------------------------ # The XSLT stylesheet! # The original stylesheet came from # http://wiki.docbook.org/topic/Html2DocBook # __DATA__ <?xml version="1.0" encoding="UTF-8"?> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:html="http://www.w3.org/1999/xhtml" exclude-result-prefixes="xsl html"> <xsl:output method="xml" indent="yes"/> <xsl:param name="filename"></xsl:param> <xsl:param name="prefix">wb</xsl:param> <xsl:param name="graphics_location">images/</xsl:param> <!-- This needs to match elements with both the html: namespace and without, because files parsed as HTML rather than XHTML don't have the html: namespace in them, for whatever reason. --> <!-- Main block-level conversions --> <xsl:template match="html:html|html"> <xsl:apply-templates select="html:body|body"/> </xsl:template> <!-- This template converts each HTML file encountered into a DocBook chapter. For a title, it selects the title, else the first h1 element --> <xsl:template match="html:body|body"> <chapter> <xsl:if test="$filename != ''"> <xsl:attribute name="id"> <xsl:value-of select="$prefix"/> <xsl:text>_</xsl:text> <xsl:value-of select="translate($filename,' ()','__')"/> </xsl:attribute> </xsl:if> <title> <xsl:value-of select="/html:html/html:head/html:title |/html/head/title |.//html:h1[1] |.//html:h2[1] |.//html:h3[1]"/>
<xsl:choose> <xsl:when test="count(html:a/@name)"> <xsl:attribute name="id"> <xsl:value-of select="html:a/@name"/> </xsl:attribute> </xsl:when> <xsl:when test="count(a/@name)"> <xsl:attribute name="id"> <xsl:value-of select="a/@name"/> </xsl:attribute> </xsl:when> <xsl:when test="preceding-sibling::* = preceding-sibling::html:a[@name != '']"> <xsl:attribute name="id"> <xsl:value-of select="concat($prefix,preceding-sibling::html:a[1]/@name)"/> </xsl:attribute> </xsl:when> <xsl:when test="preceding-sibling::* = preceding-sibling::a[@name != '']"> <xsl:attribute name="id"> <xsl:value-of select="concat($prefix,preceding-sibling::a[1]/@name)"/> </xsl:attribute> </xsl:when> <xsl:when test="following-sibling::* = following-sibling::html:a[@name != '']"> <xsl:attribute name="id"> <xsl:value-of select="concat($prefix,following-sibling::html:a[1]/@name)"/> </xsl:attribute> </xsl:when> <xsl:when test="following-sibling::* = following-sibling::a[@name != '']"> <xsl:attribute name="id"> <xsl:value-of select="concat($prefix,following-sibling::a[1]/@name)"/> </xsl:attribute> </xsl:when> </xsl:choose> <xsl:apply-templates/> _ inlinemediaobject inlinemediaobject mediaobject Matched
No template for No template for
_ _ Attempting to count columns on a non-table element Row parameter is not a valid row