#!/usr/bin/perl -w
#
# $Id: htmltopalmdoc,v 1.3 2004/10/05 21:10:27 cpb Exp $
use strict;
use Getopt::Std;
use Palm::PDB;
use Palm::Doc;
use HTML::FormatText;
use HTML::TreeBuilder;
use Encode 'from_to';
use HTML::Entities;
$Getopt::Std::STANDARD_HELP_VERSION = 1;
my $output;
if( @ARGV == 2 ) {
unless( $ARGV[0] eq '-' ) {
open STDIN, "< $ARGV[0]" or die "$@";
}
$output = $ARGV[1];
} elsif( @ARGV == 1 ) {
$output = $ARGV[0];
} else {
print STDERR <
or $0
END
exit 1;
}
binmode STDIN;
my $tree = HTML::TreeBuilder->new_from_file( *STDIN );
die "Failed to parse HTML" unless defined $tree;
my $charset = 'iso-8859-1';
# try to determine the HTML charset
my $meta = $tree->find_by_tag_name('meta');
if( defined $meta and $meta->attr('content') =~ /charset=([^\s]+)/ ) {
$charset = $1;
}
my $name = $tree->find_by_tag_name('title')->as_text();
$name = $output unless defined $name;
my $formatter = HTML::FormatText->new( 'lm' => 0, 'rm' => 80 );
my $text = $formatter->format( $tree );
# strip out newlines within paragraphs
$text =~ s!(\w)[ \t]*\n([^\n])!$1 $2!sgi;
# make sure we're dealing with latin1
from_to( $text, $charset, 'iso-8859-1' ) unless $charset =~ /8859-1$/;
decode_entities($text); # convert HTML entities to 8859-1
# decode_entities is great, except for entities that can't be
# represented in 8859-1. Strip those out or, if it's something we're
# aware of, do a quick and dirty conversion
$text =~ s/&\#821[12];?/-/g;
$text =~ s/&\#821[678];?/'/g;
$text =~ s/&\#822[012];?/"/g;
# strip out the rest
$text =~ s/&\#\d+;?/?/g;
$text =~ s/&\#[xX][\da-fA-F]+;?/?/g;
$text =~ s/&\w+;?/?/g;
my $doc = new Palm::Doc();
$doc->text( $text );
$name =~ s/[^a-z0-9 ]+//gio;
$name =~ s/^(.{3,25}).*$/$1/;
$doc->{'name'} = $name;
$name =~ s/\s+/_/g;
$doc->Write( $output );
exit 0;
__END__
=head1 NAME
htmltopalmdoc - simple HTML to Palm Doc converter
=head1 SYNOPSIS
htmltopalmdoc
cat file.html | htmltopalmdoc
=head1 DESCRIPTION
C is a very simple HTML to Palm Doc conversion script. It's
basically just a wrapper around L. I wouldn't recommend it
for serious use. L does a better job.
=head1 AUTHOR
Christophe Beauregard Ecpb@cpan.orgE
=head1 SEE ALSO
Palm::Doc(3)