#!/usr/bin/perl -w # # $Id: htmltopalmdoc,v 1.3 2004/10/05 21:10:27 cpb Exp $ use strict; use Getopt::Std; use Palm::PDB; use Palm::Doc; use HTML::FormatText; use HTML::TreeBuilder; use Encode 'from_to'; use HTML::Entities; $Getopt::Std::STANDARD_HELP_VERSION = 1; my $output; if( @ARGV == 2 ) { unless( $ARGV[0] eq '-' ) { open STDIN, "< $ARGV[0]" or die "$@"; } $output = $ARGV[1]; } elsif( @ARGV == 1 ) { $output = $ARGV[0]; } else { print STDERR < or $0 END exit 1; } binmode STDIN; my $tree = HTML::TreeBuilder->new_from_file( *STDIN ); die "Failed to parse HTML" unless defined $tree; my $charset = 'iso-8859-1'; # try to determine the HTML charset my $meta = $tree->find_by_tag_name('meta'); if( defined $meta and $meta->attr('content') =~ /charset=([^\s]+)/ ) { $charset = $1; } my $name = $tree->find_by_tag_name('title')->as_text(); $name = $output unless defined $name; my $formatter = HTML::FormatText->new( 'lm' => 0, 'rm' => 80 ); my $text = $formatter->format( $tree ); # strip out newlines within paragraphs $text =~ s!(\w)[ \t]*\n([^\n])!$1 $2!sgi; # make sure we're dealing with latin1 from_to( $text, $charset, 'iso-8859-1' ) unless $charset =~ /8859-1$/; decode_entities($text); # convert HTML entities to 8859-1 # decode_entities is great, except for entities that can't be # represented in 8859-1. Strip those out or, if it's something we're # aware of, do a quick and dirty conversion $text =~ s/&\#821[12];?/-/g; $text =~ s/&\#821[678];?/'/g; $text =~ s/&\#822[012];?/"/g; # strip out the rest $text =~ s/&\#\d+;?/?/g; $text =~ s/&\#[xX][\da-fA-F]+;?/?/g; $text =~ s/&\w+;?/?/g; my $doc = new Palm::Doc(); $doc->text( $text ); $name =~ s/[^a-z0-9 ]+//gio; $name =~ s/^(.{3,25}).*$/$1/; $doc->{'name'} = $name; $name =~ s/\s+/_/g; $doc->Write( $output ); exit 0; __END__ =head1 NAME htmltopalmdoc - simple HTML to Palm Doc converter =head1 SYNOPSIS htmltopalmdoc cat file.html | htmltopalmdoc =head1 DESCRIPTION C is a very simple HTML to Palm Doc conversion script. It's basically just a wrapper around L. I wouldn't recommend it for serious use. L does a better job. =head1 AUTHOR Christophe Beauregard Ecpb@cpan.orgE =head1 SEE ALSO Palm::Doc(3)