#!/usr/bin/perl # # $Id: mobisucks,v 1.7 2005/02/14 03:50:53 cpb Exp $ use strict; use HTML::TreeBuilder; use HTML::FormatText; use Palm::PDB; use Palm::Doc; use HTML::Entities; my $doc = new Palm::PDB; $doc->Load( $ARGV[0] ); my $text = $doc->text(); if( (not defined $text) or $text !~ /^<[a-z]+>/i ) { print STDERR "$ARGV[0] doesn't look like a Mobireader document\n"; exit 1; } my $tree = HTML::TreeBuilder->new_from_content( $text ); my $formatter = HTML::FormatText->new( leftmargin => 0, rightmargin => 80 ); my $text = $formatter->format( $tree ); decode_entities($text); # convert HTML entities to 8859-1 # decode_entities is great, except for entities that can't be # represented in 8859-1. Strip those out or, if it's something we're # aware of, do a quick and dirty conversion $text =~ s/&\#821[12];?/-/g; $text =~ s/&\#821[678];?/'/g; $text =~ s/&\#822[012];?/"/g; # strip out the rest $text =~ s/&\#\d+;?/?/g; $text =~ s/&\#[xX][\da-fA-F]+;?/?/g; $text =~ s/&\w+;?/?/g; # FormatText's margins aren't really appropriate for PDA reading. Better to # remove them entirely and let the reader handle that stuff. We really should # just write our own formatter around HTML::TreeBuilder... $text =~ s!([^\n])\n([^\n])!\1 \2!sg; $doc->text( $text ); $doc->Write( $ARGV[0] ); exit 0; __END__ =head1 NAME mobisucks - convert Mobipocket reader files to regular text Palm Docs. =head1 SYNOPSIS mobisucks =head1 DESCRIPTION I hate Mobipocket reader, but it's used by quite a few ebook providers (http://www.baen.com/library/, http://www.blackmask.com/, etc) rather than a more useful format like straight PalmDoc, Ztxt, or even Plucker. This little script converts Mobipocket to bog-standard PalmDoc text. =head1 BUGS Doesn't handle every Mobipocket book, nor is the output always great. Overall, it's a bit of a hack. =head1 AUTHOR Christophe Beauregard Ecpb@cpan.orgE =head1 SEE ALSO Palm::Doc(3)