#!/usr/bin/perl ###################################################################### ## ## ## Script: html2tei.pl ## ## Author: D. Hageman ## ## ## ## Description: ## ## ## ## Utility to convert reasonably compliant HTML files to TEILite. ## ## ## ###################################################################### ##==================================================================## ## Libraries and Variables ## ##==================================================================## require 5.006; use strict; use warnings; use TEI::Lite; use XML::LibXML; our $VERSION = "0.50"; ##==================================================================## ## Main Execution ## ##==================================================================## { ## Check to see if we are given a file to convert. if( scalar( @ARGV ) < 1 ) { print_usage(); } ## Create a parser to pull in the HTML file. my $parser = XML::LibXML->new(); ## Parse the HTML file given to utility - if it is reasonably ## compliant - it should work. my $html_file = $parser->parse_html_file( $ARGV[0] ); my $html_root = $html_file->documentElement; my $tei_file = TEI::Lite::Document->new( 'Corpus' => 0, 'Composite' => 0 ); ## We need to add a header to document. my $tei_header = $tei_file->addHeader(); ## Grab the body element of the TEI document. my $tei_body = $tei_file->getBody(); ## Time to set the title. my $title = $html_root->findvalue( '//head/title' ); ## Clean up the title a bit ... $title =~ s/^\s+//g; $title =~ s/\s+$//g; ## Set the title correctly $tei_header->setTitle( $title ); my( $body ) = $html_root->findnodes( '//body' ); my $body_string = tei_convert_html_fragment( 0, $body->toString() ); my $doc = $parser->parse_string( $body_string ); $tei_body->appendChild( $doc->documentElement ); ## Print the docuument ... print $tei_file->toString( 2 ) . "\n"; ## We are done, exit nicely and go away! exit(0); } ##==================================================================## ## Function(s) ## ##==================================================================## ##----------------------------------------------## ## print_usage ## ##----------------------------------------------## ## Subroutine to print usage information. ## ##----------------------------------------------## sub print_usage { print "\nUsage: html2tei.pl \n\n"; exit( 1 ); } ##==================================================================## ## End of Code ## ##==================================================================## 1; ##==================================================================## ## Plain Old Documenation (POD) ## ##==================================================================## __END__ =head1 NAME html2tei.pl =head1 SYNOPSIS html2tei.pl =head1 DESCRIPTION Utility to convert a HTML file to a TEI Lite file. =head1 AUTHOR D. Hageman Edhageman@dracken.comE =head1 SEE ALSO L =head1 COPYRIGHT AND LICENSE Copyright (c) 2002-2005 D. Hageman (Dracken Technologies). All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut