package HTML::DublinCore;
use strict;
use warnings;
use Carp qw( croak );
use base qw( DublinCore::Record HTML::Parser );
use DublinCore::Element;
our $VERSION = .4;
=head1 NAME
HTML::DublinCore - Extract Dublin Core metadata from HTML
=head1 SYNOPSIS
use HTML::DublinCore;
## pass HTML to constructor
my $dc = HTML::DublinCore->new( $html );
## get the title element and print it's content
my $title = $dc->element( 'Title' );
print "title: ", $title->content(), "\n";
## get the same title content in one step
print "title: ", $dc->element( 'Title' )->content(), "\n";
## list context will retrieve all of a particular element
foreach my $element ( $dc->element( 'Creator' ) ) {
print "creator: ",$element->content(),"\n";
}
## qualified dublin core
my $creation = $dc->element( 'Date.created' )->content();
=head1 DESCRIPTION
HTML::DublinCore is a module for easily extracting Dublin Core metadata
that is embedded in HTML documents. The Dublin Core is a small set of metadata
elements for describing information resources. Dublin Core is typically
stored in the EHEADE of and HTML document using the EMETAE tag.
For more information on embedding DublinCore in HTML see RFC 2731
L. For a definition of the
meaning of various Dublin Core elements please see
L.
HTML::DublinCore actually extends Brian Cassidy's excellent DublinCore::Record
framework by adding some asHTML() methods, and a new constructor.
=head1 METHODS
=cut
## valid dublin core elements
=head2 new()
Constructor which you pass HTML content.
$dc = HTML::DublinCore->new( $html );
=cut
sub new {
my ( $class, $html ) = @_;
my $self = $class->SUPER::new;
bless $self, $class;
croak( "please supply string of HTML as argument to new()" ) if !$html;
$self->{ "DC_errors" } = [];
## initialize our parser, and parse
$self->init();
$self->parse( $html );
}
=head2 asHtml()
Serialize your Dublin Core metadata as HTML EMETAE tags.
print $dc->asHtml();
=cut
sub asHtml {
my $self = shift;
my $html = '';
foreach my $element ( $self->elements ) {
$html .= $element->asHtml() . "\n";
}
return( $html );
}
=head1 TODO
=over 4
=item * More comprehensive tests.
=item * Handle HTML entities properly.
=item * Collect error messages so they can be reported out of the object.
=back
=head1 SEE ALSO
=over 4
=item * DublinCore::Record
=item * Dublin Core L
=item * RFC 2731 L
=item * HTML::Parser
=item * perl4lib L
=back
=head1 AUTHORS
=over 4
=item * Ed Summers Eehs@pobox.comE
=item * Brian Cassidy Ebricas@cpan.orgE
=back
=head1 COPYRIGHT AND LICENSE
Copyright 2004 by Ed Summers, Brian Cassidy
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
## start tag hander. This automatically gets called in new() when we
## parse HTML since HTML::DublinCore inherits from HTML::Parser.
sub start {
my ( $self, $tagname, $attr, $attrseq, $origtext ) = @_;
return if ( $tagname ne 'meta' );
## lowercase keys
my %attributes = map { lc($_) => $attr->{$_} } keys( %$attr );
## parse name attribute (eg. DC.Identifier.ISBN )
return( undef ) if ! exists( $attributes{ name } );
my ( $namespace, $element, $qualifier ) =
split /\./, lc( $attributes{ name } );
## ignore non-DublinCore data
return( undef ) if $namespace ne 'dc';
## make sure element is dublin core
if ( ! grep { $element } @DublinCore::Record::VALID_ELEMENTS ) {
$self->_error( "invalid element: $element found" );
return( undef );
}
## return if we don't have a content attribute
if ( ! exists( $attributes{ content } ) ) {
$self->_error( "element $element lacks content" );
return( undef );
}
## create a new HTML::DublinCore::Element object
my $dc = DublinCore::Element->new();
$dc->name( $element );
$dc->qualifier( $qualifier );
$dc->content( $attributes{ content } );
if ( exists( $attributes{ scheme } ) ) {
$dc->scheme( $attributes{ scheme } );
}
if ( exists( $attributes{ lang } ) ) {
$dc->language( $attributes{ lang } );
}
## stash it for later
$self->add( $dc );
}
sub _error {
my ( $self, $msg ) = @_;
push( @{ $self->{ DC_errors } }, $msg );
return( 1 );
}
# add in a method to write DC elements as HTML meta tags.
package DublinCore::Element;
sub asHtml {
my $self = shift;
my $name = ucfirst( $self->name() );
if ( $self->qualifier() ) { $name .= '.' . $self->qualifier(); }
my $content = $self->content();
my $scheme = $self->scheme();
my $lang = $self->language();
my $html = qq(