package XML::Atom::Microformats; use 5.010; use autodie; use strict; use utf8; use HTML::Microformats 0.100 qw(); use JSON 2.00 qw(); use RDF::Trine 0.135 qw(); use RDF::Query 2.900 qw(); use XML::Atom::OWL 0.100 qw(); our ($VERSION, $AUTHORITY, $HAS_RDFA); BEGIN { $XML::Atom::Microformats::VERSION = '0.004'; $XML::Atom::Microformats::AUTHORITY = 'cpan:TOBYINK'; $XML::Atom::Microformats::HAS_RDFA = eval "use RDF::RDFa::Parser 1.097; 1" || 0; } sub new_feed { my ($class, $xml, $base_uri) = @_; my $awol; if ($HAS_RDFA) { my $cfg = RDF::RDFa::Parser::Config->new( RDF::RDFa::Parser::Config->HOST_ATOM, RDF::RDFa::Parser::Config->RDFA_11, atom_parser => 1, ); $awol = RDF::RDFa::Parser->new($xml, $base_uri, $cfg, RDF::Trine::Store::Memory->temporary_store)->consume; } else { $awol = XML::Atom::OWL->new($xml, $base_uri, undef, RDF::Trine::Store::Memory->temporary_store)->consume; } my $self = bless { 'AWOL' => $awol , 'base' => $base_uri }, $class; return $self->_find_contexts->_prepare_contexts; } sub _find_contexts { my ($self) = @_; my $sparql = < PREFIX iana: SELECT ?entry ?entryid ?entrylink ?contenttype ?contentbody ?contentbase ?contentlang ?profile WHERE { ?entry a awol:Entry ; awol:content ?content ; awol:id ?entryid . ?content a awol:Content ; awol:type ?contenttype ; awol:body ?contentbody . OPTIONAL { ?entry iana:self ?entrylink . } OPTIONAL { ?content awol:base ?contentbase . } OPTIONAL { ?content awol:lang ?contentlang . } OPTIONAL { { ?feed awol:entry ?entry ; iana:profile ?profile . } UNION { ?entry iana:profile ?profile . } } } SPARQL my $query = RDF::Query->new($sparql); my $result = $query->execute($self->{'AWOL'}->graph); my $data = {}; while (my $row = $result->next) { my $e = $row->{'entry'}->as_ntriples; $data->{$e}->{'entryid'} ||= $row->{'entryid'}->literal_value; $data->{$e}->{'contentbody'} ||= $row->{'contentbody'}->literal_value; $data->{$e}->{'contenttype'} ||= $row->{'contenttype'}->literal_value; $data->{$e}->{'contentlang'} ||= $row->{'contentlang'}->literal_value if defined $row->{'contentlang'}; $data->{$e}->{'contentbase'} ||= $row->{'contentbase'}->uri if defined $row->{'contentbase'}; $data->{$e}->{'entrylink'} ||= $row->{'entrylink'}->uri if defined $row->{'entrylink'}; if (defined $row->{'profile'}) { push @{ $data->{$e}->{'profiles'} }, $row->{'profile'}->uri; } } $self->{'contexts'} = [values %$data]; return $self; } sub _prepare_contexts { my ($self) = @_; foreach my $context (@{$self->{'contexts'}}) { next unless $context->{'contenttype'} eq 'text/html' || $context->{'contenttype'} eq 'application/xhtml+xml'; my $dom; my $html = sprintf("
%s
", $context->{'contentlang'}, $context->{'contentlang'}, $context->{'contentbody'}); my $hmf = HTML::Microformats->new_document( $html, ($context->{'contentbase'} || $self->{'base'}), type => $context->{'contenttype'}); if ($@ || !defined $hmf) { warn sprintf("ENTRY <%s>: %s", $context->{'entryid'}, ($@ || "Could not process entry.")); next; } $hmf->{'context'}->{'document_uri'} = $context->{'entrylink'} || $context->{'entryid'}; $hmf->add_profile( @{$context->{'profiles'}} ); $context->{'HMF'} = $hmf; } return $self; } sub add_profile { my ($self, @profiles) = @_; foreach my $context (@{$self->{'contexts'}}) { next unless $context->{'HMF'}; $context->{'HMF'}->add_profile(@profiles); } return $self; } sub entry_add_profile { my ($self, $entry, @profiles) = @_; foreach my $context (@{$self->{'contexts'}}) { next unless $context->{'HMF'}; next unless $context->{'entryid'} eq $entry; $context->{'HMF'}->add_profile(@profiles); } return $self; } sub assume_profile { my ($self, @profiles) = @_; foreach my $context (@{$self->{'contexts'}}) { next unless $context->{'HMF'}; $context->{'HMF'}->assume_profile(@profiles); } return $self; } sub entry_assume_profile { my ($self, $entry, @profiles) = @_; foreach my $context (@{$self->{'contexts'}}) { next unless $context->{'HMF'}; next unless $context->{'entryid'} eq $entry; $context->{'HMF'}->assume_profile(@profiles); } return $self; } sub assume_all_profiles { my ($self) = @_; foreach my $context (@{$self->{'contexts'}}) { next unless $context->{'HMF'}; $context->{'HMF'}->assume_all_profiles; } return $self; } sub entry_assume_all_profiles { my ($self, $entry) = @_; foreach my $context (@{$self->{'contexts'}}) { next unless $context->{'HMF'}; next unless $context->{'entryid'} eq $entry; $context->{'HMF'}->assume_all_profiles; } return $self; } sub parse_microformats { my ($self) = @_; return $self if $self->{'parsed'}; foreach my $context (@{$self->{'contexts'}}) { next unless $context->{'HMF'}; $context->{'objects'} = $context->{'HMF'}->objects; } $self->{'parsed'} = 1; return $self; } sub clear_microformats { my ($self) = @_; foreach my $context (@{$self->{'contexts'}}) { $context->{'objects'} = undef; next unless $context->{'HMF'}; $context->{'HMF'}->clear_microformats; } $self->{'parsed'} = 0; return $self; } sub objects { my ($self, $format, $entry) = @_; $self->parse_microformats; my @rv; foreach my $context (@{$self->{'contexts'}}) { next unless $context->{'HMF'}; if ($entry eq $context->{'entryid'} || !defined $entry) { my @these = $context->{'HMF'}->objects($format); push @rv, @these; } } return @rv if (wantarray); return \@rv; } sub entry_objects { my ($self, $entry, $format) = @_; return $self->objects($format, $entry); } sub all_objects { my ($self, $entry) = @_; my $rv = {}; foreach my $format (HTML::Microformats->formats) { $rv->{$format} = $self->objects($format, $entry); } return $rv; } *entry_all_objects = \&all_objects; sub TO_JSON { return $_[0]->all_objects; } sub json { my ($self, %opts) = @_; $opts{'convert_blessed'} = 1 unless defined $opts{'convert_blessed'}; $opts{'utf8'} = 1 unless defined $opts{'utf8'}; return JSON::to_json($self->all_objects, \%opts); } sub entry_json { my ($self, $entry, %opts) = @_; $opts{'convert_blessed'} = 1 unless defined $opts{'convert_blessed'}; $opts{'utf8'} = 1 unless defined $opts{'utf8'}; return JSON::to_json($self->entry_all_objects($entry), \%opts); } sub model { my ($self, %opts) = @_; my $model = RDF::Trine::Model->temporary_model; $self->add_to_model($model, %opts); return $model; } sub entry_model { my ($self, $entry, %opts) = @_; my $model = RDF::Trine::Model->temporary_model; $self->entry_add_to_model($model, %opts); return $model; } sub add_to_model { my ($self, $model, %opts) = @_; $self->parse_microformats; my %entry_opts = %opts; $entry_opts{'atomowl'} = 0; foreach my $context (@{$self->{'contexts'}}) { next unless $context->{'HMF'}; $self->entry_add_to_model($context->{'entryid'}, $model, %entry_opts); } if ($opts{'atomowl'}) { my $iter = $self->{'AWOL'}->graph->as_stream; while (my $st = $iter->next) { $model->add_statement($st); } } return $self; } sub entry_add_to_model { my ($self, $entry, $model, %opts) = @_; $self->parse_microformats; foreach my $context (@{$self->{'contexts'}}) { next unless $context->{'HMF'}; next unless $entry eq $context->{'entryid'}; my $iter = $context->{'HMF'}->model->as_stream; while (my $st = $iter->next) { $model->add_statement( RDF::Trine::Statement->new(($st->nodes)[0..2]), RDF::Trine::Node::Resource->new($entry), ); } } if ($opts{'atomowl'}) { my $iter = $self->{'AWOL'}->graph->as_stream; while (my $st = $iter->next) { $model->add_statement($st); } } return $self; } 1; __END__ =head1 NAME XML::Atom::Microformats - parse microformats in Atom content =head1 SYNOPSIS use XML::Atom::Microformats; my $feed = XML::Atom::Microformats -> new_feed( $xml, $base_uri ) -> assume_profile( qw(hCard hCalendar) ); print $feed->json(pretty => 1); my $results = RDF::Query -> new( $sparql ) -> execute( $feed->model ); =head1 DESCRIPTION The XML::Atom::Microformats module brings the functionality of L to Atom 1.0 Syndication feeds. It finds microformats embedded in the EcontentE elements (note: not EsummaryE) of Atom entries. The general pattern of usage is to create an XML::Atom::Microformats object (which corresponds to an Atom 1.0 feed) using the C method; tell it which types of Microformat you expect to find there; then ask for the data, as a Perl hashref, a JSON string, or an RDF::Trine model. =head2 Constructor =over =item C<< XML::Atom::Microformats->new_feed($xml, $base_url) >> Constructs a feed object. $xml is the Atom source (string) or an XML::LibXML::Document. $base_url is the feed URL, important for resolving relative URL references. =back =head2 Profile Management Methods HTML::Microformats uses HTML profiles (i.e. the profile attribute on the HTML element) to detect which Microformats are used on a page. Any microformats which do not have a profile URI declared will not be parsed. XML::Atom::Microformats uses a similar mechanism. Because Atom does not have a EheadE element, Atom ElinkE is used instead: These links can be used on a per-entry basis, or for the whole feed. Because many feeds fail to properly declare which profiles they use, there are various profile management methods to tell XML::Atom::Microformats to assume the presence of particular profile URIs, even if they're actually missing. =over =item C<< add_profile(@profiles) >> Using C you can add one or more profile URIs, and they are treated as if they were found on the document. For example: $feed->add_profile('http://microformats.org/profile/rel-tag') This is useful for adding profile URIs declared outside the document itself (e.g. in HTTP headers). =item C<< entry_add_profile($entryid, @profiles) >> C is a variant to allow you to add a profile which applies only to one specific entry within the feed, if you know that entry's ID. =item C<< assume_profile(@microformats) >> For example: $feed->assume_profile(qw(hCard adr geo)) This method acts similarly to C but allows you to use names of microformats rather than URIs. Microformat names are case sensitive, and must match HTML::Microformats::Format::Foo module names. =item C<< entry_assume_profile($entryid, @profiles) >> C is a variant to allow you to add a profile which applies only to one specific entry within the feed, if you know that entry's ID. =item C<< assume_all_profiles >> This method is equivalent to calling C for all known microformats. =item C<< entry_assume_all_profiles($entryid) >> This method is equivalent to calling C for all known microformats. =back =head2 Parsing Methods You can probably skip this section. The C, C and C methods will automatically do this for you. =over =item C<< parse_microformats >> Scans through the feed, finding microformat objects. On subsequent calls, does nothing (as everything is already parsed). =item C<< clear_microformats >> Forgets information gleaned by C and thus allows C to be run again. This is useful if you've added some profiles between runs of C. =back =head2 Data Retrieval Methods These methods allow you to retrieve the feed's data, and do things with it. =over =item C<< objects($format) >> $format is, for example, 'hCard', 'adr' or 'RelTag'. Returns a list of objects of that type. (If called in scalar context, returns an arrayref.) Each object is, for example, an HTML::Microformat::hCard object, or an HTML::Microformat::RelTag object, etc. See the relevent documentation for details. =item C<< entry_objects($entryid, $format) >> C is a variant to allow you to fetch data for one specific entry within the feed, if you know that entry's ID. =item C<< all_objects >> Returns a hashref of data. Each hashref key is the name of a microformat (e.g. 'hCard', 'RelTag', etc), and the values are arrayrefs of objects. Each object is, for example, an HTML::Microformat::hCard object, or an HTML::Microformat::RelTag object, etc. See the relevent documentation for details. =item C<< entry_all_objects($entryid) >> C is a variant to allow you to fetch data for one specific entry within the feed, if you know that entry's ID. =item C<< json(%opts) >> Returns data roughly equivalent to the C method, but as a JSON string. %opts is a hash of options, suitable for passing to the L module's to_json function. The 'convert_blessed' and 'utf8' options are enabled by default, but can be disabled by explicitly setting them to 0, e.g. print $feed->json( pretty=>1, canonical=>1, utf8=>0 ); =item C<< entry_json($entryid, %opts) >> C is a variant to allow you to fetch data for one specific entry within the feed, if you know that entry's ID. =item C<< model(%opts) >> Returns data as an RDF::Trine::Model, suitable for serialising as RDF or running SPARQL queries. Quads are used (rather than triples) which allows you to trace statements to the entries from which they came. $opts{'atomowl'} is a boolean indicating whether or not to include data from XML::Atom::OWL in the returned model. If enabled, this always includes AtomOWL data for the whole feed (not just for a specific entry), even if you use the C method. If RDF::RDFa::Parser 1.096 or above is installed, then $opts{'atomowl'} will automatically pull in DataRSS data too. =item C<< entry_model($entryid, %opts) >> C is a variant to allow you to fetch data for one specific entry within the feed, if you know that entry's ID. =item C<< add_to_model($model, %opts) >> Adds data to an existing RDF::Trine::Model. Otherwise, the same as C. =item C<< entry_add_to_model($entry, $model, %opts) >> Adds data to an existing RDF::Trine::Model. Otherwise, the same as C. =begin private =item C<< TO_JSON >> Provided for the benefit of L's convert_blessed feature. =end private =back =head1 BUGS Please report any bugs to L. =head1 SEE ALSO L, L, L, L. L, L. =head1 AUTHOR Toby Inkster Etobyink@cpan.orgE. =head1 COPYRIGHT Copyright 2010-2012 Toby Inkster This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 DISCLAIMER OF WARRANTIES THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.