=head1 NAME HTML::Microformats::Utilities - utility functions for searching and manipulating HTML =head1 DESCRIPTION This module includes a few functions for searching and manipulating HTML trees. =cut package HTML::Microformats::Utilities; use base qw(Exporter); use common::sense; use utf8; use 5.010; use Object::AUTHORITY; BEGIN { $HTML::Microformats::Utilities::AUTHORITY = 'cpan:TOBYINK'; $HTML::Microformats::Utilities::VERSION = '0.104'; } our @EXPORT_OK; BEGIN { @EXPORT_OK = qw(searchClass searchAncestorClass searchRel searchRev searchID searchAncestorTag stringify xml_stringify); } use HTML::Microformats::Datatype::String; use XML::LibXML qw(:all); =over 4 =item C<< searchClass($class, $node, [$prefix]) >> Returns a list of elements which are descendents of $node and have class name $class. $class can be a plain string, or a regular expression. If $prefix is supplied it is used as an optional prefix for $class. For example, with $class 'bar' and $prefix 'foo', searchClass will look for all of the following classes: 'bar', 'foobar', 'foo-bar' and 'foo:bar'. =cut sub searchClass { my $target = shift; my $dom = shift; my $prefix = shift || undef; my @matches; return @matches unless $dom; foreach my $node ($dom->getElementsByTagName('*')) { my $classList; $classList = $node->getAttribute('class'); $classList = $node->getAttribute('name') if (!length $classList) && ($node->tagName eq 'param'); next unless length $classList; if ((defined $prefix) && $classList =~ / (^|\s) ($prefix [:\-]?)? $target (\s|$) /x) { push @matches, $node; } elsif ($classList =~ / (^|\s) $target (\s|$) /x) { push @matches, $node; } } return @matches; } =item C<< searchAncestorClass($class, $node, [$skip]) >> Returns the first element which is an ancestor of $node having class name $class. $class can be a plain string, or a regular expression. $skip is the number of levels of ancestor to skip. If $skip is 0, then potentially searchAncestorClass will return $node itself. If $skip is 1, then it will not return $node but could potentially return its parent, and so on. =cut sub searchAncestorClass { my $target = shift; my $dom = shift; my $skip = shift; return undef unless defined $dom; if (!defined $skip or $skip <= 0) { my $classList; $classList = $dom->getAttribute('class'); $classList = $dom->getAttribute('name') if (!length $classList and $dom->tagName eq 'param'); if ($classList =~ / (^|\s) $target (\s|$) /x) { return $dom; } } if (defined $dom->parentNode and $dom->parentNode->isa('XML::LibXML::Element')) { return searchAncestorClass($target, $dom->parentNode, $skip-1); } return undef; } =item C<< searchRel($relationship, $node) >> Returns a list of elements which are descendents of $node and have relationship $relationship. $relationship can be a plain string, or a regular expression. =cut sub searchRel { my $target = shift; my $dom = shift; $target =~ tr/[\:\.]/\[\:\.\]/ unless ref $target; my @matches = (); for my $node ($dom->getElementsByTagName('*')) { my $classList = $node->getAttribute('rel'); next unless length $classList; if ($classList =~ / (^|\s) $target (\s|$) /ix) { push @matches, $node; } } return @matches; } =item C<< searchRev($relationship, $node) >> As per searchRel, but uses the rev attribute. =cut sub searchRev { my $target = shift; my $dom = shift; $target =~ tr/[\:\.]/\[\:\.\]/ unless ref $target; my @matches = (); for my $node ($dom->getElementsByTagName('*')) { my $classList = $node->getAttribute('rev'); next unless length $classList; if ($classList =~ / (^|\s) $target (\s|$) /ix) { push @matches, $node; } } return @matches; } =item C<< searchID($id, $node) >> Returns a descendent of $node with id attribute $id, or undef. =cut sub searchID { my $target = shift; my $dom = shift; $target =~ s/^\#//; for my $node ($dom->getElementsByTagName('*')) { my $id = $node->getAttribute('id') || next; return $node if $id eq $target; } } =item C<< searchAncestorTag($tagname, $node) >> Returns the nearest ancestor of $node with tag name $tagname, or undef. =cut sub searchAncestorTag { my ($target, $node) = @_; return $node if $node->localname =~ /^ $target $/ix; return searchAncestorTag($target, $node->parentNode) if defined $node->parentNode && $node->parentNode->nodeType == XML_ELEMENT_NODE; return undef; } =item C<< stringify($node, \%options) >> Returns a stringified version of a DOM element. This is conceptually equivalent to C<< $node->textContent >>, but follows microformat-specific stringification rules, including value excerption, the abbr pattern and so on. =cut # This function takes on too much responsibility. # It should delegate stuff. sub stringify { my $dom = shift; my $valueClass = shift || undef; my $doABBR = shift || (length $valueClass); my $str; my %opts; if (ref($valueClass) eq 'HASH') { %opts = %$valueClass; $valueClass = $opts{'excerpt-class'}; $doABBR = $opts{'abbr-pattern'}; } return unless $dom; # value-title if ($opts{'value-title'} =~ /(allow|require)/i or ($opts{'datetime'} && $opts{'value-title'} !~ /(forbid)/i)) { KIDDY: foreach my $kid ($dom->childNodes) { next if $kid->nodeName eq '#text' && $kid->textContent !~ /\S/; # skip whitespace last # anything without class='value-title' and a title attribute causes us to bail out. unless $opts{'value-title'} =~ /(lax)/i || ($kid->can('hasAttribute') && $kid->hasAttribute('class') && $kid->hasAttribute('title') && $kid->getAttribute('class') =~ /\b(value\-title)\b/); my $str = $kid->getAttribute('title'); utf8::encode($str); return HTML::Microformats::Datatype::String::ms($str, $kid); } } return if $opts{'value-title'} =~ /(require)/i; # ABBR pattern if ($doABBR) { if ($dom->nodeType==XML_ELEMENT_NODE && length $dom->getAttribute('data-cpan-html-microformats-content')) { my $title = $dom->getAttribute('data-cpan-html-microformats-content'); return HTML::Microformats::Datatype::String::ms($title, $dom); } elsif ( ($dom->nodeType==XML_ELEMENT_NODE && $dom->tagName eq 'abbr' && $dom->hasAttribute('title')) || ($dom->nodeType==XML_ELEMENT_NODE && $dom->tagName eq 'acronym' && $dom->hasAttribute('title')) || ($dom->nodeType==XML_ELEMENT_NODE && $dom->getAttribute('title') =~ /data\:/) ) { my $title = $dom->getAttribute('title'); utf8::encode($title); if ($title =~ / [\(\[\{] data\: (.*) [\)\]\}] /x || $title =~ / data\: (.*) $ /x ) { $title = $1; } if (defined $title) { return (ms $title, $dom); } } elsif ($dom->nodeType==XML_ELEMENT_NODE && $opts{'datetime'} && $dom->hasAttribute('datetime')) { my $str = $dom->getAttribute('datetime'); utf8::encode($str); return HTML::Microformats::Datatype::String::ms($str, $dom); } } # Value excerpting. if (length $valueClass) { my @nodes = searchClass($valueClass, $dom); my @strs; if (@nodes) { foreach my $valueNode (@nodes) { push @strs, stringify($valueNode, { 'excerpt-class' => undef, 'abbr-pattern' => $doABBR, 'datetime' => $opts{'datetime'}, 'keep-whitespace' => 1 }); } # In datetime mode, be smart enough to detect when date, time and # timezone have been given in wrong order. if ($opts{'datetime'}) { my $dt_things = {}; foreach my $x (@strs) { if ($x =~ /^\s*(Z|[+-]\d{1,2}(\:?\d\d)?)\s*$/i) { push @{$dt_things->{'z'}}, $1; } elsif ($x =~ /^\s*T?([\d\.\:]+)\s*$/i) { push @{$dt_things->{'t'}}, $1; } elsif ($x =~ /^\s*([\d-]+)\s*$/i) { push @{$dt_things->{'d'}}, $1; } elsif ($x =~ /^\s*T?([\d\.\:]+)\s*(Z|[+-]\d{1,2}(\:?\d\d)?)\s*$/i) { push @{$dt_things->{'t'}}, $1; push @{$dt_things->{'z'}}, $2; } elsif ($x =~ /^\s*(\d+)(?:[:\.](\d+))?(?:[:\.](\d+))?\s*([ap])\.?\s*[m]\.?\s*$/i) { my $h = $1; if (uc $4 eq 'P' && $h<12) { $h += 12; } elsif (uc $4 eq 'A' && $h==12) { $h = 0; } my $t = (defined $3) ? sprintf("%02d:%02d:%02d", $h, $2, $3) : sprintf("%02d:%02d", $h, $2); push @{$dt_things->{'t'}}, $t; } } if (defined $opts{'datetime-feedthrough'} && !defined $dt_things->{'d'}->[0]) { push @{ $dt_things->{'d'} }, $opts{'datetime-feedthrough'}->ymd('-'); } if (defined $opts{'datetime-feedthrough'} && !defined $dt_things->{'z'}->[0]) { push @{ $dt_things->{'z'} }, $opts{'datetime-feedthrough'}->strftime('%z'); } $str = sprintf("%s %s %s", $dt_things->{'d'}->[0], $dt_things->{'t'}->[0], $dt_things->{'z'}->[0]); } unless (length $str) { $str = HTML::Microformats::Datatype::String::ms((join $opts{'joiner'}, @strs), $dom); } } } my $inpre = searchAncestorTag('pre', $dom) ? 1 : 0; eval { $str = _stringify_helper($dom, $inpre, 0) unless defined $str; }; #$str = '***UTF-8 ERROR (WTF Happened?)***' if $@; #$str = '***UTF-8 ERROR (Not UTF-8)***' unless utf8::is_utf8("$str"); #$str = '***UTF-8 ERROR (Bad UTF-8)***' unless utf8::valid("$str"); if ($opts{'datetime'} && defined $opts{'datetime-feedthrough'}) { if ($str =~ /^\s*T?([\d\.\:]+)\s*$/i) { $str = sprintf('%s %s %s', $opts{'datetime-feedthrough'}->ymd('-'), $1, $opts{'datetime-feedthrough'}->strftime('%z'), ); } elsif ($str =~ /^\s*T?([\d\.\:]+)\s*(Z|[+-]\d{1,2}(\:?\d\d)?)\s*$/i) { $str = sprintf('%s %s %s', $opts{'datetime-feedthrough'}->ymd('-'), $1, $2, ); } elsif ($str =~ /^\s*([\d]+)(?:[:\.](\d+))(?:[:\.](\d+))?\s*([ap])\.?\s*[m]\.?\s*$/i) { my $h = $1; if (uc $4 eq 'P' && $h<12) { $h += 12; } elsif (uc $4 eq 'A' && $h==12) { $h = 0; } my $t = (defined $3) ? sprintf("%02d:%02d:%02d", $h, $2, $3) : sprintf("%02d:%02d", $h, $2); $str = sprintf('%s %s %s', $opts{'datetime-feedthrough'}->ymd('-'), $t, $opts{'datetime-feedthrough'}->strftime('%z'), ); } } unless ($opts{'keep-whitespace'}) { # \x1D is used as a "soft" line break. It can be "absorbed" into an adjacent # "hard" line break. $str =~ s/\x1D+/\x1D/g; $str =~ s/\x1D\n/\n/gs; $str =~ s/\n\x1D/\n/gs; $str =~ s/\x1D/\n/gs; $str =~ s/(^\s+|\s+$)//gs; } return HTML::Microformats::Datatype::String::ms($str, $dom); } sub _stringify_helper { my $domNode = shift || return; my $inPRE = shift || 0; my $indent = shift || 0; my $rv = ''; my $tag; if ($domNode->nodeType == XML_ELEMENT_NODE) { $tag = lc($domNode->tagName); } elsif ($domNode->nodeType == XML_COMMENT_NODE) { return HTML::Microformats::Datatype::String::ms(''); } # Change behaviour within
.
	$inPRE++ if $tag eq 'pre';
	
	# Text node, or equivalent.
	if (!$tag || $tag eq 'img' || $tag eq 'input' || $tag eq 'param')
	{
		$rv = $domNode->getData
			unless $tag;
		$rv = $domNode->getAttribute('alt')
			if $tag && $domNode->hasAttribute('alt');
		$rv = $domNode->getAttribute('value')
			if $tag && $domNode->hasAttribute('value');

		utf8::encode($rv);

		unless ($inPRE)
		{
			$rv =~ s/[\s\r\n]+/ /gs;
		}
		
		return $rv;
	}
	
	# Breaks.
	return "\n" if ($tag eq 'br');
	return "\x1D\n====\n\n"
		if ($tag eq 'hr');
	
	# Deleted text.
	return '' if ($tag eq 'del');

	# Get stringified children.
	my (@parts, @ctags, @cdoms);
	my $extra = 0;
	if ($tag =~ /^([oud]l|blockquote)$/)
	{
		$extra += 6; # Advisory for word wrapping.
	}
	foreach my $child ($domNode->getChildNodes)
	{
		my $ctag = $child->nodeType==XML_ELEMENT_NODE ? lc($child->tagName) : undef;
		my $str  = _stringify_helper($child, $inPRE, $indent + $extra);
		push @ctags, $ctag;
		push @parts, $str;
		push @cdoms, $child;
	}
	
	if ($tag eq 'ul' || $tag eq 'dir' || $tag eq 'menu')
	{
		$rv .= "\x1D";
		my $type = lc($domNode->getAttribute('type')) || 'disc';

		for (my $i=0; defined $parts[$i]; $i++)
		{
			next unless ($ctags[$i] eq 'li');
			
			$_ = $parts[$i];
			s/(^\x1D|\x1D$)//g;
			s/\x1D+/\x1D/g;
			s/\x1D\n/\n/gs;
			s/\n\x1D/\n/gs;
			s/\x1D/\n/gs;
			s/\n/\n    /gs;

			my $marker_type = $type;
			$marker_type = lc($cdoms[$i]->getAttribute('type'))
				if (length $cdoms[$i]->getAttribute('type'));

			my $marker = '*';
			if ($marker_type eq 'circle')    { $marker = '-'; }
			elsif ($marker_type eq 'square') { $marker = '+'; }
			
			$rv .= "  $marker $_\n";
		}
		$rv .= "\n";
	}
	
	elsif ($tag eq 'ol')
	{
		$rv .= "\x1D";
		
		my $count = 1;
		$count = $domNode->getAttribute('start')
			if (length $domNode->getAttribute('start'));
		my $type = $domNode->getAttribute('type') || '1';
		
		for (my $i=0; defined $parts[$i]; $i++)
		{
			next unless ($ctags[$i] eq 'li');
			
			$_ = $parts[$i];
			s/(^\x1D|\x1D$)//g;
			s/\x1D+/\x1D/g;
			s/\x1D\n/\n/gs;
			s/\n\x1D/\n/gs;
			s/\x1D/\n/gs;
			s/\n/\n    /gs;
			
			my $marker_value = $count;
			$marker_value = $cdoms[$i]->getAttribute('value')
				if (length $cdoms[$i]->getAttribute('value'));
			
			my $marker_type = $type;
			$marker_type = $cdoms[$i]->getAttribute('type')
				if (length $cdoms[$i]->getAttribute('type'));
				
			my $marker = sprintf('% 2d', $marker_value);
			if (uc($marker_type) eq 'A' && $marker_value > 0 && $marker_value <= 26)
				{ $marker = ' ' . chr( ord($marker_type) + $marker_value - 1 ); }
			elsif ($marker_type eq 'i' && $marker_value > 0 && $marker_value <= 3999)
				{ $marker = sprintf('% 2s', roman($marker_value)); }
			elsif ($marker_type eq 'I' && $marker_value > 0 && $marker_value <= 3999)
				{ $marker = sprintf('% 2s', Roman($marker_value)); }
				
			$rv .= sprintf("\%s. \%s\n", $marker, $_);

			$count++;
		}
		$rv .= "\n";
	}

	elsif ($tag eq 'dl')
	{
		$rv .= "\x1D";
		for (my $i=0; defined $parts[$i]; $i++)
		{
			next unless ($ctags[$i] eq 'dt' || $ctags[$i] eq 'dd');
			
			if ($ctags[$i] eq 'dt')
			{
				$rv .= $parts[$i] . ':';
				$rv =~ s/\:\s*\:$/\:/;
				$rv .= "\n";
			}
			elsif ($ctags[$i] eq 'dd')
			{
				$_ = $parts[$i];
				s/(^\x1D|\x1D$)//g;
				s/\x1D+/\x1D/g;
				s/\x1D\n/\n/gs;
				s/\n\x1D/\n/gs;
				s/\x1D/\n/gs;
				s/\n/\n    /gs;
				$rv .= sprintf("    \%s\n\n", $_);
			}
		}
	}

	elsif ($tag eq 'blockquote')
	{
		$rv .= "\x1D";
		for (my $i=0; defined $parts[$i]; $i++)
		{
			next unless ($ctags[$i]);
			
			$_ = $parts[$i];
			s/(^\x1D|\x1D$)//g;
			s/\x1D+/\x1D/g;
			s/\x1D\n/\n/gs;
			s/\n\x1D/\n/gs;
			s/\x1D/\n/gs;
			s/\n\n/\n/;
			s/\n/\n> /gs;
			$rv .= "> $_\n";
		}
		$rv =~ s/> $/\x1D/;
	}
	
	else
	{
		$rv = '';
		for (my $i=0; defined $parts[$i]; $i++)
		{
			$rv .= $parts[$i];
			
			# Hopefully this is a sensible algorithm for inserting whitespace
			# between childnodes. Needs a bit more testing though.
			
			# Don't insert whitespace if this tag or the next one is a block-level element.
			# Probably need to expand this list of block elements.
#			next if ($ctags[$i]   =~ /^(p|h[1-9]?|div|center|address|li|dd|dt|tr|caption|table)$/);
#			next if ($ctags[$i+1] =~ /^(p|h[1-9]?|div|center|address|li|dd|dt|tr|caption|table)$/);
			
			# Insert whitespace unless the string already ends in whitespace, or next
			# one begins with whitespace.
#			$rv .= ' '
#				unless ($rv =~ /\s$/ || (defined $parts[$i+1] && $parts[$i+1] =~ /^\s/));
		}
		
		if ($tag =~ /^(p|h[1-9]?|div|center|address|li|dd|dt|tr|caption|table)$/ && !$inPRE)
		{
			$rv =~ s/^[\t ]//s;
			#local($Text::Wrap::columns);
			#$Text::Wrap::columns = 78 - $indent;
			$rv = "\x1D".$rv;#Text::Wrap::wrap('','',$rv);
			if ($tag =~ /^(p|h[1-9]?|address)$/)
			{
				$rv .= "\n\n";
			}
		}
		
		if ($tag eq 'sub')
			{ $rv = "($rv)"; }
		elsif ($tag eq 'sup')
			{ $rv = "[$rv]"; }
		elsif ($tag eq 'q')
			{ $rv = "\"$rv\""; }
		elsif ($tag eq 'th' || $tag eq 'td')
			{ $rv = "$rv\t"; }
	}

	return $rv;
}

=item C<< xml_stringify($node) >>

Returns an XML serialisation of a DOM element. This is conceptually equivalent
to C<< $node->toStringEC14N >>, but hides certain attributes which
HTML::Microformats::DocumentContext adds for internal processing.

=cut

sub xml_stringify
{
	my $node  = shift;
	my $clone = $node->cloneNode(1);
	
	foreach my $attr ($clone->attributes)
	{
		if ($attr->nodeName =~ /^data-cpan-html-microformats-/)
		{
			$clone->removeAttribute($attr->nodeName);
		}
	}
	foreach my $kid ($clone->getElementsByTagName('*'))
	{
		foreach my $attr ($kid->attributes)
		{
			if ($attr->nodeName =~ /^data-cpan-html-microformats-/)
			{
				$kid->removeAttribute($attr->nodeName);
			}
		}
	}
	
	$node->ownerDocument->documentElement->appendChild($clone);
	my $rv = $clone->toStringEC14N;
	$node->ownerDocument->documentElement->removeChild($clone);
	return $rv;
}

1;

__END__

=back

=head1 BUGS

Please report any bugs to L.

=head1 SEE ALSO

L.

=head1 AUTHOR

Toby Inkster Etobyink@cpan.orgE.

=head1 COPYRIGHT

Copyright 2008-2011 Toby Inkster

This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=head1 DISCLAIMER OF WARRANTIES

THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.


=cut