The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

use common::sense;
use constant XHTML_NS => 'http://www.w3.org/1999/xhtml';
use CGI qw'';
use CGI::Carp qw'fatalsToBrowser';
use Data::Dumper qw'';
use Digest::SHA1 qw'sha1_hex';
use File::Slurp qw'slurp';
use HTML::HTML5::Writer qw(DOCTYPE_HTML5 DOCTYPE_XHTML_RDFA);
use HTTP::Cache::Transparent (BasePath=>'/tmp/cache/');
use HTTP::Negotiate qw'choose';
use JSON -convert_blessed_universally;
use LWPx::ParanoidAgent;
use PHP::Serialization qw'';
use RDF::RDFa::Generator;
use RDF::RDFa::Linter;
use RDF::RDFa::Linter::Error;
use RDF::RDFa::Parser;
use URI qw'';
use WWW::RobotRules;
use XML::LibXML qw':all';
use YAML::Any qw'';

my $CGI = CGI->new;
$CGI::POST_MAX = 1024*1024*2;

my @services = qw(Facebook Google CreativeCommons);

my $url = $CGI->param('url') || $CGI->param('uri') || shift @ARGV;
my $content;
my $source;

if ($CGI->param('source') =~ /upload/i)
{
	$source = 'uploaded file';
	my $fh = $CGI->upload('file');
	$content = do { local $/; <$fh>; };
	$url = 'widget://'.sha1_hex($content).'.rdfa/self'
		unless length $url;
}
elsif ($CGI->param('source') =~ /post/i)
{
	$source = 'posted data';
	$content = $CGI->param('data');
	$url = 'widget://'.sha1_hex($content).'.rdfa/self'
		unless length $url;
}
elsif ($url =~ /^referr?er$/i && length $CGI->referer)
{
	$url = $CGI->referer;
	$source = "referer <$url>";
}
elsif (length $url)
{
	$source = "<$url>";
}
else
{
	die "Must provide a URL!";
}

# Set up primary RDFa parser
my $ua = LWPx::ParanoidAgent->new(
	agent => sprintf("check.rdfa/0.0.1 (+http://check.rdfa.info/) RDF-RDFa-Parser/%s RDF-RDFa-Linter/%s RDF-RDFa-Generator/%s ",
		$RDF::RDFa::Parser::VERSION, $RDF::RDFa::Linter::VERSION, $RDF::RDFa::Generator::VERSION));
my $rdfa_parser;
my $opts = RDF::RDFa::Parser::Config->new(
	$CGI->param('language')||'html5',
	$CGI->param('version')||'1.0',
	lwp_ua => $ua,
	);
if (defined $content)
{
	$rdfa_parser = RDF::RDFa::Parser->new($content, $url, $opts);
}
else
{
	my $robots_txt_url = URI->new_abs('/robots.txt', $url);
	my $robots_txt     = $ua->get($robots_txt_url);
	my $rules          = WWW::RobotRules->new('check.rdfa');
	$rules->parse($robots_txt_url, $robots_txt->decoded_content)
		if $robots_txt->is_success;
	
	die "Forbidden by robots.txt."
		unless $rules->allowed($url);
	
	$rdfa_parser = RDF::RDFa::Parser->new_from_url($url, $opts);
}

my @main_errs;
$rdfa_parser->set_callbacks({oncurie => \&main_cb_oncurie});

my $var = $CGI->param('format') || choose([
	[ 'html',  1.000, 'text/html'],
	[ 'xhtml', 0.900, 'application/xhtml+xml'],
	[ 'json',  0.500, 'application/json'],
	[ 'yaml',  0.100, 'text/x-yaml'],
	[ 'pl',    0.100, 'text/x-perl'],
	[ 'php',   0.100, 'application/vnd.php.serialized'],
	]) || 'html';
$var = lc $var;

if ($var eq 'json' or $var eq 'yaml' or $var eq 'pl' or $var eq 'php')
{
	my $data = {};
	
	$data->{'RDFa'}->{'Data'}   = $rdfa_parser->graph->as_hashref;
	$data->{'RDFa'}->{'Errors'} = \@main_errs;

	foreach my $srv (@services)
	{
		my $this_parser = RDF::RDFa::Parser->new($rdfa_parser->dom, $rdfa_parser->uri, $opts);
		my $linter      = RDF::RDFa::Linter->new($srv, $url, $this_parser);
		
		if ($linter->filtered_graph->count_statements)
		{
			$data->{$srv}->{'Info'}   = $linter->info;
			$data->{$srv}->{'Data'}   = $linter->filtered_graph->as_hashref;
			$data->{$srv}->{'Errors'} = [ $linter->find_errors ];
		}
		else
		{
			$data->{$srv} = undef;
		}
	}
	
	if ($var eq 'json')
	{
		print $CGI->header('application/json')
			if defined $CGI->request_method;
		print JSON->new->utf8->convert_blessed->encode($data);
	}
	elsif ($var eq 'yaml')
	{
		print $CGI->header('text/x-yaml')
			if defined $CGI->request_method;
		print YAML::Any::Dump($data);
	}
	elsif ($var eq 'pl')
	{
		print $CGI->header('text/x-perl')
			if defined $CGI->request_method;
		print Data::Dumper::Dumper($data);
	}
	elsif ($var eq 'php')
	{
		print $CGI->header('application/vnd.php.serialized')
			if defined $CGI->request_method;
		print PHP::Serialization::serialize($data);
	}
	exit;	
}
else
{
	my $template = slurp('linter-template.xml');
	my $dom = XML::LibXML->new->parse_string($template);
	my $xpc = XML::LibXML::XPathContext->new($dom);
	$xpc->registerNs('x', XHTML_NS);
	my $gen = RDF::RDFa::Generator->new(style=>'HTML::Pretty', safe_xml_literals=>1);

	# Title
	my @title = $dom->getElementsByTagName('title');
	$title[0]->appendTextNode("check.rdfa: $url");

	# Header
	my @head = $xpc->findnodes('//x:*[@class="head"]');
	$head[0]->addNewChild(XHTML_NS, 'h1')->appendWellBalancedChunk('check<span class="space"> </span><span class="r">rdfa</span>');

	# Summary
	my @summary = $xpc->findnodes('//x:*[@class="summary"]');
	$summary[0]->addNewChild(XHTML_NS, 'p')->appendTextNode("results for $source");

	# Main tab
	my $main_tab = _add_tab($xpc, 'RDFa', undef, 0, 'All Data');
	$main_tab->addNewChild(XHTML_NS, 'p')->appendTextNode("This tab shows all RDFa data extracted from your page; the other tabs filter this data down to show what particular services will see.");
	foreach my $node ($gen->nodes($rdfa_parser->graph, notes=>\@main_errs))
	{
		$node->setAttribute('class', $node->getAttribute('class').' rdfa');
		$main_tab->appendChild($node);
	}

	# Service tabs
	foreach my $srv (@services)
	{
		my $this_parser = RDF::RDFa::Parser->new($rdfa_parser->dom, $rdfa_parser->uri, $opts);
		my $linter      = RDF::RDFa::Linter->new($srv, $url, $this_parser);
		
		my $this_tab    = _add_tab($xpc, $linter->info->{'short'}, undef, 0, $linter->info->{'title'});	
		$this_tab->addNewChild(XHTML_NS, 'p')->appendTextNode($linter->info->{'description'});
		
		if ($linter->filtered_graph->count_statements)
		{
			foreach my $node ($gen->nodes($linter->filtered_graph, notes=>[$linter->find_errors]))
			{
				$node->setAttribute('class', $node->getAttribute('class').' rdfa');
				$this_tab->appendChild($node);
			}
		}
		else
		{
			$this_tab->addNewChild(XHTML_NS, 'p')->addNewChild(XHTML_NS, 'strong')->appendTextNode("No data found by this service.");
		}
	}

	# Output
	my $doctype = {
		html  => DOCTYPE_HTML5,
		xhtml => DOCTYPE_XHTML_RDFA,
		};
	$dom->documentElement->removeAttributeNS(undef, 'lang')
		if $var eq 'xhtml';
	print $CGI->header(($var eq 'html' ? 'text/html' : 'application/xhtml+xml')."; charset=utf-8")
		if defined $CGI->request_method;
	print HTML::HTML5::Writer->new(charset=>'ascii',markup=>$var,doctype=>$doctype->{$var})->document($dom);
	exit;
}

sub _xpath_has_class
{
	my ($nodelist, $class) = @_;
	my $result = XML::LibXML::NodeList->new;
	for my $node ($nodelist->get_nodelist)
	{
		next unless $node->nodeType eq XML_ELEMENT_NODE;
		next unless $node->hasAttribute('class');
		$result->push($node) if $node->getAttribute('class') =~ /\b($class)\b/;
	}
	return $result;
}

sub _add_tab
{
	my ($xpc, $title, $id, $index, $long) = @_;
	
	($id = 'tab-'.lc $title) =~ s/[^a-z0-9-]//i
		unless defined $id;
	
	$index = 0 unless defined $index;
		
	my @containers = $xpc->findnodes('//x:*[@class="tabs"]');
	my $tab = $containers[$index]->addNewChild(XHTML_NS, 'div');
	$tab->setAttribute('id', $id);	
	$tab->addNewChild(XHTML_NS, 'h2')->appendTextNode($long||$title);
	
	my @menus = $xpc->findnodes('//x:*[@class="tabNavigation"]');
	my $item = $menus[$index]->addNewChild(XHTML_NS, 'li');
	my $a = $item->addNewChild(XHTML_NS, 'a');
	$a->setAttribute('href', '#'.$id);
	$a->appendTextNode($title);
	
	return $tab;
}

sub main_cb_oncurie
{
	my ($parser, $node, $curie, $uri) = @_;

	return $uri unless $curie eq $uri || $uri eq '';

	my $preferred = {
		bibo => 'http://purl.org/ontology/bibo/' ,
		cc => 'http://creativecommons.org/ns#' ,
		ctag => 'http://commontag.org/ns#' ,
		dbp => 'http://dbpedia.org/property/' ,
		dc => 'http://purl.org/dc/terms/' ,
		doap => 'http://usefulinc.com/ns/doap#' ,
		fb => 'http://developers.facebook.com/schema/' ,
		foaf => 'http://xmlns.com/foaf/0.1/' ,
		geo => 'http://www.w3.org/2003/01/geo/wgs84_pos#' ,
		gr => 'http://purl.org/goodrelations/v1#' ,
		ical => 'http://www.w3.org/2002/12/cal/ical#' ,
		og => 'http://opengraphprotocol.org/schema/' ,
		owl => 'http://www.w3.org/2002/07/owl#' ,
		rdf => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' ,
		rdfa => 'http://www.w3.org/ns/rdfa#' ,
		rdfs => 'http://www.w3.org/2000/01/rdf-schema#' ,
		rel => 'http://purl.org/vocab/relationship/' ,
		rev => 'http://purl.org/stuff/rev#' ,
		rss => 'http://purl.org/rss/1.0/' ,
		sioc => 'http://rdfs.org/sioc/ns#' ,
		skos => 'http://www.w3.org/2004/02/skos/core#' ,
		v => 'http://rdf.data-vocabulary.org/#' ,
		vann => 'http://purl.org/vocab/vann/' ,
		vcard => 'http://www.w3.org/2006/vcard/ns#' ,
		void => 'http://rdfs.org/ns/void#' ,
		xfn => 'http://vocab.sindice.com/xfn#' ,
		xhv => 'http://www.w3.org/1999/xhtml/vocab#' ,
		xsd => 'http://www.w3.org/2001/XMLSchema#' ,
		};
	
	if ($curie =~ m/^([^:]+):(.*)$/)
	{
		my ($pfx, $sfx) = ($1, $2);
		
		if (defined $preferred->{$pfx})
		{
			push @main_errs,
				RDF::RDFa::Linter::Error->new(
					'subject' => RDF::Trine::Node::Resource->new($url),
					'text'    => "CURIE '$curie' used but '$pfx' is not bound - perhaps you forgot to specify xmlns:${pfx}=\"".$preferred->{$pfx}."\"",
					'level'   => 5,
					);
		}
		elsif ($pfx !~ m'^(http|https|file|ftp|urn|tag|mailto|acct|data|
			fax|tel|modem|gopher|info|news|sip|irc|javascript|sgn|ssh|xri|widget)$'ix)
		{
			push @main_errs,
				RDF::RDFa::Linter::Error->new(
					'subject' => RDF::Trine::Node::Resource->new($url),
					'text'    => "CURIE '$curie' used but '$pfx' is not bound - perhaps you forgot to specify xmlns:${pfx}=\"SOMETHING\"",
					'level'   => 1,
					);
		}
	}

	return $uri;
}