/* -*- Mode: c; c-basic-offset: 2 -*- * * raptor_xslt_parse.c - Raptor GRDDL XSLT Parser implementation * * Copyright (C) 2005-2006, David Beckett http://purl.org/net/dajobe/ * Copyright (C) 2005, University of Bristol, UK http://www.bristol.ac.uk/ * * This package is Free Software and part of Redland http://librdf.org/ * * It is licensed under the following three licenses as alternatives: * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version * 2. GNU General Public License (GPL) V2 or any newer version * 3. Apache License, V2.0 or any newer version * * You may not use this file except in compliance with at least one of * the above three licenses. * * See LICENSE.html or LICENSE.txt at the top of this package for the * complete terms and further detail along with the license texts for * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. * */ /* * W3C Gleaning Resource Descriptions from Dialects of Languages (GRDDL) * http://www.w3.org/2004/01/rdxh/spec * * See also * http://www.w3.org/2003/g/data-view * * * Looks for indication of GRDDL meaning intended in the XML (XHTML) * document source. * * 1. /html/head[@profile="http://www.w3.org/2003/g/data-view"] * 2. /html/head/link[@rel="transformation"] (may be repeated) * * Indicating that the sheet in the value of @href of #2 transforms * the document into RDF/XML and hence RDF triples. * * In example: * * * * ... * * * The may be repeated. */ #ifdef HAVE_CONFIG_H #include #endif #ifdef WIN32 #include #endif #include #include #include #include #ifdef HAVE_ERRNO_H #include #endif #ifdef HAVE_STDLIB_H #include #endif /* Raptor includes */ #include "raptor.h" #include "raptor_internal.h" #include #include #include #include #include /* * libxslt API notes * * Inputs to an XSLT transformation process with libxslt are: * 1. A set of (key:value) parameters. * 2. An xsltStylesheetPtr for the XSLT sheet * Which could be made from a file or an xmlDoc; and the xmlDoc. * made from a file or memory buffer. * 3. An xmlDoc for the XML source * Which could be made from a file or a memory buffer. * */ /* * XSLT parser object */ struct raptor_xslt_parser_context_s { xmlSAXHandler sax; /* XML document ctxt */ xmlParserCtxtPtr ctxt; /* Create xpath evaluation context */ xmlXPathContextPtr xpathCtx; /* Evaluate xpath expression */ xmlXPathObjectPtr xpathObj; /* (RDF/XML) parser for dealing with the result */ raptor_parser* rdfxml; }; typedef struct raptor_xslt_parser_context_s raptor_xslt_parser_context; static int raptor_xslt_parse_init(raptor_parser* rdf_parser, const char *name) { raptor_xslt_parser_context *xslt_parser=(raptor_xslt_parser_context*)rdf_parser->context; xslt_parser->rdfxml=raptor_new_parser("rdfxml"); if(!xslt_parser->rdfxml) { raptor_parser_error(rdf_parser, "Failed to create RDF/XML parser"); return 1; } return 0; } static void raptor_xslt_parse_terminate(raptor_parser *rdf_parser) { raptor_xslt_parser_context *xslt_parser=(raptor_xslt_parser_context*)rdf_parser->context; if(xslt_parser->ctxt) { if(xslt_parser->ctxt->myDoc) { xmlFreeDoc(xslt_parser->ctxt->myDoc); xslt_parser->ctxt->myDoc=NULL; } xmlFreeParserCtxt(xslt_parser->ctxt); } if(xslt_parser->xpathCtx) xmlXPathFreeContext(xslt_parser->xpathCtx); if(xslt_parser->xpathObj) xmlXPathFreeObject(xslt_parser->xpathObj); if(xslt_parser->rdfxml) raptor_free_parser(xslt_parser->rdfxml); } static int raptor_xslt_parse_start(raptor_parser *rdf_parser) { raptor_xslt_parser_context* xslt_parser=(raptor_xslt_parser_context*)rdf_parser->context; raptor_locator *locator=&rdf_parser->locator; raptor_parser *p=xslt_parser->rdfxml; locator->line=1; /* copy any user data to the internal parser */ raptor_parser_copy_user_state(p, rdf_parser); return 0; } static struct { const xmlChar* xpath; int is_value_list; const xmlChar* xslt_sheet_uri; } match_table[]={ /* XHTML document where the GRDDL profile is in * inside the html */ { (const xmlChar*)"/html:html/html:head[contains(@profile,\"http://www.w3.org/2003/g/data-view\")]/html:link[@rel=\"transformation\"]/@href", 0, NULL } , /* XHTML document where the GRDDL profile is in * inside the html */ { (const xmlChar*)"/html:html/html:head[contains(@profile,\"http://www.w3.org/2003/g/data-view\")]/../..//html:a[@rel=\"transformation\"]/@href", 0, NULL } , /* XML document linking to transform via attribute dataview:transformation * Example: http://www.w3.org/2004/01/rdxh/grddl-p3p-example **/ { (const xmlChar*)"//@dataview:transformation", 1, /* list of URIs */ NULL } , #if 0 /* FIXME Disabled. This returns the wrong namespaces in * Id: dc-extract.xsl,v 1.10 2005/09/07 17:10:06 connolly Exp */ /* Dublin Core in tags http://dublincore.org/documents/dcq-html/ */ { (const xmlChar*)"/html:html/html:head/html:link[@href=\"http://purl.org/dc/elements/1.1/\"]", 0, (const xmlChar*)"http://www.w3.org/2000/06/dc-extract/dc-extract.xsl" } , #endif /* Embedded RDF * inside */ { (const xmlChar*)"/html:html/html:head[contains(@profile,\"http://purl.org/NET/erdf/profile\")]", 0, (const xmlChar*)"http://purl.org/NET/erdf/extract-rdf.xsl" } , /* hCalendar microformat http://microformats.org/wiki/hcalendar */ { (const xmlChar*)"//*[@class=\"vevent\"]", 0, (const xmlChar*)"http://www.w3.org/2002/12/cal/glean-hcal.xsl" } , { NULL, 0, 0 } }; /* Run a GRDDL transform using a pre-parsed XSLT stylesheet already * formed into a libxml document (with URI) */ static int raptor_xslt_run_grddl_transform_doc(raptor_parser* rdf_parser, raptor_uri* xslt_uri, xmlDocPtr xslt_doc, xmlDocPtr doc) { raptor_xslt_parser_context* xslt_parser=(raptor_xslt_parser_context*)rdf_parser->context; int ret=0; xsltStylesheetPtr sheet=NULL; xmlDocPtr res=NULL; xmlChar *doc_txt=NULL; int doc_txt_len=0; RAPTOR_DEBUG3("Running GRDDL transform with XSLT URI '%s' on doc URI '%s'\n", raptor_uri_as_string(xslt_uri), raptor_uri_as_string(rdf_parser->base_uri)); sheet = xsltParseStylesheetDoc(xslt_doc); if(!sheet) { raptor_parser_error(rdf_parser, "Failed to parse stylesheet in '%s'", raptor_uri_as_string(xslt_uri)); ret=1; goto cleanup_xslt; } res = xsltApplyStylesheet(sheet, doc, NULL); /* no params */ if(!res) { raptor_parser_error(rdf_parser, "Failed to apply stylesheet in '%s'", raptor_uri_as_string(xslt_uri)); ret=1; goto cleanup_xslt; } /* write the resulting XML to a string */ xsltSaveResultToString(&doc_txt, &doc_txt_len, res, sheet); if(!doc_txt || !doc_txt_len) { /* empty document - continue? FIXME */ raptor_parser_warning(rdf_parser, "XSLT returned an empty document"); } else { RAPTOR_DEBUG2("XSLT returned %d bytes RDF/XML document\n", doc_txt_len); /* generate the triples */ raptor_start_parse(xslt_parser->rdfxml, rdf_parser->base_uri); raptor_parse_chunk(xslt_parser->rdfxml, doc_txt, doc_txt_len, 1); } cleanup_xslt: if(doc_txt) xmlFree(doc_txt); if(res) xmlFreeDoc(res); if(sheet) xsltFreeStylesheet(sheet); return ret; } typedef struct { raptor_parser* rdf_parser; xmlParserCtxtPtr xc; } raptor_xslt_parse_bytes_context; static void raptor_xslt_uri_parse_bytes(raptor_www* www, void *userdata, const void *ptr, size_t size, size_t nmemb) { raptor_xslt_parse_bytes_context* pbc=(raptor_xslt_parse_bytes_context*)userdata; int len=size*nmemb; int rc=0; if(!pbc->xc) { xmlParserCtxtPtr xc; xc = xmlCreatePushParserCtxt(NULL, NULL, (const char*)ptr, len, (const char*)raptor_uri_as_string(www->uri)); if(!xc) rc=1; else { int libxml_options = 0; #ifdef XML_PARSE_NONET if(pbc->rdf_parser->features[RAPTOR_FEATURE_NO_NET]) libxml_options |= XML_PARSE_NONET; #endif #ifdef HAVE_XMLCTXTUSEOPTIONS xmlCtxtUseOptions(xc, libxml_options); #endif xc->replaceEntities = 1; xc->loadsubset = 1; } pbc->xc=xc; } else rc=xmlParseChunk(pbc->xc, (const char*)ptr, len, 0); if(rc) raptor_parser_error(pbc->rdf_parser, "Parsing failed"); } /* Run a GRDDL transform using a XSLT stylesheet at a given URI */ static int raptor_xslt_run_grddl_transform_uri(raptor_parser* rdf_parser, raptor_uri* xslt_uri, xmlDocPtr doc) { raptor_www *www=NULL; xmlParserCtxtPtr xslt_ctxt=NULL; raptor_xslt_parse_bytes_context pbc; int ret=0; RAPTOR_DEBUG2("Running GRDDL transform with XSLT URI '%s'\n", raptor_uri_as_string(xslt_uri)); /* make an xsltStylesheetPtr via the raptor_xslt_uri_parse_bytes * callback as bytes are returned */ pbc.xc=NULL; pbc.rdf_parser=rdf_parser; www=raptor_www_new(); if(rdf_parser->uri_filter) raptor_www_set_uri_filter(www, rdf_parser->uri_filter, rdf_parser->uri_filter_user_data); else if(rdf_parser->features[RAPTOR_FEATURE_NO_NET]) raptor_www_set_uri_filter(www, raptor_parse_uri_no_net_filter, rdf_parser); raptor_www_set_write_bytes_handler(www, raptor_xslt_uri_parse_bytes, &pbc); if(raptor_www_fetch(www, xslt_uri)) { ret=1; goto cleanup_xslt; } xslt_ctxt=pbc.xc; xmlParseChunk(pbc.xc, NULL, 0, 1); ret=raptor_xslt_run_grddl_transform_doc(rdf_parser, xslt_uri, xslt_ctxt->myDoc, doc); cleanup_xslt: if(xslt_ctxt) xmlFreeParserCtxt(xslt_ctxt); if(www) raptor_www_free(www); return ret; } static int raptor_xslt_parse_chunk(raptor_parser* rdf_parser, const unsigned char *s, size_t len, int is_end) { raptor_xslt_parser_context* xslt_parser=(raptor_xslt_parser_context*)rdf_parser->context; int i; int ret=0; const unsigned char* uri_string; raptor_uri* uri; /* XML document DOM */ xmlDocPtr doc; xmlNodeSetPtr nodes; int expri; xmlChar *base_uri_string; raptor_uri* base_uri=NULL; if(!xslt_parser->ctxt) { uri_string=raptor_uri_as_string(rdf_parser->base_uri); /* first time, so init context with first read bytes */ xslt_parser->ctxt = xmlCreatePushParserCtxt(NULL, NULL, (const char*)s, len, (const char*)uri_string); if(!xslt_parser->ctxt) { raptor_parser_error(rdf_parser, "Failed to create XML parser"); return 1; } raptor_libxml_init_sax_error_handlers(&xslt_parser->sax); raptor_libxml_init_generic_error_handlers(rdf_parser); xslt_parser->ctxt->replaceEntities = 1; xslt_parser->ctxt->loadsubset = 1; if(is_end) xmlParseChunk(xslt_parser->ctxt, (const char*)s, 0, is_end); } else if((s && len) || is_end) xmlParseChunk(xslt_parser->ctxt, (const char*)s, len, is_end); if(!is_end) return 0; doc=xslt_parser->ctxt->myDoc; if(!doc) { raptor_parser_error(rdf_parser, "Failed to create XML DOM for document"); return 1; } /* Create the XPath evaluation context */ xslt_parser->xpathCtx=NULL; xslt_parser->xpathCtx = xmlXPathNewContext(doc); if(!xslt_parser->xpathCtx) { raptor_parser_error(rdf_parser, "Failed to create XPath context for document"); return 1; } xmlXPathRegisterNs(xslt_parser->xpathCtx, (const xmlChar*)"html", (const xmlChar*)"http://www.w3.org/1999/xhtml"); xmlXPathRegisterNs(xslt_parser->xpathCtx, (const xmlChar*)"dataview", (const xmlChar*)"http://www.w3.org/2003/g/data-view#"); base_uri=NULL; /* Try all XPaths */ for(expri=0; match_table[expri].xpath; expri++) { const xmlChar* xpathExpr=match_table[expri].xpath; /* Evaluate xpath expression */ xslt_parser->xpathObj = xmlXPathEvalExpression(xpathExpr, xslt_parser->xpathCtx); if(!xslt_parser->xpathObj) { raptor_parser_error(rdf_parser, "Unable to evaluate XPath expression \"%s\"", xpathExpr); return 1; } nodes=xslt_parser->xpathObj->nodesetval; if(!nodes || xmlXPathNodeSetIsEmpty(nodes)) { RAPTOR_DEBUG3("No match found with XPath expression \"%s\" over '%s'\n", xpathExpr, raptor_uri_as_string(rdf_parser->base_uri)); continue; } RAPTOR_DEBUG3("Found match with XPath expression \"%s\" over '%s'\n", xpathExpr, raptor_uri_as_string(rdf_parser->base_uri)); if(match_table[expri].xslt_sheet_uri) { /* Ignore what matched, use a hardcoded XSLT URI */ uri_string=match_table[expri].xslt_sheet_uri; base_uri=raptor_uri_copy(rdf_parser->base_uri); RAPTOR_DEBUG2("Using hard-coded XSLT URI '%s'\n", uri_string); uri=raptor_new_uri_relative_to_base(base_uri, uri_string); ret=raptor_xslt_run_grddl_transform_uri(rdf_parser, uri, doc); raptor_free_uri(uri); if(base_uri) raptor_free_uri(base_uri); } else { for(i=0; i < xmlXPathNodeSetGetLength(nodes); i++) { xmlNodePtr node=nodes->nodeTab[i]; if(node->type != XML_ATTRIBUTE_NODE) { raptor_parser_error(rdf_parser, "Got unexpected node type %d", node->type); continue; } /* returns base URI or NULL - must be freed with xmlFree() */ base_uri_string=xmlNodeGetBase(doc, node); uri_string=(const unsigned char*)node->children->content; if(base_uri_string) { base_uri=raptor_new_uri(base_uri_string); xmlFree(base_uri_string); RAPTOR_DEBUG2("Got XML base URI '%s'\n", raptor_uri_as_string(base_uri)); } else if(rdf_parser->base_uri) base_uri=raptor_uri_copy(rdf_parser->base_uri); else base_uri=NULL; if(match_table[expri].is_value_list) { char *start; char *end; char* buffer; size_t list_len=strlen((const char*)uri_string); buffer=(char*)RAPTOR_MALLOC(cstring, list_len+1); strncpy(buffer, (const char*)uri_string, list_len+1); for(start=end=buffer; end; start=end+1) { end=strchr(start, ' '); if(end) *end='\0'; RAPTOR_DEBUG2("Got list URI '%s'\n", start); uri=raptor_new_uri_relative_to_base(base_uri, (const unsigned char*)start); ret=raptor_xslt_run_grddl_transform_uri(rdf_parser, uri, doc); raptor_free_uri(uri); } RAPTOR_FREE(cstring, buffer); } else { uri=raptor_new_uri_relative_to_base(base_uri, uri_string); ret=raptor_xslt_run_grddl_transform_uri(rdf_parser, uri, doc); raptor_free_uri(uri); } if(base_uri) raptor_free_uri(base_uri); } } if(rdf_parser->failed || ret != 0) break; } /* end XPath expression loop */ if(rdf_parser->failed) return 1; return (ret != 0); } static int raptor_xslt_parse_recognise_syntax(raptor_parser_factory* factory, const unsigned char *buffer, size_t len, const unsigned char *identifier, const unsigned char *suffix, const char *mime_type) { int score= 0; if(suffix) { if(!strcmp((const char*)suffix, "xhtml")) score=7; if(!strcmp((const char*)suffix, "html")) score=2; } if(identifier) { if(strstr((const char*)identifier, "xhtml")) score+=5; } return score; } static void raptor_xslt_parser_register_factory(raptor_parser_factory *factory) { factory->context_length = sizeof(raptor_xslt_parser_context); factory->need_base_uri = 1; factory->init = raptor_xslt_parse_init; factory->terminate = raptor_xslt_parse_terminate; factory->start = raptor_xslt_parse_start; factory->chunk = raptor_xslt_parse_chunk; factory->recognise_syntax = raptor_xslt_parse_recognise_syntax; raptor_parser_factory_add_mime_type(factory, "text/html", 2); raptor_parser_factory_add_mime_type(factory, "application/html+xml", 2); } void raptor_init_parser_grddl(void) { raptor_parser_register_factory("grddl", "GRDDL over XHTML/XML using XSLT", &raptor_xslt_parser_register_factory); }