The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/*
 * 	Copyright (c) 2000-2006 All rights reserved
 * 	Alberto Reggiori <areggiori@webweaving.org>
 *	Dirk-Willem van Gulik <dirkx@webweaving.org>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by
 *        Alberto Reggiori <areggiori@webweaving.org> and
 *        Dirk-Willem van Gulik <dirkx@webweaving.org>."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *    This product includes software developed by the University of
 *    California, Berkeley and its contributors.
 *
 * 5. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * 6. Products derived from this software may not be called "RDFStore"
 *    nor may "RDFStore" appear in their names without prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * ====================================================================
 *
 * This software consists of work developed by Alberto Reggiori and
 * Dirk-Willem van Gulik. The RDF specific part is based based on public
 * domain software written at the Stanford University Database Group by
 * Sergey Melnik. For more information on the RDF API Draft work,
 * please see <http://www-db.stanford.edu/~melnik/rdf/api.html>
 * The DBMS TCP/IP server part is based on software originally written
 * by Dirk-Willem van Gulik for Web Weaving Internet Engineering m/v Enschede,
 * The Netherlands.
 *
 * $Id: rdfstore_digest.c,v 1.12 2006/06/19 10:10:21 areggiori Exp $
 *
 */

#include <stdio.h>

#include "rdfstore_digest.h"
#include "rdfstore_log.h"
#include "rdfstore_serializer.h"

/*
#define RDFSTORE_DEBUG_DIGEST
*/

#ifdef RDFSTORE_DEBUG_DIGEST
#define DIGEST_PRINT(dd) {\
		int             i = 0;\
		printf("Statement digest at line %s:%d is '",__FILE__,__LINE__);\
		for (i = 0; i < RDFSTORE_SHA_DIGESTSIZE; i++) {\
			printf("%02X", dd[i]);\
		};\
		printf("'\n");\
	}
#else
#define DIGEST_PRINT(dd) {}
#endif


int
rdfstore_digest_digest(unsigned char *input, int len, unsigned char digest[RDFSTORE_SHA_DIGESTSIZE])
{
	RDFSTORE_AP_SHA1_CTX sha_info;

	rdfstore_ap_SHA1Init(&sha_info);
	rdfstore_ap_SHA1Update(&sha_info, input, len);
	rdfstore_ap_SHA1Final(digest, &sha_info);

	DIGEST_PRINT(digest);
	return 0;
};

const char *
rdfstore_digest_get_digest_algorithm()
{
	return "SHA-1";
};

/* As it stands - this is a 32 bit (partial) hash - we are not using the full
 * 160 bots of a normal SHA1 operation
  */
static rdf_store_digest_t
rdfstore_digest_crc64(unsigned char * dd)
{
	if (dd == NULL)
		return 0;

	return (rdf_store_digest_t) htonl(*(uint32_t *) dd);
}

int 
rdfstore_digest_get_node_digest(RDF_Node * node, unsigned char dd[RDFSTORE_SHA_DIGESTSIZE], int unique)
{
	unsigned char  *input = NULL;
	int             status = 0;
	int             len = 0;

	if (node == NULL)
		return -1;

	if (node->type != 1) {
		if (node->value.resource.identifier == NULL)
			return -1;

#ifdef RDFSTORE_DEBUG_DIGEST
		printf("get_node_digest( RESOURCE '%s')\n", node->value.resource.identifier);
#endif

		len = node->value.resource.identifier_len;

		input = (unsigned char *) RDFSTORE_MALLOC(
			sizeof(unsigned char) * (len + 1)); /* also bNode bit flag below 1/0 */

		if (input == NULL)
			return -1;

		memcpy(input, node->value.resource.identifier,len);

		if( node->type == 2 ) {
			memcpy(input+len, "1", 1); /* is bNode */
		} else {
			memcpy(input+len, "0", 1);
			};
		len++;
	} else if (node->type == 1) {
		int len_lang,len_dt;
		/* literals can be empty i.e. node->value.literal.string can be NULL */

#ifdef RDFSTORE_DEBUG_DIGEST
		printf("get_node_digest( LITERAL '%s')\n", node->value.literal.string);
#endif

		len = (node->value.literal.string != NULL) ?  node->value.literal.string_len : 0;

		len_lang=0;
		len_dt=0;
		if( unique ) {
			if (node->value.literal.lang != NULL)
				len_lang = strlen(node->value.literal.lang);

			if (node->value.literal.parseType == 1)
				len_dt = strlen(RDFSTORE_RDF_PARSETYPE_LITERAL);
			else if (node->value.literal.dataType != NULL)
				len_dt= strlen(node->value.literal.dataType);
			};

		input = (unsigned char *) RDFSTORE_MALLOC(
			sizeof(unsigned char) * (len + len_lang + len_dt + 2)); /* the two double quotes signs to distinguish between resources and literals */

		if (input == NULL)
			return -1;

		/*
		 * the following assures that different digests are generated
		 * for the same string for Literal and URI ref of a Resource
		 * e.g. "http://www.google.com" and <http://www.google.com>
		 * would result in different digests
		 */
		memcpy(input, "\"", 1);
		if (node->value.literal.string != NULL) {
			memcpy(input+1, node->value.literal.string, len);
			};
		memcpy(input+1+len, "\"", 1);

		/* keep the digest unique per xml:lang and rdf:datatype if requested */
		if( unique ) {
			if (node->value.literal.lang != NULL)
				memcpy(input+1+len+1, node->value.literal.lang, len_lang);
                	if (node->value.literal.parseType == 1)
				memcpy(input+1+len+1+len_lang, RDFSTORE_RDF_PARSETYPE_LITERAL, len_dt);
                	else if (node->value.literal.dataType != NULL)
				memcpy(input+1+len+1+len_lang, node->value.literal.dataType, len_dt);
			};
		len += len_lang + len_dt + 2;
	} else {
		return -1;
		};

	status = rdfstore_digest_digest(input, len, dd);

	RDFSTORE_FREE(input);

	return status;
};

/*
 * crc64 of an SHA-1 cryptographic hash - see Stanford API Draft and GUID
 * stuff
 */
rdf_store_digest_t
rdfstore_digest_get_node_hashCode( RDF_Node * node, int unique )
{
	unsigned char   dd[RDFSTORE_SHA_DIGESTSIZE];
	rdf_store_digest_t hc = 0;

	if (node == NULL)
		return 0;

#ifdef RDFSTORE_DEBUG_DIGEST
	if (node->hashcode)
		printf("Node hashcode for '%s' already carried out '%d'\n", (node->type != 1) ? node->value.resource.identifier : node->value.literal.string, node->hashcode);
#endif

	if (node->hashcode)
		return node->hashcode;

	if ((rdfstore_digest_get_node_digest(node, dd, unique)) != 0) {
		hc = 0;
	} else {
		hc = rdfstore_digest_crc64(dd);
	};

	return hc;
};

int 
rdfstore_digest_get_statement_digest(RDF_Statement * statement, RDF_Node * given_context, unsigned char dd[RDFSTORE_SHA_DIGESTSIZE])
{
	unsigned char   dds[RDFSTORE_SHA_DIGESTSIZE];
	unsigned char   ddp[RDFSTORE_SHA_DIGESTSIZE];
	unsigned char   ddo[RDFSTORE_SHA_DIGESTSIZE];
	unsigned char   ddc[RDFSTORE_SHA_DIGESTSIZE];
	/* unsigned char ddn[RDFSTORE_SHA_DIGESTSIZE]; */
	unsigned char  *input = NULL;
	RDF_Node       *context = NULL;
	int             status = 0;

	if (statement == NULL)
		return -1;

	if (given_context == NULL) {
		if (statement->context != NULL)
			context = statement->context;
	} else {
		/* use given context instead */
		context = given_context;
	};

	if ((rdfstore_digest_get_node_digest(statement->subject, dds, 1)) != 0)
		return -1;

	DIGEST_PRINT(dds);

	if ((rdfstore_digest_get_node_digest(statement->predicate, ddp, 1)) != 0)
		return -1;

	DIGEST_PRINT(ddp);

	if ((rdfstore_digest_get_node_digest(statement->object, ddo, 1)) != 0) /* distinguish RDF literal hashcode by xml:lang or rdf:datatype */
		return -1;

	DIGEST_PRINT(ddo);

	if (context != NULL) {
		if ((rdfstore_digest_get_node_digest(context, ddc, 1)) != 0)
			return -1;

		DIGEST_PRINT(ddc);

		input = (unsigned char *) RDFSTORE_MALLOC(
			sizeof(unsigned char) * (RDFSTORE_SHA_DIGESTSIZE * 4));	/* s,p,o,c */
	} else {
		input = (unsigned char *) RDFSTORE_MALLOC(
			sizeof(unsigned char) * (RDFSTORE_SHA_DIGESTSIZE * 3));	/* s,p,o */
	};

	if (input == NULL)
		return -1;

	memcpy(input, dds, RDFSTORE_SHA_DIGESTSIZE);
	memcpy(input + RDFSTORE_SHA_DIGESTSIZE, ddp, RDFSTORE_SHA_DIGESTSIZE);

	if (statement->object->type == 1) {
		register int    i;
		unsigned char   c = ddo[0];
		/*
		 * rotate one byte - see why at
		 * http://www-db.stanford.edu/~melnik/rdf/api.html#digest
		 * even if it says rotate to the left why is the right :)
		 */
		for (i = 0; i < RDFSTORE_SHA_DIGESTSIZE - 1; i++)
			ddo[i] = ddo[i + 1];
		ddo[RDFSTORE_SHA_DIGESTSIZE - 1] = c;
	};
	memcpy(input + (2 * RDFSTORE_SHA_DIGESTSIZE), ddo, RDFSTORE_SHA_DIGESTSIZE);

	if (context != NULL) 
		memcpy(input + (3 * RDFSTORE_SHA_DIGESTSIZE), ddc, RDFSTORE_SHA_DIGESTSIZE);

	status = rdfstore_digest_digest(input, 
		(context != NULL) ? (RDFSTORE_SHA_DIGESTSIZE * 4) : (RDFSTORE_SHA_DIGESTSIZE * 3), 
		dd);

	DIGEST_PRINT(dd);

	RDFSTORE_FREE(input);

	return status;
}

rdf_store_digest_t
rdfstore_digest_get_statement_hashCode(RDF_Statement * statement, RDF_Node * given_context)
{
	unsigned char   dd[RDFSTORE_SHA_DIGESTSIZE];
	rdf_store_digest_t hc = 0;

	if (statement == NULL)
		return 0;

#ifdef RDFSTORE_DEBUG_DIGEST
	if (statement->hashcode) {
		char           *ntriples_rep = rdfstore_ntriples_statement(statement, NULL);
		printf("Statement hashcode for '%s' already carried out '%d'\n", ntriples_rep, statement->hashcode);
		RDFSTORE_FREE(ntriples_rep);
	};
#endif

	if (statement->hashcode)
		return statement->hashcode;

	if ((rdfstore_digest_get_statement_digest(statement, given_context, dd)) != 0) {
		hc = 0;
	} else {
		/*
		 * perhaps it is instead => s.hashCode() * 7) + p.hashCode()) *
		 * 7 + o.hashCode() + c.hashCode()
		 */
		hc = rdfstore_digest_crc64(dd);
	};

#ifdef RDFSTORE_DEBUG_DIGEST
	{
		char           *ntriples_rep = rdfstore_ntriples_statement(statement, NULL);
		printf("Just computed statement hashcode for '%s' to '%d' %s\n", ntriples_rep, hc, (given_context != NULL) ? "(not to be cached)" : "");
		RDFSTORE_FREE(ntriples_rep);
	};
#endif

	return hc;
}