scossu
/
lakesuperior_legacy


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
							from rdflib import URIRef, BNode, Literal

#from cpython.mem cimport PyMem_Malloc, PyMem_Free
from libc.stdint cimport uint64_t
from libc.stdlib cimport malloc, free
from libc.string cimport memcpy

#from lakesuperior.cy_include.cyspookyhash cimport spookyhash_128
from lakesuperior.cy_include cimport cytpl as tpl


DEF LSUP_TERM_TYPE_URIREF = 1
DEF LSUP_TERM_TYPE_BNODE = 2
DEF LSUP_TERM_TYPE_LITERAL = 3
DEF LSUP_PK_FMT_ID = b'S(cs)'
DEF LSUP_PK_FMT_LIT = b'S(csss)'


DEF _SEED_LEN = 8
DEF _HLEN = 16

HLEN = _HLEN
SEED_LEN = _SEED_LEN

term_hash_seed = b'\xff\xf2Q\xf2j\x0bG\xc1\x8a}\xca\x92\x98^y\x12'
"""
Seed for computing the term hash.

This is a 16-byte string that will be split up into two ``uint64``
numbers to make up the ``spookyhash_128`` seeds.
"""
memcpy(&term_hash_seed1, term_hash_seed, SEED_LEN)
memcpy(&term_hash_seed2, term_hash_seed + SEED_LEN, SEED_LEN)

# We only need one function from spookyhash. No need for a pxd file.
cdef extern from 'spookyhash_api.h':
    void spookyhash_128(
            const void *input, size_t input_size, uint64_t *hash_1,
            uint64_t *hash_2)


cdef int serialize(
        term, unsigned char **pack_data, size_t *pack_size) except -1:
    cdef:
        bytes term_data = term.encode()
        bytes term_datatype
        bytes term_lang
        IdentifierTerm id_t
        LiteralTerm lit_t

    if isinstance(term, Literal):
        term_datatype = (getattr(term, 'datatype') or '').encode()
        term_lang = (getattr(term, 'language') or '').encode()

        lit_t.type = LSUP_TERM_TYPE_LITERAL
        lit_t.data = term_data
        lit_t.datatype = <unsigned char *>term_datatype
        lit_t.lang = <unsigned char *>term_lang

        tpl.tpl_jot(tpl.TPL_MEM, pack_data, pack_size, LSUP_PK_FMT_LIT, &lit_t)
    else:
        if isinstance(term, URIRef):
            id_t.type = LSUP_TERM_TYPE_URIREF
        elif isinstance(term, BNode):
            id_t.type = LSUP_TERM_TYPE_BNODE
        else:
            raise ValueError(f'Unsupported term type: {type(term)}')
        id_t.data = term_data
        tpl.tpl_jot(tpl.TPL_MEM, pack_data, pack_size, LSUP_PK_FMT_ID, &id_t)


cdef deserialize(const unsigned char *data, const size_t data_size):
    cdef:
        char term_type
        char *fmt = NULL
        char *_pk = NULL
        unsigned char *term_data = NULL
        unsigned char *term_lang = NULL
        unsigned char *term_datatype = NULL

    datatype = None
    lang = None

    fmt = tpl.tpl_peek(tpl.TPL_MEM, data, data_size)
    try:
        if fmt == LSUP_PK_FMT_LIT:
            _pk = tpl.tpl_peek(
                    tpl.TPL_MEM | tpl.TPL_DATAPEEK, data, data_size, b'csss',
                    &term_type, &term_data, &term_datatype, &term_lang)
            if len(term_datatype) > 0:
                datatype = term_datatype.decode()
            elif len(term_lang) > 0:
                lang = term_lang.decode()

            return Literal(term_data.decode(), datatype=datatype, lang=lang)

        elif fmt == LSUP_PK_FMT_ID:
            _pk = tpl.tpl_peek(
                    tpl.TPL_MEM | tpl.TPL_DATAPEEK, data, data_size, b'cs',
                    &term_type, &term_data)
            uri = term_data.decode()
            if term_type == LSUP_TERM_TYPE_URIREF:
                return URIRef(uri)
            elif term_type == LSUP_TERM_TYPE_BNODE:
                return BNode(uri)
            else:
                raise IOError(f'Unknown term type code: {term_type}')
        else:
            msg = f'Unknown structure pack format: {fmt}'
            raise IOError(msg)
    finally:
        free(term_data)
        free(term_datatype)
        free(term_lang)
        free(_pk)
        free(fmt)


cdef inline void hash_(
        const unsigned char *message, size_t message_size, Hash *digest):
    """
    Get the hash value of a serialized object.

    The hashing algorithm is `SpookyHash
    <http://burtleburtle.net/bob/hash/spooky.html>`_ which produces 128-bit
    (16-byte) digests.

    The initial seeds are determined in the application configuration.
    """
    cdef Hash_128 seed = [term_hash_seed1, term_hash_seed2]

    spookyhash_128(message, message_size, seed, seed + 1)

    memcpy(digest, seed, sizeof(Hash))