term.pyx 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. from rdflib import URIRef, BNode, Literal
  2. #from cpython.mem cimport PyMem_Malloc, PyMem_Free
  3. from libc.stdint cimport uint64_t
  4. from libc.stdlib cimport malloc, free
  5. from libc.string cimport memcpy
  6. #from lakesuperior.cy_include.cyspookyhash cimport spookyhash_128
  7. from lakesuperior.cy_include cimport cytpl as tpl
  8. DEF LSUP_TERM_TYPE_URIREF = 1
  9. DEF LSUP_TERM_TYPE_BNODE = 2
  10. DEF LSUP_TERM_TYPE_LITERAL = 3
  11. DEF LSUP_PK_FMT_ID = b'S(cs)'
  12. DEF LSUP_PK_FMT_LIT = b'S(csss)'
  13. DEF _SEED_LEN = 8
  14. DEF _HLEN = 16
  15. HLEN = _HLEN
  16. SEED_LEN = _SEED_LEN
  17. term_hash_seed = b'\xff\xf2Q\xf2j\x0bG\xc1\x8a}\xca\x92\x98^y\x12'
  18. """
  19. Seed for computing the term hash.
  20. This is a 16-byte string that will be split up into two ``uint64``
  21. numbers to make up the ``spookyhash_128`` seeds.
  22. """
  23. memcpy(&term_hash_seed1, term_hash_seed, SEED_LEN)
  24. memcpy(&term_hash_seed2, term_hash_seed + SEED_LEN, SEED_LEN)
  25. # We only need one function from spookyhash. No need for a pxd file.
  26. cdef extern from 'spookyhash_api.h':
  27. void spookyhash_128(
  28. const void *input, size_t input_size, uint64_t *hash_1,
  29. uint64_t *hash_2)
  30. cdef int serialize(
  31. term, unsigned char **pack_data, size_t *pack_size) except -1:
  32. cdef:
  33. bytes term_data = term.encode()
  34. bytes term_datatype
  35. bytes term_lang
  36. IdentifierTerm id_t
  37. LiteralTerm lit_t
  38. if isinstance(term, Literal):
  39. term_datatype = (getattr(term, 'datatype') or '').encode()
  40. term_lang = (getattr(term, 'language') or '').encode()
  41. lit_t.type = LSUP_TERM_TYPE_LITERAL
  42. lit_t.data = term_data
  43. lit_t.datatype = <unsigned char *>term_datatype
  44. lit_t.lang = <unsigned char *>term_lang
  45. tpl.tpl_jot(tpl.TPL_MEM, pack_data, pack_size, LSUP_PK_FMT_LIT, &lit_t)
  46. else:
  47. if isinstance(term, URIRef):
  48. id_t.type = LSUP_TERM_TYPE_URIREF
  49. elif isinstance(term, BNode):
  50. id_t.type = LSUP_TERM_TYPE_BNODE
  51. else:
  52. raise ValueError(f'Unsupported term type: {type(term)}')
  53. id_t.data = term_data
  54. tpl.tpl_jot(tpl.TPL_MEM, pack_data, pack_size, LSUP_PK_FMT_ID, &id_t)
  55. cdef deserialize(const unsigned char *data, const size_t data_size):
  56. cdef:
  57. char term_type
  58. char *fmt = NULL
  59. char *_pk = NULL
  60. unsigned char *term_data = NULL
  61. unsigned char *term_lang = NULL
  62. unsigned char *term_datatype = NULL
  63. datatype = None
  64. lang = None
  65. fmt = tpl.tpl_peek(tpl.TPL_MEM, data, data_size)
  66. try:
  67. if fmt == LSUP_PK_FMT_LIT:
  68. _pk = tpl.tpl_peek(
  69. tpl.TPL_MEM | tpl.TPL_DATAPEEK, data, data_size, b'csss',
  70. &term_type, &term_data, &term_datatype, &term_lang)
  71. if len(term_datatype) > 0:
  72. datatype = term_datatype.decode()
  73. elif len(term_lang) > 0:
  74. lang = term_lang.decode()
  75. return Literal(term_data.decode(), datatype=datatype, lang=lang)
  76. elif fmt == LSUP_PK_FMT_ID:
  77. _pk = tpl.tpl_peek(
  78. tpl.TPL_MEM | tpl.TPL_DATAPEEK, data, data_size, b'cs',
  79. &term_type, &term_data)
  80. uri = term_data.decode()
  81. if term_type == LSUP_TERM_TYPE_URIREF:
  82. return URIRef(uri)
  83. elif term_type == LSUP_TERM_TYPE_BNODE:
  84. return BNode(uri)
  85. else:
  86. raise IOError(f'Unknown term type code: {term_type}')
  87. else:
  88. msg = f'Unknown structure pack format: {fmt}'
  89. raise IOError(msg)
  90. finally:
  91. free(term_data)
  92. free(term_datatype)
  93. free(term_lang)
  94. free(_pk)
  95. free(fmt)
  96. cdef inline void hash_(
  97. const unsigned char *message, size_t message_size, Hash *digest):
  98. """
  99. Get the hash value of a serialized object.
  100. The hashing algorithm is `SpookyHash
  101. <http://burtleburtle.net/bob/hash/spooky.html>`_ which produces 128-bit
  102. (16-byte) digests.
  103. The initial seeds are determined in the application configuration.
  104. """
  105. cdef Hash_128 seed = [term_hash_seed1, term_hash_seed2]
  106. spookyhash_128(message, message_size, seed, seed + 1)
  107. memcpy(digest, seed, sizeof(Hash))