term.pyx 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. from uuid import uuid4
  2. from rdflib import URIRef, BNode, Literal
  3. #from cpython.mem cimport PyMem_Malloc, PyMem_Free
  4. from libc.stdint cimport uint64_t
  5. from libc.stdlib cimport free
  6. from libc.string cimport memcpy
  7. from lakesuperior.cy_include cimport cytpl as tpl
  8. from lakesuperior.dictionaries.namespaces import ns_collection as nsc
  9. from lakesuperior.model.base cimport Buffer, buffer_dump
  10. DEF LSUP_TERM_TYPE_URIREF = 1
  11. DEF LSUP_TERM_TYPE_BNODE = 2
  12. DEF LSUP_TERM_TYPE_LITERAL = 3
  13. DEF LSUP_TERM_PK_FMT = b'csss' # Reflects the Term structure
  14. DEF LSUP_TERM_STRUCT_PK_FMT = b'S(' + LSUP_TERM_PK_FMT + b')'
  15. # URI parsing regular expression. Conforms to RFC3986.
  16. #DEF URI_REGEX_STR = (
  17. # b'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
  18. #)
  19. __doc__ = """
  20. Term model.
  21. ``Term`` is not defined as a Cython or Python class. It is a C structure,
  22. hence only visible by the Cython layer of the application.
  23. Terms can be converted from/to RDFlib terms, and deserialized from, or
  24. serialized to, binary buffer structures. This is the form that terms are stored
  25. in the data store.
  26. If uses require a public API, a proper Term Cython class with a Python API
  27. could be developed in the future.
  28. """
  29. #cdef char* ptn = URI_REGEX_STR
  30. #regcomp(&uri_regex, ptn, REG_NOSUB)
  31. # Compile with no catch groups.
  32. # TODO This should be properly cleaned up on application shutdown:
  33. # regfree(&uri_regex)
  34. #cdef int term_new(
  35. # Term* term, char type, char* data, char* datatype=NULL, char* lang=NULL
  36. #) except -1:
  37. # if regexec(&uri_regex, data, 0, NULL, 0) == REG_NOMATCH:
  38. # raise ValueError('Not a valid URI.')
  39. # term.type = type
  40. # term.data = (
  41. # data # TODO use C UUID v4 (RFC 4122) generator
  42. # if term.type == LSUP_TERM_TYPE_BNODE
  43. # else data
  44. # )
  45. # if term.type == LSUP_TERM_TYPE_LITERAL:
  46. # term.datatype = datatype
  47. # term.lang = lang
  48. #
  49. # return 0
  50. cdef int serialize(const Term *term, Buffer *sterm) except -1:
  51. """
  52. Serialize a Term into a binary buffer.
  53. """
  54. tpl.tpl_jot(
  55. tpl.TPL_MEM, &sterm.addr, &sterm.sz, LSUP_TERM_STRUCT_PK_FMT, term
  56. )
  57. cdef int deserialize(const Buffer *data, Term *term) except -1:
  58. """
  59. Return a term from serialized binary data.
  60. """
  61. #print(f'Deserializing: {buffer_dump(data)}')
  62. _pk = tpl.tpl_peek(
  63. tpl.TPL_MEM | tpl.TPL_DATAPEEK, data.addr, data.sz,
  64. LSUP_TERM_PK_FMT, &(term.type), &(term.data),
  65. &(term.datatype), &(term.lang))
  66. if _pk is NULL:
  67. raise MemoryError('Error deserializing term.')
  68. else:
  69. free(_pk)
  70. cdef int from_rdflib(term_obj, Term *term) except -1:
  71. """
  72. Return a Term struct obtained from a Python/RDFLib term.
  73. """
  74. _data = str(term_obj).encode()
  75. term.data = _data
  76. term.datatype = NULL
  77. term.lang = NULL
  78. if isinstance(term_obj, Literal):
  79. _datatype = getattr(term_obj, 'datatype', None)
  80. _lang = getattr(term_obj, 'language', None)
  81. term.type = LSUP_TERM_TYPE_LITERAL
  82. if _datatype:
  83. _datatype = _datatype.encode()
  84. term.datatype = _datatype
  85. if _lang:
  86. _lang = _lang.encode()
  87. term.lang = _lang
  88. else:
  89. if isinstance(term_obj, URIRef):
  90. term.type = LSUP_TERM_TYPE_URIREF
  91. elif isinstance(term_obj, BNode):
  92. term.type = LSUP_TERM_TYPE_BNODE
  93. else:
  94. raise ValueError(f'Unsupported term type: {type(term_obj)}')
  95. cdef int serialize_from_rdflib(term_obj, Buffer *data) except -1:
  96. """
  97. Return a Buffer struct from a Python/RDFLib term.
  98. """
  99. cdef:
  100. Term _term
  101. void *addr
  102. size_t sz
  103. from_rdflib(term_obj, &_term)
  104. serialize(&_term, data)
  105. cdef object to_rdflib(const Term *term):
  106. """
  107. Return an RDFLib term.
  108. """
  109. cdef str data = (<bytes>term.data).decode()
  110. if term.type == LSUP_TERM_TYPE_LITERAL:
  111. if term.lang:
  112. params = {'lang': (<bytes>term.lang).decode()}
  113. elif term.datatype:
  114. params = {'datatype': (<bytes>term.datatype).decode()}
  115. else:
  116. params = {}
  117. return Literal(data, **params)
  118. else:
  119. if term.type == LSUP_TERM_TYPE_URIREF:
  120. return URIRef(data)
  121. elif term.type == LSUP_TERM_TYPE_BNODE:
  122. return BNode(data)
  123. else:
  124. raise IOError(f'Unknown term type code: {term[0].type}')
  125. cdef object deserialize_to_rdflib(const Buffer *data):
  126. """
  127. Return a Python/RDFLib term from a serialized Cython term.
  128. """
  129. cdef Term t
  130. deserialize(data, &t)
  131. return to_rdflib(&t)
  132. cdef object to_bytes(const Term *term):
  133. """
  134. Return a Python bytes object of the serialized term.
  135. """
  136. cdef:
  137. Buffer pk_t
  138. unsigned char *bytestream
  139. serialize(term, &pk_t)
  140. bytestream = <unsigned char *>pk_t.addr
  141. return <bytes>(bytestream)[:pk_t.sz]