term.pyx 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. from uuid import uuid4
  2. from rdflib import URIRef, BNode, Literal
  3. #from cpython.mem cimport PyMem_Malloc, PyMem_Free
  4. from libc.stdint cimport uint64_t
  5. from libc.stdlib cimport free
  6. from libc.string cimport memcpy
  7. from lakesuperior.cy_include cimport cytpl as tpl
  8. from lakesuperior.model.base cimport Buffer, buffer_dump
  9. DEF LSUP_TERM_TYPE_URIREF = 1
  10. DEF LSUP_TERM_TYPE_BNODE = 2
  11. DEF LSUP_TERM_TYPE_LITERAL = 3
  12. DEF LSUP_TERM_PK_FMT = b'csss' # Reflects the Term structure
  13. DEF LSUP_TERM_STRUCT_PK_FMT = b'S(' + LSUP_TERM_PK_FMT + b')'
  14. # URI parsing regular expression. Conforms to RFC3986.
  15. #DEF URI_REGEX_STR = (
  16. # b'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
  17. #)
  18. __doc__ = """
  19. Term model.
  20. ``Term`` is not defined as a Cython or Python class. It is a C structure,
  21. hence only visible by the Cython layer of the application.
  22. Terms can be converted from/to RDFlib terms, and deserialized from, or
  23. serialized to, binary buffer structures. This is the form that terms are stored
  24. in the data store.
  25. If uses require a public API, a proper Term Cython class with a Python API
  26. could be developed in the future.
  27. """
  28. #cdef char* ptn = URI_REGEX_STR
  29. #regcomp(&uri_regex, ptn, REG_NOSUB)
  30. # Compile with no catch groups.
  31. # TODO This should be properly cleaned up on application shutdown:
  32. # regfree(&uri_regex)
  33. #cdef int term_new(
  34. # Term* term, char type, char* data, char* datatype=NULL, char* lang=NULL
  35. #) except -1:
  36. # if regexec(&uri_regex, data, 0, NULL, 0) == REG_NOMATCH:
  37. # raise ValueError('Not a valid URI.')
  38. # term.type = type
  39. # term.data = (
  40. # data # TODO use C UUID v4 (RFC 4122) generator
  41. # if term.type == LSUP_TERM_TYPE_BNODE
  42. # else data
  43. # )
  44. # if term.type == LSUP_TERM_TYPE_LITERAL:
  45. # term.datatype = datatype
  46. # term.lang = lang
  47. #
  48. # return 0
  49. cdef int serialize(const Term *term, Buffer *sterm) except -1:
  50. """
  51. Serialize a Term into a binary buffer.
  52. """
  53. tpl.tpl_jot(
  54. tpl.TPL_MEM, &sterm.addr, &sterm.sz, LSUP_TERM_STRUCT_PK_FMT, term
  55. )
  56. cdef int deserialize(const Buffer *data, Term *term) except -1:
  57. """
  58. Return a term from serialized binary data.
  59. """
  60. #print(f'Deserializing: {buffer_dump(data)}')
  61. _pk = tpl.tpl_peek(
  62. tpl.TPL_MEM | tpl.TPL_DATAPEEK, data[0].addr, data[0].sz,
  63. LSUP_TERM_PK_FMT, &(term[0].type), &(term[0].data),
  64. &(term[0].datatype), &(term[0].lang))
  65. if _pk is NULL:
  66. raise MemoryError('Error deserializing term.')
  67. else:
  68. free(_pk)
  69. cdef int from_rdflib(term_obj, Term *term) except -1:
  70. """
  71. Return a Term struct obtained from a Python/RDFLib term.
  72. """
  73. _data = str(term_obj).encode()
  74. term[0].data = _data
  75. if isinstance(term_obj, Literal):
  76. _datatype = (getattr(term_obj, 'datatype') or '').encode()
  77. _lang = (getattr(term_obj, 'language') or '').encode()
  78. term[0].type = LSUP_TERM_TYPE_LITERAL
  79. term[0].datatype = _datatype
  80. term[0].lang = _lang
  81. else:
  82. term[0].datatype = NULL
  83. term[0].lang = NULL
  84. if isinstance(term_obj, URIRef):
  85. term[0].type = LSUP_TERM_TYPE_URIREF
  86. elif isinstance(term_obj, BNode):
  87. term[0].type = LSUP_TERM_TYPE_BNODE
  88. else:
  89. raise ValueError(f'Unsupported term type: {type(term_obj)}')
  90. cdef int serialize_from_rdflib(term_obj, Buffer *data) except -1:
  91. """
  92. Return a Buffer struct from a Python/RDFLib term.
  93. """
  94. cdef:
  95. Term _term
  96. void *addr
  97. size_t sz
  98. # From RDFlib
  99. _data = str(term_obj).encode()
  100. _term.data = _data
  101. if isinstance(term_obj, Literal):
  102. _datatype = (getattr(term_obj, 'datatype') or '').encode()
  103. _lang = (getattr(term_obj, 'language') or '').encode()
  104. _term.type = LSUP_TERM_TYPE_LITERAL
  105. _term.datatype = _datatype
  106. _term.lang = _lang
  107. else:
  108. _term.datatype = NULL
  109. _term.lang = NULL
  110. if isinstance(term_obj, URIRef):
  111. _term.type = LSUP_TERM_TYPE_URIREF
  112. elif isinstance(term_obj, BNode):
  113. _term.type = LSUP_TERM_TYPE_BNODE
  114. else:
  115. raise ValueError(
  116. f'Unsupported term type: {term_obj} {type(term_obj)}'
  117. )
  118. serialize(&_term, data)
  119. cdef object to_rdflib(const Term *term):
  120. """
  121. Return an RDFLib term.
  122. """
  123. cdef str data = (<bytes>term.data).decode()
  124. if term[0].type == LSUP_TERM_TYPE_LITERAL:
  125. return Literal(
  126. data,
  127. datatype=term.datatype if not term.lang else None,
  128. lang=term.lang or None
  129. )
  130. else:
  131. if term.type == LSUP_TERM_TYPE_URIREF:
  132. return URIRef(data)
  133. elif term.type == LSUP_TERM_TYPE_BNODE:
  134. return BNode(data)
  135. else:
  136. raise IOError(f'Unknown term type code: {term[0].type}')
  137. cdef object deserialize_to_rdflib(const Buffer *data):
  138. """
  139. Return a Python/RDFLib term from a serialized Cython term.
  140. """
  141. cdef Term t
  142. deserialize(data, &t)
  143. return to_rdflib(&t)
  144. cdef object to_bytes(const Term *term):
  145. """
  146. Return a Python bytes object of the serialized term.
  147. """
  148. cdef:
  149. Buffer pk_t
  150. unsigned char *bytestream
  151. serialize(term, &pk_t)
  152. bytestream = <unsigned char *>pk_t.addr
  153. return <bytes>(bytestream)[:pk_t.sz]