term.pyx 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. from uuid import uuid4
  2. from rdflib import URIRef, BNode, Literal
  3. #from cpython.mem cimport PyMem_Malloc, PyMem_Free
  4. from libc.stdint cimport uint64_t
  5. from libc.stdlib cimport free
  6. from libc.string cimport memcpy
  7. from cymem.cymem cimport Pool
  8. from lakesuperior.cy_include cimport cytpl as tpl
  9. from lakesuperior.model.base cimport Buffer, buffer_dump
  10. DEF LSUP_TERM_TYPE_URIREF = 1
  11. DEF LSUP_TERM_TYPE_BNODE = 2
  12. DEF LSUP_TERM_TYPE_LITERAL = 3
  13. DEF LSUP_TERM_PK_FMT = b'csss' # Reflects the Term structure
  14. DEF LSUP_TERM_STRUCT_PK_FMT = b'S(' + LSUP_TERM_PK_FMT + b')'
  15. # URI parsing regular expression. Conforms to RFC3986.
  16. #DEF URI_REGEX_STR = (
  17. # b'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
  18. #)
  19. #cdef char* ptn = URI_REGEX_STR
  20. #regcomp(&uri_regex, ptn, REG_NOSUB)
  21. # Compile with no catch groups.
  22. # TODO This should be properly cleaned up on application shutdown:
  23. # regfree(&uri_regex)
  24. #cdef int term_new(
  25. # Term* term, char type, char* data, char* datatype=NULL, char* lang=NULL
  26. #) except -1:
  27. # if regexec(&uri_regex, data, 0, NULL, 0) == REG_NOMATCH:
  28. # raise ValueError('Not a valid URI.')
  29. # term.type = type
  30. # term.data = (
  31. # data # TODO use C UUID v4 (RFC 4122) generator
  32. # if term.type == LSUP_TERM_TYPE_BNODE
  33. # else data
  34. # )
  35. # if term.type == LSUP_TERM_TYPE_LITERAL:
  36. # term.datatype = datatype
  37. # term.lang = lang
  38. #
  39. # return 0
  40. cdef int serialize(const Term *term, Buffer *sterm, Pool pool=None) except -1:
  41. """
  42. Serialize a Term into a binary buffer.
  43. The returned result is dynamically allocated in the provided memory pool.
  44. """
  45. cdef:
  46. unsigned char *addr
  47. size_t sz
  48. tpl.tpl_jot(tpl.TPL_MEM, &addr, &sz, LSUP_TERM_STRUCT_PK_FMT, term)
  49. if pool is None:
  50. sterm.addr = addr
  51. else:
  52. # addr is within this function scope. Must be copied to the cymem pool.
  53. sterm.addr = pool.alloc(sz, 1)
  54. if not sterm.addr:
  55. raise MemoryError()
  56. memcpy(sterm.addr, addr, sz)
  57. sterm.sz = sz
  58. cdef int deserialize(const Buffer *data, Term *term) except -1:
  59. """
  60. Return a term from serialized binary data.
  61. """
  62. #print(f'Deserializing: {buffer_dump(data)}')
  63. _pk = tpl.tpl_peek(
  64. tpl.TPL_MEM | tpl.TPL_DATAPEEK, data[0].addr, data[0].sz,
  65. LSUP_TERM_PK_FMT, &(term[0].type), &(term[0].data),
  66. &(term[0].datatype), &(term[0].lang))
  67. if _pk is NULL:
  68. raise MemoryError('Error deserializing term.')
  69. else:
  70. free(_pk)
  71. cdef int from_rdflib(term_obj, Term *term) except -1:
  72. """
  73. Return a Term struct obtained from a Python/RDFLib term.
  74. """
  75. _data = str(term_obj).encode()
  76. term[0].data = _data
  77. if isinstance(term_obj, Literal):
  78. _datatype = (getattr(term_obj, 'datatype') or '').encode()
  79. _lang = (getattr(term_obj, 'language') or '').encode()
  80. term[0].type = LSUP_TERM_TYPE_LITERAL
  81. term[0].datatype = _datatype
  82. term[0].lang = _lang
  83. else:
  84. term[0].datatype = NULL
  85. term[0].lang = NULL
  86. if isinstance(term_obj, URIRef):
  87. term[0].type = LSUP_TERM_TYPE_URIREF
  88. elif isinstance(term_obj, BNode):
  89. term[0].type = LSUP_TERM_TYPE_BNODE
  90. else:
  91. raise ValueError(f'Unsupported term type: {type(term_obj)}')
  92. cdef int serialize_from_rdflib(
  93. term_obj, Buffer *data, Pool pool=None
  94. ) except -1:
  95. """
  96. Return a Buffer struct from a Python/RDFLib term.
  97. """
  98. cdef:
  99. Term _term
  100. void *addr
  101. size_t sz
  102. # From RDFlib
  103. _data = str(term_obj).encode()
  104. _term.data = _data
  105. if isinstance(term_obj, Literal):
  106. _datatype = (getattr(term_obj, 'datatype') or '').encode()
  107. _lang = (getattr(term_obj, 'language') or '').encode()
  108. _term.type = LSUP_TERM_TYPE_LITERAL
  109. _term.datatype = _datatype
  110. _term.lang = _lang
  111. else:
  112. _term.datatype = NULL
  113. _term.lang = NULL
  114. if isinstance(term_obj, URIRef):
  115. _term.type = LSUP_TERM_TYPE_URIREF
  116. elif isinstance(term_obj, BNode):
  117. _term.type = LSUP_TERM_TYPE_BNODE
  118. else:
  119. raise ValueError(f'Unsupported term type: {type(term_obj)}')
  120. serialize(&_term, data, pool)
  121. cdef object to_rdflib(const Term *term):
  122. """
  123. Return an RDFLib term.
  124. """
  125. cdef str data = (<bytes>term[0].data).decode()
  126. if term[0].type == LSUP_TERM_TYPE_LITERAL:
  127. return Literal(
  128. data,
  129. datatype=term[0].datatype if not term[0].lang else None,
  130. lang=term[0].lang or None
  131. )
  132. else:
  133. if term[0].type == LSUP_TERM_TYPE_URIREF:
  134. return URIRef(data)
  135. elif term[0].type == LSUP_TERM_TYPE_BNODE:
  136. return BNode(data)
  137. else:
  138. raise IOError(f'Unknown term type code: {term[0].type}')
  139. cdef object deserialize_to_rdflib(const Buffer *data):
  140. """
  141. Return a Python/RDFLib term from a serialized Cython term.
  142. """
  143. cdef Term t
  144. deserialize(data, &t)
  145. return to_rdflib(&t)
  146. cdef object to_bytes(const Term *term):
  147. """
  148. Return a Python bytes object of the serialized term.
  149. """
  150. cdef:
  151. Buffer pk_t
  152. unsigned char *bytestream
  153. serialize(term, &pk_t)
  154. bytestream = <unsigned char *>pk_t.addr
  155. return <bytes>(bytestream)[:pk_t.sz]