term.pyx 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. from uuid import uuid4
  2. from rdflib import URIRef, BNode, Literal
  3. #from cpython.mem cimport PyMem_Malloc, PyMem_Free
  4. from libc.stdint cimport uint64_t
  5. from libc.stdlib cimport free
  6. from libc.string cimport memcpy
  7. from lakesuperior.cy_include cimport cytpl as tpl
  8. from lakesuperior.model.base cimport Buffer, buffer_dump
  9. DEF LSUP_TERM_TYPE_URIREF = 1
  10. DEF LSUP_TERM_TYPE_BNODE = 2
  11. DEF LSUP_TERM_TYPE_LITERAL = 3
  12. DEF LSUP_TERM_PK_FMT = b'csss' # Reflects the Term structure
  13. DEF LSUP_TERM_STRUCT_PK_FMT = b'S(' + LSUP_TERM_PK_FMT + b')'
  14. # URI parsing regular expression. Conforms to RFC3986.
  15. #DEF URI_REGEX_STR = (
  16. # b'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
  17. #)
  18. #cdef char* ptn = URI_REGEX_STR
  19. #regcomp(&uri_regex, ptn, REG_NOSUB)
  20. # Compile with no catch groups.
  21. # TODO This should be properly cleaned up on application shutdown:
  22. # regfree(&uri_regex)
  23. #cdef int term_new(
  24. # Term* term, char type, char* data, char* datatype=NULL, char* lang=NULL
  25. #) except -1:
  26. # if regexec(&uri_regex, data, 0, NULL, 0) == REG_NOMATCH:
  27. # raise ValueError('Not a valid URI.')
  28. # term.type = type
  29. # term.data = (
  30. # data # TODO use C UUID v4 (RFC 4122) generator
  31. # if term.type == LSUP_TERM_TYPE_BNODE
  32. # else data
  33. # )
  34. # if term.type == LSUP_TERM_TYPE_LITERAL:
  35. # term.datatype = datatype
  36. # term.lang = lang
  37. #
  38. # return 0
  39. cdef int serialize(const Term *term, Buffer *sterm) except -1:
  40. """
  41. Serialize a Term into a binary buffer.
  42. """
  43. tpl.tpl_jot(
  44. tpl.TPL_MEM, &sterm.addr, &sterm.sz, LSUP_TERM_STRUCT_PK_FMT, term
  45. )
  46. cdef int deserialize(const Buffer *data, Term *term) except -1:
  47. """
  48. Return a term from serialized binary data.
  49. """
  50. #print(f'Deserializing: {buffer_dump(data)}')
  51. _pk = tpl.tpl_peek(
  52. tpl.TPL_MEM | tpl.TPL_DATAPEEK, data[0].addr, data[0].sz,
  53. LSUP_TERM_PK_FMT, &(term[0].type), &(term[0].data),
  54. &(term[0].datatype), &(term[0].lang))
  55. if _pk is NULL:
  56. raise MemoryError('Error deserializing term.')
  57. else:
  58. free(_pk)
  59. cdef int from_rdflib(term_obj, Term *term) except -1:
  60. """
  61. Return a Term struct obtained from a Python/RDFLib term.
  62. """
  63. _data = str(term_obj).encode()
  64. term[0].data = _data
  65. if isinstance(term_obj, Literal):
  66. _datatype = (getattr(term_obj, 'datatype') or '').encode()
  67. _lang = (getattr(term_obj, 'language') or '').encode()
  68. term[0].type = LSUP_TERM_TYPE_LITERAL
  69. term[0].datatype = _datatype
  70. term[0].lang = _lang
  71. else:
  72. term[0].datatype = NULL
  73. term[0].lang = NULL
  74. if isinstance(term_obj, URIRef):
  75. term[0].type = LSUP_TERM_TYPE_URIREF
  76. elif isinstance(term_obj, BNode):
  77. term[0].type = LSUP_TERM_TYPE_BNODE
  78. else:
  79. raise ValueError(f'Unsupported term type: {type(term_obj)}')
  80. cdef int serialize_from_rdflib(term_obj, Buffer *data) except -1:
  81. """
  82. Return a Buffer struct from a Python/RDFLib term.
  83. """
  84. cdef:
  85. Term _term
  86. void *addr
  87. size_t sz
  88. # From RDFlib
  89. _data = str(term_obj).encode()
  90. _term.data = _data
  91. if isinstance(term_obj, Literal):
  92. _datatype = (getattr(term_obj, 'datatype') or '').encode()
  93. _lang = (getattr(term_obj, 'language') or '').encode()
  94. _term.type = LSUP_TERM_TYPE_LITERAL
  95. _term.datatype = _datatype
  96. _term.lang = _lang
  97. else:
  98. _term.datatype = NULL
  99. _term.lang = NULL
  100. if isinstance(term_obj, URIRef):
  101. _term.type = LSUP_TERM_TYPE_URIREF
  102. elif isinstance(term_obj, BNode):
  103. _term.type = LSUP_TERM_TYPE_BNODE
  104. else:
  105. raise ValueError(
  106. f'Unsupported term type: {term_obj} {type(term_obj)}'
  107. )
  108. serialize(&_term, data)
  109. cdef object to_rdflib(const Term *term):
  110. """
  111. Return an RDFLib term.
  112. """
  113. cdef str data = (<bytes>term.data).decode()
  114. if term[0].type == LSUP_TERM_TYPE_LITERAL:
  115. return Literal(
  116. data,
  117. datatype=term.datatype if not term.lang else None,
  118. lang=term.lang or None
  119. )
  120. else:
  121. if term.type == LSUP_TERM_TYPE_URIREF:
  122. return URIRef(data)
  123. elif term.type == LSUP_TERM_TYPE_BNODE:
  124. return BNode(data)
  125. else:
  126. raise IOError(f'Unknown term type code: {term[0].type}')
  127. cdef object deserialize_to_rdflib(const Buffer *data):
  128. """
  129. Return a Python/RDFLib term from a serialized Cython term.
  130. """
  131. cdef Term t
  132. deserialize(data, &t)
  133. return to_rdflib(&t)
  134. cdef object to_bytes(const Term *term):
  135. """
  136. Return a Python bytes object of the serialized term.
  137. """
  138. cdef:
  139. Buffer pk_t
  140. unsigned char *bytestream
  141. serialize(term, &pk_t)
  142. bytestream = <unsigned char *>pk_t.addr
  143. return <bytes>(bytestream)[:pk_t.sz]