graph.pyx 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. import logging
  2. import rdflib
  3. from lakesuperior import env
  4. from libc.string cimport memcpy
  5. from libc.stdlib cimport free
  6. cimport lakesuperior.cy_include.collections as cc
  7. cimport lakesuperior.model.structures.callbacks as cb
  8. cimport lakesuperior.model.structures.keyset as kset
  9. from lakesuperior.model.base cimport Key, TripleKey
  10. from lakesuperior.model.graph cimport term
  11. from lakesuperior.model.graph.triple cimport BufferTriple
  12. from lakesuperior.model.structures.hash cimport term_hash_seed32
  13. from lakesuperior.model.structures.keyset cimport Keyset
  14. logger = logging.getLogger(__name__)
  15. cdef class Graph:
  16. """
  17. Fast and simple implementation of a graph.
  18. Most functions should mimic RDFLib's graph with less overhead. It uses
  19. the same funny but functional slicing notation.
  20. A Graph contains a :py:class:`lakesuperior.model.structures.keyset.Keyset`
  21. at its core and is bound to a
  22. :py:class:`~lakesuperior.store.ldp_rs.lmdb_triplestore.LmdbTriplestore`.
  23. This makes lookups and boolean operations very efficient because all these
  24. operations are performed on an array of integers.
  25. In order to retrieve RDF values from a ``Graph``, the underlying store
  26. must be looked up. This can be done in a different transaction than the
  27. one used to create or otherwise manipulate the graph.
  28. Every time a term is looked up or added to even a temporary graph, that
  29. term is added to the store and creates a key. This is because in the
  30. majority of cases that term is bound to be stored permanently anyway, and
  31. it's more efficient to hash it and allocate it immediately. A cleanup
  32. function to remove all orphaned terms (not in any triple or context index)
  33. can be later devised to compact the database.
  34. An instance of this class can also be converted to a ``rdflib.Graph``
  35. instance.
  36. """
  37. def __cinit__(
  38. self, store=None, size_t ct=0, str uri=None, set data=set()
  39. ):
  40. """
  41. Initialize the graph, optionally from Python/RDFlib data.
  42. :type store: lakesuperior.store.ldp_rs.lmdb_triplestore.LmdbTriplestore
  43. :param store: Triplestore where keys are mapped to terms. By default
  44. this is the default application store
  45. (``env.app_globals.rdf_store``).
  46. :param size_t ct: Initial number of allocated triples.
  47. :param str uri: If specified, the graph becomes a named graph and can
  48. utilize the :py:meth:`value()` method and special slicing notation.
  49. :param set data: If specified, ``ct`` is ignored and an initial key
  50. set is created from a set of 3-tuples of :py:class:``rdflib.Term``
  51. instances.
  52. """
  53. if not store:
  54. store = env.app_globals.rdf_store
  55. # Initialize empty data set.
  56. if data:
  57. # Populate with provided Python set.
  58. self.keys = Keyset(len(data))
  59. self.add_triples(data)
  60. else:
  61. self.keys = Keyset(ct)
  62. ## PROPERTIES ##
  63. @property
  64. def uri(self):
  65. """
  66. Get resource identifier as a RDFLib URIRef.
  67. :rtype: rdflib.URIRef.
  68. """
  69. return rdflib.URIRef(self.id)
  70. @property
  71. def data(self):
  72. """
  73. Triple data as a Python/RDFlib set.
  74. :rtype: set
  75. """
  76. cdef TripleKey spok
  77. ret = set()
  78. self.seek()
  79. while self.keys.get_next(&spok):
  80. ret.keys.add((
  81. self.store.from_key(spok[0]),
  82. self.store.from_key(spok[1]),
  83. self.store.from_key(spok[2])
  84. ))
  85. return ret
  86. ## MAGIC METHODS ##
  87. def __len__(self):
  88. """ Number of triples in the graph. """
  89. return self.keys.size()
  90. def __eq__(self, other):
  91. """ Equality operator between ``Graph`` instances. """
  92. return len(self & other) == 0
  93. def __repr__(self):
  94. """
  95. String representation of the graph.
  96. This includes the subject URI, number of triples contained and the
  97. memory address of the instance.
  98. """
  99. id_repr = f' id={self.id},' if self.id else ''
  100. return (
  101. f'<{self.__class__.__name__} @0x{id(self):02x}{id_repr} '
  102. f'length={len(self)}>'
  103. )
  104. def __str__(self):
  105. """ String dump of the graph triples. """
  106. return str(self.data)
  107. def __add__(self, other):
  108. """ Alias for set-theoretical union. """
  109. return self.__or__(other)
  110. def __iadd__(self, other):
  111. """ Alias for in-place set-theoretical union. """
  112. return self.__ior__(other)
  113. def __sub__(self, other):
  114. """ Set-theoretical subtraction. """
  115. cdef Graph gr3 = self.empty_copy()
  116. gr3.keys = kset.subtract(self.keys, other.keys)
  117. return gr3
  118. def __isub__(self, other):
  119. """ In-place set-theoretical subtraction. """
  120. self.keys = kset.subtract(self.keys, other.keys)
  121. return self
  122. def __and__(self, other):
  123. """ Set-theoretical intersection. """
  124. cdef Graph gr3 = self.empty_copy()
  125. gr3.keys = kset.intersect(self.keys, other.keys)
  126. return gr3
  127. def __iand__(self, other):
  128. """ In-place set-theoretical intersection. """
  129. self.keys = kset.intersect(self.keys, other.keys)
  130. return self
  131. def __or__(self, other):
  132. """ Set-theoretical union. """
  133. cdef Graph gr3 = self.copy()
  134. gr3.keys = kset.merge(self.keys, other.keys)
  135. return gr3
  136. def __ior__(self, other):
  137. """ In-place set-theoretical union. """
  138. self.keys = kset.merge(self.keys, other.keys)
  139. return self
  140. def __xor__(self, other):
  141. """ Set-theoretical exclusive disjunction (XOR). """
  142. cdef Graph gr3 = self.empty_copy()
  143. gr3.keys = kset.xor(self.keys, other.keys)
  144. return gr3
  145. def __ixor__(self, other):
  146. """ In-place set-theoretical exclusive disjunction (XOR). """
  147. self.keys = kset.xor(self.keys, other.keys)
  148. return self
  149. def __contains__(self, trp):
  150. """
  151. Whether the graph contains a triple.
  152. :rtype: boolean
  153. """
  154. cdef TripleKey spok
  155. spok = [
  156. self.store.to_key(trp[0]),
  157. self.store.to_key(trp[1]),
  158. self.store.to_key(trp[2]),
  159. ]
  160. return self.keys.contains(&spok)
  161. def __iter__(self):
  162. """ Graph iterator. It iterates over the set triples. """
  163. yield from self.data
  164. # Slicing.
  165. def __getitem__(self, item):
  166. """
  167. Slicing function.
  168. It behaves similarly to `RDFLib graph slicing
  169. <https://rdflib.readthedocs.io/en/stable/utilities.html#slicing-graphs>`__
  170. """
  171. if isinstance(item, slice):
  172. s, p, o = item.start, item.stop, item.step
  173. return self._slice(s, p, o)
  174. elif self.id and isinstance(item, rdflib.Node):
  175. # If a Node is given, return all values for that predicate.
  176. return self._slice(self.uri, item, None)
  177. else:
  178. raise TypeError(f'Wrong slice format: {item}.')
  179. def __hash__(self):
  180. """ TODO Bogus """
  181. return self.id
  182. ## BASIC PYTHON-ACCESSIBLE SET OPERATIONS ##
  183. def value(self, p, strict=False):
  184. """
  185. Get an individual value.
  186. :param rdflib.termNode p: Predicate to search for.
  187. :param bool strict: If set to ``True`` the method raises an error if
  188. more than one value is found. If ``False`` (the default) only
  189. the first found result is returned.
  190. :rtype: rdflib.term.Node
  191. """
  192. if not self.id:
  193. raise ValueError('Cannot use `value` on a non-named graph.')
  194. # TODO use slice.
  195. values = {trp[2] for trp in self.lookup((self.uri, p, None))}
  196. if strict and len(values) > 1:
  197. raise RuntimeError('More than one value found for {}, {}.'.format(
  198. self.id, p))
  199. for ret in values:
  200. return ret
  201. return None
  202. def terms_by_type(self, type):
  203. """
  204. Get all terms of a type: subject, predicate or object.
  205. :param str type: One of ``s``, ``p`` or ``o``.
  206. """
  207. i = 'spo'.index(type)
  208. return {r[i] for r in self.data}
  209. def add_triples(self, triples):
  210. """
  211. Add triples to the graph.
  212. This method checks for duplicates.
  213. :param iterable triples: iterable of 3-tuple triples.
  214. """
  215. cdef TripleKey spok
  216. for s, p, o in triples:
  217. spok = [
  218. self.store.to_key(s),
  219. self.store.to_key(p),
  220. self.store.to_key(o),
  221. ]
  222. self.keys.add(&spok, True)
  223. def remove(self, pattern):
  224. """
  225. Remove triples by pattern.
  226. The pattern used is similar to :py:meth:`LmdbTripleStore.delete`.
  227. """
  228. self._match_ptn_callback(
  229. pattern, self, del_trp_callback, NULL
  230. )
  231. ## CYTHON-ACCESSIBLE BASIC METHODS ##
  232. cdef Graph copy(self, str uri=None):
  233. """
  234. Create copy of the graph with a different (or no) URI.
  235. :param str uri: URI of the new graph. This should be different from
  236. the original.
  237. """
  238. cdef Graph new_gr = Graph(self.store, self.ct, uri=uri)
  239. new_gr.keys = self.keys.copy()
  240. cdef Graph empty_copy(self, str uri=None):
  241. """
  242. Create an empty copy with same capacity and store binding.
  243. :param str uri: URI of the new graph. This should be different from
  244. the original.
  245. """
  246. return Graph(self.store, self.ct, uri=uri)
  247. cpdef void set(self, tuple trp) except *:
  248. """
  249. Set a single value for subject and predicate.
  250. Remove all triples matching ``s`` and ``p`` before adding ``s p o``.
  251. """
  252. if None in trp:
  253. raise ValueError(f'Invalid triple: {trp}')
  254. self.remove((trp[0], trp[1], None))
  255. self.add((trp,))
  256. def as_rdflib(self):
  257. """
  258. Return the data set as an RDFLib Graph.
  259. :rtype: rdflib.Graph
  260. """
  261. gr = Graph(identifier=self.id)
  262. for trp in self.data:
  263. gr.add(trp)
  264. return gr
  265. def _slice(self, s, p, o):
  266. """
  267. Return terms filtered by other terms.
  268. This behaves like the rdflib.Graph slicing policy.
  269. """
  270. # If no terms are unbound, check for containment.
  271. if s is not None and p is not None and o is not None: # s p o
  272. return (s, p, o) in self
  273. # If some terms are unbound, do a lookup.
  274. res = self.lookup((s, p, o))
  275. if s is not None:
  276. if p is not None: # s p ?
  277. return {r[2] for r in res}
  278. if o is not None: # s ? o
  279. return {r[1] for r in res}
  280. # s ? ?
  281. return {(r[1], r[2]) for r in res}
  282. if p is not None:
  283. if o is not None: # ? p o
  284. return {r[0] for r in res}
  285. # ? p ?
  286. return {(r[0], r[2]) for r in res}
  287. if o is not None: # ? ? o
  288. return {(r[0], r[1]) for r in res}
  289. # ? ? ?
  290. return res
  291. def lookup(self, pattern):
  292. """
  293. Look up triples by a pattern.
  294. This function converts RDFLib terms into the serialized format stored
  295. in the graph's internal structure and compares them bytewise.
  296. Any and all of the lookup terms msy be ``None``.
  297. :rtype: Graph
  298. "return: New Graph instance with matching triples.
  299. """
  300. cdef:
  301. Graph res_gr = self.empty_copy()
  302. self._match_ptn_callback(pattern, res_gr, add_trp_callback, NULL)
  303. res_gr.data.resize()
  304. return res_gr
  305. cdef void _match_ptn_callback(
  306. self, pattern, Graph gr,
  307. lookup_callback_fn_t callback_fn, void* ctx=NULL
  308. ) except *:
  309. """
  310. Execute an arbitrary function on a list of triples matching a pattern.
  311. The arbitrary function is appied to each triple found in the current
  312. graph, and to a discrete graph that can be the current graph itself
  313. or a different one.
  314. """
  315. cdef:
  316. kset.key_cmp_fn_t cmp_fn
  317. Key k1, k2, sk, pk, ok
  318. TripleKey spok
  319. s, p, o = pattern
  320. # Decide comparison logic outside the loop.
  321. if s is not None and p is not None and o is not None:
  322. # Shortcut for 3-term match.
  323. spok = [
  324. self.store.to_key(s),
  325. self.store.to_key(p),
  326. self.store.to_key(o),
  327. ]
  328. if self.keys.contains(&spok):
  329. callback_fn(gr, &spok, ctx)
  330. return
  331. if s is not None:
  332. k1 = self.store.to_key(s)
  333. if p is not None:
  334. cmp_fn = cb.lookup_skpk_cmp_fn
  335. k2 = self.store.to_key(p)
  336. elif o is not None:
  337. cmp_fn = cb.lookup_skok_cmp_fn
  338. k2 = self.store.to_key(o)
  339. else:
  340. cmp_fn = cb.lookup_sk_cmp_fn
  341. elif p is not None:
  342. k1 = self.store.to_key(p)
  343. if o is not None:
  344. cmp_fn = cb.lookup_pkok_cmp_fn
  345. k2 = self.store.to_key(o)
  346. else:
  347. cmp_fn = cb.lookup_pk_cmp_fn
  348. elif o is not None:
  349. cmp_fn = cb.lookup_ok_cmp_fn
  350. k1 = self.store.to_key(o)
  351. else:
  352. cmp_fn = cb.lookup_none_cmp_fn
  353. # Iterate over serialized triples.
  354. while self.keys.get_next(&spok):
  355. if cmp_fn(&spok, k1, k2):
  356. callback_fn(gr, &spok, ctx)
  357. ## LOOKUP CALLBACK FUNCTIONS
  358. cdef inline void add_trp_callback(
  359. Graph gr, const TripleKey* spok_p, void* ctx
  360. ):
  361. """
  362. Add a triple to a graph as a result of a lookup callback.
  363. """
  364. gr.keys.add(spok_p)
  365. cdef inline void del_trp_callback(
  366. Graph gr, const TripleKey* spok_p, void* ctx
  367. ):
  368. """
  369. Remove a triple from a graph as a result of a lookup callback.
  370. """
  371. #logger.info('removing triple: {} {} {}'.format(
  372. # buffer_dump(trp.s), buffer_dump(trp.p), buffer_dump(trp.o)
  373. #))
  374. gr.keys.remove(spok_p)