graph.pyx 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. import logging
  2. import rdflib
  3. from lakesuperior import env
  4. from cpython.object cimport Py_LT, Py_EQ, Py_GT, Py_LE, Py_NE, Py_GE
  5. from libc.string cimport memcpy
  6. from libc.stdlib cimport free
  7. cimport lakesuperior.cy_include.collections as cc
  8. cimport lakesuperior.model.callbacks as cb
  9. cimport lakesuperior.model.structures.keyset as kset
  10. from lakesuperior.model.base cimport Key, TripleKey
  11. from lakesuperior.model.rdf cimport term
  12. from lakesuperior.model.rdf.triple cimport BufferTriple
  13. from lakesuperior.model.structures.hash cimport term_hash_seed32
  14. from lakesuperior.model.structures.keyset cimport Keyset
  15. logger = logging.getLogger(__name__)
  16. cdef class Graph:
  17. """
  18. Fast and simple implementation of a graph.
  19. Most functions should mimic RDFLib's graph with less overhead. It uses
  20. the same funny but functional slicing notation.
  21. A Graph contains a :py:class:`lakesuperior.model.structures.keyset.Keyset`
  22. at its core and is bound to a
  23. :py:class:`~lakesuperior.store.ldp_rs.lmdb_triplestore.LmdbTriplestore`.
  24. This makes lookups and boolean operations very efficient because all these
  25. operations are performed on an array of integers.
  26. In order to retrieve RDF values from a ``Graph``, the underlying store
  27. must be looked up. This can be done in a different transaction than the
  28. one used to create or otherwise manipulate the graph.
  29. Every time a term is looked up or added to even a temporary graph, that
  30. term is added to the store and creates a key. This is because in the
  31. majority of cases that term is likely to be stored permanently anyway, and
  32. it's more efficient to hash it and allocate it immediately. A cleanup
  33. function to remove all orphaned terms (not in any triple or context index)
  34. can be later devised to compact the database.
  35. An instance of this class can also be converted to a ``rdflib.Graph``
  36. instance.
  37. """
  38. def __cinit__(
  39. self, store=None, size_t capacity=0, uri=None, set data=set()
  40. ):
  41. """
  42. Initialize the graph, optionally from Python/RDFlib data.
  43. When initializing a non-empty Graph, a store transaction must be
  44. opened::
  45. >>> from rdflib import URIRef
  46. >>> from lakesuperior import env_setup, env
  47. >>> store = env.app_globals.rdf_store
  48. >>> # Or alternatively:
  49. >>> # from lakesuperior.store.ldp_rs.lmdb_store import LmdbStore
  50. >>> # store = LmdbStore('/tmp/test')
  51. >>> trp = {(URIRef('urn:s:0'), URIRef('urn:p:0'), URIRef('urn:o:0'))}
  52. >>> with store.txn_ctx():
  53. >>> gr = Graph(store, data=trp)
  54. Similarly, any operation such as adding, changing or looking up triples
  55. needs a store transaction.
  56. Note that, even though any operation may involve adding new terms to
  57. the store, a read-only transaction is sufficient. Lakesuperior will
  58. open a write transaction automatically only if necessary and only for
  59. the time needed to enter the new terms.
  60. :type store: lakesuperior.store.ldp_rs.lmdb_triplestore.LmdbTriplestore
  61. :param store: Triplestore where keys are mapped to terms. By default
  62. this is the default application store
  63. (``env.app_globals.rdf_store``).
  64. :param size_t capacity: Initial number of allocated triples.
  65. :param str uri: If specified, the graph becomes a named graph and can
  66. utilize the :py:meth:`value()` method and special slicing notation.
  67. :param set data: If specified, ``capacity`` is ignored and an initial key
  68. set is created from a set of 3-tuples of :py:class:``rdflib.Term``
  69. instances.
  70. """
  71. self.uri = rdflib.URIRef(uri) if uri else None
  72. self.store = store if store is not None else env.app_globals.rdf_store
  73. logger.debug(f'Assigned store at {self.store.env_path}')
  74. # Initialize empty data set.
  75. if data:
  76. # Populate with provided Python set.
  77. self.keys = Keyset(len(data))
  78. self.add(data)
  79. else:
  80. self.keys = Keyset(capacity)
  81. ## PROPERTIES ##
  82. property data:
  83. def __get__(self):
  84. """
  85. Triple data as a Python/RDFlib set.
  86. :rtype: set
  87. """
  88. cdef TripleKey spok
  89. ret = set()
  90. self.keys.seek()
  91. while self.keys.get_next(&spok):
  92. ret.add((
  93. self.store.from_key(spok[0]),
  94. self.store.from_key(spok[1]),
  95. self.store.from_key(spok[2])
  96. ))
  97. return ret
  98. property capacity:
  99. def __get__(self):
  100. """
  101. Total capacity of the underlying Keyset, in number of triples.
  102. """
  103. return self.keys.capacity
  104. property txn_ctx:
  105. def __get__(self):
  106. """ Expose underlying store's ``txn_ctx``. """
  107. return self.store.txn_ctx
  108. ## MAGIC METHODS ##
  109. def __len__(self):
  110. """ Number of triples in the graph. """
  111. return self.keys.size()
  112. def __richcmp__(self, other, int op):
  113. """ Comparators between ``Graph`` instances. """
  114. if op == Py_LT:
  115. raise NotImplementedError()
  116. elif op == Py_EQ:
  117. logger.info('Comparing for equality.')
  118. return len(self ^ other) == 0
  119. elif op == Py_GT:
  120. raise NotImplementedError()
  121. elif op == Py_LE:
  122. raise NotImplementedError()
  123. elif op == Py_NE:
  124. return len(self ^ other) != 0
  125. elif op == Py_GE:
  126. raise NotImplementedError()
  127. def __repr__(self):
  128. """
  129. String representation of the graph.
  130. This includes the subject URI, number of triples contained and the
  131. memory address of the instance.
  132. """
  133. uri_repr = f', uri={self.uri}' if self.uri else ''
  134. return (
  135. f'<{self.__class__.__module__}.{self.__class__.__qualname__} '
  136. f'@0x{id(self):02x} length={len(self)}{uri_repr}>'
  137. )
  138. def __str__(self):
  139. """ String dump of the graph triples. """
  140. return str(self.data)
  141. def __add__(self, other):
  142. """ Alias for set-theoretical union. """
  143. return self.__or__(other)
  144. def __iadd__(self, other):
  145. """ Alias for in-place set-theoretical union. """
  146. return self.__ior__(other)
  147. def __sub__(self, other):
  148. """ Set-theoretical subtraction. """
  149. cdef Graph gr3 = self.empty_copy()
  150. gr3.keys = kset.subtract(self.keys, other.keys)
  151. return gr3
  152. def __isub__(self, other):
  153. """ In-place set-theoretical subtraction. """
  154. self.keys = kset.subtract(self.keys, other.keys)
  155. return self
  156. def __and__(self, other):
  157. """ Set-theoretical intersection. """
  158. cdef Graph gr3 = self.empty_copy()
  159. gr3.keys = kset.intersect(self.keys, other.keys)
  160. return gr3
  161. def __iand__(self, other):
  162. """ In-place set-theoretical intersection. """
  163. self.keys = kset.intersect(self.keys, other.keys)
  164. return self
  165. def __or__(self, other):
  166. """ Set-theoretical union. """
  167. cdef Graph gr3 = self.empty_copy()
  168. gr3.keys = kset.merge(self.keys, other.keys)
  169. return gr3
  170. def __ior__(self, other):
  171. """ In-place set-theoretical union. """
  172. self.keys = kset.merge(self.keys, other.keys)
  173. return self
  174. def __xor__(self, other):
  175. """ Set-theoretical exclusive disjunction (XOR). """
  176. cdef Graph gr3 = self.empty_copy()
  177. gr3.keys = kset.xor(self.keys, other.keys)
  178. return gr3
  179. def __ixor__(self, other):
  180. """ In-place set-theoretical exclusive disjunction (XOR). """
  181. self.keys = kset.xor(self.keys, other.keys)
  182. return self
  183. def __contains__(self, trp):
  184. """
  185. Whether the graph contains a triple.
  186. :rtype: boolean
  187. """
  188. cdef TripleKey spok
  189. spok = [
  190. self.store.to_key(trp[0]),
  191. self.store.to_key(trp[1]),
  192. self.store.to_key(trp[2]),
  193. ]
  194. return self.keys.contains(&spok)
  195. def __iter__(self):
  196. """ Graph iterator. It iterates over the set triples. """
  197. yield from self.data
  198. # Slicing.
  199. def __getitem__(self, item):
  200. """
  201. Slicing function.
  202. It behaves similarly to `RDFLib graph slicing
  203. <https://rdflib.readthedocs.io/en/stable/utilities.html#slicing-graphs>`__
  204. """
  205. if isinstance(item, slice):
  206. s, p, o = item.start, item.stop, item.step
  207. return self._slice(s, p, o)
  208. elif self.uri and isinstance(item, rdflib.term.Identifier):
  209. # If a Node is given, return all values for that predicate.
  210. return self._slice(self.uri, item, None)
  211. else:
  212. raise TypeError(f'Wrong slice format: {item}.')
  213. def __hash__(self):
  214. """ TODO Not that great of a hash. """
  215. return id(self)
  216. ## BASIC PYTHON-ACCESSIBLE SET OPERATIONS ##
  217. def value(self, p, strict=False):
  218. """
  219. Get an individual value.
  220. :param rdflib.termNode p: Predicate to search for.
  221. :param bool strict: If set to ``True`` the method raises an error if
  222. more than one value is found. If ``False`` (the default) only
  223. the first found result is returned.
  224. :rtype: rdflib.term.Node
  225. """
  226. if not self.uri:
  227. raise ValueError('Cannot use `value` on a non-named graph.')
  228. # TODO use slice.
  229. values = {trp[2] for trp in self.lookup((self.uri, p, None))}
  230. if strict and len(values) > 1:
  231. raise RuntimeError('More than one value found for {}, {}.'.format(
  232. self.uri, p))
  233. for ret in values:
  234. return ret
  235. return None
  236. def terms_by_type(self, type):
  237. """
  238. Get all terms of a type: subject, predicate or object.
  239. :param str type: One of ``s``, ``p`` or ``o``.
  240. """
  241. i = 'spo'.index(type)
  242. return {r[i] for r in self.data}
  243. def add(self, triples):
  244. """
  245. Add triples to the graph.
  246. This method checks for duplicates.
  247. :param iterable triples: iterable of 3-tuple triples.
  248. """
  249. cdef:
  250. TripleKey spok
  251. for s, p, o in triples:
  252. logger.info(f'Adding {s} {p} {o} to store: {self.store}')
  253. spok = [
  254. self.store.to_key(s),
  255. self.store.to_key(p),
  256. self.store.to_key(o),
  257. ]
  258. self.keys.add(&spok, True)
  259. def remove(self, pattern):
  260. """
  261. Remove triples by pattern.
  262. The pattern used is similar to :py:meth:`LmdbTripleStore.delete`.
  263. """
  264. # create an empty copy of the current object.
  265. new_gr = self.empty_copy()
  266. # Reverse lookup: only triples not matching the pattern are added to
  267. # the new set.
  268. self._match_ptn_callback(
  269. pattern, new_gr, add_trp_callback, False
  270. )
  271. # Replace the keyset.
  272. self.keys = new_gr.keys
  273. ## CYTHON-ACCESSIBLE BASIC METHODS ##
  274. cpdef Graph copy(self, str uri=None):
  275. """
  276. Create copy of the graph with a different (or no) URI.
  277. :param str uri: URI of the new graph. This should be different from
  278. the original.
  279. """
  280. cdef Graph new_gr = Graph(self.store, self.capacity, uri=uri)
  281. new_gr.keys = self.keys.copy()
  282. return new_gr
  283. cpdef Graph empty_copy(self, str uri=None):
  284. """
  285. Create an empty copy with same capacity and store binding.
  286. :param str uri: URI of the new graph. This should be different from
  287. the original.
  288. """
  289. return Graph(self.store, self.capacity, uri=uri)
  290. cpdef void set(self, tuple trp) except *:
  291. """
  292. Set a single value for subject and predicate.
  293. Remove all triples matching ``s`` and ``p`` before adding ``s p o``.
  294. """
  295. if None in trp:
  296. raise ValueError(f'Invalid triple: {trp}')
  297. self.remove((trp[0], trp[1], None))
  298. self.add((trp,))
  299. def as_rdflib(self):
  300. """
  301. Return the data set as an RDFLib Graph.
  302. :rtype: rdflib.Graph
  303. """
  304. gr = rdflib.Graph(identifier=self.uri)
  305. for trp in self.data:
  306. gr.add(trp)
  307. return gr
  308. def _slice(self, s, p, o):
  309. """
  310. Return terms filtered by other terms.
  311. This behaves like the rdflib.Graph slicing policy.
  312. """
  313. logger.info(f'Slicing: {s} {p} {o}')
  314. # If no terms are unbound, check for containment.
  315. if s is not None and p is not None and o is not None: # s p o
  316. return (s, p, o) in self
  317. # If some terms are unbound, do a lookup.
  318. res = self.lookup((s, p, o))
  319. logger.info(f'Slicing results: {res}')
  320. if s is not None:
  321. if p is not None: # s p ?
  322. return {r[2] for r in res}
  323. if o is not None: # s ? o
  324. return {r[1] for r in res}
  325. # s ? ?
  326. return {(r[1], r[2]) for r in res}
  327. if p is not None:
  328. if o is not None: # ? p o
  329. return {r[0] for r in res}
  330. # ? p ?
  331. return {(r[0], r[2]) for r in res}
  332. if o is not None: # ? ? o
  333. return {(r[0], r[1]) for r in res}
  334. # ? ? ?
  335. return res
  336. def lookup(self, pattern):
  337. """
  338. Look up triples by a pattern.
  339. This function converts RDFLib terms into the serialized format stored
  340. in the graph's internal structure and compares them bytewise.
  341. Any and all of the lookup terms msy be ``None``.
  342. :rtype: Graph
  343. "return: New Graph instance with matching triples.
  344. """
  345. cdef:
  346. Graph res_gr = self.empty_copy()
  347. self._match_ptn_callback(pattern, res_gr, add_trp_callback)
  348. res_gr.keys.resize()
  349. return res_gr
  350. cdef void _match_ptn_callback(
  351. self, pattern, Graph gr, lookup_callback_fn_t callback_fn,
  352. bint callback_cond=True, void* ctx=NULL
  353. ) except *:
  354. """
  355. Execute an arbitrary function on a list of triples matching a pattern.
  356. The arbitrary function is applied to each triple found in the current
  357. graph, and to a discrete graph that can be the current graph itself
  358. or a different one.
  359. :param tuple pattern: A 3-tuple of rdflib terms or None.
  360. :param Graph gr: The graph instance to apply the callback function to.
  361. :param lookup_callback_fn_t callback_fn: A callback function to be
  362. applied to the target graph using the matching triples.
  363. :param bint callback_cond: Whether to apply the callback function if
  364. a match is found (``True``) or if it is not found (``False``).
  365. :param void* ctx: Pointer to an arbitrary object that can be used by
  366. the callback function.
  367. """
  368. cdef:
  369. kset.key_cmp_fn_t cmp_fn
  370. Key k1, k2, k3
  371. TripleKey spok
  372. s, p, o = pattern
  373. logger.info(f'Match Callback pattern: {pattern}')
  374. self.keys.seek()
  375. # Decide comparison logic outside the loop.
  376. if all(pattern):
  377. if callback_cond:
  378. # Shortcut for 3-term match—only if callback_cond is True.
  379. spok = [
  380. self.store.to_key(s),
  381. self.store.to_key(p),
  382. self.store.to_key(o),
  383. ]
  384. if self.keys.contains(&spok):
  385. callback_fn(gr, &spok, ctx)
  386. else:
  387. # For negative condition (i.e. "apply this function to all keys
  388. # except the matching one"), the whole set must be scanned.
  389. #logger.info('All terms bound and negative condition.')
  390. k1 = self.store.to_key(s)
  391. k2 = self.store.to_key(p)
  392. k3 = self.store.to_key(o)
  393. #logger.info(f'Keys to match: {k1} {k2} {k3}')
  394. while self.keys.get_next(&spok):
  395. #logger.info(f'Verifying spok: {spok}')
  396. if k1 != spok[0] or k2 != spok[1] or k3 != spok[2]:
  397. logger.info(f'Calling function for spok: {spok}')
  398. callback_fn(gr, &spok, ctx)
  399. return
  400. if s is not None:
  401. k1 = self.store.to_key(s)
  402. if p is not None:
  403. k2 = self.store.to_key(p)
  404. cmp_fn = cb.lookup_skpk_cmp_fn
  405. logger.info('SKPK')
  406. elif o is not None:
  407. k2 = self.store.to_key(o)
  408. cmp_fn = cb.lookup_skok_cmp_fn
  409. logger.info('SKOK')
  410. else:
  411. cmp_fn = cb.lookup_sk_cmp_fn
  412. elif p is not None:
  413. k1 = self.store.to_key(p)
  414. if o is not None:
  415. k2 = self.store.to_key(o)
  416. cmp_fn = cb.lookup_pkok_cmp_fn
  417. else:
  418. cmp_fn = cb.lookup_pk_cmp_fn
  419. elif o is not None:
  420. k1 = self.store.to_key(o)
  421. cmp_fn = cb.lookup_ok_cmp_fn
  422. else:
  423. cmp_fn = cb.lookup_none_cmp_fn
  424. logger.info(f'k1: {k1} k2: {k2}')
  425. # Iterate over serialized triples.
  426. while self.keys.get_next(&spok):
  427. logger.info(f'Verifying spok: {spok}')
  428. if cmp_fn(&spok, k1, k2) == callback_cond:
  429. callback_fn(gr, &spok, ctx)
  430. ## FACTORY METHODS
  431. def from_rdf(store=None, uri=None, *args, **kwargs):
  432. """
  433. Create a Graph from a serialized RDF string.
  434. This factory function takes the same arguments as
  435. :py:meth:`rdflib.Graph.parse`.
  436. :param store: see :py:meth:`Graph.__cinit__`.
  437. :param uri: see :py:meth:`Graph.__cinit__`.
  438. :param *args: Positional arguments passed to RDFlib's ``parse``.
  439. :param *kwargs: Keyword arguments passed to RDFlib's ``parse``.
  440. :rtype: Graph
  441. """
  442. gr = rdflib.Graph().parse(*args, **kwargs)
  443. logger.info(f'graph: {set(gr)}')
  444. return Graph(store=store, uri=uri, data={*gr})
  445. ## LOOKUP CALLBACK FUNCTIONS
  446. cdef inline void add_trp_callback(
  447. Graph gr, const TripleKey* spok_p, void* ctx
  448. ):
  449. """
  450. Add a triple to a graph as a result of a lookup callback.
  451. """
  452. gr.keys.add(spok_p)