graph.pyx 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563
  1. import logging
  2. from functools import wraps
  3. from rdflib import Graph
  4. from rdflib.term import Node
  5. from lakesuperior import env
  6. from cpython.mem cimport PyMem_Malloc, PyMem_Free
  7. from libc.string cimport memcmp
  8. from lakesuperior.cy_include cimport calg
  9. from lakesuperior.cy_include cimport cylmdb as lmdb
  10. from lakesuperior.store.ldp_rs cimport term
  11. from lakesuperior.store.ldp_rs.lmdb_triplestore cimport (
  12. KLEN, DBL_KLEN, TRP_KLEN, TripleKey, LmdbTriplestore)
  13. from lakesuperior.store.ldp_rs.keyset cimport Keyset
  14. from lakesuperior.store.ldp_rs.triple cimport Triple
  15. from lakesuperior.util.hash cimport Hash64, hash64
  16. logger = logging.getLogger(__name__)
  17. def use_data(fn):
  18. """
  19. Decorator to indicate that a set operation between two SimpleGraph
  20. instances should use the ``data`` property of the second term. The second
  21. term can also be a simple set.
  22. """
  23. @wraps(fn)
  24. def _wrapper(self, other):
  25. if isinstance(other, SimpleGraph):
  26. other = other.data
  27. return _wrapper
  28. cdef unsigned int set_item_hash_fn(calg.SetValue data):
  29. """
  30. Hash function for the CAlg set implementation.
  31. https://fragglet.github.io/c-algorithms/doc/set_8h.html#6c7986a2a80d7a3cb7b9d74e1c6fef97
  32. :param SetItem *data: Pointer to a SetItem structure.
  33. """
  34. cdef:
  35. Hash64 hash
  36. term.Buffer sr_data
  37. sr_data.addr = (<SetItem>data).data
  38. sr_data.sz = (<SetItem>data).size
  39. hash64(&sr_data, &hash)
  40. return hash
  41. cdef bint set_item_cmp_fn(calg.SetValue v1, calg.SetValue v2):
  42. """
  43. Compare function for two CAlg set items.
  44. https://fragglet.github.io/c-algorithms/doc/set_8h.html#40fa2c86d5b003c1b0b0e8dd1e4df9f4
  45. """
  46. if (<SetItem *>v1)[0].size != (<SetItem *>v2)[0].size:
  47. return False
  48. return memcmp(
  49. (<SetItem *>v1)[0].data, (<SetItem *>v2)[0].data,
  50. (<SetItem *>v1)[0].size)
  51. cdef class SimpleGraph:
  52. """
  53. Fast and simple implementation of a graph.
  54. Most functions should mimic RDFLib's graph with less overhead. It uses
  55. the same funny but functional slicing notation.
  56. Instances of this class hold a set of pointers to
  57. :py:class:`~lakesuperior.store.ldp_rs.triple.Triple` structures. No data
  58. are copied but care must be taken when freeing the triples pointed to.
  59. A SimpleGraph can be obtained from a
  60. :py:class:`lakesuperior.store.keyset.Keyset` which is convenient bacause
  61. a Keyset can be obtained very efficiently from querying a store, then also
  62. very efficiently filtered and eventually converted into a set of readable
  63. terms.
  64. An instance of this class can also be converted to and from a
  65. ``rdflib.Graph`` instance. TODO verify that this frees Cython pointers.
  66. """
  67. def __cinit__(
  68. self, calg.Set *cdata=NULL, Keyset keyset=None, store=None,
  69. set data=set()):
  70. """
  71. Initialize the graph with pre-existing data or by looking up a store.
  72. One of ``cdata``, ``keyset``, or ``data`` can be provided. If more than
  73. one of these is provided, precedence is given in the mentioned order.
  74. If none of them is specified, an empty graph is initialized.
  75. :param rdflib.URIRef uri: The graph URI.
  76. This will serve as the subject for some queries.
  77. :param calg.Set cdata: Initial data as a C ``Set`` struct.
  78. :param Keyset keyset: Keyset to create the graph from. Keys will be
  79. converted to set elements.
  80. :param lakesuperior.store.ldp_rs.LmdbTripleStore store: store to
  81. look up the keyset. Only used if ``keyset`` is specified. If not
  82. set, the environment store is used.
  83. :param set data: Initial data as a set of 3-tuples of RDFLib terms.
  84. :param tuple lookup: tuple of a 3-tuple of lookup terms, and a context.
  85. E.g. ``((URIRef('urn:ns:a'), None, None), URIRef('urn:ns:ctx'))``.
  86. Any and all elements may be ``None``.
  87. :param lmdbStore store: the store to look data up.
  88. """
  89. self.store = store or env.app_defaults.rdf_store
  90. cdef:
  91. size_t i = 0
  92. TripleKey spok
  93. term.Buffer pk_t
  94. if cdata is not NULL:
  95. # Get data from provided C set.
  96. self._data = cdata
  97. else:
  98. # Initialize empty data set.
  99. self._data = calg.set_new(set_item_hash_fn, set_item_cmp_fn)
  100. if keyset is not None:
  101. # Populate with triples extracted from provided key set.
  102. while keyset.next(spok):
  103. self.store.lookup_term(spok[:KLEN], &pk_t)
  104. term.deserialize(&pk_t, self._trp.s)
  105. self.store.lookup_term(spok[KLEN:DBL_KLEN], &pk_t)
  106. term.deserialize(&pk_t, self._trp.p)
  107. self.store.lookup_term(spok[DBL_KLEN:TRP_KLEN], &pk_t)
  108. term.deserialize(&pk_t, self._trp.o)
  109. calg.set_insert(self._data, &self._trp)
  110. else:
  111. # Populate with provided Python set.
  112. self._trp = <Triple *>PyMem_Malloc(sizeof(Triple) * len(data))
  113. for s, p, o in data:
  114. term.from_rdflib(s, self._trp[i].s)
  115. term.from_rdflib(p, self._trp[i].p)
  116. term.from_rdflib(o, self._trp[i].o)
  117. calg.set_insert(self._data, self._trp)
  118. def __dealloc__(self):
  119. """
  120. Free the triple pointer.
  121. """
  122. PyMem_Free(self._trp)
  123. # TODO This should free the structs pointed to as well, unless they
  124. # were provided as ``cdata`` in the constructor (i.e. they were
  125. # generated.externally).
  126. @property
  127. def data(self):
  128. """
  129. Triple data as a Python set.
  130. :rtype: set
  131. """
  132. return self._data_as_set()
  133. cdef void _data_from_lookup(
  134. self, LmdbTriplestore store, tuple trp_ptn, ctx=None) except *:
  135. """
  136. Look up triples in the triplestore and load them into ``data``.
  137. :param tuple lookup: 3-tuple of RDFlib terms or ``None``.
  138. :param LmdbTriplestore store: Reference to a LMDB triplestore. This
  139. is normally set to ``lakesuperior.env.app_globals.rdf_store``.
  140. """
  141. cdef:
  142. size_t i
  143. unsigned char spok[TRP_KLEN]
  144. self._data = calg.set_new(set_item_hash_fn, set_item_cmp_fn)
  145. with store.txn_ctx():
  146. keyset = store.triple_keys(trp_ptn, ctx)
  147. for i in range(keyset.ct):
  148. spok = keyset.data + i * TRP_KLEN
  149. self.data.add(store.from_trp_key(spok[: TRP_KLEN]))
  150. strp = serialize_triple(self._trp)
  151. calg.set_insert(self._data, strp)
  152. cdef _data_as_set(self):
  153. """
  154. Convert triple data to a Python set.
  155. :rtype: set
  156. """
  157. pass
  158. # Basic set operations.
  159. def add(self, dataset):
  160. """ Set union. """
  161. self.data.add(dataset)
  162. def remove(self, item):
  163. """
  164. Remove one item from the graph.
  165. :param tuple item: A 3-tuple of RDFlib terms. Only exact terms, i.e.
  166. wildcards are not accepted.
  167. """
  168. self.data.remove(item)
  169. def __len__(self):
  170. """ Number of triples in the graph. """
  171. return len(self.data)
  172. @use_data
  173. def __eq__(self, other):
  174. """ Equality operator between ``SimpleGraph`` instances. """
  175. return self.data == other
  176. def __repr__(self):
  177. """
  178. String representation of the graph.
  179. It provides the number of triples in the graph and memory address of
  180. the instance.
  181. """
  182. return (f'<{self.__class__.__name__} @{hex(id(self))} '
  183. f'length={len(self.data)}>')
  184. def __str__(self):
  185. """ String dump of the graph triples. """
  186. return str(self.data)
  187. @use_data
  188. def __sub__(self, other):
  189. """ Set subtraction. """
  190. return self.data - other
  191. @use_data
  192. def __isub__(self, other):
  193. """ In-place set subtraction. """
  194. self.data -= other
  195. return self
  196. @use_data
  197. def __and__(self, other):
  198. """ Set intersection. """
  199. return self.data & other
  200. @use_data
  201. def __iand__(self, other):
  202. """ In-place set intersection. """
  203. self.data &= other
  204. return self
  205. @use_data
  206. def __or__(self, other):
  207. """ Set union. """
  208. return self.data | other
  209. @use_data
  210. def __ior__(self, other):
  211. """ In-place set union. """
  212. self.data |= other
  213. return self
  214. @use_data
  215. def __xor__(self, other):
  216. """ Set exclusive intersection (XOR). """
  217. return self.data ^ other
  218. @use_data
  219. def __ixor__(self, other):
  220. """ In-place set exclusive intersection (XOR). """
  221. self.data ^= other
  222. return self
  223. def __contains__(self, item):
  224. """
  225. Whether the graph contains a triple.
  226. :rtype: boolean
  227. """
  228. return item in self.data
  229. def __iter__(self):
  230. """ Graph iterator. It iterates over the set triples. """
  231. return self.data.__iter__()
  232. # Slicing.
  233. def __getitem__(self, item):
  234. """
  235. Slicing function.
  236. It behaves similarly to `RDFLib graph slicing
  237. <https://rdflib.readthedocs.io/en/stable/utilities.html#slicing-graphs>`__
  238. """
  239. if isinstance(item, slice):
  240. s, p, o = item.start, item.stop, item.step
  241. return self._slice(s, p, o)
  242. else:
  243. raise TypeError(f'Wrong slice format: {item}.')
  244. cpdef void set(self, tuple trp) except *:
  245. """
  246. Set a single value for subject and predicate.
  247. Remove all triples matching ``s`` and ``p`` before adding ``s p o``.
  248. """
  249. self.remove_triples((trp[0], trp[1], None))
  250. if None in trp:
  251. raise ValueError(f'Invalid triple: {trp}')
  252. self.data.add(trp)
  253. cpdef void remove_triples(self, pattern) except *:
  254. """
  255. Remove triples by pattern.
  256. The pattern used is similar to :py:meth:`LmdbTripleStore.delete`.
  257. """
  258. s, p, o = pattern
  259. for match in self.lookup(s, p, o):
  260. logger.debug(f'Removing from graph: {match}.')
  261. self.data.remove(match)
  262. cpdef object as_rdflib(self):
  263. """
  264. Return the data set as an RDFLib Graph.
  265. :rtype: rdflib.Graph
  266. """
  267. gr = Graph()
  268. for trp in self.data:
  269. gr.add(trp)
  270. return gr
  271. cdef _slice(self, s, p, o):
  272. """
  273. Return terms filtered by other terms.
  274. This behaves like the rdflib.Graph slicing policy.
  275. """
  276. if s is None and p is None and o is None:
  277. return self.data
  278. elif s is None and p is None:
  279. return {(r[0], r[1]) for r in self.data if r[2] == o}
  280. elif s is None and o is None:
  281. return {(r[0], r[2]) for r in self.data if r[1] == p}
  282. elif p is None and o is None:
  283. return {(r[1], r[2]) for r in self.data if r[0] == s}
  284. elif s is None:
  285. return {r[0] for r in self.data if r[1] == p and r[2] == o}
  286. elif p is None:
  287. return {r[1] for r in self.data if r[0] == s and r[2] == o}
  288. elif o is None:
  289. return {r[2] for r in self.data if r[0] == s and r[1] == p}
  290. else:
  291. # all given
  292. return (s,p,o) in self.data
  293. cpdef lookup(self, s, p, o):
  294. """
  295. Look up triples by a pattern.
  296. """
  297. logger.debug(f'Looking up in graph: {s}, {p}, {o}.')
  298. if s is None and p is None and o is None:
  299. return self.data
  300. elif s is None and p is None:
  301. return {r for r in self.data if r[2] == o}
  302. elif s is None and o is None:
  303. return {r for r in self.data if r[1] == p}
  304. elif p is None and o is None:
  305. return {r for r in self.data if r[0] == s}
  306. elif s is None:
  307. return {r for r in self.data if r[1] == p and r[2] == o}
  308. elif p is None:
  309. return {r for r in self.data if r[0] == s and r[2] == o}
  310. elif o is None:
  311. return {r for r in self.data if r[0] == s and r[1] == p}
  312. else:
  313. # all given
  314. return (s,p,o) if (s, p, o) in self.data else set()
  315. cpdef set terms(self, str type):
  316. """
  317. Get all terms of a type: subject, predicate or object.
  318. :param str type: One of ``s``, ``p`` or ``o``.
  319. """
  320. i = 'spo'.index(type)
  321. return {r[i] for r in self.data}
  322. cdef class Imr(SimpleGraph):
  323. """
  324. In-memory resource data container.
  325. This is an extension of :py:class:`~SimpleGraph` that adds a subject URI to
  326. the data set and some convenience methods.
  327. An instance of this class can be converted to a ``rdflib.Resource``
  328. instance.
  329. Some set operations that produce a new object (``-``, ``|``, ``&``, ``^``)
  330. will create a new ``Imr`` instance with the same subject URI.
  331. """
  332. def __init__(self, str uri, *args, **kwargs):
  333. """
  334. Initialize the graph with pre-existing data or by looking up a store.
  335. Either ``data``, or ``lookup`` *and* ``store``, can be provide.
  336. ``lookup`` and ``store`` have precedence. If none of them is specified,
  337. an empty graph is initialized.
  338. :param rdflib.URIRef uri: The graph URI.
  339. This will serve as the subject for some queries.
  340. :param set data: Initial data as a set of 3-tuples of RDFLib terms.
  341. :param tuple lookup: tuple of a 3-tuple of lookup terms, and a context.
  342. E.g. ``((URIRef('urn:ns:a'), None, None), URIRef('urn:ns:ctx'))``.
  343. Any and all elements may be ``None``.
  344. :param lmdbStore store: the store to look data up.
  345. """
  346. super().__init__(*args, **kwargs)
  347. self.uri = uri
  348. @property
  349. def identifier(self):
  350. """
  351. IMR URI. For compatibility with RDFLib Resource.
  352. :rtype: string
  353. """
  354. return self.uri
  355. @property
  356. def graph(self):
  357. """
  358. Return a SimpleGraph with the same data.
  359. :rtype: SimpleGraph
  360. """
  361. return SimpleGraph(self.data)
  362. def __repr__(self):
  363. """
  364. String representation of an Imr.
  365. This includes the subject URI, number of triples contained and the
  366. memory address of the instance.
  367. """
  368. return (f'<{self.__class__.__name__} @{hex(id(self))} uri={self.uri}, '
  369. f'length={len(self.data)}>')
  370. @use_data
  371. def __sub__(self, other):
  372. """
  373. Set difference. This creates a new Imr with the same subject URI.
  374. """
  375. return self.__class__(uri=self.uri, data=self.data - other)
  376. @use_data
  377. def __and__(self, other):
  378. """
  379. Set intersection. This creates a new Imr with the same subject URI.
  380. """
  381. return self.__class__(uri=self.uri, data=self.data & other)
  382. @use_data
  383. def __or__(self, other):
  384. """
  385. Set union. This creates a new Imr with the same subject URI.
  386. """
  387. return self.__class__(uri=self.uri, data=self.data | other)
  388. @use_data
  389. def __xor__(self, other):
  390. """
  391. Set exclusive OR (XOR). This creates a new Imr with the same subject
  392. URI.
  393. """
  394. return self.__class__(uri=self.uri, data=self.data ^ other)
  395. def __getitem__(self, item):
  396. """
  397. Supports slicing notation.
  398. """
  399. if isinstance(item, slice):
  400. s, p, o = item.start, item.stop, item.step
  401. return self._slice(s, p, o)
  402. elif isinstance(item, Node):
  403. # If a Node is given, return all values for that predicate.
  404. return {
  405. r[2] for r in self.data
  406. if r[0] == self.uri and r[1] == item}
  407. else:
  408. raise TypeError(f'Wrong slice format: {item}.')
  409. def value(self, p, strict=False):
  410. """
  411. Get an individual value.
  412. :param rdflib.termNode p: Predicate to search for.
  413. :param bool strict: If set to ``True`` the method raises an error if
  414. more than one value is found. If ``False`` (the default) only
  415. the first found result is returned.
  416. :rtype: rdflib.term.Node
  417. """
  418. values = self[p]
  419. if strict and len(values) > 1:
  420. raise RuntimeError('More than one value found for {}, {}.'.format(
  421. self.uri, p))
  422. for ret in values:
  423. return ret
  424. return None
  425. cpdef as_rdflib(self):
  426. """
  427. Return the IMR as a RDFLib Resource.
  428. :rtype: rdflib.Resource
  429. """
  430. gr = Graph()
  431. for trp in self.data:
  432. gr.add(trp)
  433. return gr.resource(identifier=self.uri)