graph.pyx 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672
  1. import logging
  2. import rdflib
  3. from lakesuperior import env
  4. from libc.string cimport memcpy
  5. from libc.stdlib cimport free
  6. from cymem.cymem cimport Pool
  7. cimport lakesuperior.cy_include.collections as cc
  8. cimport lakesuperior.model.graph.callbacks as cb
  9. from lakesuperior.model.base cimport Buffer, buffer_dump
  10. from lakesuperior.model.structures.keyset cimport Keyset
  11. from lakesuperior.model.graph cimport term
  12. from lakesuperior.model.graph.triple cimport BufferTriple
  13. from lakesuperior.model.structures.hash cimport term_hash_seed32
  14. logger = logging.getLogger(__name__)
  15. cdef class Graph:
  16. """
  17. Fast and simple implementation of a graph.
  18. Most functions should mimic RDFLib's graph with less overhead. It uses
  19. the same funny but functional slicing notation.
  20. A Graph can be instantiated from a store lookup. This makes it
  21. possible to use a Keyset to perform initial filtering via identity by key,
  22. then the filtered Keyset can be converted into a set of meaningful terms.
  23. An instance of this class can also be converted to and from a
  24. ``rdflib.Graph`` instance.
  25. """
  26. def __cinit__(
  27. self, store, size_t ct=0, str uri=None, set data=set()
  28. ):
  29. """
  30. Initialize the graph, optionally with Python data.
  31. :param set data: Initial data as a set of 3-tuples of RDFLib terms.
  32. """
  33. self.pool = Pool()
  34. if not store:
  35. store = env.app_globals.ldprs_store
  36. # Initialize empty data set.
  37. if data:
  38. # Populate with provided Python set.
  39. self.keys = Keyset(len(data))
  40. self.add_triples(data)
  41. else:
  42. self.keys = Keyset(ct)
  43. ## PROPERTIES ##
  44. @property
  45. def uri(self):
  46. """
  47. Get resource identifier as a RDFLib URIRef.
  48. :rtype: rdflib.URIRef.
  49. """
  50. return rdflib.URIRef(self.id)
  51. @property
  52. def data(self):
  53. """
  54. Triple data as a Python set.
  55. :rtype: set
  56. """
  57. cdef TripleKey spok
  58. ret = set()
  59. self.seek()
  60. while self.get_next(&spok):
  61. ret.add((
  62. self.store.from_key(trp[0]),
  63. self.store.from_key(trp[1]),
  64. self.store.from_key(trp[2])
  65. ))
  66. return ret
  67. ## MAGIC METHODS ##
  68. def __len__(self):
  69. """ Number of triples in the graph. """
  70. return self.keys.size()
  71. def __eq__(self, other):
  72. """ Equality operator between ``Graph`` instances. """
  73. return len(self ^ other) == 0
  74. def __repr__(self):
  75. """
  76. String representation of the graph.
  77. This includes the subject URI, number of triples contained and the
  78. memory address of the instance.
  79. """
  80. id_repr = f' id={self.id},' if self.id else ''
  81. return (
  82. f'<{self.__class__.__name__} @0x{id(self):02x}{id_repr} '
  83. f'length={len(self)}>'
  84. )
  85. def __str__(self):
  86. """ String dump of the graph triples. """
  87. return str(self.data)
  88. def __add__(self, other):
  89. """ Alias for set-theoretical union. """
  90. return self.union_(other)
  91. def __iadd__(self, other):
  92. """ Alias for in-place set-theoretical union. """
  93. self.ip_union(other)
  94. return self
  95. def __sub__(self, other):
  96. """ Set-theoretical subtraction. """
  97. return self.subtraction(other)
  98. def __isub__(self, other):
  99. """ In-place set-theoretical subtraction. """
  100. self.ip_subtraction(other)
  101. return self
  102. def __and__(self, other):
  103. """ Set-theoretical intersection. """
  104. return self.intersection(other)
  105. def __iand__(self, other):
  106. """ In-place set-theoretical intersection. """
  107. self.ip_intersection(other)
  108. return self
  109. def __or__(self, other):
  110. """ Set-theoretical union. """
  111. return self.union_(other)
  112. def __ior__(self, other):
  113. """ In-place set-theoretical union. """
  114. self.ip_union(other)
  115. return self
  116. def __xor__(self, other):
  117. """ Set-theoretical exclusive disjunction (XOR). """
  118. return self.xor(other)
  119. def __ixor__(self, other):
  120. """ In-place set-theoretical exclusive disjunction (XOR). """
  121. self.ip_xor(other)
  122. return self
  123. def __contains__(self, trp):
  124. """
  125. Whether the graph contains a triple.
  126. :rtype: boolean
  127. """
  128. cdef TripleKey spok
  129. spok = [
  130. self.store.to_key(trp[0]),
  131. self.store.to_key(trp[1]),
  132. self.store.to_key(trp[2]),
  133. ]
  134. return self.data.contains(&spok)
  135. def __iter__(self):
  136. """ Graph iterator. It iterates over the set triples. """
  137. yield from self.data
  138. # Slicing.
  139. def __getitem__(self, item):
  140. """
  141. Slicing function.
  142. It behaves similarly to `RDFLib graph slicing
  143. <https://rdflib.readthedocs.io/en/stable/utilities.html#slicing-graphs>`__
  144. """
  145. if isinstance(item, slice):
  146. s, p, o = item.start, item.stop, item.step
  147. return self._slice(s, p, o)
  148. elif self.id and isinstance(item, rdflib.Node):
  149. # If a Node is given, return all values for that predicate.
  150. return self._slice(self.uri, item, None)
  151. else:
  152. raise TypeError(f'Wrong slice format: {item}.')
  153. def __hash__(self):
  154. """ TODO Bogus """
  155. return self.id
  156. ## BASIC PYTHON-ACCESSIBLE SET OPERATIONS ##
  157. def value(self, p, strict=False):
  158. """
  159. Get an individual value.
  160. :param rdflib.termNode p: Predicate to search for.
  161. :param bool strict: If set to ``True`` the method raises an error if
  162. more than one value is found. If ``False`` (the default) only
  163. the first found result is returned.
  164. :rtype: rdflib.term.Node
  165. """
  166. if not self.id:
  167. raise ValueError('Cannot use `value` on a non-named graph.')
  168. # TODO use slice.
  169. values = {trp[2] for trp in self.lookup((self.uri, p, None))}
  170. if strict and len(values) > 1:
  171. raise RuntimeError('More than one value found for {}, {}.'.format(
  172. self.id, p))
  173. for ret in values:
  174. return ret
  175. return None
  176. def terms_by_type(self, type):
  177. """
  178. Get all terms of a type: subject, predicate or object.
  179. :param str type: One of ``s``, ``p`` or ``o``.
  180. """
  181. i = 'spo'.index(type)
  182. return {r[i] for r in self.data}
  183. def add_triples(self, trp):
  184. """
  185. Add triples to the graph.
  186. This method checks for duplicates.
  187. :param iterable triples: iterable of 3-tuple triples.
  188. """
  189. for s, p, o in triples:
  190. self.keys.add([
  191. self.store.to_key(s),
  192. self.store.to_key(p),
  193. self.store.to_key(o),
  194. ], True)
  195. def remove(self, pattern):
  196. """
  197. Remove triples by pattern.
  198. The pattern used is similar to :py:meth:`LmdbTripleStore.delete`.
  199. """
  200. self._match_ptn_callback(
  201. pattern, self, cb.del_trp_callback, NULL
  202. )
  203. ## CYTHON-ACCESSIBLE BASIC METHODS ##
  204. cdef Graph empty_copy(self):
  205. """
  206. Create an empty copy carrying over some key properties.
  207. Override in subclasses to accommodate for different init properties.
  208. """
  209. return self.__class__(self.ct, self.store, uri=self.id)
  210. cpdef union_(self, Graph other):
  211. """
  212. Perform set union resulting in a new Graph instance.
  213. TODO Allow union of multiple graphs at a time.
  214. :param Graph other: The other graph to merge.
  215. :rtype: Graph
  216. :return: A new Graph instance.
  217. """
  218. cdef:
  219. void *cur
  220. cc.HashSetIter it
  221. BufferTriple *trp
  222. new_gr = self.empty_copy()
  223. for gr in (self, other):
  224. cc.hashset_iter_init(&it, gr._triples)
  225. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  226. bt = <BufferTriple*>cur
  227. new_gr.add_triple(bt, True)
  228. return new_gr
  229. cdef void ip_union(self, Graph other) except *:
  230. """
  231. Perform an in-place set union that adds triples to this instance
  232. TODO Allow union of multiple graphs at a time.
  233. :param Graph other: The other graph to merge.
  234. :rtype: void
  235. """
  236. cdef:
  237. void *cur
  238. cc.HashSetIter it
  239. cc.hashset_iter_init(&it, other._triples)
  240. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  241. bt = <BufferTriple*>cur
  242. self.add_triple(bt, True)
  243. cpdef intersection(self, Graph other):
  244. """
  245. Graph intersection.
  246. :param Graph other: The other graph to intersect.
  247. :rtype: Graph
  248. :return: A new Graph instance.
  249. """
  250. cdef:
  251. void *cur
  252. cc.HashSetIter it
  253. new_gr = self.empty_copy()
  254. cc.hashset_iter_init(&it, self._triples)
  255. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  256. bt = <BufferTriple*>cur
  257. if other.trp_contains(bt):
  258. new_gr.add_triple(bt, True)
  259. return new_gr
  260. cdef void ip_intersection(self, Graph other) except *:
  261. """
  262. In-place graph intersection.
  263. Triples not in common with another graph are removed from the current
  264. one.
  265. :param Graph other: The other graph to intersect.
  266. :rtype: void
  267. """
  268. cdef:
  269. void *cur
  270. cc.HashSetIter it
  271. cc.hashset_iter_init(&it, self._triples)
  272. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  273. bt = <BufferTriple*>cur
  274. if not other.trp_contains(bt):
  275. self.remove_triple(bt)
  276. cpdef subtraction(self, Graph other):
  277. """
  278. Graph set-theoretical subtraction.
  279. Create a new graph with the triples of this graph minus the ones in
  280. common with the other graph.
  281. :param Graph other: The other graph to subtract to this.
  282. :rtype: Graph
  283. :return: A new Graph instance.
  284. """
  285. cdef:
  286. void *cur
  287. cc.HashSetIter it
  288. new_gr = self.empty_copy()
  289. cc.hashset_iter_init(&it, self._triples)
  290. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  291. bt = <BufferTriple*>cur
  292. if not other.trp_contains(bt):
  293. new_gr.add_triple(bt, True)
  294. return new_gr
  295. cdef void ip_subtraction(self, Graph other) except *:
  296. """
  297. In-place graph subtraction.
  298. Triples in common with another graph are removed from the current one.
  299. :param Graph other: The other graph to intersect.
  300. :rtype: void
  301. """
  302. cdef:
  303. void *cur
  304. cc.HashSetIter it
  305. cc.hashset_iter_init(&it, self._triples)
  306. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  307. bt = <BufferTriple*>cur
  308. if other.trp_contains(bt):
  309. self.remove_triple(bt)
  310. cpdef xor(self, Graph other):
  311. """
  312. Graph Exclusive disjunction (XOR).
  313. :param Graph other: The other graph to perform XOR with.
  314. :rtype: Graph
  315. :return: A new Graph instance.
  316. """
  317. cdef:
  318. void *cur
  319. cc.HashSetIter it
  320. BufferTriple* bt
  321. new_gr = self.empty_copy()
  322. # Add triples in this and not in other.
  323. cc.hashset_iter_init(&it, self._triples)
  324. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  325. bt = <BufferTriple*>cur
  326. if not other.trp_contains(bt):
  327. new_gr.add_triple(bt, True)
  328. # Other way around.
  329. cc.hashset_iter_init(&it, other._triples)
  330. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  331. bt = <BufferTriple*>cur
  332. if not self.trp_contains(bt):
  333. new_gr.add_triple(bt, True)
  334. return new_gr
  335. cdef void ip_xor(self, Graph other) except *:
  336. """
  337. In-place graph XOR.
  338. Triples in common with another graph are removed from the current one,
  339. and triples not in common will be added from the other one.
  340. :param Graph other: The other graph to perform XOR with.
  341. :rtype: void
  342. """
  343. cdef:
  344. void *cur
  345. cc.HashSetIter it
  346. # TODO This could be more efficient to stash values in a simple
  347. # array, but how urgent is it to improve an in-place XOR?
  348. Graph tmp = Graph()
  349. # Add *to the tmp graph* triples in other graph and not in this graph.
  350. cc.hashset_iter_init(&it, other._triples)
  351. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  352. bt = <BufferTriple*>cur
  353. if not self.trp_contains(bt):
  354. tmp.add_triple(bt)
  355. # Remove triples in common.
  356. cc.hashset_iter_init(&it, self._triples)
  357. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  358. bt = <BufferTriple*>cur
  359. if other.trp_contains(bt):
  360. self.remove_triple(bt)
  361. self |= tmp
  362. cdef bint trp_contains(self, const BufferTriple* btrp):
  363. cdef:
  364. cc.HashSetIter it
  365. void* cur
  366. cc.hashset_iter_init(&it, self._triples)
  367. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  368. if self.trp_cmp_fn(cur, btrp) == 0:
  369. return True
  370. return False
  371. cpdef void set(self, tuple trp) except *:
  372. """
  373. Set a single value for subject and predicate.
  374. Remove all triples matching ``s`` and ``p`` before adding ``s p o``.
  375. """
  376. if None in trp:
  377. raise ValueError(f'Invalid triple: {trp}')
  378. self.remove((trp[0], trp[1], None))
  379. self.add((trp,))
  380. def as_rdflib(self):
  381. """
  382. Return the data set as an RDFLib Graph.
  383. :rtype: rdflib.Graph
  384. """
  385. gr = Graph(identifier=self.id)
  386. for trp in self.data:
  387. gr.add(trp)
  388. return gr
  389. def _slice(self, s, p, o):
  390. """
  391. Return terms filtered by other terms.
  392. This behaves like the rdflib.Graph slicing policy.
  393. """
  394. # If no terms are unbound, check for containment.
  395. if s is not None and p is not None and o is not None: # s p o
  396. return (s, p, o) in self
  397. # If some terms are unbound, do a lookup.
  398. res = self.lookup((s, p, o))
  399. if s is not None:
  400. if p is not None: # s p ?
  401. return {r[2] for r in res}
  402. if o is not None: # s ? o
  403. return {r[1] for r in res}
  404. # s ? ?
  405. return {(r[1], r[2]) for r in res}
  406. if p is not None:
  407. if o is not None: # ? p o
  408. return {r[0] for r in res}
  409. # ? p ?
  410. return {(r[0], r[2]) for r in res}
  411. if o is not None: # ? ? o
  412. return {(r[0], r[1]) for r in res}
  413. # ? ? ?
  414. return res
  415. def lookup(self, pattern):
  416. """
  417. Look up triples by a pattern.
  418. This function converts RDFLib terms into the serialized format stored
  419. in the graph's internal structure and compares them bytewise.
  420. Any and all of the lookup terms msy be ``None``.
  421. :rtype: Graph
  422. "return: New Graph instance with matching triples.
  423. """
  424. cdef:
  425. void* cur
  426. BufferTriple trp
  427. Graph res_gr = Graph()
  428. self._match_ptn_callback(pattern, res_gr, cb.add_trp_callback, NULL)
  429. return res_gr
  430. cdef void _match_ptn_callback(
  431. self, pattern, Graph gr,
  432. lookup_callback_fn_t callback_fn, void* ctx=NULL
  433. ) except *:
  434. """
  435. Execute an arbitrary function on a list of triples matching a pattern.
  436. The arbitrary function is appied to each triple found in the current
  437. graph, and to a discrete graph that can be the current graph itself
  438. or a different one.
  439. """
  440. cdef:
  441. void* cur
  442. Buffer t1, t2
  443. Buffer ss, sp, so
  444. BufferTriple trp
  445. BufferTriple* trp_p
  446. lookup_fn_t cmp_fn
  447. cc.HashSetIter it
  448. TripleKey spok
  449. s, p, o = pattern
  450. # Decide comparison logic outside the loop.
  451. if s is not None and p is not None and o is not None:
  452. # Shortcut for 3-term match.
  453. spok = [
  454. self.store.to_key(s),
  455. self.store.to_key(p),
  456. self.store.to_key(o),
  457. ]
  458. if self.keys.contains(spok):
  459. callback_fn(gr, &spok, ctx)
  460. return
  461. if s is not None:
  462. term.serialize_from_rdflib(s, &t1)
  463. if p is not None:
  464. cmp_fn = cb.lookup_sp_cmp_fn
  465. term.serialize_from_rdflib(p, &t2)
  466. elif o is not None:
  467. cmp_fn = cb.lookup_so_cmp_fn
  468. term.serialize_from_rdflib(o, &t2)
  469. else:
  470. cmp_fn = cb.lookup_s_cmp_fn
  471. elif p is not None:
  472. term.serialize_from_rdflib(p, &t1)
  473. if o is not None:
  474. cmp_fn = cb.lookup_po_cmp_fn
  475. term.serialize_from_rdflib(o, &t2)
  476. else:
  477. cmp_fn = cb.lookup_p_cmp_fn
  478. elif o is not None:
  479. cmp_fn = cb.lookup_o_cmp_fn
  480. term.serialize_from_rdflib(o, &t1)
  481. else:
  482. cmp_fn = cb.lookup_none_cmp_fn
  483. # Iterate over serialized triples.
  484. cc.hashset_iter_init(&it, self._triples)
  485. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  486. trp_p = <BufferTriple*>cur
  487. if cmp_fn(trp_p, &t1, &t2):
  488. callback_fn(gr, trp_p, ctx)