graph.pyx 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793
  1. import logging
  2. from functools import wraps
  3. from rdflib import Graph, URIRef
  4. from rdflib.term import Node
  5. from lakesuperior import env
  6. from libc.string cimport memcpy
  7. from libc.stdlib cimport free
  8. from cymem.cymem cimport Pool
  9. cimport lakesuperior.cy_include.collections as cc
  10. cimport lakesuperior.model.graph.callbacks as cb
  11. from lakesuperior.model.base cimport Buffer, buffer_dump
  12. from lakesuperior.model.structures.keyset import Keyset
  13. from lakesuperior.model.graph cimport term
  14. from lakesuperior.model.graph.triple cimport BufferTriple
  15. from lakesuperior.model.structures.hash cimport term_hash_seed32
  16. logger = logging.getLogger(__name__)
  17. cdef class Graph(Keyset):
  18. """
  19. Fast and simple implementation of a graph.
  20. Most functions should mimic RDFLib's graph with less overhead. It uses
  21. the same funny but functional slicing notation.
  22. A Graph can be instantiated from a store lookup. This makes it
  23. possible to use a Keyset to perform initial filtering via identity by key,
  24. then the filtered Keyset can be converted into a set of meaningful terms.
  25. An instance of this class can also be converted to and from a
  26. ``rdflib.Graph`` instance.
  27. """
  28. def __cinit__(self, *args, str uri=None, set data=set(), **kwargs):
  29. """
  30. Initialize the graph, optionally with Python data.
  31. :param set data: Initial data as a set of 3-tuples of RDFLib terms.
  32. """
  33. self.pool = Pool()
  34. # Initialize empty data set.
  35. if data:
  36. # Populate with provided Python set.
  37. self.add(data)
  38. ## PROPERTIES ##
  39. @property
  40. def data(self):
  41. """
  42. Triple data as a Python set.
  43. :rtype: set
  44. """
  45. cdef TripleKey spok
  46. ret = set()
  47. self.seek()
  48. while self.get_next(&spok):
  49. ret.add((
  50. self.store.from_key(trp[0]),
  51. self.store.from_key(trp[1]),
  52. self.store.from_key(trp[2])
  53. ))
  54. return ret
  55. ## MAGIC METHODS ##
  56. def __len__(self):
  57. """ Number of triples in the graph. """
  58. return self._free_i
  59. def __eq__(self, other):
  60. """ Equality operator between ``Graph`` instances. """
  61. return len(self ^ other) == 0
  62. def __repr__(self):
  63. """
  64. String representation of the graph.
  65. It provides the number of triples in the graph and memory address of
  66. the instance.
  67. """
  68. return (
  69. f'<{self.__class__.__name__} @{hex(id(self))} '
  70. f'length={len(self)}>'
  71. )
  72. def __str__(self):
  73. """ String dump of the graph triples. """
  74. return str(self.data)
  75. def __add__(self, other):
  76. """ Alias for set-theoretical union. """
  77. return self.union_(other)
  78. def __iadd__(self, other):
  79. """ Alias for in-place set-theoretical union. """
  80. self.ip_union(other)
  81. return self
  82. def __sub__(self, other):
  83. """ Set-theoretical subtraction. """
  84. return self.subtraction(other)
  85. def __isub__(self, other):
  86. """ In-place set-theoretical subtraction. """
  87. self.ip_subtraction(other)
  88. return self
  89. def __and__(self, other):
  90. """ Set-theoretical intersection. """
  91. return self.intersection(other)
  92. def __iand__(self, other):
  93. """ In-place set-theoretical intersection. """
  94. self.ip_intersection(other)
  95. return self
  96. def __or__(self, other):
  97. """ Set-theoretical union. """
  98. return self.union_(other)
  99. def __ior__(self, other):
  100. """ In-place set-theoretical union. """
  101. self.ip_union(other)
  102. return self
  103. def __xor__(self, other):
  104. """ Set-theoretical exclusive disjunction (XOR). """
  105. return self.xor(other)
  106. def __ixor__(self, other):
  107. """ In-place set-theoretical exclusive disjunction (XOR). """
  108. self.ip_xor(other)
  109. return self
  110. def __contains__(self, trp):
  111. """
  112. Whether the graph contains a triple.
  113. :rtype: boolean
  114. """
  115. cdef:
  116. Buffer ss, sp, so
  117. BufferTriple btrp
  118. btrp.s = &ss
  119. btrp.p = &sp
  120. btrp.o = &so
  121. s, p, o = trp
  122. term.serialize_from_rdflib(s, &ss)
  123. term.serialize_from_rdflib(p, &sp)
  124. term.serialize_from_rdflib(o, &so)
  125. return self.trp_contains(&btrp)
  126. def __iter__(self):
  127. """ Graph iterator. It iterates over the set triples. """
  128. yield from self.data
  129. # Slicing.
  130. def __getitem__(self, item):
  131. """
  132. Slicing function.
  133. It behaves similarly to `RDFLib graph slicing
  134. <https://rdflib.readthedocs.io/en/stable/utilities.html#slicing-graphs>`__
  135. """
  136. if isinstance(item, slice):
  137. s, p, o = item.start, item.stop, item.step
  138. return self._slice(s, p, o)
  139. else:
  140. raise TypeError(f'Wrong slice format: {item}.')
  141. def __hash__(self):
  142. return 23465
  143. ## BASIC PYTHON-ACCESSIBLE SET OPERATIONS ##
  144. def terms_by_type(self, type):
  145. """
  146. Get all terms of a type: subject, predicate or object.
  147. :param str type: One of ``s``, ``p`` or ``o``.
  148. """
  149. i = 'spo'.index(type)
  150. return {r[i] for r in self.data}
  151. def add_triples(self, trp):
  152. """
  153. Add triples to the graph.
  154. :param iterable triples: iterable of 3-tuple triples.
  155. """
  156. for s, p, o in triples:
  157. self.add([
  158. self.store.to_key(s),
  159. self.store.to_key(p),
  160. self.store.to_key(o),
  161. ])
  162. def len_terms(self):
  163. """ Number of terms in the graph. """
  164. return cc.hashset_size(self._terms)
  165. def remove(self, pattern):
  166. """
  167. Remove triples by pattern.
  168. The pattern used is similar to :py:meth:`LmdbTripleStore.delete`.
  169. """
  170. self._match_ptn_callback(
  171. pattern, self, cb.del_trp_callback, NULL
  172. )
  173. ## CYTHON-ACCESSIBLE BASIC METHODS ##
  174. cdef Graph empty_copy(self):
  175. """
  176. Create an empty copy carrying over some key properties.
  177. Override in subclasses to accommodate for different init properties.
  178. """
  179. return self.__class__()
  180. cpdef union_(self, Graph other):
  181. """
  182. Perform set union resulting in a new Graph instance.
  183. TODO Allow union of multiple graphs at a time.
  184. :param Graph other: The other graph to merge.
  185. :rtype: Graph
  186. :return: A new Graph instance.
  187. """
  188. cdef:
  189. void *cur
  190. cc.HashSetIter it
  191. BufferTriple *trp
  192. new_gr = self.empty_copy()
  193. for gr in (self, other):
  194. cc.hashset_iter_init(&it, gr._triples)
  195. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  196. bt = <BufferTriple*>cur
  197. new_gr.add_triple(bt, True)
  198. return new_gr
  199. cdef void ip_union(self, Graph other) except *:
  200. """
  201. Perform an in-place set union that adds triples to this instance
  202. TODO Allow union of multiple graphs at a time.
  203. :param Graph other: The other graph to merge.
  204. :rtype: void
  205. """
  206. cdef:
  207. void *cur
  208. cc.HashSetIter it
  209. cc.hashset_iter_init(&it, other._triples)
  210. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  211. bt = <BufferTriple*>cur
  212. self.add_triple(bt, True)
  213. cpdef intersection(self, Graph other):
  214. """
  215. Graph intersection.
  216. :param Graph other: The other graph to intersect.
  217. :rtype: Graph
  218. :return: A new Graph instance.
  219. """
  220. cdef:
  221. void *cur
  222. cc.HashSetIter it
  223. new_gr = self.empty_copy()
  224. cc.hashset_iter_init(&it, self._triples)
  225. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  226. bt = <BufferTriple*>cur
  227. if other.trp_contains(bt):
  228. new_gr.add_triple(bt, True)
  229. return new_gr
  230. cdef void ip_intersection(self, Graph other) except *:
  231. """
  232. In-place graph intersection.
  233. Triples not in common with another graph are removed from the current
  234. one.
  235. :param Graph other: The other graph to intersect.
  236. :rtype: void
  237. """
  238. cdef:
  239. void *cur
  240. cc.HashSetIter it
  241. cc.hashset_iter_init(&it, self._triples)
  242. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  243. bt = <BufferTriple*>cur
  244. if not other.trp_contains(bt):
  245. self.remove_triple(bt)
  246. cpdef subtraction(self, Graph other):
  247. """
  248. Graph set-theoretical subtraction.
  249. Create a new graph with the triples of this graph minus the ones in
  250. common with the other graph.
  251. :param Graph other: The other graph to subtract to this.
  252. :rtype: Graph
  253. :return: A new Graph instance.
  254. """
  255. cdef:
  256. void *cur
  257. cc.HashSetIter it
  258. new_gr = self.empty_copy()
  259. cc.hashset_iter_init(&it, self._triples)
  260. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  261. bt = <BufferTriple*>cur
  262. if not other.trp_contains(bt):
  263. new_gr.add_triple(bt, True)
  264. return new_gr
  265. cdef void ip_subtraction(self, Graph other) except *:
  266. """
  267. In-place graph subtraction.
  268. Triples in common with another graph are removed from the current one.
  269. :param Graph other: The other graph to intersect.
  270. :rtype: void
  271. """
  272. cdef:
  273. void *cur
  274. cc.HashSetIter it
  275. cc.hashset_iter_init(&it, self._triples)
  276. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  277. bt = <BufferTriple*>cur
  278. if other.trp_contains(bt):
  279. self.remove_triple(bt)
  280. cpdef xor(self, Graph other):
  281. """
  282. Graph Exclusive disjunction (XOR).
  283. :param Graph other: The other graph to perform XOR with.
  284. :rtype: Graph
  285. :return: A new Graph instance.
  286. """
  287. cdef:
  288. void *cur
  289. cc.HashSetIter it
  290. BufferTriple* bt
  291. new_gr = self.empty_copy()
  292. # Add triples in this and not in other.
  293. cc.hashset_iter_init(&it, self._triples)
  294. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  295. bt = <BufferTriple*>cur
  296. if not other.trp_contains(bt):
  297. new_gr.add_triple(bt, True)
  298. # Other way around.
  299. cc.hashset_iter_init(&it, other._triples)
  300. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  301. bt = <BufferTriple*>cur
  302. if not self.trp_contains(bt):
  303. new_gr.add_triple(bt, True)
  304. return new_gr
  305. cdef void ip_xor(self, Graph other) except *:
  306. """
  307. In-place graph XOR.
  308. Triples in common with another graph are removed from the current one,
  309. and triples not in common will be added from the other one.
  310. :param Graph other: The other graph to perform XOR with.
  311. :rtype: void
  312. """
  313. cdef:
  314. void *cur
  315. cc.HashSetIter it
  316. # TODO This could be more efficient to stash values in a simple
  317. # array, but how urgent is it to improve an in-place XOR?
  318. Graph tmp = Graph()
  319. # Add *to the tmp graph* triples in other graph and not in this graph.
  320. cc.hashset_iter_init(&it, other._triples)
  321. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  322. bt = <BufferTriple*>cur
  323. if not self.trp_contains(bt):
  324. tmp.add_triple(bt)
  325. # Remove triples in common.
  326. cc.hashset_iter_init(&it, self._triples)
  327. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  328. bt = <BufferTriple*>cur
  329. if other.trp_contains(bt):
  330. self.remove_triple(bt)
  331. self |= tmp
  332. cdef inline BufferTriple* store_triple(self, const BufferTriple* strp):
  333. """
  334. Store triple data in the graph.
  335. Normally, raw data underlying the triple and terms are only referenced
  336. by pointers. If the destination data are garbage collected before the
  337. graph is, segfaults are bound to happen.
  338. This method copies the data to the graph's memory pool, so they are
  339. managed with the lifecycle of the graph.
  340. Note that this method stores items regardless of whether thwy are
  341. duplicate or not, so there may be some duplication.
  342. """
  343. cdef:
  344. BufferTriple* dtrp = <BufferTriple*>self.pool.alloc(
  345. 1, sizeof(BufferTriple)
  346. )
  347. Buffer* spo = <Buffer*>self.pool.alloc(3, sizeof(Buffer))
  348. if not dtrp:
  349. raise MemoryError()
  350. if not spo:
  351. raise MemoryError()
  352. dtrp.s = spo
  353. dtrp.p = spo + 1
  354. dtrp.o = spo + 2
  355. spo[0].addr = self.pool.alloc(strp.s.sz, 1)
  356. spo[0].sz = strp.s.sz
  357. spo[1].addr = self.pool.alloc(strp.p.sz, 1)
  358. spo[1].sz = strp.p.sz
  359. spo[2].addr = self.pool.alloc(strp.o.sz, 1)
  360. spo[2].sz = strp.o.sz
  361. if not spo[0].addr or not spo[1].addr or not spo[2].addr:
  362. raise MemoryError()
  363. memcpy(dtrp.s.addr, strp.s.addr, strp.s.sz)
  364. memcpy(dtrp.p.addr, strp.p.addr, strp.p.sz)
  365. memcpy(dtrp.o.addr, strp.o.addr, strp.o.sz)
  366. return dtrp
  367. cdef bint trp_contains(self, const BufferTriple* btrp):
  368. cdef:
  369. cc.HashSetIter it
  370. void* cur
  371. cc.hashset_iter_init(&it, self._triples)
  372. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  373. if self.trp_cmp_fn(cur, btrp) == 0:
  374. return True
  375. return False
  376. cpdef void set(self, tuple trp) except *:
  377. """
  378. Set a single value for subject and predicate.
  379. Remove all triples matching ``s`` and ``p`` before adding ``s p o``.
  380. """
  381. if None in trp:
  382. raise ValueError(f'Invalid triple: {trp}')
  383. self.remove((trp[0], trp[1], None))
  384. self.add((trp,))
  385. def as_rdflib(self):
  386. """
  387. Return the data set as an RDFLib Graph.
  388. :rtype: rdflib.Graph
  389. """
  390. gr = Graph()
  391. for trp in self.data:
  392. gr.add(trp)
  393. return gr
  394. def _slice(self, s, p, o):
  395. """
  396. Return terms filtered by other terms.
  397. This behaves like the rdflib.Graph slicing policy.
  398. """
  399. # If no terms are unbound, check for containment.
  400. if s is not None and p is not None and o is not None: # s p o
  401. return (s, p, o) in self
  402. # If some terms are unbound, do a lookup.
  403. res = self.lookup((s, p, o))
  404. if s is not None:
  405. if p is not None: # s p ?
  406. return {r[2] for r in res}
  407. if o is not None: # s ? o
  408. return {r[1] for r in res}
  409. # s ? ?
  410. return {(r[1], r[2]) for r in res}
  411. if p is not None:
  412. if o is not None: # ? p o
  413. return {r[0] for r in res}
  414. # ? p ?
  415. return {(r[0], r[2]) for r in res}
  416. if o is not None: # ? ? o
  417. return {(r[0], r[1]) for r in res}
  418. # ? ? ?
  419. return res
  420. def lookup(self, pattern):
  421. """
  422. Look up triples by a pattern.
  423. This function converts RDFLib terms into the serialized format stored
  424. in the graph's internal structure and compares them bytewise.
  425. Any and all of the lookup terms msy be ``None``.
  426. :rtype: Graph
  427. "return: New Graph instance with matching triples.
  428. """
  429. cdef:
  430. void* cur
  431. BufferTriple trp
  432. Graph res_gr = Graph()
  433. self._match_ptn_callback(pattern, res_gr, cb.add_trp_callback, NULL)
  434. return res_gr
  435. cdef void _match_ptn_callback(
  436. self, pattern, Graph gr,
  437. lookup_callback_fn_t callback_fn, void* ctx=NULL
  438. ) except *:
  439. """
  440. Execute an arbitrary function on a list of triples matching a pattern.
  441. The arbitrary function is appied to each triple found in the current
  442. graph, and to a discrete graph that can be the current graph itself
  443. or a different one.
  444. """
  445. cdef:
  446. void* cur
  447. Buffer t1, t2
  448. Buffer ss, sp, so
  449. BufferTriple trp
  450. BufferTriple* trp_p
  451. lookup_fn_t cmp_fn
  452. cc.HashSetIter it
  453. s, p, o = pattern
  454. # Decide comparison logic outside the loop.
  455. if s is not None and p is not None and o is not None:
  456. # Shortcut for 3-term match.
  457. trp.s = &ss
  458. trp.p = &sp
  459. trp.o = &so
  460. term.serialize_from_rdflib(s, trp.s, self.pool)
  461. term.serialize_from_rdflib(p, trp.p, self.pool)
  462. term.serialize_from_rdflib(o, trp.o, self.pool)
  463. if cc.hashset_contains(self._triples, &trp):
  464. callback_fn(gr, &trp, ctx)
  465. return
  466. if s is not None:
  467. term.serialize_from_rdflib(s, &t1)
  468. if p is not None:
  469. cmp_fn = cb.lookup_sp_cmp_fn
  470. term.serialize_from_rdflib(p, &t2)
  471. elif o is not None:
  472. cmp_fn = cb.lookup_so_cmp_fn
  473. term.serialize_from_rdflib(o, &t2)
  474. else:
  475. cmp_fn = cb.lookup_s_cmp_fn
  476. elif p is not None:
  477. term.serialize_from_rdflib(p, &t1)
  478. if o is not None:
  479. cmp_fn = cb.lookup_po_cmp_fn
  480. term.serialize_from_rdflib(o, &t2)
  481. else:
  482. cmp_fn = cb.lookup_p_cmp_fn
  483. elif o is not None:
  484. cmp_fn = cb.lookup_o_cmp_fn
  485. term.serialize_from_rdflib(o, &t1)
  486. else:
  487. cmp_fn = cb.lookup_none_cmp_fn
  488. # Iterate over serialized triples.
  489. cc.hashset_iter_init(&it, self._triples)
  490. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  491. trp_p = <BufferTriple*>cur
  492. if cmp_fn(trp_p, &t1, &t2):
  493. callback_fn(gr, trp_p, ctx)
  494. cdef class Imr(Graph):
  495. """
  496. In-memory resource data container.
  497. This is an extension of :py:class:`~Graph` that adds a subject URI to
  498. the data set and some convenience methods.
  499. An instance of this class can be converted to a ``rdflib.Resource``
  500. instance.
  501. Some set operations that produce a new object (``-``, ``|``, ``&``, ``^``)
  502. will create a new ``Imr`` instance with the same subject URI.
  503. """
  504. def __init__(self, uri, *args, **kwargs):
  505. """
  506. Initialize the graph with pre-existing data or by looking up a store.
  507. Either ``data``, or ``lookup`` *and* ``store``, can be provide.
  508. ``lookup`` and ``store`` have precedence. If none of them is specified,
  509. an empty graph is initialized.
  510. :param rdflib.URIRef uri: The graph URI.
  511. This will serve as the subject for some queries.
  512. :param args: Positional arguments inherited from
  513. ``Graph.__init__``.
  514. :param kwargs: Keyword arguments inherited from
  515. ``Graph.__init__``.
  516. """
  517. self.id = str(uri)
  518. #super().__init(*args, **kwargs)
  519. def __repr__(self):
  520. """
  521. String representation of an Imr.
  522. This includes the subject URI, number of triples contained and the
  523. memory address of the instance.
  524. """
  525. return (f'<{self.__class__.__name__} @{hex(id(self))} id={self.id}, '
  526. f'length={len(self)}>')
  527. def __getitem__(self, item):
  528. """
  529. Supports slicing notation.
  530. """
  531. if isinstance(item, slice):
  532. s, p, o = item.start, item.stop, item.step
  533. return self._slice(s, p, o)
  534. elif isinstance(item, Node):
  535. # If a Node is given, return all values for that predicate.
  536. return self._slice(self.uri, item, None)
  537. else:
  538. raise TypeError(f'Wrong slice format: {item}.')
  539. @property
  540. def uri(self):
  541. """
  542. Get resource identifier as a RDFLib URIRef.
  543. :rtype: rdflib.URIRef.
  544. """
  545. return URIRef(self.id)
  546. cdef Imr empty_copy(self):
  547. """
  548. Create an empty instance carrying over some key properties.
  549. """
  550. return self.__class__(uri=self.id)
  551. def value(self, p, strict=False):
  552. """
  553. Get an individual value.
  554. :param rdflib.termNode p: Predicate to search for.
  555. :param bool strict: If set to ``True`` the method raises an error if
  556. more than one value is found. If ``False`` (the default) only
  557. the first found result is returned.
  558. :rtype: rdflib.term.Node
  559. """
  560. # TODO use slice.
  561. values = {trp[2] for trp in self.lookup((self.uri, p, None))}
  562. if strict and len(values) > 1:
  563. raise RuntimeError('More than one value found for {}, {}.'.format(
  564. self.id, p))
  565. for ret in values:
  566. return ret
  567. return None
  568. cpdef as_rdflib(self):
  569. """
  570. Return the IMR as a RDFLib Resource.
  571. :rtype: rdflib.Resource
  572. """
  573. gr = Graph()
  574. for trp in self.data:
  575. gr.add(trp)
  576. return gr.resource(identifier=self.uri)