graph.pyx 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189
  1. import logging
  2. from functools import wraps
  3. from rdflib import Graph
  4. from rdflib.term import Node
  5. from lakesuperior import env
  6. from libc.stdint cimport uint32_t, uint64_t
  7. from libc.string cimport memcmp, memcpy
  8. from libc.stdlib cimport free
  9. from cymem.cymem cimport Pool
  10. from lakesuperior.cy_include cimport cylmdb as lmdb
  11. from lakesuperior.cy_include cimport collections as cc
  12. from lakesuperior.cy_include cimport spookyhash as sph
  13. from lakesuperior.model.graph cimport term
  14. from lakesuperior.store.ldp_rs.lmdb_triplestore cimport (
  15. KLEN, DBL_KLEN, TRP_KLEN, TripleKey)
  16. from lakesuperior.model.structures.hash cimport term_hash_seed32
  17. from lakesuperior.model.structures.keyset cimport Keyset
  18. from lakesuperior.model.base cimport Buffer
  19. from lakesuperior.model.graph.triple cimport BufferTriple
  20. from lakesuperior.model.structures.hash cimport hash64
  21. cdef extern from 'spookyhash_api.h':
  22. uint64_t spookyhash_64(const void *input, size_t input_size, uint64_t seed)
  23. logger = logging.getLogger(__name__)
  24. cdef int term_cmp_fn(const void* key1, const void* key2):
  25. """
  26. Compare function for two Buffer objects.
  27. :rtype: int
  28. :return: 0 if the byte streams are the same, another integer otherwise.
  29. """
  30. b1 = <Buffer *>key1
  31. b2 = <Buffer *>key2
  32. if b1.sz != b2.sz:
  33. logger.info(f'Sizes differ: {b1.sz} != {b2.sz}. Return 1.')
  34. return 1
  35. cdef int cmp = memcmp(b1.addr, b2.addr, b1.sz)
  36. logger.info(f'term memcmp: {cmp}')
  37. return cmp
  38. cdef int trp_lit_cmp_fn(const void* key1, const void* key2):
  39. """
  40. Compare function for two triples in a set.
  41. s, p, o byte data are compared literally.
  42. :rtype: int
  43. :return: 0 if all three terms point to byte-wise identical data in both
  44. triples.
  45. """
  46. t1 = <BufferTriple *>key1
  47. t2 = <BufferTriple *>key2
  48. #print('Comparing terms: {} {} {}'.format(
  49. # (<unsigned char*>t1.s.addr)[:t1.s.sz],
  50. # (<unsigned char*>t1.p.addr)[:t1.p.sz],
  51. # (<unsigned char*>t1.o.addr)[:t1.o.sz]
  52. #))
  53. #print('With: {} {} {}'.format(
  54. # (<unsigned char*>t2.s.addr)[:t2.s.sz],
  55. # (<unsigned char*>t2.p.addr)[:t2.p.sz],
  56. # (<unsigned char*>t2.o.addr)[:t2.o.sz]
  57. #))
  58. diff = (
  59. term_cmp_fn(t1.o, t2.o) or
  60. term_cmp_fn(t1.s, t2.s) or
  61. term_cmp_fn(t1.p, t2.p)
  62. )
  63. logger.info(f'Triples match: {not(diff)}')
  64. return diff
  65. cdef int trp_cmp_fn(const void* key1, const void* key2):
  66. """
  67. Compare function for two triples in a set.
  68. Here, pointers to terms are compared for s, p, o. The pointers should be
  69. guaranteed to point to unique values (i.e. no two pointers have the same
  70. term value within a graph).
  71. :rtype: int
  72. :return: 0 if the addresses of all terms are the same, 1 otherwise.
  73. """
  74. t1 = <BufferTriple *>key1
  75. t2 = <BufferTriple *>key2
  76. #print('Comparing terms: {} {} {}'.format(
  77. # (<unsigned char*>t1.s.addr)[:t1.s.sz],
  78. # (<unsigned char*>t1.p.addr)[:t1.p.sz],
  79. # (<unsigned char*>t1.o.addr)[:t1.o.sz]
  80. #))
  81. #print('With: {} {} {}'.format(
  82. # (<unsigned char*>t2.s.addr)[:t2.s.sz],
  83. # (<unsigned char*>t2.p.addr)[:t2.p.sz],
  84. # (<unsigned char*>t2.o.addr)[:t2.o.sz]
  85. #))
  86. #print('Comparing addresses: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
  87. # <size_t>t1.s, <size_t>t1.p, <size_t>t1.o))
  88. #print('With: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
  89. # <size_t>t2.s, <size_t>t2.p, <size_t>t2.o))
  90. cdef int is_not_equal = (
  91. t1.s.addr != t2.s.addr or
  92. t1.p.addr != t2.p.addr or
  93. t1.o.addr != t2.o.addr
  94. )
  95. logger.info(f'Triples match: {not(is_not_equal)}')
  96. return is_not_equal
  97. cdef bint graph_eq_fn(SimpleGraph g1, SimpleGraph g2):
  98. """
  99. Compare 2 graphs for equality.
  100. Note that this returns the opposite value than the triple and term
  101. compare functions: 1 (True) if equal, 0 (False) if not.
  102. """
  103. cdef:
  104. void* el
  105. cc.HashSetIter it
  106. cc.hashset_iter_init(&it, g1._triples)
  107. while cc.hashset_iter_next(&it, &el) != cc.CC_ITER_END:
  108. if cc.hashset_contains(g2._triples, el):
  109. return False
  110. return True
  111. cdef size_t term_hash_fn(const void* key, int l, uint32_t seed):
  112. """
  113. Hash function for serialized terms (:py:class:`Buffer` objects)
  114. """
  115. return <size_t>spookyhash_64((<Buffer*>key).addr, (<Buffer*>key).sz, seed)
  116. cdef size_t trp_lit_hash_fn(const void* key, int l, uint32_t seed):
  117. """
  118. Hash function for sets of (serialized) triples.
  119. This function concatenates the literal terms of the triple as bytes
  120. and computes their hash.
  121. """
  122. trp = <BufferTriple*>key
  123. seed64 = <uint64_t>seed
  124. seed_dummy = seed64
  125. cdef sph.spookyhash_context ctx
  126. sph.spookyhash_context_init(&ctx, seed64, seed_dummy)
  127. sph.spookyhash_update(&ctx, trp.s.addr, trp.s.sz)
  128. sph.spookyhash_update(&ctx, trp.s.addr, trp.p.sz)
  129. sph.spookyhash_update(&ctx, trp.s.addr, trp.o.sz)
  130. sph.spookyhash_final(&ctx, &seed64, &seed_dummy)
  131. return <size_t>seed64
  132. cdef size_t trp_hash_fn(const void* key, int l, uint32_t seed):
  133. """
  134. Hash function for sets of (serialized) triples.
  135. This function computes the hash of the concatenated pointer values in the
  136. s, p, o members of the triple. The triple structure is treated as a byte
  137. string. This is safe in spite of byte-wise struct evaluation being a
  138. frowned-upon practice (due to padding issues), because it is assumed that
  139. the input value is always the same type of structure.
  140. """
  141. return <size_t>spookyhash_64(key, l, seed)
  142. cdef size_t hash_ptr_passthrough(const void* key, int l, uint32_t seed):
  143. """
  144. No-op function that takes a pointer and does *not* hash it.
  145. The pointer value is used as the "hash".
  146. """
  147. return <size_t>key
  148. cdef inline bint lookup_none_cmp_fn(
  149. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  150. ):
  151. """
  152. Dummy callback for queries with all parameters unbound.
  153. This function always returns ``True``
  154. """
  155. return True
  156. cdef inline bint lookup_s_cmp_fn(
  157. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  158. ):
  159. """
  160. Lookup callback compare function for a given s in a triple.
  161. The function returns ``True`` if ``t1`` matches the first term.
  162. ``t2`` is not used and is declared only for compatibility with the
  163. other interchangeable functions.
  164. """
  165. return term_cmp_fn(t1, trp[0].s)
  166. cdef inline bint lookup_p_cmp_fn(
  167. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  168. ):
  169. return term_cmp_fn(t1, trp[0].p)
  170. cdef inline bint lookup_o_cmp_fn(
  171. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  172. ):
  173. return term_cmp_fn(t1, trp[0].o)
  174. cdef inline bint lookup_sp_cmp_fn(
  175. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  176. ):
  177. return (
  178. term_cmp_fn(t1, trp[0].s)
  179. and term_cmp_fn(t2, trp[0].p))
  180. cdef inline bint lookup_so_cmp_fn(
  181. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  182. ):
  183. return (
  184. term_cmp_fn(t1, trp[0].s)
  185. and term_cmp_fn(t2, trp[0].o))
  186. cdef inline bint lookup_po_cmp_fn(
  187. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  188. ):
  189. return (
  190. term_cmp_fn(t1, trp[0].p)
  191. and term_cmp_fn(t2, trp[0].o))
  192. cdef class SimpleGraph:
  193. """
  194. Fast and simple implementation of a graph.
  195. Most functions should mimic RDFLib's graph with less overhead. It uses
  196. the same funny but functional slicing notation. No lookup functions within
  197. the graph are available at this time.
  198. Instances of this class hold a set of
  199. :py:class:`~lakesuperior.store.ldp_rs.term.Term` structures that stores
  200. unique terms within the graph, and a set of
  201. :py:class:`~lakesuperior.store.ldp_rs.triple.Triple` structures referencing
  202. those terms. Therefore, no data duplication occurs and the storage is quite
  203. sparse.
  204. A graph can be instantiated from a store lookup.
  205. A SimpleGraph can also be obtained from a
  206. :py:class:`lakesuperior.store.keyset.Keyset` which is convenient bacause
  207. a Keyset can be obtained very efficiently from querying a store, then also
  208. very efficiently filtered and eventually converted into a set of meaningful
  209. terms.
  210. An instance of this class can also be converted to and from a
  211. ``rdflib.Graph`` instance. TODO verify that this frees Cython pointers.
  212. """
  213. def __cinit__(
  214. self, Keyset keyset=None, store=None, set data=set(), *args, **kwargs):
  215. """
  216. Initialize the graph with pre-existing data or by looking up a store.
  217. One of ``keyset``, or ``data`` can be provided. If more than
  218. one of these is provided, precedence is given in the mentioned order.
  219. If none of them is specified, an empty graph is initialized.
  220. :param rdflib.URIRef uri: The graph URI.
  221. This will serve as the subject for some queries.
  222. :param Keyset keyset: Keyset to create the graph from. Keys will be
  223. converted to set elements.
  224. :param lakesuperior.store.ldp_rs.LmdbTripleStore store: store to
  225. look up the keyset. Only used if ``keyset`` is specified. If not
  226. set, the environment store is used.
  227. :param set data: Initial data as a set of 3-tuples of RDFLib terms.
  228. :param tuple lookup: tuple of a 3-tuple of lookup terms, and a context.
  229. E.g. ``((URIRef('urn:ns:a'), None, None), URIRef('urn:ns:ctx'))``.
  230. Any and all elements may be ``None``.
  231. :param lmdbStore store: the store to look data up.
  232. """
  233. cdef:
  234. cc.HashSetConf terms_conf, trp_conf
  235. self.term_cmp_fn = &term_cmp_fn
  236. self.trp_cmp_fn = &trp_lit_cmp_fn
  237. cc.hashset_conf_init(&terms_conf)
  238. terms_conf.load_factor = 0.85
  239. terms_conf.hash = &term_hash_fn
  240. terms_conf.hash_seed = term_hash_seed32
  241. terms_conf.key_compare = self.term_cmp_fn
  242. terms_conf.key_length = sizeof(Buffer*)
  243. cc.hashset_conf_init(&trp_conf)
  244. trp_conf.load_factor = 0.75
  245. trp_conf.hash = &trp_lit_hash_fn
  246. trp_conf.hash_seed = term_hash_seed32
  247. trp_conf.key_compare = self.trp_cmp_fn
  248. trp_conf.key_length = sizeof(BufferTriple)
  249. cc.hashset_new_conf(&terms_conf, &self._terms)
  250. cc.hashset_new_conf(&trp_conf, &self._triples)
  251. self.store = store or env.app_globals.rdf_store
  252. self._pool = Pool()
  253. # Initialize empty data set.
  254. if keyset:
  255. # Populate with triples extracted from provided key set.
  256. self._data_from_keyset(keyset)
  257. elif data:
  258. # Populate with provided Python set.
  259. self.add(data)
  260. def __dealloc__(self):
  261. """
  262. Free the triple pointers. TODO use a Cymem pool
  263. """
  264. free(self._triples)
  265. free(self._terms)
  266. @property
  267. def data(self):
  268. """
  269. Triple data as a Python set.
  270. :rtype: set
  271. """
  272. return self._to_pyset()
  273. @property
  274. def terms(self):
  275. return self._get_terms()
  276. # # # BASIC SET OPERATIONS # # #
  277. cpdef SimpleGraph union(self, SimpleGraph other):
  278. """
  279. Perform set union resulting in a new SimpleGraph instance.
  280. TODO Allow union of multiple graphs at a time.
  281. :param SimpleGraph other: The other graph to merge.
  282. :rtype: SimpleGraph
  283. :return: A new SimpleGraph instance.
  284. """
  285. cdef:
  286. void *cur
  287. cc.HashSetIter it
  288. SimpleGraph new_gr = SimpleGraph()
  289. BufferTriple *trp
  290. new_gr.store = self.store
  291. for gr in (self, other):
  292. cc.hashset_iter_init(&it, gr._triples)
  293. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  294. bt = <BufferTriple*>cur
  295. new_gr._add_triple(bt)
  296. return new_gr
  297. cdef void ip_union(self, SimpleGraph other) except *:
  298. """
  299. Perform an in-place set union that adds triples to this instance
  300. TODO Allow union of multiple graphs at a time.
  301. :param SimpleGraph other: The other graph to merge.
  302. :rtype: void
  303. """
  304. cdef:
  305. void *cur
  306. cc.HashSetIter it
  307. cc.hashset_iter_init(&it, other._triples)
  308. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  309. bt = <BufferTriple*>cur
  310. self._add_triple(bt)
  311. cpdef SimpleGraph intersection(self, SimpleGraph other):
  312. """
  313. Graph intersection.
  314. :param SimpleGraph other: The other graph to intersect.
  315. :rtype: SimpleGraph
  316. :return: A new SimpleGraph instance.
  317. """
  318. cdef:
  319. void *cur
  320. cc.HashSetIter it
  321. SimpleGraph new_gr = SimpleGraph()
  322. cc.hashset_iter_init(&it, self._triples)
  323. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  324. bt = <BufferTriple*>cur
  325. #print('Checking: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
  326. # <size_t>bt.s, <size_t>bt.p, <size_t>bt.o))
  327. if other._trp_contains(bt):
  328. #print('Adding.')
  329. new_gr._add_triple(bt)
  330. return new_gr
  331. cdef void ip_intersection(self, SimpleGraph other) except *:
  332. """
  333. In-place graph intersection.
  334. Triples not in common with another graph are removed from the current
  335. one.
  336. :param SimpleGraph other: The other graph to intersect.
  337. :rtype: void
  338. """
  339. cdef:
  340. void *cur
  341. cc.HashSetIter it
  342. cc.hashset_iter_init(&it, self._triples)
  343. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  344. bt = <BufferTriple*>cur
  345. if not other._trp_contains(bt):
  346. self._remove_triple(bt)
  347. cpdef SimpleGraph subtraction(self, SimpleGraph other):
  348. """
  349. Graph set-theoretical subtraction.
  350. Create a new graph with the triples of this graph minus the ones in
  351. common with the other graph.
  352. :param SimpleGraph other: The other graph to subtract to this.
  353. :rtype: SimpleGraph
  354. :return: A new SimpleGraph instance.
  355. """
  356. cdef:
  357. void *cur
  358. cc.HashSetIter it
  359. SimpleGraph new_gr = SimpleGraph()
  360. cc.hashset_iter_init(&it, self._triples)
  361. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  362. bt = <BufferTriple*>cur
  363. #print('Checking: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
  364. # <size_t>bt.s, <size_t>bt.p, <size_t>bt.o))
  365. if not other._trp_contains(bt):
  366. #print('Adding.')
  367. new_gr._add_triple(bt)
  368. return new_gr
  369. cdef void ip_subtraction(self, SimpleGraph other) except *:
  370. """
  371. In-place graph subtraction.
  372. Triples in common with another graph are removed from the current one.
  373. :param SimpleGraph other: The other graph to intersect.
  374. :rtype: void
  375. """
  376. cdef:
  377. void *cur
  378. cc.HashSetIter it
  379. cc.hashset_iter_init(&it, self._triples)
  380. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  381. bt = <BufferTriple*>cur
  382. if other._trp_contains(bt):
  383. self._remove_triple(bt)
  384. cpdef SimpleGraph xor(self, SimpleGraph other):
  385. """
  386. Graph Exclusive disjunction (XOR).
  387. :param SimpleGraph other: The other graph to perform XOR with.
  388. :rtype: SimpleGraph
  389. :return: A new SimpleGraph instance.
  390. """
  391. cdef:
  392. void *cur
  393. cc.HashSetIter it
  394. SimpleGraph new_gr = SimpleGraph()
  395. BufferTriple* bt
  396. # Add triples in this and not in other.
  397. cc.hashset_iter_init(&it, self._triples)
  398. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  399. bt = <BufferTriple*>cur
  400. if not other._trp_contains(bt):
  401. new_gr._add_triple(bt)
  402. # Other way around.
  403. cc.hashset_iter_init(&it, other._triples)
  404. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  405. bt = <BufferTriple*>cur
  406. if not self._trp_contains(bt):
  407. new_gr._add_triple(bt)
  408. return new_gr
  409. cdef void ip_xor(self, SimpleGraph other) except *:
  410. """
  411. In-place graph XOR.
  412. Triples in common with another graph are removed from the current one,
  413. and triples not in common will be added from the other one.
  414. :param SimpleGraph other: The other graph to perform XOR with.
  415. :rtype: void
  416. """
  417. cdef:
  418. void *cur
  419. cc.HashSetIter it
  420. # TODO This could be more efficient to stash values in a simple
  421. # array, but how urgent is it to improve an in-place XOR?
  422. SimpleGraph tmp = SimpleGraph()
  423. # Add *to the tmp graph* triples in other graph and not in this graph.
  424. cc.hashset_iter_init(&it, other._triples)
  425. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  426. bt = <BufferTriple*>cur
  427. if not self._trp_contains(bt):
  428. tmp._add_triple(bt)
  429. # Remove triples in common.
  430. cc.hashset_iter_init(&it, self._triples)
  431. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  432. bt = <BufferTriple*>cur
  433. if other._trp_contains(bt):
  434. print(self._remove_triple(bt))
  435. self |= tmp
  436. cdef void _data_from_lookup(self, tuple trp_ptn, ctx=None) except *:
  437. """
  438. Look up triples in the triplestore and load them into ``data``.
  439. :param tuple lookup: 3-tuple of RDFlib terms or ``None``.
  440. :param LmdbTriplestore store: Reference to a LMDB triplestore. This
  441. is normally set to ``lakesuperior.env.app_globals.rdf_store``.
  442. """
  443. cdef:
  444. size_t i
  445. unsigned char spok[TRP_KLEN]
  446. with self.store.txn_ctx():
  447. keyset = self.store.triple_keys(trp_ptn, ctx)
  448. self.data_from_keyset(keyset)
  449. cdef void _data_from_keyset(self, Keyset data) except *:
  450. """Populate a graph from a Keyset."""
  451. cdef TripleKey spok
  452. while data.next(spok):
  453. self._add_from_spok(spok)
  454. cdef inline void _add_from_spok(self, TripleKey spok) except *:
  455. """
  456. Add a triple from a TripleKey of term keys.
  457. """
  458. s_spo = <SPOBuffer>self._pool.alloc(3, sizeof(Buffer))
  459. self.store.lookup_term(spok, s_spo)
  460. self.store.lookup_term(spok + KLEN, s_spo + 1)
  461. self.store.lookup_term(spok + DBL_KLEN, s_spo + 2)
  462. trp = <BufferTriple *>self._pool.alloc(1, sizeof(BufferTriple))
  463. trp.s = s_spo
  464. trp.p = s_spo + 1
  465. trp.o = s_spo + 2
  466. self._add_triple(trp)
  467. cdef inline void _add_triple(self, BufferTriple* trp) except *:
  468. """
  469. Add a triple from 3 (TPL) serialized terms.
  470. Each of the terms is added to the term set if not existing. The triple
  471. also is only added if not existing.
  472. """
  473. logger.info('Inserting terms.')
  474. cc.hashset_add(self._terms, trp.s)
  475. cc.hashset_add(self._terms, trp.p)
  476. cc.hashset_add(self._terms, trp.o)
  477. logger.info('inserted terms.')
  478. logger.info(f'Terms set size: {cc.hashset_size(self._terms)}')
  479. cdef size_t trp_sz = cc.hashset_size(self._triples)
  480. logger.info(f'Triples set size before adding: {trp_sz}')
  481. r = cc.hashset_add(self._triples, trp)
  482. #print('Insert triple result:')
  483. #print(r)
  484. trp_sz = cc.hashset_size(self._triples)
  485. logger.info(f'Triples set size after adding: {trp_sz}')
  486. cdef:
  487. cc.HashSetIter ti
  488. void *cur
  489. cdef int _remove_triple(self, BufferTriple* btrp) except -1:
  490. """
  491. Remove one triple from the graph.
  492. """
  493. return cc.hashset_remove(self._triples, btrp, NULL)
  494. cdef bint _trp_contains(self, BufferTriple* btrp):
  495. cdef:
  496. cc.HashSetIter it
  497. void* cur
  498. cc.hashset_iter_init(&it, self._triples)
  499. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  500. if self.trp_cmp_fn(cur, btrp) == 0:
  501. return True
  502. return False
  503. cdef _get_terms(self):
  504. """
  505. Get all terms in the graph.
  506. """
  507. cdef:
  508. cc.HashSetIter it
  509. void *cur
  510. terms = [] # This is intentionally a list to spot issues with the set.
  511. cc.hashset_iter_init(&it, self._terms)
  512. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  513. s_term = <Buffer*>cur
  514. terms.append((f'0x{<size_t>cur:02x}', term.deserialize_to_rdflib(s_term)))
  515. return terms
  516. cdef set _to_pyset(self):
  517. """
  518. Convert triple data to a Python set.
  519. :rtype: set
  520. """
  521. cdef:
  522. void *void_p
  523. cc.HashSetIter ti
  524. term.Term s, p, o
  525. graph_set = set()
  526. # Looping over an empty HashSet results in a segfault. Exit early in
  527. # that case.
  528. #if not cc.hashset_size(self._triples):
  529. # return graph_set
  530. cc.hashset_iter_init(&ti, self._triples)
  531. while cc.hashset_iter_next(&ti, &void_p) != cc.CC_ITER_END:
  532. if void_p == NULL:
  533. logger.warn('Triple is NULL!')
  534. break
  535. trp = <BufferTriple *>void_p
  536. graph_set.add((
  537. term.deserialize_to_rdflib(trp.s),
  538. term.deserialize_to_rdflib(trp.p),
  539. term.deserialize_to_rdflib(trp.o),
  540. ))
  541. return graph_set
  542. # Basic set operations.
  543. def add(self, trp):
  544. """
  545. Add triples to the graph.
  546. :param iterable triples: Set, list or tuple of 3-tuple triples.
  547. """
  548. cdef size_t cur = 0, trp_cur = 0
  549. trp_ct = len(trp)
  550. term_buf = <Buffer*>self._pool.alloc(3 * trp_ct, sizeof(Buffer))
  551. trp_buf = <BufferTriple*>self._pool.alloc(trp_ct, sizeof(BufferTriple))
  552. for s, p, o in trp:
  553. term.serialize_from_rdflib(s, term_buf + cur, self._pool)
  554. term.serialize_from_rdflib(p, term_buf + cur + 1, self._pool)
  555. term.serialize_from_rdflib(o, term_buf + cur + 2, self._pool)
  556. (trp_buf + trp_cur).s = term_buf + cur
  557. (trp_buf + trp_cur).p = term_buf + cur + 1
  558. (trp_buf + trp_cur).o = term_buf + cur + 2
  559. self._add_triple(trp_buf + trp_cur)
  560. trp_cur += 1
  561. cur += 3
  562. def len_terms(self):
  563. """ Number of triples in the graph. """
  564. return cc.hashset_size(self._terms)
  565. def remove(self, trp):
  566. """
  567. Remove one item from the graph.
  568. :param tuple item: A 3-tuple of RDFlib terms. Only exact terms, i.e.
  569. wildcards are not accepted.
  570. """
  571. cdef:
  572. Buffer ss, sp, so
  573. BufferTriple trp_buf
  574. term.serialize_from_rdflib(trp[0], &ss, self._pool)
  575. term.serialize_from_rdflib(trp[1], &sp, self._pool)
  576. term.serialize_from_rdflib(trp[2], &so, self._pool)
  577. trp_buf.s = &ss
  578. trp_buf.p = &sp
  579. trp_buf.o = &so
  580. self._remove_triple(&trp_buf)
  581. def __len__(self):
  582. """ Number of triples in the graph. """
  583. return cc.hashset_size(self._triples)
  584. def __eq__(self, other):
  585. """ Equality operator between ``SimpleGraph`` instances. """
  586. return len(self ^ other) == 0
  587. def __repr__(self):
  588. """
  589. String representation of the graph.
  590. It provides the number of triples in the graph and memory address of
  591. the instance.
  592. """
  593. return (
  594. f'<{self.__class__.__name__} @{hex(id(self))} '
  595. f'length={len(self.data)}>'
  596. )
  597. def __str__(self):
  598. """ String dump of the graph triples. """
  599. return str(self.data)
  600. def __add__(self, other):
  601. """ Alias for set-theoretical union. """
  602. return self.union(other)
  603. def __iadd__(self, other):
  604. """ Alias for in-place set-theoretical union. """
  605. self.ip_union(other)
  606. return self
  607. def __sub__(self, other):
  608. """ Set-theoretical subtraction. """
  609. return self.subtraction(other)
  610. def __isub__(self, other):
  611. """ In-place set-theoretical subtraction. """
  612. self.ip_subtraction(other)
  613. return self
  614. def __and__(self, other):
  615. """ Set-theoretical intersection. """
  616. return self.intersection(other)
  617. def __iand__(self, other):
  618. """ In-place set-theoretical intersection. """
  619. self.ip_intersection(other)
  620. return self
  621. def __or__(self, other):
  622. """ Set-theoretical union. """
  623. return self.union(other)
  624. def __ior__(self, other):
  625. """ In-place set-theoretical union. """
  626. self.ip_union(other)
  627. return self
  628. def __xor__(self, other):
  629. """ Set-theoretical exclusive disjunction (XOR). """
  630. return self.xor(other)
  631. def __ixor__(self, other):
  632. """ In-place set-theoretical exclusive disjunction (XOR). """
  633. self.ip_xor(other)
  634. return self
  635. def __contains__(self, trp):
  636. """
  637. Whether the graph contains a triple.
  638. :rtype: boolean
  639. """
  640. cdef:
  641. Buffer ss, sp, so
  642. BufferTriple btrp
  643. btrp.s = &ss
  644. btrp.p = &sp
  645. btrp.o = &so
  646. s, p, o = trp
  647. term.serialize_from_rdflib(s, &ss)
  648. term.serialize_from_rdflib(p, &sp)
  649. term.serialize_from_rdflib(o, &so)
  650. return self._trp_contains(&btrp)
  651. def __iter__(self):
  652. """ Graph iterator. It iterates over the set triples. """
  653. raise NotImplementedError()
  654. # Slicing.
  655. def __getitem__(self, item):
  656. """
  657. Slicing function.
  658. It behaves similarly to `RDFLib graph slicing
  659. <https://rdflib.readthedocs.io/en/stable/utilities.html#slicing-graphs>`__
  660. """
  661. if isinstance(item, slice):
  662. s, p, o = item.start, item.stop, item.step
  663. return self._slice(s, p, o)
  664. else:
  665. raise TypeError(f'Wrong slice format: {item}.')
  666. cpdef void set(self, tuple trp) except *:
  667. """
  668. Set a single value for subject and predicate.
  669. Remove all triples matching ``s`` and ``p`` before adding ``s p o``.
  670. """
  671. if None in trp:
  672. raise ValueError(f'Invalid triple: {trp}')
  673. self.remove_triples((trp[0], trp[1], None))
  674. self.add((trp,))
  675. cpdef void remove_triples(self, pattern) except *:
  676. """
  677. Remove triples by pattern.
  678. The pattern used is similar to :py:meth:`LmdbTripleStore.delete`.
  679. """
  680. s, p, o = pattern
  681. for match in self.lookup(s, p, o):
  682. logger.debug(f'Removing from graph: {match}.')
  683. self.data.remove(match)
  684. cpdef object as_rdflib(self):
  685. """
  686. Return the data set as an RDFLib Graph.
  687. :rtype: rdflib.Graph
  688. """
  689. gr = Graph()
  690. for trp in self.data:
  691. gr.add(trp)
  692. return gr
  693. def _slice(self, s, p, o):
  694. """
  695. Return terms filtered by other terms.
  696. This behaves like the rdflib.Graph slicing policy.
  697. """
  698. _data = self.data
  699. logger.debug(f'Slicing graph by: {s}, {p}, {o}.')
  700. if s is None and p is None and o is None:
  701. return _data
  702. elif s is None and p is None:
  703. return {(r[0], r[1]) for r in _data if r[2] == o}
  704. elif s is None and o is None:
  705. return {(r[0], r[2]) for r in _data if r[1] == p}
  706. elif p is None and o is None:
  707. return {(r[1], r[2]) for r in _data if r[0] == s}
  708. elif s is None:
  709. return {r[0] for r in _data if r[1] == p and r[2] == o}
  710. elif p is None:
  711. return {r[1] for r in _data if r[0] == s and r[2] == o}
  712. elif o is None:
  713. return {r[2] for r in _data if r[0] == s and r[1] == p}
  714. else:
  715. # all given
  716. return (s,p,o) in _data
  717. def lookup(self, s, p, o):
  718. """
  719. Look up triples by a pattern.
  720. This function converts RDFLib terms into the serialized format stored
  721. in the graph's internal structure and compares them bytewise.
  722. Any and all of the lookup terms can be ``None``.
  723. """
  724. cdef:
  725. void *void_p
  726. BufferTriple trp
  727. BufferTriple *trp_p
  728. cc.HashSetIter ti
  729. Buffer t1
  730. Buffer t2
  731. lookup_fn_t fn
  732. res = set()
  733. # Decide comparison logic outside the loop.
  734. if s is not None and p is not None and o is not None:
  735. # Return immediately if 3-term match is requested.
  736. term.serialize_from_rdflib(s, trp.s, self._pool)
  737. term.serialize_from_rdflib(p, trp.p, self._pool)
  738. term.serialize_from_rdflib(o, trp.o, self._pool)
  739. if cc.hashset_contains(self._triples, &trp):
  740. res.add((s, p, o))
  741. return res
  742. elif s is not None:
  743. term.serialize_from_rdflib(s, &t1)
  744. if p is not None:
  745. fn = lookup_sp_cmp_fn
  746. term.serialize_from_rdflib(p, &t2)
  747. elif o is not None:
  748. fn = lookup_so_cmp_fn
  749. term.serialize_from_rdflib(o, &t2)
  750. else:
  751. fn = lookup_s_cmp_fn
  752. elif p is not None:
  753. term.serialize_from_rdflib(p, &t1)
  754. if o is not None:
  755. fn = lookup_po_cmp_fn
  756. term.serialize_from_rdflib(o, &t2)
  757. else:
  758. fn = lookup_p_cmp_fn
  759. elif o is not None:
  760. fn = lookup_o_cmp_fn
  761. term.serialize_from_rdflib(o, &t1)
  762. else:
  763. fn = lookup_none_cmp_fn
  764. # Iterate over serialized triples.
  765. cc.hashset_iter_init(&ti, self._triples)
  766. while cc.hashset_iter_next(&ti, &void_p) != cc.CC_ITER_END:
  767. if void_p == NULL:
  768. trp_p = <BufferTriple *>void_p
  769. res.add((
  770. term.deserialize_to_rdflib(trp_p[0].s),
  771. term.deserialize_to_rdflib(trp_p[0].p),
  772. term.deserialize_to_rdflib(trp_p[0].o),
  773. ))
  774. return res
  775. #cpdef set terms(self, str type):
  776. # """
  777. # Get all terms of a type: subject, predicate or object.
  778. # :param str type: One of ``s``, ``p`` or ``o``.
  779. # """
  780. # i = 'spo'.index(type)
  781. # return {r[i] for r in self.data}
  782. cdef class Imr(SimpleGraph):
  783. """
  784. In-memory resource data container.
  785. This is an extension of :py:class:`~SimpleGraph` that adds a subject URI to
  786. the data set and some convenience methods.
  787. An instance of this class can be converted to a ``rdflib.Resource``
  788. instance.
  789. Some set operations that produce a new object (``-``, ``|``, ``&``, ``^``)
  790. will create a new ``Imr`` instance with the same subject URI.
  791. """
  792. def __init__(self, uri, *args, **kwargs):
  793. """
  794. Initialize the graph with pre-existing data or by looking up a store.
  795. Either ``data``, or ``lookup`` *and* ``store``, can be provide.
  796. ``lookup`` and ``store`` have precedence. If none of them is specified,
  797. an empty graph is initialized.
  798. :param rdflib.URIRef uri: The graph URI.
  799. This will serve as the subject for some queries.
  800. :param set data: Initial data as a set of 3-tuples of RDFLib terms.
  801. :param tuple lookup: tuple of a 3-tuple of lookup terms, and a context.
  802. E.g. ``((URIRef('urn:ns:a'), None, None), URIRef('urn:ns:ctx'))``.
  803. Any and all elements may be ``None``.
  804. :param lmdbStore store: the store to look data up.
  805. """
  806. self.uri = str(uri)
  807. @property
  808. def identifier(self):
  809. """
  810. IMR URI. For compatibility with RDFLib Resource.
  811. :rtype: string
  812. """
  813. return self.uri
  814. @property
  815. def graph(self):
  816. """
  817. Return a SimpleGraph with the same data.
  818. :rtype: SimpleGraph
  819. """
  820. return SimpleGraph(self.data)
  821. def __repr__(self):
  822. """
  823. String representation of an Imr.
  824. This includes the subject URI, number of triples contained and the
  825. memory address of the instance.
  826. """
  827. return (f'<{self.__class__.__name__} @{hex(id(self))} uri={self.uri}, '
  828. f'length={len(self.data)}>')
  829. def __sub__(self, other):
  830. """
  831. Set difference. This creates a new Imr with the same subject URI.
  832. """
  833. return self.__class__(uri=self.uri, data=self.data - other)
  834. def __and__(self, other):
  835. """
  836. Set intersection. This creates a new Imr with the same subject URI.
  837. """
  838. return self.__class__(uri=self.uri, data=self.data & other)
  839. def __or__(self, other):
  840. """
  841. Set union. This creates a new Imr with the same subject URI.
  842. """
  843. return self.__class__(uri=self.uri, data=self.data | other)
  844. def __xor__(self, other):
  845. """
  846. Set exclusive OR (XOR). This creates a new Imr with the same subject
  847. URI.
  848. """
  849. return self.__class__(uri=self.uri, data=self.data ^ other)
  850. def __getitem__(self, item):
  851. """
  852. Supports slicing notation.
  853. """
  854. if isinstance(item, slice):
  855. s, p, o = item.start, item.stop, item.step
  856. return self._slice(s, p, o)
  857. elif isinstance(item, Node):
  858. # If a Node is given, return all values for that predicate.
  859. return {
  860. r[2] for r in self.data
  861. if r[0] == self.uri and r[1] == item}
  862. else:
  863. raise TypeError(f'Wrong slice format: {item}.')
  864. def value(self, p, strict=False):
  865. """
  866. Get an individual value.
  867. :param rdflib.termNode p: Predicate to search for.
  868. :param bool strict: If set to ``True`` the method raises an error if
  869. more than one value is found. If ``False`` (the default) only
  870. the first found result is returned.
  871. :rtype: rdflib.term.Node
  872. """
  873. values = self[p]
  874. if strict and len(values) > 1:
  875. raise RuntimeError('More than one value found for {}, {}.'.format(
  876. self.uri, p))
  877. for ret in values:
  878. return ret
  879. return None
  880. cpdef as_rdflib(self):
  881. """
  882. Return the IMR as a RDFLib Resource.
  883. :rtype: rdflib.Resource
  884. """
  885. gr = Graph()
  886. for trp in self.data:
  887. gr.add(trp)
  888. return gr.resource(identifier=self.uri)