graph.pyx 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112
  1. import logging
  2. from functools import wraps
  3. from rdflib import Graph
  4. from rdflib.term import Node
  5. from lakesuperior import env
  6. from libc.stdint cimport uint32_t, uint64_t
  7. from libc.string cimport memcmp, memcpy
  8. from libc.stdlib cimport free
  9. from cymem.cymem cimport Pool
  10. from lakesuperior.cy_include cimport cylmdb as lmdb
  11. from lakesuperior.cy_include cimport collections as cc
  12. from lakesuperior.cy_include cimport spookyhash as sph
  13. from lakesuperior.model.base cimport Buffer, buffer_dump
  14. from lakesuperior.model.graph cimport term
  15. from lakesuperior.model.graph.triple cimport BufferTriple
  16. from lakesuperior.model.structures.hash cimport hash64
  17. from lakesuperior.model.structures.hash cimport term_hash_seed32
  18. from lakesuperior.model.structures.keyset cimport Keyset
  19. from lakesuperior.store.ldp_rs.lmdb_triplestore cimport (
  20. KLEN, DBL_KLEN, TRP_KLEN, TripleKey)
  21. cdef extern from 'spookyhash_api.h':
  22. uint64_t spookyhash_64(const void *input, size_t input_size, uint64_t seed)
  23. logger = logging.getLogger(__name__)
  24. cdef int term_cmp_fn(const void* key1, const void* key2):
  25. """
  26. Compare function for two Buffer objects.
  27. :rtype: int
  28. :return: 0 if the byte streams are the same, another integer otherwise.
  29. """
  30. b1 = <Buffer *>key1
  31. b2 = <Buffer *>key2
  32. if b1.sz != b2.sz:
  33. logger.info(f'Sizes differ: {b1.sz} != {b2.sz}. Return 1.')
  34. return 1
  35. cdef int cmp = memcmp(b1.addr, b2.addr, b1.sz)
  36. logger.info(f'term memcmp: {cmp}')
  37. return cmp
  38. cdef int trp_lit_cmp_fn(const void* key1, const void* key2):
  39. """
  40. Compare function for two triples in a set.
  41. s, p, o byte data are compared literally.
  42. :rtype: int
  43. :return: 0 if all three terms point to byte-wise identical data in both
  44. triples.
  45. """
  46. t1 = <BufferTriple *>key1
  47. t2 = <BufferTriple *>key2
  48. diff = (
  49. term_cmp_fn(t1.o, t2.o) or
  50. term_cmp_fn(t1.s, t2.s) or
  51. term_cmp_fn(t1.p, t2.p)
  52. )
  53. logger.info(f'Triples match: {not(diff)}')
  54. return diff
  55. cdef int trp_cmp_fn(const void* key1, const void* key2):
  56. """
  57. Compare function for two triples in a set.
  58. Here, pointers to terms are compared for s, p, o. The pointers should be
  59. guaranteed to point to unique values (i.e. no two pointers have the same
  60. term value within a graph).
  61. :rtype: int
  62. :return: 0 if the addresses of all terms are the same, 1 otherwise.
  63. """
  64. t1 = <BufferTriple *>key1
  65. t2 = <BufferTriple *>key2
  66. cdef int is_not_equal = (
  67. t1.s.addr != t2.s.addr or
  68. t1.p.addr != t2.p.addr or
  69. t1.o.addr != t2.o.addr
  70. )
  71. logger.info(f'Triples match: {not(is_not_equal)}')
  72. return is_not_equal
  73. cdef bint graph_eq_fn(SimpleGraph g1, SimpleGraph g2):
  74. """
  75. Compare 2 graphs for equality.
  76. Note that this returns the opposite value than the triple and term
  77. compare functions: 1 (True) if equal, 0 (False) if not.
  78. """
  79. cdef:
  80. void* el
  81. cc.HashSetIter it
  82. cc.hashset_iter_init(&it, g1._triples)
  83. while cc.hashset_iter_next(&it, &el) != cc.CC_ITER_END:
  84. if cc.hashset_contains(g2._triples, el):
  85. return False
  86. return True
  87. cdef size_t term_hash_fn(const void* key, int l, uint32_t seed):
  88. """
  89. Hash function for serialized terms (:py:class:`Buffer` objects)
  90. """
  91. return <size_t>spookyhash_64((<Buffer*>key).addr, (<Buffer*>key).sz, seed)
  92. cdef size_t trp_lit_hash_fn(const void* key, int l, uint32_t seed):
  93. """
  94. Hash function for sets of (serialized) triples.
  95. This function concatenates the literal terms of the triple as bytes
  96. and computes their hash.
  97. """
  98. trp = <BufferTriple*>key
  99. seed64 = <uint64_t>seed
  100. seed_dummy = seed64
  101. cdef sph.spookyhash_context ctx
  102. sph.spookyhash_context_init(&ctx, seed64, seed_dummy)
  103. sph.spookyhash_update(&ctx, trp.s.addr, trp.s.sz)
  104. sph.spookyhash_update(&ctx, trp.s.addr, trp.p.sz)
  105. sph.spookyhash_update(&ctx, trp.s.addr, trp.o.sz)
  106. sph.spookyhash_final(&ctx, &seed64, &seed_dummy)
  107. return <size_t>seed64
  108. cdef size_t trp_hash_fn(const void* key, int l, uint32_t seed):
  109. """
  110. Hash function for sets of (serialized) triples.
  111. This function computes the hash of the concatenated pointer values in the
  112. s, p, o members of the triple. The triple structure is treated as a byte
  113. string. This is safe in spite of byte-wise struct evaluation being a
  114. frowned-upon practice (due to padding issues), because it is assumed that
  115. the input value is always the same type of structure.
  116. """
  117. return <size_t>spookyhash_64(key, l, seed)
  118. cdef size_t hash_ptr_passthrough(const void* key, int l, uint32_t seed):
  119. """
  120. No-op function that takes a pointer and does *not* hash it.
  121. The pointer value is used as the "hash".
  122. """
  123. return <size_t>key
  124. cdef inline bint lookup_none_cmp_fn(
  125. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  126. ):
  127. """
  128. Dummy callback for queries with all parameters unbound.
  129. This function always returns ``True``
  130. """
  131. return True
  132. cdef inline bint lookup_s_cmp_fn(
  133. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  134. ):
  135. """
  136. Lookup callback compare function for a given s in a triple.
  137. The function returns ``True`` if ``t1`` matches the first term.
  138. ``t2`` is not used and is declared only for compatibility with the
  139. other interchangeable functions.
  140. """
  141. return term_cmp_fn(t1, trp[0].s)
  142. cdef inline bint lookup_p_cmp_fn(
  143. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  144. ):
  145. return term_cmp_fn(t1, trp[0].p)
  146. cdef inline bint lookup_o_cmp_fn(
  147. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  148. ):
  149. return term_cmp_fn(t1, trp[0].o)
  150. cdef inline bint lookup_sp_cmp_fn(
  151. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  152. ):
  153. return (
  154. term_cmp_fn(t1, trp[0].s)
  155. and term_cmp_fn(t2, trp[0].p))
  156. cdef inline bint lookup_so_cmp_fn(
  157. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  158. ):
  159. return (
  160. term_cmp_fn(t1, trp[0].s)
  161. and term_cmp_fn(t2, trp[0].o))
  162. cdef inline bint lookup_po_cmp_fn(
  163. const BufferTriple *trp, const Buffer *t1, const Buffer *t2
  164. ):
  165. return (
  166. term_cmp_fn(t1, trp[0].p)
  167. and term_cmp_fn(t2, trp[0].o))
  168. cdef class SimpleGraph:
  169. """
  170. Fast and simple implementation of a graph.
  171. Most functions should mimic RDFLib's graph with less overhead. It uses
  172. the same funny but functional slicing notation.
  173. A SimpleGraph can be instantiated from a store lookup or obtained from a
  174. :py:class:`lakesuperior.store.keyset.Keyset`. This makes it possible to use
  175. a Keyset to perform initial filtering via identity by key, then the
  176. filtered Keyset can be converted into a set of meaningful terms.
  177. An instance of this class can also be converted to and from a
  178. ``rdflib.Graph`` instance.
  179. """
  180. def __cinit__(self, set data=set(), *args, **kwargs):
  181. """
  182. Initialize the graph, optionally with Python data.
  183. :param set data: Initial data as a set of 3-tuples of RDFLib terms.
  184. """
  185. cdef:
  186. cc.HashSetConf terms_conf, trp_conf
  187. self.term_cmp_fn = &term_cmp_fn
  188. self.trp_cmp_fn = &trp_lit_cmp_fn
  189. cc.hashset_conf_init(&terms_conf)
  190. terms_conf.load_factor = 0.85
  191. terms_conf.hash = &term_hash_fn
  192. terms_conf.hash_seed = term_hash_seed32
  193. terms_conf.key_compare = self.term_cmp_fn
  194. terms_conf.key_length = sizeof(Buffer*)
  195. cc.hashset_conf_init(&trp_conf)
  196. trp_conf.load_factor = 0.75
  197. trp_conf.hash = &trp_lit_hash_fn
  198. trp_conf.hash_seed = term_hash_seed32
  199. trp_conf.key_compare = self.trp_cmp_fn
  200. trp_conf.key_length = sizeof(BufferTriple)
  201. cc.hashset_new_conf(&terms_conf, &self._terms)
  202. cc.hashset_new_conf(&trp_conf, &self._triples)
  203. self.pool = Pool()
  204. # Initialize empty data set.
  205. if data:
  206. # Populate with provided Python set.
  207. self.add(data)
  208. def __dealloc__(self):
  209. """
  210. Free the triple pointers.
  211. """
  212. free(self._triples)
  213. free(self._terms)
  214. ## PROPERTIES ##
  215. @property
  216. def data(self):
  217. """
  218. Triple data as a Python set.
  219. :rtype: set
  220. """
  221. cdef:
  222. void *void_p
  223. cc.HashSetIter ti
  224. Buffer* ss, sp, so
  225. graph_set = set()
  226. cc.hashset_iter_init(&ti, self._triples)
  227. while cc.hashset_iter_next(&ti, &void_p) != cc.CC_ITER_END:
  228. logger.info(f'Data loop.')
  229. if void_p == NULL:
  230. logger.warn('Triple is NULL!')
  231. break
  232. trp = <BufferTriple *>void_p
  233. print(f'trp.s: {buffer_dump(trp.s)}')
  234. print(f'trp.p: {buffer_dump(trp.p)}')
  235. print(f'trp.o: {buffer_dump(trp.o)}')
  236. graph_set.add((
  237. term.deserialize_to_rdflib(trp.s),
  238. term.deserialize_to_rdflib(trp.p),
  239. term.deserialize_to_rdflib(trp.o),
  240. ))
  241. return graph_set
  242. @property
  243. def stored_terms(self):
  244. """
  245. All terms in the graph with their memory address.
  246. For debugging purposes.
  247. """
  248. cdef:
  249. cc.HashSetIter it
  250. void *cur
  251. terms = set()
  252. cc.hashset_iter_init(&it, self._terms)
  253. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  254. s_term = <Buffer*>cur
  255. terms.add((f'0x{<size_t>cur:02x}', term.deserialize_to_rdflib(s_term)))
  256. return terms
  257. ## MAGIC METHODS ##
  258. def __len__(self):
  259. """ Number of triples in the graph. """
  260. return cc.hashset_size(self._triples)
  261. def __eq__(self, other):
  262. """ Equality operator between ``SimpleGraph`` instances. """
  263. return len(self ^ other) == 0
  264. def __repr__(self):
  265. """
  266. String representation of the graph.
  267. It provides the number of triples in the graph and memory address of
  268. the instance.
  269. """
  270. return (
  271. f'<{self.__class__.__name__} @{hex(id(self))} '
  272. f'length={len(self.data)}>'
  273. )
  274. def __str__(self):
  275. """ String dump of the graph triples. """
  276. return str(self.data)
  277. def __add__(self, other):
  278. """ Alias for set-theoretical union. """
  279. return self.union_(other)
  280. def __iadd__(self, other):
  281. """ Alias for in-place set-theoretical union. """
  282. self.ip_union(other)
  283. return self
  284. def __sub__(self, other):
  285. """ Set-theoretical subtraction. """
  286. return self.subtraction(other)
  287. def __isub__(self, other):
  288. """ In-place set-theoretical subtraction. """
  289. self.ip_subtraction(other)
  290. return self
  291. def __and__(self, other):
  292. """ Set-theoretical intersection. """
  293. return self.intersection(other)
  294. def __iand__(self, other):
  295. """ In-place set-theoretical intersection. """
  296. self.ip_intersection(other)
  297. return self
  298. def __or__(self, other):
  299. """ Set-theoretical union. """
  300. return self.union_(other)
  301. def __ior__(self, other):
  302. """ In-place set-theoretical union. """
  303. self.ip_union(other)
  304. return self
  305. def __xor__(self, other):
  306. """ Set-theoretical exclusive disjunction (XOR). """
  307. return self.xor(other)
  308. def __ixor__(self, other):
  309. """ In-place set-theoretical exclusive disjunction (XOR). """
  310. self.ip_xor(other)
  311. return self
  312. def __contains__(self, trp):
  313. """
  314. Whether the graph contains a triple.
  315. :rtype: boolean
  316. """
  317. cdef:
  318. Buffer ss, sp, so
  319. BufferTriple btrp
  320. btrp.s = &ss
  321. btrp.p = &sp
  322. btrp.o = &so
  323. s, p, o = trp
  324. term.serialize_from_rdflib(s, &ss)
  325. term.serialize_from_rdflib(p, &sp)
  326. term.serialize_from_rdflib(o, &so)
  327. return self.trp_contains(&btrp)
  328. def __iter__(self):
  329. """ Graph iterator. It iterates over the set triples. """
  330. raise NotImplementedError()
  331. # Slicing.
  332. def __getitem__(self, item):
  333. """
  334. Slicing function.
  335. It behaves similarly to `RDFLib graph slicing
  336. <https://rdflib.readthedocs.io/en/stable/utilities.html#slicing-graphs>`__
  337. """
  338. if isinstance(item, slice):
  339. s, p, o = item.start, item.stop, item.step
  340. return self._slice(s, p, o)
  341. else:
  342. raise TypeError(f'Wrong slice format: {item}.')
  343. def __hash__(self):
  344. return 23465
  345. ## BASIC PYTHON-ACCESSIBLE SET OPERATIONS ##
  346. def terms_by_type(self, type):
  347. """
  348. Get all terms of a type: subject, predicate or object.
  349. :param str type: One of ``s``, ``p`` or ``o``.
  350. """
  351. i = 'spo'.index(type)
  352. return {r[i] for r in self.data}
  353. def add(self, trp):
  354. """
  355. Add triples to the graph.
  356. :param iterable triples: iterable of 3-tuple triples.
  357. """
  358. cdef size_t cur = 0, trp_cur = 0
  359. trp_ct = len(trp)
  360. term_buf = <Buffer*>self.pool.alloc(3 * trp_ct, sizeof(Buffer))
  361. trp_buf = <BufferTriple*>self.pool.alloc(trp_ct, sizeof(BufferTriple))
  362. for s, p, o in trp:
  363. term.serialize_from_rdflib(s, term_buf + cur, self.pool)
  364. term.serialize_from_rdflib(p, term_buf + cur + 1, self.pool)
  365. term.serialize_from_rdflib(o, term_buf + cur + 2, self.pool)
  366. (trp_buf + trp_cur).s = term_buf + cur
  367. (trp_buf + trp_cur).p = term_buf + cur + 1
  368. (trp_buf + trp_cur).o = term_buf + cur + 2
  369. self.add_triple(trp_buf + trp_cur)
  370. trp_cur += 1
  371. cur += 3
  372. def len_terms(self):
  373. """ Number of terms in the graph. """
  374. return cc.hashset_size(self._terms)
  375. def remove(self, trp):
  376. """
  377. Remove one item from the graph.
  378. :param tuple item: A 3-tuple of RDFlib terms. Only exact terms, i.e.
  379. wildcards are not accepted.
  380. """
  381. cdef:
  382. Buffer ss, sp, so
  383. BufferTriple trp_buf
  384. term.serialize_from_rdflib(trp[0], &ss, self.pool)
  385. term.serialize_from_rdflib(trp[1], &sp, self.pool)
  386. term.serialize_from_rdflib(trp[2], &so, self.pool)
  387. trp_buf.s = &ss
  388. trp_buf.p = &sp
  389. trp_buf.o = &so
  390. self.remove_triple(&trp_buf)
  391. ## CYTHON-ACCESSIBLE BASIC METHODS ##
  392. cdef SimpleGraph empty_copy(self):
  393. """
  394. Create an empty copy carrying over some key properties.
  395. Override in subclasses to accommodate for different init properties.
  396. """
  397. return self.__class__()
  398. cpdef union_(self, SimpleGraph other):
  399. """
  400. Perform set union resulting in a new SimpleGraph instance.
  401. TODO Allow union of multiple graphs at a time.
  402. :param SimpleGraph other: The other graph to merge.
  403. :rtype: SimpleGraph
  404. :return: A new SimpleGraph instance.
  405. """
  406. cdef:
  407. void *cur
  408. cc.HashSetIter it
  409. BufferTriple *trp
  410. new_gr = self.empty_copy()
  411. for gr in (self, other):
  412. cc.hashset_iter_init(&it, gr._triples)
  413. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  414. bt = <BufferTriple*>cur
  415. new_gr.add_triple(bt, True)
  416. return new_gr
  417. cdef void ip_union(self, SimpleGraph other) except *:
  418. """
  419. Perform an in-place set union that adds triples to this instance
  420. TODO Allow union of multiple graphs at a time.
  421. :param SimpleGraph other: The other graph to merge.
  422. :rtype: void
  423. """
  424. cdef:
  425. void *cur
  426. cc.HashSetIter it
  427. cc.hashset_iter_init(&it, other._triples)
  428. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  429. bt = <BufferTriple*>cur
  430. self.add_triple(bt, True)
  431. cpdef intersection(self, SimpleGraph other):
  432. """
  433. Graph intersection.
  434. :param SimpleGraph other: The other graph to intersect.
  435. :rtype: SimpleGraph
  436. :return: A new SimpleGraph instance.
  437. """
  438. cdef:
  439. void *cur
  440. cc.HashSetIter it
  441. new_gr = self.empty_copy()
  442. cc.hashset_iter_init(&it, self._triples)
  443. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  444. bt = <BufferTriple*>cur
  445. #print('Checking: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
  446. # <size_t>bt.s, <size_t>bt.p, <size_t>bt.o))
  447. if other.trp_contains(bt):
  448. #print('Adding.')
  449. new_gr.add_triple(bt, True)
  450. return new_gr
  451. cdef void ip_intersection(self, SimpleGraph other) except *:
  452. """
  453. In-place graph intersection.
  454. Triples not in common with another graph are removed from the current
  455. one.
  456. :param SimpleGraph other: The other graph to intersect.
  457. :rtype: void
  458. """
  459. cdef:
  460. void *cur
  461. cc.HashSetIter it
  462. cc.hashset_iter_init(&it, self._triples)
  463. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  464. bt = <BufferTriple*>cur
  465. if not other.trp_contains(bt):
  466. self.remove_triple(bt)
  467. cpdef subtraction(self, SimpleGraph other):
  468. """
  469. Graph set-theoretical subtraction.
  470. Create a new graph with the triples of this graph minus the ones in
  471. common with the other graph.
  472. :param SimpleGraph other: The other graph to subtract to this.
  473. :rtype: SimpleGraph
  474. :return: A new SimpleGraph instance.
  475. """
  476. cdef:
  477. void *cur
  478. cc.HashSetIter it
  479. new_gr = self.empty_copy()
  480. cc.hashset_iter_init(&it, self._triples)
  481. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  482. bt = <BufferTriple*>cur
  483. #print('Checking: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
  484. # <size_t>bt.s, <size_t>bt.p, <size_t>bt.o))
  485. if not other.trp_contains(bt):
  486. #print('Adding.')
  487. new_gr.add_triple(bt, True)
  488. return new_gr
  489. cdef void ip_subtraction(self, SimpleGraph other) except *:
  490. """
  491. In-place graph subtraction.
  492. Triples in common with another graph are removed from the current one.
  493. :param SimpleGraph other: The other graph to intersect.
  494. :rtype: void
  495. """
  496. cdef:
  497. void *cur
  498. cc.HashSetIter it
  499. cc.hashset_iter_init(&it, self._triples)
  500. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  501. bt = <BufferTriple*>cur
  502. if other.trp_contains(bt):
  503. self.remove_triple(bt)
  504. cpdef xor(self, SimpleGraph other):
  505. """
  506. Graph Exclusive disjunction (XOR).
  507. :param SimpleGraph other: The other graph to perform XOR with.
  508. :rtype: SimpleGraph
  509. :return: A new SimpleGraph instance.
  510. """
  511. cdef:
  512. void *cur
  513. cc.HashSetIter it
  514. BufferTriple* bt
  515. new_gr = self.empty_copy()
  516. # Add triples in this and not in other.
  517. cc.hashset_iter_init(&it, self._triples)
  518. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  519. bt = <BufferTriple*>cur
  520. if not other.trp_contains(bt):
  521. new_gr.add_triple(bt, True)
  522. # Other way around.
  523. cc.hashset_iter_init(&it, other._triples)
  524. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  525. bt = <BufferTriple*>cur
  526. if not self.trp_contains(bt):
  527. new_gr.add_triple(bt, True)
  528. return new_gr
  529. cdef void ip_xor(self, SimpleGraph other) except *:
  530. """
  531. In-place graph XOR.
  532. Triples in common with another graph are removed from the current one,
  533. and triples not in common will be added from the other one.
  534. :param SimpleGraph other: The other graph to perform XOR with.
  535. :rtype: void
  536. """
  537. cdef:
  538. void *cur
  539. cc.HashSetIter it
  540. # TODO This could be more efficient to stash values in a simple
  541. # array, but how urgent is it to improve an in-place XOR?
  542. SimpleGraph tmp = SimpleGraph()
  543. # Add *to the tmp graph* triples in other graph and not in this graph.
  544. cc.hashset_iter_init(&it, other._triples)
  545. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  546. bt = <BufferTriple*>cur
  547. if not self.trp_contains(bt):
  548. tmp.add_triple(bt)
  549. # Remove triples in common.
  550. cc.hashset_iter_init(&it, self._triples)
  551. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  552. bt = <BufferTriple*>cur
  553. if other.trp_contains(bt):
  554. self.remove_triple(bt)
  555. self |= tmp
  556. cdef inline BufferTriple* store_triple(self, const BufferTriple* strp):
  557. """
  558. Store triple data in the graph.
  559. Normally, raw data underlying the triple and terms are only referenced
  560. by pointers. If the destination data are garbage collected before the
  561. graph is, segfaults are bound to happen.
  562. This method copies the data to the graph's memory pool, so they are
  563. managed with the lifecycle of the graph.
  564. Note that this method stores items regardless of whether thwy are
  565. duplicate or not, so there may be some duplication.
  566. """
  567. cdef:
  568. BufferTriple* dtrp = <BufferTriple*>self.pool.alloc(
  569. 1, sizeof(BufferTriple)
  570. )
  571. Buffer* spo = <Buffer*>self.pool.alloc(3, sizeof(Buffer))
  572. if not dtrp:
  573. raise MemoryError()
  574. if not spo:
  575. raise MemoryError()
  576. dtrp.s = spo
  577. dtrp.p = spo + 1
  578. dtrp.o = spo + 2
  579. spo[0].addr = self.pool.alloc(strp.s.sz, 1)
  580. spo[0].sz = strp.s.sz
  581. spo[1].addr = self.pool.alloc(strp.p.sz, 1)
  582. spo[1].sz = strp.p.sz
  583. spo[2].addr = self.pool.alloc(strp.o.sz, 1)
  584. spo[2].sz = strp.o.sz
  585. if not spo[0].addr or not spo[1].addr or not spo[2].addr:
  586. raise MemoryError()
  587. memcpy(dtrp.s.addr, strp.s.addr, strp.s.sz)
  588. memcpy(dtrp.p.addr, strp.p.addr, strp.p.sz)
  589. memcpy(dtrp.o.addr, strp.o.addr, strp.o.sz)
  590. return dtrp
  591. cdef inline void add_triple(
  592. self, BufferTriple* trp, bint add=False
  593. ) except *:
  594. """
  595. Add a triple from 3 (TPL) serialized terms.
  596. Each of the terms is added to the term set if not existing. The triple
  597. also is only added if not existing.
  598. :param BufferTriple* trp: The triple to add.
  599. :param bint add: if ``True``, the triple and term data will be
  600. allocated and copied into the graph memory pool.
  601. """
  602. if add:
  603. trp = self.store_triple(trp)
  604. logger.info('Inserting terms.')
  605. cc.hashset_add(self._terms, trp.s)
  606. cc.hashset_add(self._terms, trp.p)
  607. cc.hashset_add(self._terms, trp.o)
  608. logger.info('inserted terms.')
  609. logger.info(f'Terms set size: {cc.hashset_size(self._terms)}')
  610. cdef size_t trp_sz = cc.hashset_size(self._triples)
  611. logger.info(f'Triples set size before adding: {trp_sz}')
  612. r = cc.hashset_add(self._triples, trp)
  613. trp_sz = cc.hashset_size(self._triples)
  614. logger.info(f'Triples set size after adding: {trp_sz}')
  615. cdef:
  616. cc.HashSetIter ti
  617. void *cur
  618. cdef int remove_triple(self, BufferTriple* btrp) except -1:
  619. """
  620. Remove one triple from the graph.
  621. """
  622. return cc.hashset_remove(self._triples, btrp, NULL)
  623. cdef bint trp_contains(self, BufferTriple* btrp):
  624. cdef:
  625. cc.HashSetIter it
  626. void* cur
  627. cc.hashset_iter_init(&it, self._triples)
  628. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  629. if self.trp_cmp_fn(cur, btrp) == 0:
  630. return True
  631. return False
  632. cpdef void set(self, tuple trp) except *:
  633. """
  634. Set a single value for subject and predicate.
  635. Remove all triples matching ``s`` and ``p`` before adding ``s p o``.
  636. """
  637. if None in trp:
  638. raise ValueError(f'Invalid triple: {trp}')
  639. self.remove((trp[0], trp[1], None))
  640. self.add((trp,))
  641. cpdef void remove_triples(self, pattern) except *:
  642. """
  643. Remove triples by pattern.
  644. The pattern used is similar to :py:meth:`LmdbTripleStore.delete`.
  645. """
  646. s, p, o = pattern
  647. for match in self.lookup(s, p, o):
  648. logger.debug(f'Removing from graph: {match}.')
  649. self.data.remove(match)
  650. def as_rdflib(self):
  651. """
  652. Return the data set as an RDFLib Graph.
  653. :rtype: rdflib.Graph
  654. """
  655. gr = Graph()
  656. for trp in self.data:
  657. gr.add(trp)
  658. return gr
  659. def _slice(self, s, p, o):
  660. """
  661. Return terms filtered by other terms.
  662. This behaves like the rdflib.Graph slicing policy.
  663. """
  664. _data = self.data
  665. logger.debug(f'Slicing graph by: {s}, {p}, {o}.')
  666. if s is None and p is None and o is None:
  667. return _data
  668. elif s is None and p is None:
  669. return {(r[0], r[1]) for r in _data if r[2] == o}
  670. elif s is None and o is None:
  671. return {(r[0], r[2]) for r in _data if r[1] == p}
  672. elif p is None and o is None:
  673. return {(r[1], r[2]) for r in _data if r[0] == s}
  674. elif s is None:
  675. return {r[0] for r in _data if r[1] == p and r[2] == o}
  676. elif p is None:
  677. return {r[1] for r in _data if r[0] == s and r[2] == o}
  678. elif o is None:
  679. return {r[2] for r in _data if r[0] == s and r[1] == p}
  680. else:
  681. # all given
  682. return (s,p,o) in _data
  683. def lookup(self, s, p, o):
  684. """
  685. Look up triples by a pattern.
  686. This function converts RDFLib terms into the serialized format stored
  687. in the graph's internal structure and compares them bytewise.
  688. Any and all of the lookup terms can be ``None``.
  689. """
  690. cdef:
  691. void *void_p
  692. BufferTriple trp
  693. BufferTriple *trp_p
  694. cc.HashSetIter ti
  695. Buffer t1
  696. Buffer t2
  697. lookup_fn_t fn
  698. res = set()
  699. # Decide comparison logic outside the loop.
  700. if s is not None and p is not None and o is not None:
  701. # Return immediately if 3-term match is requested.
  702. term.serialize_from_rdflib(s, trp.s, self.pool)
  703. term.serialize_from_rdflib(p, trp.p, self.pool)
  704. term.serialize_from_rdflib(o, trp.o, self.pool)
  705. if cc.hashset_contains(self._triples, &trp):
  706. res.add((s, p, o))
  707. return res
  708. elif s is not None:
  709. term.serialize_from_rdflib(s, &t1)
  710. if p is not None:
  711. fn = lookup_sp_cmp_fn
  712. term.serialize_from_rdflib(p, &t2)
  713. elif o is not None:
  714. fn = lookup_so_cmp_fn
  715. term.serialize_from_rdflib(o, &t2)
  716. else:
  717. fn = lookup_s_cmp_fn
  718. elif p is not None:
  719. term.serialize_from_rdflib(p, &t1)
  720. if o is not None:
  721. fn = lookup_po_cmp_fn
  722. term.serialize_from_rdflib(o, &t2)
  723. else:
  724. fn = lookup_p_cmp_fn
  725. elif o is not None:
  726. fn = lookup_o_cmp_fn
  727. term.serialize_from_rdflib(o, &t1)
  728. else:
  729. fn = lookup_none_cmp_fn
  730. # Iterate over serialized triples.
  731. cc.hashset_iter_init(&ti, self._triples)
  732. while cc.hashset_iter_next(&ti, &void_p) != cc.CC_ITER_END:
  733. if void_p == NULL:
  734. trp_p = <BufferTriple *>void_p
  735. res.add((
  736. term.deserialize_to_rdflib(trp_p[0].s),
  737. term.deserialize_to_rdflib(trp_p[0].p),
  738. term.deserialize_to_rdflib(trp_p[0].o),
  739. ))
  740. return res
  741. cdef class Imr(SimpleGraph):
  742. """
  743. In-memory resource data container.
  744. This is an extension of :py:class:`~SimpleGraph` that adds a subject URI to
  745. the data set and some convenience methods.
  746. An instance of this class can be converted to a ``rdflib.Resource``
  747. instance.
  748. Some set operations that produce a new object (``-``, ``|``, ``&``, ``^``)
  749. will create a new ``Imr`` instance with the same subject URI.
  750. """
  751. def __init__(self, uri, *args, **kwargs):
  752. """
  753. Initialize the graph with pre-existing data or by looking up a store.
  754. Either ``data``, or ``lookup`` *and* ``store``, can be provide.
  755. ``lookup`` and ``store`` have precedence. If none of them is specified,
  756. an empty graph is initialized.
  757. :param rdflib.URIRef uri: The graph URI.
  758. This will serve as the subject for some queries.
  759. :param args: Positional arguments inherited from
  760. ``SimpleGraph.__init__``.
  761. :param kwargs: Keyword arguments inherited from
  762. ``SimpleGraph.__init__``.
  763. """
  764. self.uri = str(uri)
  765. #super().__init(*args, **kwargs)
  766. def __repr__(self):
  767. """
  768. String representation of an Imr.
  769. This includes the subject URI, number of triples contained and the
  770. memory address of the instance.
  771. """
  772. return (f'<{self.__class__.__name__} @{hex(id(self))} uri={self.uri}, '
  773. f'length={len(self.data)}>')
  774. def __getitem__(self, item):
  775. """
  776. Supports slicing notation.
  777. """
  778. if isinstance(item, slice):
  779. s, p, o = item.start, item.stop, item.step
  780. return self._slice(s, p, o)
  781. elif isinstance(item, Node):
  782. # If a Node is given, return all values for that predicate.
  783. return {
  784. r[2] for r in self.data
  785. if r[0] == self.uri and r[1] == item}
  786. else:
  787. raise TypeError(f'Wrong slice format: {item}.')
  788. cdef Imr empty_copy(self):
  789. """
  790. Create an empty instance carrying over some key properties.
  791. """
  792. return self.__class__(uri=self.uri)
  793. def value(self, p, strict=False):
  794. """
  795. Get an individual value.
  796. :param rdflib.termNode p: Predicate to search for.
  797. :param bool strict: If set to ``True`` the method raises an error if
  798. more than one value is found. If ``False`` (the default) only
  799. the first found result is returned.
  800. :rtype: rdflib.term.Node
  801. """
  802. values = self[p]
  803. if strict and len(values) > 1:
  804. raise RuntimeError('More than one value found for {}, {}.'.format(
  805. self.uri, p))
  806. for ret in values:
  807. return ret
  808. return None
  809. cpdef as_rdflib(self):
  810. """
  811. Return the IMR as a RDFLib Resource.
  812. :rtype: rdflib.Resource
  813. """
  814. gr = Graph()
  815. for trp in self.data:
  816. gr.add(trp)
  817. return gr.resource(identifier=self.uri)