lmdb_triplestore.pyx 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330
  1. import logging
  2. import sys
  3. import rdflib
  4. #from cython.parallel import prange
  5. from rdflib.graph import DATASET_DEFAULT_GRAPH_ID as RDFLIB_DEFAULT_GRAPH_URI
  6. from lakesuperior.store.base_lmdb_store import (
  7. KeyExistsError, KeyNotFoundError, LmdbError)
  8. from libc.stdlib cimport malloc, free
  9. cimport lakesuperior.cy_include.collections as cc
  10. cimport lakesuperior.cy_include.cylmdb as lmdb
  11. from lakesuperior.model.base cimport (
  12. FIRST_KEY, KLEN, DBL_KLEN, TRP_KLEN, QUAD_KLEN,
  13. Key, DoubleKey, TripleKey, QuadKey,
  14. Buffer, buffer_dump
  15. )
  16. from lakesuperior.store.base_lmdb_store cimport (
  17. _check, BaseLmdbStore, data_v, dbi, key_v)
  18. from lakesuperior.model.rdf.graph cimport Graph
  19. from lakesuperior.model.rdf.term cimport (
  20. Term, deserialize_to_rdflib, serialize_from_rdflib)
  21. from lakesuperior.model.rdf.triple cimport BufferTriple
  22. from lakesuperior.model.structures.hash cimport (
  23. HLEN_128 as HLEN, Hash128, hash128)
  24. # Integer keys and values are stored in the system's native byte order.
  25. # Therefore they must be parsed left-to-right if the system is big-endian,
  26. # and right-to-left if little-endian, in order to maintain the correct
  27. # sorting order.
  28. BIG_ENDIAN = sys.byteorder == 'big'
  29. LSUP_REVERSEKEY = 0 if BIG_ENDIAN else lmdb.MDB_REVERSEKEY
  30. LSUP_REVERSEDUP = 0 if BIG_ENDIAN else lmdb.MDB_REVERSEDUP
  31. INT_KEY_MASK = lmdb.MDB_INTEGERKEY | LSUP_REVERSEKEY
  32. INT_DUP_KEY_MASK = (
  33. lmdb.MDB_DUPSORT | lmdb.MDB_DUPFIXED | lmdb.MDB_INTEGERKEY
  34. | LSUP_REVERSEKEY | LSUP_REVERSEDUP
  35. )
  36. INT_DUP_MASK = (
  37. lmdb.MDB_DUPSORT | lmdb.MDB_DUPFIXED | lmdb.MDB_INTEGERDUP
  38. | LSUP_REVERSEKEY | LSUP_REVERSEDUP
  39. )
  40. lookup_rank = [0, 2, 1]
  41. """
  42. Order in which keys are looked up if two terms are bound.
  43. The indices with the smallest average number of values per key should be
  44. looked up first.
  45. 0 = s:po
  46. 1 = p:so
  47. 2 = o:sp
  48. If we want to get fancy, this can be rebalanced from time to time by
  49. looking up the number of keys in (s:po, p:so, o:sp).
  50. """
  51. lookup_ordering = [
  52. [0, 1, 2], # spo
  53. [1, 0, 2], # pso
  54. [2, 0, 1], # osp
  55. ]
  56. lookup_ordering_2bound = [
  57. [1, 2, 0], # po:s
  58. [0, 2, 1], # so:p
  59. [0, 1, 2], # sp:o
  60. ]
  61. logger = logging.getLogger(__name__)
  62. cdef class LmdbTriplestore(BaseLmdbStore):
  63. """
  64. Low-level storage layer.
  65. This class extends the RDFLib-compatible :py:class:`BaseLmdbStore` and maps
  66. triples and contexts to key-value records in LMDB.
  67. This class uses the original LMDB C API rather than the Python bindings,
  68. because several data manipulations happen after retrieval from the store,
  69. which are more efficiently performed at the C level.
  70. """
  71. dbi_labels = [
  72. # Main data
  73. # Term key to serialized term content
  74. 't:st',
  75. # Joined triple keys to context key
  76. 'spo:c',
  77. # This has empty values and is used to keep track of empty contexts.
  78. 'c:',
  79. # Prefix to namespace
  80. 'pfx:ns',
  81. # Indices
  82. # Namespace to prefix
  83. 'ns:pfx',
  84. # Term hash to triple key
  85. 'th:t',
  86. # Lookups
  87. 's:po',
  88. 'p:so',
  89. 'o:sp',
  90. 'po:s',
  91. 'so:p',
  92. 'sp:o',
  93. 'c:spo',
  94. ]
  95. lookup_indices = [
  96. b's:po',
  97. b'p:so',
  98. b'o:sp',
  99. b'po:s',
  100. b'so:p',
  101. b'sp:o',
  102. ]
  103. dbi_flags = {
  104. 'c': INT_KEY_MASK,
  105. 't:st': INT_KEY_MASK,
  106. 's:po': INT_DUP_KEY_MASK,
  107. 'p:so': INT_DUP_KEY_MASK,
  108. 'o:sp': INT_DUP_KEY_MASK,
  109. 'po:s': INT_DUP_MASK,
  110. 'so:p': INT_DUP_MASK,
  111. 'sp:o': INT_DUP_MASK,
  112. 'c:spo': INT_DUP_KEY_MASK,
  113. 'spo:c': INT_DUP_MASK,
  114. }
  115. logger.debug(f'DBI flags: {dbi_flags}')
  116. flags = 0
  117. options = {
  118. 'map_size': 1024 ** 4 # 1Tb.
  119. }
  120. # DB management methods.
  121. cpdef dict stats(self):
  122. """
  123. Gather statistics about the database."""
  124. st = self._stats()
  125. st['num_triples'] = st['db_stats']['spo:c']['ms_entries']
  126. return st
  127. cpdef size_t _len(self, context=None) except -1:
  128. """
  129. Return the length of the dataset.
  130. The RDFLib interface defines `__len__` in a nonstandard way that
  131. causes a Cython compilation error, so this method is called by the
  132. `__len__` method of its Python counterpart.
  133. """
  134. cdef:
  135. size_t ct
  136. Key ck
  137. if context is not None:
  138. ck = self.to_key(context)
  139. key_v.mv_data = &ck
  140. key_v.mv_size = KLEN
  141. cur = self._cur_open('c:spo')
  142. try:
  143. _check(lmdb.mdb_cursor_get(
  144. cur, &key_v, NULL, lmdb.MDB_SET))
  145. _check(lmdb.mdb_cursor_count(cur, &ct))
  146. except KeyNotFoundError:
  147. return 0
  148. else:
  149. return ct
  150. finally:
  151. #pass
  152. self._cur_close(cur)
  153. else:
  154. return self.stats()['num_triples']
  155. ## PRIVATE METHODS ##
  156. # Triple and graph methods.
  157. cpdef void add(self, triple, context=None, quoted=False) except *:
  158. """
  159. Add a triple and start indexing.
  160. :param tuple(rdflib.Identifier) triple: Tuple of three identifiers.
  161. :param context: Context identifier. ``None`` inserts in the default
  162. graph.
  163. :type context: rdflib.Identifier or None
  164. :param bool quoted: Not used.
  165. """
  166. cdef:
  167. lmdb.MDB_cursor *icur
  168. lmdb.MDB_val spo_v, c_v, null_v, key_v, data_v
  169. unsigned char i
  170. Hash128 thash
  171. QuadKey spock
  172. Buffer pk_t
  173. c = self._normalize_context(context)
  174. if c is None:
  175. c = RDFLIB_DEFAULT_GRAPH_URI
  176. s, p, o = triple
  177. icur = self._cur_open('th:t')
  178. try:
  179. for i, term_obj in enumerate((s, p, o, c)):
  180. serialize_from_rdflib(term_obj, &pk_t)
  181. hash128(&pk_t, &thash)
  182. try:
  183. key_v.mv_data = thash
  184. key_v.mv_size = HLEN
  185. _check(lmdb.mdb_get(
  186. self.txn, self.get_dbi('th:t'), &key_v, &data_v))
  187. spock[i] = (<Key*>data_v.mv_data)[0]
  188. except KeyNotFoundError:
  189. # If term_obj is not found, add it...
  190. logger.debug('Hash {} not found. Adding to DB.'.format(
  191. thash[: HLEN]))
  192. spock[i] = self._append(&pk_t, dblabel=b't:st')
  193. # ...and index it.
  194. key_v.mv_data = thash
  195. key_v.mv_size = HLEN
  196. data_v.mv_data = spock + i
  197. data_v.mv_size = KLEN
  198. _check(
  199. lmdb.mdb_cursor_put(icur, &key_v, &data_v, 0),
  200. 'Error setting key {}.'.format(thash))
  201. finally:
  202. self._cur_close(icur)
  203. spo_v.mv_data = spock # address of sk in spock
  204. spo_v.mv_size = TRP_KLEN # Grab 3 keys
  205. c_v.mv_data = spock + 3 # address of ck in spock
  206. c_v.mv_size = KLEN
  207. null_v.mv_data = b''
  208. null_v.mv_size = 0
  209. try:
  210. _check(lmdb.mdb_put(
  211. self.txn, self.get_dbi('c:'), &c_v, &null_v,
  212. lmdb.MDB_NOOVERWRITE))
  213. except KeyExistsError:
  214. pass
  215. try:
  216. # Add triple:context association.
  217. _check(lmdb.mdb_put(
  218. self.txn, self.get_dbi('spo:c'), &spo_v, &c_v,
  219. lmdb.MDB_NODUPDATA))
  220. except KeyExistsError:
  221. pass
  222. try:
  223. # Index context:triple association.
  224. _check(lmdb.mdb_put(
  225. self.txn, self.get_dbi('c:spo'), &c_v, &spo_v,
  226. lmdb.MDB_NODUPDATA))
  227. except KeyExistsError:
  228. pass
  229. self._index_triple(IDX_OP_ADD, [spock[0], spock[1], spock[2]])
  230. cpdef void add_graph(self, c) except *:
  231. """
  232. Add a graph (context) to the database.
  233. This creates an empty graph by associating the graph URI with the
  234. pickled `None` value. This prevents from removing the graph when all
  235. triples are removed.
  236. :param rdflib.URIRef graph: URI of the named graph to add.
  237. """
  238. logger.exception('Called add_graph.')
  239. cdef:
  240. lmdb.MDB_txn *_txn
  241. Buffer _sc
  242. Key ck
  243. if isinstance(c, rdflib.Graph):
  244. c = c.identifier
  245. ck = self.to_key(c)
  246. if not self._key_exists(<unsigned char*>ck, KLEN, b'c:'):
  247. # Insert context term if not existing.
  248. if self.is_txn_rw:
  249. _txn = self.txn
  250. else:
  251. _check(lmdb.mdb_txn_begin(self.dbenv, NULL, 0, &_txn))
  252. # Open new R/W transactions.
  253. #logger.debug('Opening a temporary RW transaction.')
  254. try:
  255. #logger.debug('Working in existing RW transaction.')
  256. # Add to list of contexts.
  257. key_v.mv_data = &ck
  258. key_v.mv_size = KLEN
  259. data_v.mv_data = &ck # Whatever, length is zero anyways
  260. data_v.mv_size = 0
  261. _check(lmdb.mdb_put(
  262. _txn, self.get_dbi(b'c:'), &key_v, &data_v, 0
  263. ))
  264. if not self.is_txn_rw:
  265. _check(lmdb.mdb_txn_commit(_txn))
  266. except:
  267. if not self.is_txn_rw:
  268. lmdb.mdb_txn_abort(_txn)
  269. raise
  270. cpdef void _remove(self, tuple triple_pattern, context=None) except *:
  271. cdef:
  272. lmdb.MDB_val spok_v, ck_v
  273. TripleKey spok_cur
  274. Key ck
  275. if context is not None:
  276. try:
  277. ck = self.to_key(context)
  278. except KeyNotFoundError:
  279. # If context is specified but not found, return to avoid
  280. # deleting the wrong triples.
  281. return
  282. # Get the matching pattern.
  283. match_set = self.triple_keys(triple_pattern, context)
  284. dcur = self._cur_open('spo:c')
  285. icur = self._cur_open('c:spo')
  286. try:
  287. spok_v.mv_size = TRP_KLEN
  288. # If context was specified, remove only associations with that context.
  289. match_set.seek()
  290. if context is not None:
  291. ck_v.mv_data = &ck
  292. ck_v.mv_size = KLEN
  293. while match_set.keys.get_next(&spok_cur):
  294. spok_v.mv_data = spok_cur
  295. # Delete spo:c entry.
  296. try:
  297. _check(lmdb.mdb_cursor_get(
  298. dcur, &spok_v, &ck_v, lmdb.MDB_GET_BOTH))
  299. except KeyNotFoundError:
  300. pass
  301. else:
  302. _check(lmdb.mdb_cursor_del(dcur, 0))
  303. # Restore ck after delete.
  304. ck_v.mv_data = &ck
  305. # Delete c:spo entry.
  306. try:
  307. _check(lmdb.mdb_cursor_get(
  308. icur, &ck_v, &spok_v, lmdb.MDB_GET_BOTH))
  309. except KeyNotFoundError:
  310. pass
  311. else:
  312. _check(lmdb.mdb_cursor_del(icur, 0))
  313. # Delete lookup indices, only if no other context
  314. # association is present.
  315. # spok_v has changed on mdb_cursor_del. Restore.
  316. spok_v.mv_data = spok_cur
  317. try:
  318. _check(lmdb.mdb_cursor_get(
  319. dcur, &spok_v, NULL, lmdb.MDB_SET))
  320. except KeyNotFoundError:
  321. self._index_triple(IDX_OP_REMOVE, spok_cur)
  322. # If no context is specified, remove all associations.
  323. else:
  324. logger.debug('Removing triples in all contexts.')
  325. # Loop over all SPO matching the triple pattern.
  326. while match_set.keys.get_next(&spok_cur):
  327. spok_v.mv_data = spok_cur
  328. # Loop over all context associations for this SPO.
  329. try:
  330. _check(lmdb.mdb_cursor_get(
  331. dcur, &spok_v, &ck_v, lmdb.MDB_SET_KEY))
  332. except KeyNotFoundError:
  333. # Move on to the next SPO.
  334. continue
  335. else:
  336. ck = (<Key*>ck_v.mv_data)[0]
  337. while True:
  338. # Delete c:spo association.
  339. try:
  340. _check(lmdb.mdb_cursor_get(
  341. icur, &ck_v, &spok_v, lmdb.MDB_GET_BOTH))
  342. except KeyNotFoundError:
  343. pass
  344. else:
  345. lmdb.mdb_cursor_del(icur, 0)
  346. # Restore the pointer to the deleted SPO.
  347. spok_v.mv_data = spok_cur
  348. # Move on to next associated context.
  349. try:
  350. _check(lmdb.mdb_cursor_get(
  351. dcur, &spok_v, &ck_v, lmdb.MDB_NEXT_DUP))
  352. except KeyNotFoundError:
  353. break
  354. # Then delete the spo:c association.
  355. try:
  356. _check(lmdb.mdb_cursor_get(
  357. dcur, &spok_v, &ck_v, lmdb.MDB_SET))
  358. except KeyNotFoundError:
  359. pass
  360. else:
  361. lmdb.mdb_cursor_del(dcur, lmdb.MDB_NODUPDATA)
  362. self._index_triple(IDX_OP_REMOVE, spok_cur)
  363. finally:
  364. self._cur_close(dcur)
  365. self._cur_close(icur)
  366. cdef void _index_triple(self, int op, TripleKey spok) except *:
  367. """
  368. Update index for a triple and context (add or remove).
  369. :param str op: one of ``IDX_OP_ADD`` or ``IDX_OP_REMOVE``.
  370. :param TripleKey spok: Triple key to index.
  371. """
  372. cdef:
  373. DoubleKey dbl_keys[3]
  374. size_t i = 0
  375. lmdb.MDB_val key_v, dbl_key_v
  376. dbl_keys = [
  377. [spok[1], spok[2]], # pok
  378. [spok[0], spok[2]], # sok
  379. [spok[0], spok[1]], # spk
  380. ]
  381. #logger.debug(f'''Indices:
  382. #spok: {[spok[0], spok[1], spok[2]]}
  383. #sk: {spok[0]}
  384. #pk: {spok[1]}
  385. #ok: {spok[2]}
  386. #pok: {dbl_keys[0]}
  387. #sok: {dbl_keys[1]}
  388. #spk: {dbl_keys[2]}
  389. #''')
  390. key_v.mv_size = KLEN
  391. dbl_key_v.mv_size = DBL_KLEN
  392. #logger.debug('Start indexing: {}.'.format(spok[: TRP_KLEN]))
  393. if op == IDX_OP_REMOVE:
  394. logger.debug(f'Remove {spok[0]} from indices.')
  395. else:
  396. logger.debug(f'Add {spok[0]} to indices.')
  397. while i < 3:
  398. cur1 = self._cur_open(self.lookup_indices[i]) # s:po, p:so, o:sp
  399. cur2 = self._cur_open(self.lookup_indices[i + 3])# po:s, so:p, sp:o
  400. try:
  401. key_v.mv_data = spok + i
  402. dbl_key_v.mv_data = dbl_keys[i]
  403. # Removal op indexing.
  404. if op == IDX_OP_REMOVE:
  405. try:
  406. _check(lmdb.mdb_cursor_get(
  407. cur1, &key_v, &dbl_key_v, lmdb.MDB_GET_BOTH))
  408. except KeyNotFoundError:
  409. pass
  410. else:
  411. _check(lmdb.mdb_cursor_del(cur1, 0))
  412. # Restore pointers after delete.
  413. key_v.mv_data = spok + i
  414. dbl_key_v.mv_data = dbl_keys[i]
  415. try:
  416. _check(lmdb.mdb_cursor_get(
  417. cur2, &dbl_key_v, &key_v, lmdb.MDB_GET_BOTH))
  418. except KeyNotFoundError:
  419. pass
  420. else:
  421. _check(lmdb.mdb_cursor_del(cur2, 0))
  422. # Addition op indexing.
  423. elif op == IDX_OP_ADD:
  424. try:
  425. _check(lmdb.mdb_cursor_put(
  426. cur1, &key_v, &dbl_key_v, lmdb.MDB_NODUPDATA))
  427. except KeyExistsError:
  428. pass
  429. try:
  430. _check(lmdb.mdb_cursor_put(
  431. cur2, &dbl_key_v, &key_v, lmdb.MDB_NODUPDATA))
  432. except KeyExistsError:
  433. pass
  434. else:
  435. raise ValueError(
  436. 'Index operation \'{}\' is not supported.'.format(op))
  437. i += 1
  438. finally:
  439. #pass
  440. self._cur_close(cur1)
  441. self._cur_close(cur2)
  442. cpdef void _remove_graph(self, object gr_uri) except *:
  443. """
  444. Delete a context.
  445. """
  446. cdef:
  447. Hash128 chash
  448. Key ck
  449. lmdb.MDB_val ck_v, chash_v
  450. Buffer pk_c
  451. # Gather information on the graph prior to deletion.
  452. try:
  453. ck = self.to_key(gr_uri)
  454. except KeyNotFoundError:
  455. return
  456. # Remove all triples and indices associated with the graph.
  457. self._remove((None, None, None), gr_uri)
  458. # Remove the graph if it is in triples.
  459. self._remove((gr_uri, None, None))
  460. self._remove((None, None, gr_uri))
  461. # Clean up all terms related to the graph.
  462. serialize_from_rdflib(gr_uri, &pk_c)
  463. hash128(&pk_c, &chash)
  464. ck_v.mv_size = KLEN
  465. chash_v.mv_size = HLEN
  466. try:
  467. ck_v.mv_data = &ck
  468. _check(lmdb.mdb_del(self.txn, self.get_dbi(b'c:'), &ck_v, NULL))
  469. ck_v.mv_data = &ck
  470. _check(lmdb.mdb_del(self.txn, self.get_dbi(b't:st'), &ck_v, NULL))
  471. chash_v.mv_data = chash
  472. _check(lmdb.mdb_del(self.txn, self.get_dbi(b'th:t'), &chash_v, NULL))
  473. except KeyNotFoundError:
  474. pass
  475. # Lookup methods.
  476. def contexts(self, triple=None):
  477. """
  478. Get a list of all contexts.
  479. :rtype: set(URIRef)
  480. """
  481. cdef:
  482. size_t sz, i
  483. Key* match
  484. try:
  485. self.all_contexts(&match, &sz, triple)
  486. ret = set()
  487. for i in range(sz):
  488. ret.add(self.from_key(match[i]))
  489. finally:
  490. free(match)
  491. return ret
  492. def triples(self, triple_pattern, context=None):
  493. """
  494. Generator over matching triples.
  495. :param tuple triple_pattern: 3 RDFLib terms
  496. :param context: Context graph, if available.
  497. :type context: rdflib.Graph or None
  498. :rtype: Iterator
  499. :return: Generator over triples and contexts in which each result has
  500. the following format::
  501. (s, p, o), generator(contexts)
  502. Where the contexts generator lists all context that the triple appears
  503. in.
  504. """
  505. cdef:
  506. size_t i = 0
  507. TripleKey it_cur
  508. lmdb.MDB_val key_v, data_v
  509. # This sounds strange, RDFLib should be passing None at this point,
  510. # but anyway...
  511. context = self._normalize_context(context)
  512. logger.debug(
  513. 'Getting triples for: {}, {}'.format(triple_pattern, context))
  514. rset = self.triple_keys(triple_pattern, context)
  515. #logger.debug('Triple keys found: {}'.format(rset.data[:rset.size]))
  516. cur = self._cur_open('spo:c')
  517. try:
  518. key_v.mv_size = TRP_KLEN
  519. rset.keys.seek()
  520. while rset.keys.get_next(&it_cur):
  521. key_v.mv_data = it_cur
  522. # Get contexts associated with each triple.
  523. contexts = []
  524. # This shall never be MDB_NOTFOUND.
  525. _check(lmdb.mdb_cursor_get(cur, &key_v, &data_v, lmdb.MDB_SET))
  526. while True:
  527. c_uri = self.from_key((<Key*>data_v.mv_data)[0])
  528. contexts.append(rdflib.Graph(self, uri=c_uri, store=self))
  529. try:
  530. _check(lmdb.mdb_cursor_get(
  531. cur, &key_v, &data_v, lmdb.MDB_NEXT_DUP))
  532. except KeyNotFoundError:
  533. break
  534. yield (
  535. (
  536. self.from_key((<Key*>key_v.mv_data)[0]),
  537. self.from_key((<Key*>key_v.mv_data)[1]),
  538. self.from_key((<Key*>key_v.mv_data)[2]),
  539. ),
  540. tuple(contexts)
  541. )
  542. finally:
  543. self._cur_close(cur)
  544. cpdef Graph triple_keys(
  545. self, tuple triple_pattern, context=None, uri=None
  546. ):
  547. """
  548. Top-level lookup method.
  549. This method is used by `triples` which returns native Python tuples,
  550. as well as by other methods that need to iterate and filter triple
  551. keys without incurring in the overhead of converting them to triples.
  552. :param tuple triple_pattern: 3 RDFLib terms
  553. :param context: Context graph or URI, or None.
  554. :type context: rdflib.term.Identifier or None
  555. """
  556. cdef:
  557. size_t ct = 0, i = 0
  558. lmdb.MDB_cursor *icur
  559. lmdb.MDB_val key_v, data_v
  560. Key tk, ck
  561. TripleKey spok
  562. Graph flt_res, ret
  563. if context is not None:
  564. try:
  565. ck = self.to_key(context)
  566. except KeyNotFoundError:
  567. # Context not found.
  568. return Graph(self, uri=uri)
  569. icur = self._cur_open('c:spo')
  570. try:
  571. key_v.mv_data = &ck
  572. key_v.mv_size = KLEN
  573. # s p o c
  574. if all(triple_pattern):
  575. for i, term in enumerate(triple_pattern):
  576. try:
  577. tk = self.to_key(term)
  578. except KeyNotFoundError:
  579. # A term key was not found.
  580. return Graph(self, uri=uri)
  581. spok[i] = tk
  582. data_v.mv_data = spok
  583. data_v.mv_size = TRP_KLEN
  584. try:
  585. _check(lmdb.mdb_cursor_get(
  586. icur, &key_v, &data_v, lmdb.MDB_GET_BOTH))
  587. except KeyNotFoundError:
  588. # Triple not found.
  589. #logger.debug('spok / ck pair not found.')
  590. return Graph(self, uri=uri)
  591. ret = Graph(self, 1, uri=uri)
  592. ret.keys.add(&spok)
  593. return ret
  594. # ? ? ? c
  595. elif not any(triple_pattern):
  596. # Get all triples from the context
  597. try:
  598. _check(lmdb.mdb_cursor_get(
  599. icur, &key_v, &data_v, lmdb.MDB_SET))
  600. except KeyNotFoundError:
  601. # Triple not found.
  602. return Graph(self, uri=uri)
  603. _check(lmdb.mdb_cursor_count(icur, &ct))
  604. ret = Graph(self, ct, uri=uri)
  605. _check(lmdb.mdb_cursor_get(
  606. icur, &key_v, &data_v, lmdb.MDB_GET_MULTIPLE))
  607. while True:
  608. # Loop over page data.
  609. spok_page = <TripleKey*>data_v.mv_data
  610. for i in range(data_v.mv_size // TRP_KLEN):
  611. ret.keys.add(spok_page + i)
  612. try:
  613. # Get next page.
  614. _check(lmdb.mdb_cursor_get(
  615. icur, &key_v, &data_v, lmdb.MDB_NEXT_MULTIPLE))
  616. except KeyNotFoundError:
  617. return ret
  618. # Regular lookup. Filter _lookup() results by context.
  619. else:
  620. try:
  621. res = self._lookup(triple_pattern)
  622. except KeyNotFoundError:
  623. return Graph(self, uri=uri)
  624. key_v.mv_data = &ck
  625. key_v.mv_size = KLEN
  626. data_v.mv_size = TRP_KLEN
  627. flt_res = Graph(self, res.capacity, uri=uri)
  628. res.seek()
  629. while res.keys.get_next(&spok):
  630. data_v.mv_data = spok
  631. try:
  632. # Verify that the triple is associated with the
  633. # context being searched.
  634. _check(lmdb.mdb_cursor_get(
  635. icur, &key_v, &data_v, lmdb.MDB_GET_BOTH))
  636. except KeyNotFoundError:
  637. continue
  638. else:
  639. flt_res.keys.add(&spok)
  640. return flt_res
  641. finally:
  642. self._cur_close(icur)
  643. # Unfiltered lookup. No context checked.
  644. else:
  645. try:
  646. res = self._lookup(triple_pattern)
  647. except KeyNotFoundError:
  648. return Graph(self, uri=uri)
  649. return res
  650. cdef Graph _lookup(self, tuple triple_pattern):
  651. """
  652. Look up triples in the indices based on a triple pattern.
  653. :rtype: Iterator
  654. :return: Matching triple keys.
  655. """
  656. cdef:
  657. size_t ct = 0
  658. lmdb.MDB_stat db_stat
  659. lmdb.MDB_val spok_v, ck_v
  660. TripleKey spok
  661. Key sk, pk, ok, tk1, tk2, tk3
  662. s, p, o = triple_pattern
  663. try:
  664. if s is not None:
  665. sk = self.to_key(s)
  666. if p is not None:
  667. pk = self.to_key(p)
  668. if o is not None:
  669. ok = self.to_key(o)
  670. except KeyNotFoundError:
  671. return Graph(self)
  672. if s is not None:
  673. tk1 = sk
  674. if p is not None:
  675. tk2 = pk
  676. # s p o
  677. if o is not None:
  678. tk3 = ok
  679. spok_v.mv_data = spok
  680. spok_v.mv_size = TRP_KLEN
  681. try:
  682. spok = [tk1, tk2, tk3]
  683. _check(lmdb.mdb_get(
  684. self.txn, self.get_dbi('spo:c'), &spok_v, &ck_v))
  685. except KeyNotFoundError:
  686. return Graph(self)
  687. matches = Graph(self, 1)
  688. matches.keys.add(&spok)
  689. return matches
  690. # s p ?
  691. return self._lookup_2bound(0, 1, [tk1, tk2])
  692. if o is not None: # s ? o
  693. tk2 = ok
  694. return self._lookup_2bound(0, 2, [tk1, tk2])
  695. # s ? ?
  696. return self._lookup_1bound(0, tk1)
  697. if p is not None:
  698. tk1 = pk
  699. if o is not None: # ? p o
  700. tk2 = ok
  701. return self._lookup_2bound(1, 2, [tk1, tk2])
  702. # ? p ?
  703. return self._lookup_1bound(1, tk1)
  704. if o is not None: # ? ? o
  705. tk1 = ok
  706. return self._lookup_1bound(2, tk1)
  707. # ? ? ?
  708. # Get all triples in the database.
  709. dcur = self._cur_open('spo:c')
  710. try:
  711. _check(
  712. lmdb.mdb_stat(
  713. self.txn, lmdb.mdb_cursor_dbi(dcur), &db_stat
  714. ), 'Error gathering DB stats.'
  715. )
  716. ct = db_stat.ms_entries
  717. ret = Graph(self, ct)
  718. #logger.debug(f'Triples found: {ct}')
  719. if ct == 0:
  720. return Graph(self)
  721. _check(lmdb.mdb_cursor_get(
  722. dcur, &key_v, &data_v, lmdb.MDB_FIRST))
  723. while True:
  724. spok = <TripleKey>key_v.mv_data
  725. ret.keys.add(&spok)
  726. try:
  727. _check(lmdb.mdb_cursor_get(
  728. dcur, &key_v, &data_v, lmdb.MDB_NEXT_NODUP))
  729. except KeyNotFoundError:
  730. break
  731. return ret
  732. finally:
  733. self._cur_close(dcur)
  734. cdef Graph _lookup_1bound(self, unsigned char idx, Key luk):
  735. """
  736. Lookup triples for a pattern with one bound term.
  737. :param str idx_name: The index to look up as one of the keys of
  738. ``_lookup_ordering``.
  739. :param rdflib.URIRef term: Bound term to search for.
  740. :rtype: Iterator(bytes)
  741. :return: SPO keys matching the pattern.
  742. """
  743. cdef:
  744. unsigned int dbflags
  745. unsigned char term_order[3]
  746. size_t ct, i
  747. lmdb.MDB_cursor *icur
  748. lmdb.MDB_val key_v, data_v
  749. TripleKey spok
  750. logger.debug(f'lookup 1bound: {idx}, {luk}')
  751. term_order = lookup_ordering[idx]
  752. icur = self._cur_open(self.lookup_indices[idx])
  753. logging.debug(f'DB label: {self.lookup_indices[idx]}')
  754. logging.debug('term order: {}'.format(term_order[: 3]))
  755. try:
  756. key_v.mv_data = &luk
  757. key_v.mv_size = KLEN
  758. _check(lmdb.mdb_cursor_get(icur, &key_v, &data_v, lmdb.MDB_SET))
  759. _check(lmdb.mdb_cursor_count(icur, &ct))
  760. # Allocate memory for results.
  761. ret = Graph(self, ct)
  762. _check(lmdb.mdb_cursor_get(icur, &key_v, &data_v, lmdb.MDB_SET))
  763. _check(lmdb.mdb_cursor_get(
  764. icur, &key_v, &data_v, lmdb.MDB_GET_MULTIPLE))
  765. while True:
  766. lu_dset = <DoubleKey*>data_v.mv_data
  767. for i in range(data_v.mv_size // DBL_KLEN):
  768. spok[term_order[0]] = luk
  769. spok[term_order[1]] = lu_dset[i][0]
  770. spok[term_order[2]] = lu_dset[i][1]
  771. ret.keys.add(&spok)
  772. try:
  773. # Get results by the page.
  774. _check(lmdb.mdb_cursor_get(
  775. icur, &key_v, &data_v, lmdb.MDB_NEXT_MULTIPLE))
  776. except KeyNotFoundError:
  777. return ret
  778. finally:
  779. self._cur_close(icur)
  780. cdef Graph _lookup_2bound(
  781. self, unsigned char idx1, unsigned char idx2, DoubleKey tks
  782. ):
  783. """
  784. Look up triples for a pattern with two bound terms.
  785. :param str idx1: The index to look up as one of the keys of
  786. ``lookup_ordering_2bound``.
  787. :param rdflib.URIRef term1: First bound term to search for.
  788. :rtype: Iterator(bytes)
  789. :return: SPO keys matching the pattern.
  790. """
  791. cdef:
  792. unsigned char luk1_offset, luk2_offset
  793. unsigned int dbflags
  794. unsigned char term_order[3] # Lookup ordering
  795. size_t ct, i = 0
  796. lmdb.MDB_cursor* icur
  797. Graph ret
  798. DoubleKey luk
  799. TripleKey spok
  800. for i in range(3):
  801. if (
  802. idx1 in lookup_ordering_2bound[i][: 2]
  803. and idx2 in lookup_ordering_2bound[i][: 2]):
  804. term_order = lookup_ordering_2bound[i]
  805. if term_order[0] == idx1:
  806. luk1_offset = 0
  807. luk2_offset = 1
  808. else:
  809. luk1_offset = 1
  810. luk2_offset = 0
  811. dblabel = self.lookup_indices[i + 3] # skip 1bound index labels
  812. break
  813. if i == 2:
  814. raise ValueError(
  815. 'Indices {} and {} not found in LU keys.'.format(
  816. idx1, idx2))
  817. # Compose term keys in lookup key.
  818. luk[luk1_offset] = tks[0]
  819. luk[luk2_offset] = tks[1]
  820. icur = self._cur_open(dblabel)
  821. try:
  822. key_v.mv_data = luk
  823. key_v.mv_size = DBL_KLEN
  824. # Count duplicates for key and allocate memory for result set.
  825. _check(lmdb.mdb_cursor_get(icur, &key_v, &data_v, lmdb.MDB_SET))
  826. _check(lmdb.mdb_cursor_count(icur, &ct))
  827. ret = Graph(self, ct)
  828. _check(lmdb.mdb_cursor_get(icur, &key_v, &data_v, lmdb.MDB_SET))
  829. _check(lmdb.mdb_cursor_get(
  830. icur, &key_v, &data_v, lmdb.MDB_GET_MULTIPLE))
  831. while True:
  832. lu_dset = <Key*>data_v.mv_data
  833. for i in range(data_v.mv_size // KLEN):
  834. spok[term_order[0]] = luk[0]
  835. spok[term_order[1]] = luk[1]
  836. spok[term_order[2]] = lu_dset[i]
  837. ret.keys.add(&spok)
  838. try:
  839. # Get results by the page.
  840. _check(lmdb.mdb_cursor_get(
  841. icur, &key_v, &data_v, lmdb.MDB_NEXT_MULTIPLE))
  842. except KeyNotFoundError:
  843. return ret
  844. finally:
  845. self._cur_close(icur)
  846. cdef void _all_term_keys(self, term_type, cc.HashSet** tkeys) except *:
  847. """
  848. Return all keys of a (``s:po``, ``p:so``, ``o:sp``) index.
  849. """
  850. cdef:
  851. size_t i = 0
  852. lmdb.MDB_stat stat
  853. cc.HashSetConf tkeys_conf
  854. idx_label = self.lookup_indices['spo'.index(term_type)]
  855. icur = self._cur_open(idx_label)
  856. try:
  857. _check(lmdb.mdb_stat(self.txn, lmdb.mdb_cursor_dbi(icur), &stat))
  858. cc.hashset_conf_init(&tkeys_conf)
  859. tkeys_conf.initial_capacity = 1024
  860. tkeys_conf.load_factor = .75
  861. tkeys_conf.key_length = KLEN
  862. tkeys_conf.key_compare = cc.CC_CMP_POINTER
  863. tkeys_conf.hash = cc.POINTER_HASH
  864. cc.hashset_new_conf(&tkeys_conf, tkeys)
  865. try:
  866. _check(lmdb.mdb_cursor_get(
  867. icur, &key_v, NULL, lmdb.MDB_FIRST))
  868. except KeyNotFoundError:
  869. return
  870. while True:
  871. cc.hashset_add(tkeys[0], key_v.mv_data)
  872. rc = lmdb.mdb_cursor_get(
  873. icur, &key_v, NULL, lmdb.MDB_NEXT_NODUP)
  874. try:
  875. _check(rc)
  876. except KeyNotFoundError:
  877. return
  878. i += 1
  879. finally:
  880. self._cur_close(icur)
  881. def all_terms(self, term_type):
  882. """
  883. Return all terms of a type (``s``, ``p``, or ``o``) in the store.
  884. """
  885. cdef:
  886. void* cur
  887. cc.HashSet* tkeys
  888. cc.HashSetIter it
  889. ret = set()
  890. try:
  891. self._all_term_keys(term_type, &tkeys)
  892. cc.hashset_iter_init(&it, tkeys)
  893. while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
  894. ret.add(self.from_key((<Key*>cur)[0]))
  895. finally:
  896. if tkeys:
  897. free(tkeys)
  898. return ret
  899. cpdef tuple all_namespaces(self):
  900. """
  901. Return all registered namespaces.
  902. """
  903. cdef:
  904. size_t i = 0
  905. lmdb.MDB_stat stat
  906. ret = []
  907. dcur = self._cur_open('pfx:ns')
  908. try:
  909. try:
  910. _check(lmdb.mdb_cursor_get(
  911. dcur, &key_v, &data_v, lmdb.MDB_FIRST))
  912. except KeyNotFoundError:
  913. return tuple()
  914. while True:
  915. ret.append((
  916. (<unsigned char *>key_v.mv_data)[: key_v.mv_size].decode(),
  917. (<unsigned char *>data_v.mv_data)[: data_v.mv_size].decode()))
  918. try:
  919. _check(lmdb.mdb_cursor_get(
  920. dcur, &key_v, &data_v, lmdb.MDB_NEXT))
  921. except KeyNotFoundError:
  922. return tuple(ret)
  923. i += 1
  924. finally:
  925. self._cur_close(dcur)
  926. cdef void all_contexts(
  927. self, Key** ctx, size_t* sz, triple=None
  928. ) except *:
  929. """
  930. Get a list of all contexts.
  931. """
  932. cdef:
  933. size_t ct
  934. lmdb.MDB_stat stat
  935. lmdb.MDB_val key_v, data_v
  936. TripleKey spok
  937. cur = (
  938. self._cur_open('spo:c') if triple and all(triple)
  939. else self._cur_open('c:'))
  940. try:
  941. if triple and all(triple):
  942. _check(lmdb.mdb_stat(
  943. self.txn, lmdb.mdb_cursor_dbi(cur), &stat))
  944. spok = [
  945. self.to_key(triple[0]),
  946. self.to_key(triple[1]),
  947. self.to_key(triple[2]),
  948. ]
  949. key_v.mv_data = spok
  950. key_v.mv_size = TRP_KLEN
  951. try:
  952. _check(lmdb.mdb_cursor_get(
  953. cur, &key_v, &data_v, lmdb.MDB_SET_KEY))
  954. except KeyNotFoundError:
  955. ctx[0] = NULL
  956. return
  957. ctx[0] = <Key*>malloc(stat.ms_entries * KLEN)
  958. sz[0] = 0
  959. while True:
  960. ctx[0][sz[0]] = (<Key*>data_v.mv_data)[0]
  961. sz[0] += 1
  962. try:
  963. _check(lmdb.mdb_cursor_get(
  964. cur, &key_v, &data_v, lmdb.MDB_NEXT_DUP))
  965. except KeyNotFoundError:
  966. break
  967. else:
  968. _check(lmdb.mdb_stat(
  969. self.txn, lmdb.mdb_cursor_dbi(cur), &stat))
  970. try:
  971. _check(lmdb.mdb_cursor_get(
  972. cur, &key_v, &data_v, lmdb.MDB_FIRST))
  973. except KeyNotFoundError:
  974. ctx[0] = NULL
  975. return
  976. ctx[0] = <Key*>malloc(stat.ms_entries * KLEN)
  977. sz[0] = 0
  978. while True:
  979. ctx[0][sz[0]] = (<Key*>key_v.mv_data)[0]
  980. sz[0] += 1
  981. try:
  982. _check(lmdb.mdb_cursor_get(
  983. cur, &key_v, NULL, lmdb.MDB_NEXT))
  984. except KeyNotFoundError:
  985. break
  986. finally:
  987. self._cur_close(cur)
  988. # Key conversion methods.
  989. cdef inline void lookup_term(self, const Key tk, Buffer* data) except *:
  990. """
  991. look up a term by key.
  992. :param Key key: The key to be looked up.
  993. :param Buffer *data: Buffer structure containing the serialized term.
  994. """
  995. cdef:
  996. lmdb.MDB_val key_v, data_v
  997. key_v.mv_data = &tk
  998. key_v.mv_size = KLEN
  999. _check(
  1000. lmdb.mdb_get(
  1001. self.txn, self.get_dbi('t:st'), &key_v, &data_v
  1002. ),
  1003. f'Error getting data for key \'{tk}\'.'
  1004. )
  1005. data.addr = data_v.mv_data
  1006. data.sz = data_v.mv_size
  1007. cdef object from_key(self, const Key tk):
  1008. """
  1009. Convert a single key into one term.
  1010. :param Key key: The key to be converted.
  1011. """
  1012. cdef Buffer pk_t
  1013. logger.info(f'From key:{tk}')
  1014. self.lookup_term(tk, &pk_t)
  1015. logger.info(f'from_key buffer: {buffer_dump(&pk_t)}')
  1016. # TODO Make Term a class and return that.
  1017. return deserialize_to_rdflib(&pk_t)
  1018. cdef Key to_key(self, term) except? 0:
  1019. """
  1020. Convert a term into a key and insert it in the term key store.
  1021. :param rdflib.Term term: An RDFLib term (URIRef, BNode, Literal).
  1022. :param Key key: Key that will be produced.
  1023. :rtype: void
  1024. """
  1025. cdef:
  1026. lmdb.MDB_txn *_txn
  1027. Hash128 thash
  1028. Buffer pk_t
  1029. Key tk
  1030. #logger.info(f'Serializing term: {term}')
  1031. serialize_from_rdflib(term, &pk_t)
  1032. hash128(&pk_t, &thash)
  1033. key_v.mv_data = thash
  1034. key_v.mv_size = HLEN
  1035. try:
  1036. #logger.debug(
  1037. # f'Check {buffer_dump(&pk_t)} with hash '
  1038. # f'{(<unsigned char*>thash)[:HLEN]} in store before adding.'
  1039. #)
  1040. _check(lmdb.mdb_get(
  1041. self.txn, self.get_dbi(b'th:t'), &key_v, &data_v)
  1042. )
  1043. return (<Key*>data_v.mv_data)[0]
  1044. except KeyNotFoundError:
  1045. #logger.info(f'Adding term {term} to store.')
  1046. # If key is not in the store, add it.
  1047. if self.is_txn_rw:
  1048. # Use existing R/W transaction.
  1049. #logger.info('Working in existing RW transaction.')
  1050. _txn = self.txn
  1051. else:
  1052. # Open new R/W transaction.
  1053. #logger.info('Opening a temporary RW transaction.')
  1054. _check(lmdb.mdb_txn_begin(self.dbenv, NULL, 0, &_txn))
  1055. try:
  1056. # Main entry.
  1057. tk = self._append(&pk_t, b't:st', txn=_txn)
  1058. # Index.
  1059. data_v.mv_data = &tk
  1060. data_v.mv_size = KLEN
  1061. _check(lmdb.mdb_put(
  1062. _txn, self.get_dbi(b'th:t'), &key_v, &data_v, 0
  1063. ))
  1064. if not self.is_txn_rw:
  1065. _check(lmdb.mdb_txn_commit(_txn))
  1066. # Kick the main transaction to see the new terms.
  1067. lmdb.mdb_txn_reset(self.txn)
  1068. _check(lmdb.mdb_txn_renew(self.txn))
  1069. return tk
  1070. except:
  1071. if not self.is_txn_rw:
  1072. lmdb.mdb_txn_abort(_txn)
  1073. raise
  1074. cdef Key _append(
  1075. self, Buffer *value,
  1076. unsigned char *dblabel=b'', lmdb.MDB_txn *txn=NULL,
  1077. unsigned int flags=0
  1078. ) except? 0:
  1079. """
  1080. Append one or more keys and values to the end of a database.
  1081. :param lmdb.Cursor cur: The write cursor to act on.
  1082. :param list(bytes) values: Value(s) to append.
  1083. :rtype: Key
  1084. :return: Key inserted.
  1085. """
  1086. cdef:
  1087. lmdb.MDB_cursor *cur
  1088. Key new_idx
  1089. lmdb.MDB_val key_v, data_v
  1090. if txn is NULL:
  1091. txn = self.txn
  1092. cur = self._cur_open(dblabel, txn=txn)
  1093. try:
  1094. _check(lmdb.mdb_cursor_get(cur, &key_v, NULL, lmdb.MDB_LAST))
  1095. except KeyNotFoundError:
  1096. new_idx = FIRST_KEY
  1097. else:
  1098. new_idx = (<Key*>key_v.mv_data)[0] + 1
  1099. finally:
  1100. self._cur_close(cur)
  1101. key_v.mv_data = &new_idx
  1102. logger.debug(f'New index: {new_idx}')
  1103. logger.debug('Key data inserted: {}'.format((<unsigned char*>key_v.mv_data)[:KLEN]))
  1104. key_v.mv_size = KLEN
  1105. data_v.mv_data = value.addr
  1106. data_v.mv_size = value.sz
  1107. lmdb.mdb_put(
  1108. txn, self.get_dbi(dblabel), &key_v, &data_v,
  1109. flags | lmdb.MDB_APPEND)
  1110. return new_idx