keyset.pyx 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. import logging
  2. from libc.string cimport memcmp, memcpy
  3. from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
  4. from cython.parallel import prange
  5. cimport lakesuperior.model.callbacks as cb
  6. from lakesuperior.model.base cimport NULL_TRP, TRP_KLEN, TripleKey
  7. logger = logging.getLogger(__name__)
  8. cdef class Keyset:
  9. """
  10. Memory-contiguous array of ``TripleKey``s.
  11. The keys are ``size_t`` values that are linked to terms in the triplestore.
  12. Therefore, a triplestore lookup is necessary to view or use the terms, but
  13. several types of manipulation and filtering can be done very efficiently
  14. without looking at the term values.
  15. The set is not checked for duplicates all the time: e.g., when creating
  16. from a single set of triples coming from the store, the duplicate check is
  17. turned off for efficiency and because the source is guaranteed to provide
  18. unique values. When merging with other sets, duplicate checking should be
  19. turned on.
  20. Since this class is based on a contiguous block of memory, it is best not
  21. to do targeted manipulation. Several operations involve copying the whole
  22. data block, so e.g. bulk removal and intersection are much more efficient
  23. than individual record operations.
  24. """
  25. def __cinit__(self, size_t capacity=0, float expand_ratio=.75):
  26. """
  27. Initialize and allocate memory for the data set.
  28. :param size_t capacity: Number of elements to be accounted for.
  29. :param float expand_ratio: by how much, relatively to the current
  30. size, the memory block is expanded when full. A value of 0
  31. disables automatic expansion, and inserting beyond capacity will
  32. raise an error.
  33. """
  34. self.capacity = capacity
  35. self.expand_ratio = expand_ratio
  36. self.data = <TripleKey*>PyMem_Malloc(self.capacity * TRP_KLEN)
  37. if capacity and not self.data:
  38. raise MemoryError('Error allocating Keyset data.')
  39. self.cur = 0
  40. self.free_i = 0
  41. def __dealloc__(self):
  42. """
  43. Free the memory.
  44. This is called when the Python instance is garbage collected, which
  45. makes it handy to safely pass a Keyset instance across functions.
  46. """
  47. PyMem_Free(self.data)
  48. # Access methods.
  49. cdef void seek(self, size_t idx=0):
  50. """
  51. Place the cursor at a given index, 0 by default.
  52. :param size_t idx: Position to place the cursor. The position can be
  53. at maximum the next unused slot, any value higher than that will
  54. position the cursor at the next unused slot.
  55. """
  56. self.cur = min(idx, self.free_i)
  57. cdef size_t size(self):
  58. """
  59. Size of the object as the number of occupied data slots.
  60. Note that this is different from :py:data:`capacity`_, which indicates
  61. the number of allocated items in memory.
  62. """
  63. return self.free_i
  64. cdef size_t tell(self):
  65. """
  66. Tell the position of the cursor in the keyset.
  67. """
  68. return self.cur
  69. cdef inline bint get_next(self, TripleKey* val):
  70. """
  71. Get the current value and advance the cursor by 1.
  72. :param void *val: Addres of value returned. It is NULL if
  73. the end of the buffer was reached.
  74. :rtype: bint
  75. :return: True if a value was found, False if the end of the buffer
  76. has been reached.
  77. """
  78. if self.cur >= self.free_i:
  79. return False
  80. val[0] = self.data[self.cur]
  81. self.cur += 1
  82. return True
  83. cdef inline int add(
  84. self, const TripleKey* val, bint check_dup=False, bint check_cap=True
  85. ) except -1:
  86. """
  87. Add a triple key to the array.
  88. """
  89. # Check for deleted triples and optionally duplicates.
  90. if check_dup and self.contains(val):
  91. return 1
  92. if check_cap and self.free_i >= self.capacity:
  93. if self.expand_ratio > 0:
  94. # In some casees, a very small initial value and ratio may
  95. # round down to a zero increase, so the baseline increase is
  96. # 1 element.
  97. self.resize(
  98. 1 + <size_t>(self.capacity * (1 + self.expand_ratio))
  99. )
  100. else:
  101. raise MemoryError('No space left in key set.')
  102. self.data[self.free_i] = val[0]
  103. self.free_i += 1
  104. return 0
  105. cdef void remove(self, const TripleKey* val) except *:
  106. """
  107. Remove a triple key.
  108. This method replaces a triple with NULL_TRP if found. It
  109. does not reclaim space. Therefore, if many removal operations are
  110. forseen, using :py:meth:`subtract`_ is advised.
  111. """
  112. cdef:
  113. TripleKey stored_val
  114. self.seek()
  115. while self.get_next(&stored_val):
  116. #logger.info(f'Looking up for removal: {stored_val}')
  117. if memcmp(val, stored_val, TRP_KLEN) == 0:
  118. memcpy(&stored_val, NULL_TRP, TRP_KLEN)
  119. return
  120. cdef bint contains(self, const TripleKey* val):
  121. """
  122. Whether a value exists in the set.
  123. """
  124. cdef size_t i
  125. for i in range(self.free_i):
  126. # o is least likely to match.
  127. if (
  128. val[0][2] == self.data[i][2] and
  129. val[0][0] == self.data[i][0] and
  130. val[0][1] == self.data[i][1]
  131. ):
  132. return True
  133. return False
  134. cdef Keyset copy(self):
  135. """
  136. Copy a Keyset.
  137. """
  138. cdef Keyset new_ks = Keyset(
  139. self.capacity, expand_ratio=self.expand_ratio
  140. )
  141. memcpy(new_ks.data, self.data, self.capacity * TRP_KLEN)
  142. new_ks.seek()
  143. new_ks.free_i = self.free_i
  144. return new_ks
  145. cdef Keyset sparse_copy(self):
  146. """
  147. Copy a Keyset and plug holes.
  148. ``NULL_TRP`` values left from removing triple keys are skipped in the
  149. copy and the set is shrunk to its used size.
  150. """
  151. cdef:
  152. TripleKey val
  153. Keyset new_ks = Keyset(self.capacity, self.expand_ratio)
  154. self.seek()
  155. while self.get_next(&val):
  156. if val != NULL_TRP:
  157. new_ks.add(&val)
  158. new_ks.resize()
  159. return new_ks
  160. cdef void resize(self, size_t size=0) except *:
  161. """
  162. Change the array capacity.
  163. :param size_t size: The new capacity size. If not specified or 0, the
  164. array is shrunk to the last used item. The resulting size
  165. therefore will always be greater than 0. The only exception
  166. to this is if the specified size is 0 and no items have been added
  167. to the array, in which case the array will be effectively shrunk
  168. to 0.
  169. """
  170. if not size:
  171. size = self.free_i
  172. tmp = <TripleKey*>PyMem_Realloc(self.data, size * TRP_KLEN)
  173. if not tmp:
  174. raise MemoryError('Could not reallocate Keyset data.')
  175. self.data = tmp
  176. self.capacity = size
  177. self.seek()
  178. cdef Keyset lookup(self, const Key sk, const Key pk, const Key ok):
  179. """
  180. Look up triple keys.
  181. This works in a similar way that the ``Graph`` and ``LmdbStore``
  182. methods work.
  183. Any and all the terms may be NULL. A NULL term is treated as unbound.
  184. :param const Key* sk: s key pointer.
  185. :param const Key* pk: p key pointer.
  186. :param const Key* ok: o key pointer.
  187. """
  188. cdef:
  189. TripleKey spok
  190. Keyset ret = Keyset(self.capacity)
  191. Key k1, k2
  192. key_cmp_fn_t cmp_fn
  193. if sk and pk and ok: # s p o
  194. pass # TODO
  195. elif sk:
  196. k1 = sk
  197. if pk: # s p ?
  198. k2 = pk
  199. cmp_fn = cb.lookup_skpk_cmp_fn
  200. elif ok: # s ? o
  201. k2 = ok
  202. cmp_fn = cb.lookup_skok_cmp_fn
  203. else: # s ? ?
  204. cmp_fn = cb.lookup_sk_cmp_fn
  205. elif pk:
  206. k1 = pk
  207. if ok: # ? p o
  208. k2 = ok
  209. cmp_fn = cb.lookup_pkok_cmp_fn
  210. else: # ? p ?
  211. cmp_fn = cb.lookup_pk_cmp_fn
  212. elif ok: # ? ? o
  213. k1 = ok
  214. cmp_fn = cb.lookup_ok_cmp_fn
  215. else: # ? ? ?
  216. return self.copy()
  217. self.seek()
  218. while self.get_next(&spok):
  219. if cmp_fn(&spok, k1, k2):
  220. ret.add(&spok)
  221. ret.resize()
  222. return ret
  223. ## Boolean operations.
  224. cdef Keyset merge(Keyset ks1, Keyset ks2):
  225. """
  226. Create a Keyset by merging an``ks2`` Keyset with the current one.
  227. :rtype: Keyset
  228. """
  229. cdef:
  230. TripleKey val
  231. Keyset ks3 = ks1.copy()
  232. ks2.seek()
  233. while ks2.get_next(&val):
  234. ks3.add(&val, True)
  235. ks3.resize()
  236. return ks3
  237. cdef Keyset subtract(Keyset ks1, Keyset ks2):
  238. """
  239. Create a Keyset by subtracting an``ks2`` Keyset from the current one.
  240. :rtype: Keyset
  241. """
  242. cdef:
  243. TripleKey val
  244. Keyset ks3 = Keyset(ks1.capacity)
  245. ks1.seek()
  246. while ks1.get_next(&val):
  247. if val != NULL_TRP and not ks2.contains(&val):
  248. ks3.add(&val)
  249. ks3.resize()
  250. return ks3
  251. cdef Keyset intersect(Keyset ks1, Keyset ks2):
  252. """
  253. Create a Keyset by intersection with an``ks2`` Keyset.
  254. :rtype: Keyset
  255. """
  256. cdef:
  257. TripleKey val
  258. Keyset ks3 = Keyset(ks1.capacity)
  259. ks1.seek()
  260. while ks1.get_next(&val):
  261. if val != NULL_TRP and ks2.contains(&val):
  262. ks3.add(&val)
  263. ks3.resize()
  264. return ks3
  265. cdef Keyset xor(Keyset ks1, Keyset ks2):
  266. """
  267. Create a Keyset by disjunction (XOR) with an``ks2`` Keyset.
  268. :rtype: Keyset
  269. """
  270. cdef:
  271. TripleKey val
  272. Keyset ks3 = Keyset(ks1.capacity + ks2.capacity)
  273. ks1.seek()
  274. while ks1.get_next(&val):
  275. if val != NULL_TRP and not ks2.contains(&val):
  276. ks3.add(&val)
  277. ks2.seek()
  278. while ks2.get_next(&val):
  279. if val != NULL_TRP and not ks1.contains(&val):
  280. ks3.add(&val)
  281. ks3.resize()
  282. return ks3