瀏覽代碼

Start working on alt graph impementation.

Stefano Cossu 6 年之前
父節點
當前提交
0566859899
共有 2 個文件被更改,包括 93 次插入0 次删除
  1. 79 0
      lakesuperior/model/graph/graph.pyx
  2. 14 0
      lakesuperior/store/ldp_rs/keyset.pyx

+ 79 - 0
lakesuperior/model/graph/graph.pyx

@@ -0,0 +1,79 @@
+from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
+
+from lakesuperior.util.hash cimport HLEN_32, Hash32, hash32
+
+ctypedef void *SetItem
+ctypedef struct Index:
+    size_t *addr
+    size_t ct
+
+cdef class VarSet:
+    """
+    Variable-size set of variable-size values.
+    """
+    cdef:
+        # Data blob. Stored contibuously in memory, and found by index.
+        void *_data
+        # Total size of data.
+        size_t _data_sz
+        # Index used to find start and end of each item.
+        Index _index
+        # KeySet of hashes of the set items.
+        Keyset _hashes
+
+    def __cinit__(self):
+        self._data = PyMem_Malloc(0)
+        self._hashes = Keyset(0, sizeof(Hash32))
+        self._data_sz = 0
+
+
+    def __dealloc__(self):
+        PyMem_Free(self._data)
+
+
+    cdef int add(self, const SetItem data, Index *idx) except -1:
+        """
+        Add a number of items.
+
+        The items' content as a blob and their end boundaries must be given
+        as an array of ``size_t``.
+        """"
+        #cdef size_t grow_sz = idx.addr[idx.ct - 1]
+        # Last index indicates the position of the last byte
+        cdef:
+            size_t i, cur = 0, data_exp_sz, hash_exp_sz
+            void *_tmp_data
+            Hash32 hash
+            Buffer msg
+            SetItem *item
+
+        # Resize data sets to maximium possible size for this function call.
+        _tmp_data = PyMem_Realloc(self._data, idx.addr[idx.ct - 1])
+        if not _tmp_data:
+            raise MemoryError('Unable to allocate memory for set data.')
+        self._hashes.resize(self._hashes.ct + idx.ct)
+
+        for i in idx.ct:
+            # Iterate over the items in the index and verify if they can be
+            # added if they are not duplicates.
+            msg.addr = data + cur
+            msg.sz = idx[i] - cur
+            hash32(&msg, &hash)
+
+            if not self.hashes.contains(hash):
+                # Add to the data.
+                memcpy(_tmp_data + i * HLEN_32, msg.addr, msg.sz)
+                # Add to the hashes keyset.
+                memcpy(self._hashes + self._data_sz, hash, HLEN32)
+                # Record the memory expansion.
+                self._data_sz += msg.sz
+
+            cur = idx[i]
+
+        # Shrink data back to their actual size.
+        self.hashes.resize(cur)
+        _tmp_data = PyMem_Malloc(self._data_sz)
+        if not _tmp_data :
+            raise MemoryError('Unable to allocate memory for set data.')
+        self._data = _tmp_data
+

+ 14 - 0
lakesuperior/store/ldp_rs/keyset.pyx

@@ -1,3 +1,4 @@
+from libc.string cimport memcmp
 from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
 
 cdef class Keyset:
@@ -157,3 +158,16 @@ cdef class Keyset:
 
         return True
 
+
+    cdef bint contains(self, const void *val):
+        """
+        Whether a value exists in the set.
+        """
+        cdef void *cval
+
+        self.reset()
+        while next(val):
+            if memcmp(val, cval, self.itemsize) == 0:
+                return True
+        retuern False
+