Ver Fonte

Merge pull request #84 from scossu/size_t_keys

Size t keys
Stefano Cossu há 6 anos atrás
pai
commit
b59a2224d6

+ 1 - 1
.travis.yml

@@ -10,7 +10,7 @@ matrix:
       sudo: true
 
 install:
-  - pip install Cython==0.29 cymem
+  - pip install Cython==0.29.6 cymem
   - pip install -e .
 script:
   - python setup.py test

+ 1 - 1
ext/lmdb

@@ -1 +1 @@
-Subproject commit 2a5eaad6919ce6941dec4f0d5cce370707a00ba7
+Subproject commit 5033a08c86fb6ef0adddabad327422a1c0c0069a

+ 0 - 88
lakesuperior/cy_include/calg.pxd

@@ -1,88 +0,0 @@
-cdef extern from 'set.h':
-    #ctypedef _Set Set
-    #ctypedef _SetEntry SetEntry
-    ctypedef void *SetValue
-
-    ctypedef unsigned int (*SetHashFunc)(SetValue value)
-    ctypedef bint (*SetEqualFunc)(SetValue value1, SetValue value2)
-    ctypedef void (*SetFreeFunc)(SetValue value)
-
-    ctypedef struct SetEntry:
-        SetValue data
-        SetEntry *next
-
-    ctypedef struct _Set:
-        SetEntry **table
-        unsigned int entries
-        unsigned int table_size
-        unsigned int prime_index
-        SetHashFunc hash_func
-        SetEqualFunc equal_func
-        SetFreeFunc free_func
-
-    ctypedef _Set Set
-
-    ctypedef struct SetIterator:
-        pass
-
-    Set *set_new(SetHashFunc hash_func, SetEqualFunc equal_func)
-    void set_free(Set *set)
-    # TODO This should return an int, ideally. See
-    # https://github.com/fragglet/c-algorithms/issues/20
-    bint set_insert(Set *set, SetValue data)
-    bint set_insert_or_assign(Set *set, SetValue *data)
-    bint set_query(Set *set, SetValue data)
-    bint set_enlarge(Set *set)
-    unsigned int set_num_entries(Set *set)
-    SetValue *set_to_array(Set *set)
-    Set *set_union(Set *set1, Set *set2)
-    Set *set_intersection(Set *set1, Set *set2)
-    void set_iterate(Set *set, SetIterator *iter)
-    bint set_iter_has_more(SetIterator *iterator)
-    SetValue set_iter_next(SetIterator *iterator)
-
-
-cdef extern from 'hash-table.h':
-    ctypedef void *HashTableKey
-    ctypedef void *HashTableValue
-
-    ctypedef struct HashTablePair:
-        HashTableKey key
-        HashTableKey value
-
-    ctypedef struct HashTableEntry:
-        HashTablePair pair
-        HashTableEntry *next
-
-    ctypedef struct HashTable:
-        HashTableEntry **table
-        unsigned int table_size
-        unsigned int entries
-        unsigned int prime_index
-
-    ctypedef struct HashTableIterator:
-        pass
-
-    ctypedef unsigned int (*HashTableHashFunc)(HashTableKey value)
-    ctypedef bint (*HashTableEqualFunc)(
-            HashTableKey value1, HashTableKey value2)
-    ctypedef void (*HashTableKeyFreeFunc)(HashTableKey value)
-    ctypedef void (*HashTableValueFreeFunc)(HashTableValue value)
-
-
-    HashTable *hash_table_new(
-            HashTableHashFunc hash_func, HashTableEqualFunc equal_func)
-    void hash_table_free(HashTable *hash_table)
-    void hash_table_register_free_functions(
-            HashTable *hash_table, HashTableKeyFreeFunc key_free_func,
-            HashTableValueFreeFunc value_free_func)
-    int hash_table_insert(
-            HashTable *hash_table, HashTableKey key, HashTableValue value)
-    HashTableValue hash_table_lookup(
-            HashTable *hash_table, HashTableKey key)
-    bint hash_table_remove(HashTable *hash_table, HashTableKey key)
-    unsigned int hash_table_num_entries(HashTable *hash_table)
-    void hash_table_iterate(HashTable *hash_table, HashTableIterator *iter)
-    bint hash_table_iter_has_more(HashTableIterator *iterator)
-    HashTablePair hash_table_iter_next(HashTableIterator *iterator)
-

+ 40 - 33
lakesuperior/cy_include/collections.pxd

@@ -9,7 +9,7 @@ ctypedef int (*key_compare_ft)(const void* key1, const void* key2)
 
 cdef extern from "common.h":
 
-    cdef enum cc_stat:
+    enum cc_stat:
         CC_OK
         CC_ERR_ALLOC
         CC_ERR_INVALID_CAPACITY
@@ -19,47 +19,50 @@ cdef extern from "common.h":
         CC_ERR_VALUE_NOT_FOUND
         CC_ERR_OUT_OF_RANGE
         CC_ITER_END
+
+    key_compare_ft CC_CMP_STRING
+    key_compare_ft CC_CMP_POINTER
 #
 #    int cc_common_cmp_str(const void* key1, const void* key2)
 #
 #    int cc_common_cmp_ptr(const void* key1, const void* key2)
 
-#cdef extern from "array.h":
+cdef extern from "array.h":
+
+    ctypedef struct Array:
+        pass
+
+    ctypedef struct ArrayConf:
+        size_t          capacity
+        float           exp_factor
+        mem_alloc_ft  mem_alloc
+        mem_calloc_ft mem_calloc
+        mem_free_ft   mem_free
+
+    ctypedef struct ArrayIter:
+        Array* ar
+        size_t index
+        bint last_removed
 
-#    ctypedef struct Array:
-#        pass
-#
-#    ctypedef struct ArrayConf:
-#        size_t          capacity
-#        float           exp_factor
-#        mem_alloc_ft  mem_alloc
-#        mem_calloc_ft mem_calloc
-#        mem_free_ft   mem_free
-#
-#    ctypedef struct ArrayIter:
-#        Array* ar
-#        size_t index
-#        bint last_removed
-#
 #    ctypedef struct ArrayZipIter:
 #        Array* ar1
 #        Array* ar2
 #        size_t index
 #        bint last_removed
 #
-#    cc_stat array_new(Array** out)
-#
-#    cc_stat array_new_conf(ArrayConf* conf, Array** out)
-#
-#    void array_conf_init(ArrayConf* conf)
-#
-#    void array_destroy(Array* ar)
-#
+    cc_stat array_new(Array** out)
+
+    cc_stat array_new_conf(ArrayConf* conf, Array** out)
+
+    void array_conf_init(ArrayConf* conf)
+
+    void array_destroy(Array* ar)
+
 #    ctypedef void (*_array_destroy_cb_cb_ft)(void*)
 #
 #    void array_destroy_cb(Array* ar, _array_destroy_cb_cb_ft cb)
 #
-#    #cc_stat array_add(Array* ar, void* element)
+    cc_stat array_add(Array* ar, void* element)
 #
 #    #cc_stat array_add_at(Array* ar, void* element, size_t index)
 #
@@ -125,9 +128,9 @@ cdef extern from "common.h":
 #
 #    cc_stat array_filter(Array* ar, _array_filter_predicate_ft predicate, Array** out)
 #
-#    void array_iter_init(ArrayIter* iter, Array* ar)
-#
-#    cc_stat array_iter_next(ArrayIter* iter, void** out)
+    void array_iter_init(ArrayIter* iter, Array* ar)
+
+    cc_stat array_iter_next(ArrayIter* iter, void** out)
 #
 #    cc_stat array_iter_remove(ArrayIter* iter, void** out)
 #
@@ -181,6 +184,10 @@ cdef extern from "hashtable.h":
         TableEntry* prev_entry
         TableEntry* next_entry
 
+    hash_ft GENERAL_HASH
+    hash_ft STRING_HASH
+    hash_ft POINTER_HASH
+
 #    size_t get_table_index(HashTable *table, void *key)
 #
 #    void hashtable_conf_init(HashTableConf* conf)
@@ -209,10 +216,10 @@ cdef extern from "hashtable.h":
 #
 #    cc_stat hashtable_get_values(HashTable* table, Array** out)
 #
-#    size_t hashtable_hash_string(void* key, int len, uint32_t seed)
-#
-#    size_t hashtable_hash(void* key, int len, uint32_t seed)
-#
+    size_t hashtable_hash_string(void* key, int len, uint32_t seed)
+
+    size_t hashtable_hash(void* key, int len, uint32_t seed)
+
     size_t hashtable_hash_ptr(void* key, int len, uint32_t seed)
 #
 #    ctypedef void (*_hashtable_foreach_key_op_ft)(void*)

+ 15 - 1
lakesuperior/model/base.pxd

@@ -1,5 +1,19 @@
-from lakesuperior.cy_include cimport cytpl as tpl
+cimport lakesuperior.cy_include.cytpl as tpl
 
 ctypedef tpl.tpl_bin Buffer
 
+# NOTE This may change in the future, e.g. if a different key size is to
+# be forced.
+ctypedef size_t Key
+
+ctypedef Key DoubleKey[2]
+ctypedef Key TripleKey[3]
+ctypedef Key QuadKey[4]
+
+cdef enum:
+    KLEN = sizeof(Key)
+    DBL_KLEN = 2 * sizeof(Key)
+    TRP_KLEN = 3 * sizeof(Key)
+    QUAD_KLEN = 4 * sizeof(Key)
+
 cdef bytes buffer_dump(Buffer* buf)

+ 1 - 0
lakesuperior/model/graph/callbacks.pxd

@@ -13,6 +13,7 @@ cdef:
     int trp_cmp_fn(const void* key1, const void* key2)
     size_t term_hash_fn(const void* key, int l, uint32_t seed)
     size_t trp_hash_fn(const void* key, int l, uint32_t seed)
+
     bint lookup_none_cmp_fn(
         const BufferTriple *trp, const Buffer *t1, const Buffer *t2
     )

+ 3 - 2
lakesuperior/model/graph/callbacks.pyx

@@ -3,8 +3,9 @@ import logging
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memcmp
 
-from lakesuperior.cy_include cimport collections as cc
-from lakesuperior.cy_include cimport spookyhash as sph
+cimport lakesuperior.cy_include.collections as cc
+cimport lakesuperior.cy_include.spookyhash as sph
+
 from lakesuperior.model.base cimport Buffer, buffer_dump
 from lakesuperior.model.graph cimport graph
 from lakesuperior.model.graph.triple cimport BufferTriple

+ 2 - 1
lakesuperior/model/graph/graph.pxd

@@ -2,7 +2,8 @@ from libc.stdint cimport uint32_t, uint64_t
 
 from cymem.cymem cimport Pool
 
-from lakesuperior.cy_include cimport collections as cc
+cimport lakesuperior.cy_include.collections as cc
+
 from lakesuperior.model.base cimport Buffer
 from lakesuperior.model.graph.triple cimport BufferTriple
 

+ 6 - 6
lakesuperior/model/graph/graph.pyx

@@ -12,9 +12,10 @@ from libc.stdlib cimport free
 
 from cymem.cymem cimport Pool
 
-from lakesuperior.cy_include cimport collections as cc
+cimport lakesuperior.cy_include.collections as cc
+cimport lakesuperior.model.graph.callbacks as cb
+
 from lakesuperior.model.base cimport Buffer, buffer_dump
-from lakesuperior.model.graph cimport callbacks as cb
 from lakesuperior.model.graph cimport term
 from lakesuperior.model.graph.triple cimport BufferTriple
 from lakesuperior.model.structures.hash cimport term_hash_seed32
@@ -29,10 +30,9 @@ cdef class SimpleGraph:
     Most functions should mimic RDFLib's graph with less overhead. It uses
     the same funny but functional slicing notation.
 
-    A SimpleGraph can be instantiated from a store lookup or obtained from a
-    :py:class:`lakesuperior.store.keyset.Keyset`. This makes it possible to use
-    a Keyset to perform initial filtering via identity by key, then the
-    filtered Keyset can be converted into a set of meaningful terms.
+    A SimpleGraph can be instantiated from a store lookup. This makes it
+    possible to use a Keyset to perform initial filtering via identity by key,
+    then the filtered Keyset can be converted into a set of meaningful terms.
 
     An instance of this class can also be converted to and from a
     ``rdflib.Graph`` instance.

+ 21 - 0
lakesuperior/model/structures/callbacks.pxd

@@ -0,0 +1,21 @@
+from lakesuperior.model.base cimport Key, TripleKey
+
+cdef:
+    bint lookup_sk_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    )
+    bint lookup_pk_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    )
+    bint lookup_ok_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    )
+    bint lookup_skpk_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    )
+    bint lookup_skok_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    )
+    bint lookup_pkok_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    )

+ 33 - 0
lakesuperior/model/structures/callbacks.pyx

@@ -0,0 +1,33 @@
+from lakesuperior.model.base cimport Key, TripleKey
+
+cdef bint lookup_sk_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    ):
+    return spok[0] == k1
+
+cdef bint lookup_pk_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    ):
+    return spok[1] == k1
+
+cdef bint lookup_ok_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    ):
+    return spok[2] == k1
+
+cdef bint lookup_skpk_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    ):
+    return spok[0] == k1 and spok[1] == k2
+
+cdef bint lookup_skok_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    ):
+    return spok[0] == k1 and spok[2] == k2
+
+cdef bint lookup_pkok_cmp_fn(
+        const TripleKey* spok, const Key* k1, const Key* k2
+    ):
+    return spok[1] == k1 and spok[2] == k2
+
+

+ 23 - 9
lakesuperior/model/structures/keyset.pxd

@@ -1,12 +1,26 @@
+from lakesuperior.model.base cimport (
+    Key, Key, DoubleKey, TripleKey, Buffer
+)
+
+ctypedef bint (*key_cmp_fn_t)(
+    const TripleKey* spok, const Key* k1, const Key* k2
+)
+
 cdef class Keyset:
     cdef:
-        readonly unsigned char *data
-        readonly unsigned char itemsize
-        readonly size_t ct, size
-        size_t _cur
-
-        void resize(self, size_t ct) except *
-        unsigned char *get_item(self, i)
-        bint iter_next(self, unsigned char** val)
-        bint contains(self, const void *val)
+        TripleKey* data
+        size_t ct
+        size_t _cur # Index cursor used to look up values.
+        size_t _free_i # Index of next free slot.
 
+        void seek(self, size_t idx=*)
+        size_t tell(self)
+        bint get_at(self, size_t i, TripleKey* item)
+        bint get_next(self, TripleKey* item)
+        void add(self, const TripleKey* val) except *
+        bint contains(self, const TripleKey* val)
+        Keyset copy(self)
+        void resize(self, size_t size=*) except *
+        Keyset lookup(
+            self, const Key* sk, const Key* pk, const Key* ok
+        )

+ 149 - 101
lakesuperior/model/structures/keyset.pyx

@@ -1,47 +1,34 @@
-from libc.string cimport memcmp
+import logging
+
+from libc.string cimport memcmp, memcpy
 from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
 
-cdef class Keyset:
-    """
-    Pre-allocated result set.
+cimport lakesuperior.model.structures.callbacks as cb
+
+from lakesuperior.model.base cimport TripleKey, TRP_KLEN
+
 
-    Data in the set are stored as a 1D contiguous array of characters.
-    Access to elements at an arbitrary index position is achieved by using the
-    ``itemsize`` property multiplied by the index number.
+logger = logging.getLogger(__name__)
 
-    Key properties:
 
-    ``ct``: number of elements in the set.
-    ``itemsize``: size of each element, in bytes. All elements have the same
-        size.
-    ``size``: Total size, in bytes, of the data set. This is the product of
-        ``itemsize`` and ``ct``.
+cdef class Keyset:
+    """
+    Pre-allocated array (not set, as the name may suggest) of ``TripleKey``s.
     """
-    def __cinit__(self, size_t ct, unsigned char itemsize):
+    def __cinit__(self, size_t ct=0):
         """
         Initialize and allocate memory for the data set.
 
         :param size_t ct: Number of elements to be accounted for.
-        :param unsigned char itemsize: Size of an individual item.
-            Note that the ``itemsize`` is an unsigned char,
-            i.e. an item can be at most 255 bytes. This is for economy reasons,
-            since many multiplications are done between ``itemsize`` and other
-            char variables.
         """
         self.ct = ct
-        self.itemsize = itemsize
-        self.size = self.itemsize * self.ct
-        self._cur = 0
+        self.data = <TripleKey*>PyMem_Malloc(self.ct * TRP_KLEN)
+        logger.info(f'data address: 0x{<size_t>self.data:02x}')
+        if ct and not self.data:
+            raise MemoryError('Error allocating Keyset data.')
 
-        #logger.debug('Got malloc sizes: {}, {}'.format(ct, itemsize))
-        #logger.debug(
-        #    'Allocating {0} ({1}x{2}) bytes of Keyset data...'.format(
-        #        self.size, self.ct, self.itemsize))
-        self.data = <unsigned char *>PyMem_Malloc(ct * itemsize)
-        if not self.data:
-            raise MemoryError()
-        #logger.debug('...done allocating @ {0:x}.'.format(
-        #        <unsigned long>self.data))
+        self._cur = 0
+        self._free_i = 0
 
 
     def __dealloc__(self):
@@ -53,92 +40,44 @@ cdef class Keyset:
         """
         #logger.debug(
         #    'Releasing {0} ({1}x{2}) bytes of Keyset @ {3:x}...'.format(
-        #        self.size, self.ct, self.itemsize,
+        #        self.size, self.conf.capacity, self.itemsize,
         #        <unsigned long>self.data))
         PyMem_Free(self.data)
         #logger.debug('...done releasing.')
 
 
-    cdef void resize(self, size_t ct) except *:
-        """
-        Resize the result set. Uses ``PyMem_Realloc``.
-
-        Note that resizing to a smaller size does not copy or reallocate the
-        data, resizing to a larger size does.
-
-        Also, note that only the number of items can be changed, the item size
-        cannot.
-
-        :param size_t ct: Number of items in the result set.
-        """
-        cdef unsigned char *tmp
-        self.ct = ct
-        self.size = self.itemsize * self.ct
-
-        #logger.debug(
-        #    'Resizing Keyset to {0} ({1}x{2}) bytes @ {3:x}...'.format(
-        #        self.itemsize * ct, ct, self.itemsize,
-        #        <unsigned long>self.data))
-        tmp = <unsigned char *>PyMem_Realloc(self.data, ct * self.itemsize)
-        if not tmp:
-            raise MemoryError()
-        #logger.debug('...done resizing.')
-
-        self.data = tmp
-
-
     # Access methods.
 
-    def to_tuple(self):
-        """
-        Return the data set as a Python tuple.
-
-        :rtype: tuple
-        """
-        return tuple(
-                self.data[i: i + self.itemsize]
-                for i in range(0, self.size, self.itemsize))
-
-
-    def get_item_obj(self, i):
+    cdef void seek(self, size_t idx=0):
         """
-        Get an item at a given index position.
-
-        :rtype: bytes
-        """
-        if i >= self.ct:
-            raise ValueError(f'Index {i} out of range.')
-
-        return self.get_item(i)[: self.itemsize]
-
-
-    def iter_init(self):
+        Place the cursor at a certain index, 0 by default.
         """
-        Reset the cursor to the initial position.
-        """
-        self._cur = 0
+        self._cur = idx
 
 
-    def tell(self):
+    cdef size_t tell(self):
         """
         Tell the position of the cursor in the keyset.
         """
         return self._cur
 
 
-    cdef unsigned char *get_item(self, i):
+    cdef bint get_at(self, size_t i, TripleKey* item):
         """
         Get an item at a given index position. Cython-level method.
 
-        The item size is known by the ``itemsize`` property of the object.
-
-        :rtype: unsigned char*
+        :rtype: TripleKey
         """
+        if i >= self._free_i:
+            return False
+
         self._cur = i
-        return self.data + self.itemsize * i
+        item[0] = self.data[i]
+
+        return True
 
 
-    cdef bint iter_next(self, unsigned char** val):
+    cdef bint get_next(self, TripleKey* item):
         """
         Populate the current value and advance the cursor by 1.
 
@@ -149,25 +88,134 @@ cdef class Keyset:
         :return: True if a value was found, False if the end of the buffer
             has been reached.
         """
-        if self._cur >= self.ct:
-            val = NULL
+        if self._cur >= self._free_i:
             return False
 
-        val[0] = self.data + self.itemsize * self._cur
+        item[0] = self.data[self._cur]
         self._cur += 1
 
         return True
 
 
-    cdef bint contains(self, const void *val):
+    cdef void add(self, const TripleKey* val) except *:
+        """
+        Add a triple key to the array.
+        """
+        if self._free_i >= self.ct:
+            raise MemoryError('No slots left in key set.')
+
+        self.data[self._free_i] = val[0]
+
+        self._free_i += 1
+
+
+    cdef bint contains(self, const TripleKey* val):
         """
         Whether a value exists in the set.
         """
-        cdef unsigned char* stored_val
+        cdef TripleKey stored_val
 
-        self.iter_init()
-        while self.iter_next(&stored_val):
-            if memcmp(val, stored_val, self.itemsize) == 0:
+        self.seek()
+        while self.get_next(&stored_val):
+            if memcmp(val, stored_val, TRP_KLEN) == 0:
                 return True
         return False
 
+
+    cdef Keyset copy(self):
+        """
+        Copy a Keyset.
+        """
+        cdef Keyset new_ks = Keyset(self.ct)
+        memcpy(new_ks.data, self.data, self.ct * TRP_KLEN)
+        new_ks.seek()
+
+        return new_ks
+
+
+    cdef void resize(self, size_t size=0) except *:
+        """
+        Change the array capacity.
+
+        :param size_t size: The new capacity size. If not specified or 0, the
+            array is shrunk to the last used item. The resulting size
+            therefore will always be greater than 0. The only exception
+            to this is if the specified size is 0 and no items have been added
+            to the array, in which case the array will be effectively shrunk
+            to 0.
+        """
+        if not size:
+            size = self._free_i
+
+        tmp = <TripleKey*>PyMem_Realloc(self.data, size * TRP_KLEN)
+
+        if not tmp:
+            raise MemoryError('Could not reallocate Keyset data.')
+
+        self.data = tmp
+        self.ct = size
+        self.seek()
+
+
+    cdef Keyset lookup(
+            self, const Key* sk, const Key* pk, const Key* ok
+    ):
+        """
+        Look up triple keys.
+
+        This works in a similar way that the ``SimpleGraph`` and ``LmdbStore``
+        methods work.
+
+        Any and all the terms may be NULL. A NULL term is treated as unbound.
+
+        :param const Key* sk: s key pointer.
+        :param const Key* pk: p key pointer.
+        :param const Key* ok: o key pointer.
+        """
+        cdef:
+            TripleKey spok
+            Keyset ret = Keyset(self.ct)
+            Key* k1 = NULL
+            Key* k2 = NULL
+            key_cmp_fn_t cmp_fn
+
+        if sk and pk and ok: # s p o
+            pass # TODO
+
+        elif sk:
+            k1 = sk
+            if pk: # s p ?
+                k2 = pk
+                cmp_fn = cb.lookup_skpk_cmp_fn
+
+            elif ok: # s ? o
+                k2 = ok
+                cmp_fn = cb.lookup_skok_cmp_fn
+
+            else: # s ? ?
+                cmp_fn = cb.lookup_sk_cmp_fn
+
+        elif pk:
+            k1 = pk
+            if ok: # ? p o
+                k2 = ok
+                cmp_fn = cb.lookup_pkok_cmp_fn
+
+            else: # ? p ?
+                cmp_fn = cb.lookup_pk_cmp_fn
+
+        elif ok: # ? ? o
+            k1 = ok
+            cmp_fn = cb.lookup_ok_cmp_fn
+
+        else: # ? ? ?
+            return self.copy()
+
+        self.seek()
+        while self.get_next(&spok):
+            if cmp_fn(<TripleKey*>spok, k1, k2):
+                ret.add(&spok)
+
+        ret.resize()
+
+        return ret

+ 2 - 1
lakesuperior/store/base_lmdb_store.pxd

@@ -1,4 +1,4 @@
-cimport lakesuperior.cy_include.cylmdb as lmdb
+from lakesuperior.cy_include cimport cylmdb as lmdb
 
 cdef:
     int rc
@@ -13,6 +13,7 @@ cdef:
 cdef class BaseLmdbStore:
     cdef:
         readonly bint is_txn_open
+        readonly bint is_txn_rw
         public bint _open
         unsigned int _readers
         readonly str env_path

+ 25 - 58
lakesuperior/store/ldp_rs/lmdb_triplestore.pxd

@@ -1,59 +1,25 @@
+cimport lakesuperior.cy_include.collections as cc
 cimport lakesuperior.cy_include.cylmdb as lmdb
 cimport lakesuperior.cy_include.cytpl as tpl
 
-from lakesuperior.model.base cimport Buffer
+from lakesuperior.model.base cimport (
+    Key, DoubleKey, TripleKey, Buffer
+)
 from lakesuperior.model.graph.graph cimport SimpleGraph
 from lakesuperior.model.structures.keyset cimport Keyset
 from lakesuperior.store.base_lmdb_store cimport BaseLmdbStore
 
-#Fixed length for term keys.
-#
-#4 or 5 is a safe range. 4 allows for ~4 billion (256 ** 4) unique terms
-#in the store. 5 allows ~1 trillion terms. While these numbers may seem
-#huge (the total number of Internet pages indexed by Google as of 2018 is 45
-#billions), it must be reminded that the keys cannot be reused, so a
-#repository that deletes a lot of triples may burn through a lot of terms.
-#
-#If a repository runs ot of keys it can no longer store new terms and must
-#be migrated to a new database, which will regenerate and compact the keys.
-#
-#For smaller repositories it should be safe to set this value to 4, which
-#could improve performance since keys make up the vast majority of record
-#exchange between the store and the application. However it is sensible not
-#to expose this value as a configuration option.
-#
-#TODO: Explore the option to use size_t (8 bits, or in some architectures,
-#4 bits). If the overhead of handling 8
-#vs. 5 bytes is not huge (and maybe counterbalanced by x86_64 arch optimizations
-#for 8-byte words) it may be worth using those instead of char[5] to simplify
-#the code significantly.
-DEF _KLEN = 5
-DEF _DBL_KLEN = _KLEN * 2
-DEF _TRP_KLEN = _KLEN * 3
-DEF _QUAD_KLEN = _KLEN * 4
-# Lexical sequence start. ``\\x01`` is fine since no special characters are
-# used, but it's good to leave a spare for potential future use.
-DEF _KEY_START = b'\x01'
-
-cdef enum:
-    KLEN = _KLEN
-    DBL_KLEN = _DBL_KLEN
-    TRP_KLEN = _TRP_KLEN
-    QUAD_KLEN = _QUAD_KLEN
-
-ctypedef unsigned char Key[KLEN]
-ctypedef unsigned char DoubleKey[DBL_KLEN]
-ctypedef unsigned char TripleKey[TRP_KLEN]
-ctypedef unsigned char QuadKey[QUAD_KLEN]
-
 cdef:
-    unsigned char KEY_START = _KEY_START
-    unsigned char FIRST_KEY[KLEN]
+    enum:
+        IDX_OP_ADD = 1
+        IDX_OP_REMOVE = -1
+
     unsigned char lookup_rank[3]
     unsigned char lookup_ordering[3][3]
     unsigned char lookup_ordering_2bound[3][3]
 
 
+
 cdef class LmdbTriplestore(BaseLmdbStore):
     cpdef dict stats(self)
     cpdef size_t _len(self, context=*) except -1
@@ -62,27 +28,28 @@ cdef class LmdbTriplestore(BaseLmdbStore):
     cpdef void _remove(self, tuple triple_pattern, context=*) except *
     cpdef void _remove_graph(self, object gr_uri) except *
     cpdef tuple all_namespaces(self)
-    cpdef tuple all_contexts(self, triple=*)
     cpdef SimpleGraph graph_lookup(
         self, triple_pattern, context=*, uri=*, copy=*
     )
 
     cdef:
-        void _add_graph(self, Buffer *pk_gr) except *
-        void _index_triple(self, str op, TripleKey spok) except *
+        void _add_graph(self, Buffer* pk_gr) except *
+        void _index_triple(self, int op, TripleKey spok) except *
         Keyset triple_keys(self, tuple triple_pattern, context=*)
-        Keyset _all_term_keys(self, term_type)
-        inline void lookup_term(self, const Key key, Buffer* data) except *
+        void _all_term_keys(self, term_type, cc.HashSet** tkeys) except *
+        void lookup_term(self, const Key* tk, Buffer* data) except *
         Keyset _lookup(self, tuple triple_pattern)
-        Keyset _lookup_1bound(self, unsigned char idx, term)
+        Keyset _lookup_1bound(self, unsigned char idx, Key luk)
         Keyset _lookup_2bound(
-                self, unsigned char idx1, term1, unsigned char idx2, term2)
-        object from_key(self, const Key key)
-        tuple from_trp_key(self, TripleKey key)
-        inline void _to_key(self, term, Key *key) except *
-        inline void _to_triple_key(self, tuple terms, TripleKey *tkey) except *
-        void _append(
-                self, Buffer *value, Key *nkey,
+            self, unsigned char idx1, unsigned char idx2, DoubleKey tks
+        )
+        object from_key(self, const Key tk)
+        Key _to_key_idx(self, term) except -1
+        void all_contexts(self, Key** ctx, size_t* sz, triple=*) except *
+        Key _append(
+                self, Buffer *value,
                 unsigned char *dblabel=*, lmdb.MDB_txn *txn=*,
-                unsigned int flags=*) except *
-        void _next_key(self, const Key key, Key *nkey) except *
+                unsigned int flags=*)
+
+        #Key bytes_to_idx(self, const unsigned char* bs)
+        #unsigned char* idx_to_bytes(Key idx)

Diff do ficheiro suprimidas por serem muito extensas
+ 318 - 287
lakesuperior/store/ldp_rs/lmdb_triplestore.pyx


+ 106 - 29
lakesuperior/util/benchmark.py

@@ -1,11 +1,14 @@
 #!/usr/bin/env python3
 
+import logging
 import sys
 
+from os import path
 from uuid import uuid4
 
 import arrow
 import click
+import rdflib
 import requests
 
 from matplotlib import pyplot as plt
@@ -17,16 +20,32 @@ __doc__ = '''
 Benchmark script to measure write performance.
 '''
 
+def_mode = 'ldp'
 def_endpoint = 'http://localhost:8000/ldp'
 def_ct = 10000
 def_parent = '/pomegranate'
 def_gr_size = 200
 
+logging.disable(logging.WARN)
+
 
 @click.command()
+@click.option(
+    '--mode', '-m', default=def_mode,
+    help=(
+        'Mode of ingestion. One of `ldp`, `python`. With the former, the '
+        'HTTP/LDP web server is used. With the latter, the Python API is '
+        'used, in which case the server need not be running. '
+        f'Default: {def_endpoint}'
+    )
+)
 @click.option(
     '--endpoint', '-e', default=def_endpoint,
-    help=f'LDP endpoint. Default: {def_endpoint}')
+    help=(
+        'LDP endpoint. Only meaningful with `ldp` mode. '
+        f'Default: {def_endpoint}'
+    )
+)
 @click.option(
     '--count', '-c', default=def_ct,
     help='Number of resources to ingest. Default: {def_ct}')
@@ -40,9 +59,12 @@ def_gr_size = 200
     help='Delete container resource and its children if already existing. By '
     'default, the container is not deleted and new resources are added to it.')
 @click.option(
-    '--method', '-m', default='put',
-    help='HTTP method to use. Case insensitive. Either PUT '
-    f'or POST. Default: PUT')
+    '--method', '-X', default='put',
+    help=(
+        'HTTP method to use. Case insensitive. Either PUT or POST. '
+        'Default: PUT'
+    )
+)
 @click.option(
     '--graph-size', '-s', default=def_gr_size,
     help=f'Number of triples in each graph. Default: {def_gr_size}')
@@ -52,47 +74,73 @@ def_gr_size = 200
     '`n` (only  LDP-NR, i.e. binaries), or `b` (50/50% of both). '
     'Default: r')
 @click.option(
-    '--graph', '-g', is_flag=True, help='Plot a graph of ingest timings. '
+    '--plot', '-P', is_flag=True, help='Plot a graph of ingest timings. '
     'The graph figure is displayed on screen with basic manipulation and save '
     'options.')
 
 def run(
-        endpoint, count, parent, method, delete_container,
-        graph_size, resource_type, graph):
-
-    container_uri = endpoint + parent
+    mode, endpoint, count, parent, method, delete_container,
+    graph_size, resource_type, plot
+):
+    """
+    Run the benchmark.
+    """
 
     method = method.lower()
     if method not in ('post', 'put'):
-        raise ValueError(f'HTTP method not supported: {method}')
+        raise ValueError(f'Insertion method not supported: {method}')
+
+    mode = mode.lower()
+    if mode == 'ldp':
+        parent = '{}/{}'.format(endpoint.strip('/'), parent.strip('/'))
+
+        if delete_container:
+            requests.delete(parent, headers={'prefer': 'no-tombstone'})
+        requests.put(parent)
 
-    if delete_container:
-        requests.delete(container_uri, headers={'prefer': 'no-tombstone'})
-    requests.put(container_uri)
+    elif mode == 'python':
+        from lakesuperior import env_setup
+        from lakesuperior.api import resource as rsrc_api
+
+        if delete_container:
+            rsrc_api.delete(parent, soft=False)
+        rsrc_api.create_or_replace(parent)
+    else:
+        raise ValueError(f'Mode not supported: {mode}')
 
-    print(f'Inserting {count} children under {container_uri}.')
 
     # URI used to establish an in-repo relationship. This is set to
     # the most recently created resource in each loop.
-    ref = container_uri
+    ref = parent
+
+    print(f'Inserting {count} children under {parent}.')
 
     wclock_start = arrow.utcnow()
-    if graph:
+    if plot:
         print('Results will be plotted.')
         # Plot coordinates: X is request count, Y is request timing.
         px = []
         py = []
         plt.xlabel('Requests')
         plt.ylabel('ms per request')
-        plt.title('FCREPO Benchmark')
+        plt.title('Lakesuperior / FCREPO Benchmark')
 
     try:
         for i in range(1, count + 1):
-            url = '{}/{}'.format(container_uri, uuid4()) if method == 'put' \
-                    else container_uri
+            #import pdb; pdb.set_trace()
+            if mode == 'ldp':
+                dest = (
+                    f'{parent}/{uuid4()}' if method == 'put'
+                    else parent
+                )
+            else:
+                dest = (
+                    path.join(parent, str(uuid4()))
+                    if method == 'put' else parent
+                )
 
             if resource_type == 'r' or (resource_type == 'b' and i % 2 == 0):
-                data = random_graph(graph_size, ref).serialize(format='ttl')
+                data = random_graph(graph_size, ref)
                 headers = {'content-type': 'text/turtle'}
             else:
                 img = random_image(name=uuid4(), ts=16, ims=512)
@@ -103,19 +151,21 @@ def run(
                         'content-disposition': 'attachment; filename="{}"'
                             .format(uuid4())}
 
-            #import pdb; pdb.set_trace()
             # Start timing after generating the data.
             ckpt = arrow.utcnow()
             if i == 1:
                 tcounter = ckpt - ckpt
                 prev_tcounter = tcounter
 
-            rsp = requests.request(method, url, data=data, headers=headers)
-            tdelta = arrow.utcnow() - ckpt
-            tcounter += tdelta
+            ref = (
+                _ingest_graph_ldp(
+                    method, dest, data.serialize(format='ttl'), headers, ref
+                )
+                if mode == 'ldp'
+                else _ingest_graph_py(method, dest, data, ref)
+            )
+            tcounter += (arrow.utcnow() - ckpt)
 
-            rsp.raise_for_status()
-            ref = rsp.headers['location']
             if i % 10 == 0:
                 avg10 = (tcounter - prev_tcounter) / 10
                 print(
@@ -123,7 +173,7 @@ def run(
                     f'Per resource: {avg10}')
                 prev_tcounter = tcounter
 
-                if graph:
+                if plot:
                     px.append(i)
                     # Divide by 1000 for µs → ms
                     py.append(avg10.microseconds // 1000)
@@ -136,7 +186,7 @@ def run(
     print(f'Total time spent ingesting resources: {tcounter}')
     print(f'Average time per resource: {tcounter.total_seconds()/i}')
 
-    if graph:
+    if plot:
         if resource_type == 'r':
             type_label = 'LDP-RS'
         elif resource_type == 'n':
@@ -144,12 +194,39 @@ def run(
         else:
             type_label = 'LDP-RS + LDP-NR'
         label = (
-            f'{container_uri}; {method.upper()}; {graph_size} trp/graph; '
+            f'{parent}; {method.upper()}; {graph_size} trp/graph; '
             f'{type_label}')
         plt.plot(px, py, label=label)
         plt.legend()
         plt.show()
 
 
+def _ingest_graph_ldp(method, uri, data, headers, ref):
+    """
+    Ingest the graph via HTTP/LDP.
+    """
+    rsp = requests.request(method, uri, data=data, headers=headers)
+    rsp.raise_for_status()
+    return rsp.headers['location']
+
+
+def _ingest_graph_py(method, dest, data, ref):
+    from lakesuperior.api import resource as rsrc_api
+
+    kwargs = {}
+    if isinstance(data, rdflib.Graph):
+        kwargs['graph'] = data
+    else:
+        kwargs['stream'] = data
+        kwargs['mimetype'] = 'image/png'
+
+    if method == 'put':
+        _, rsrc = rsrc_api.create_or_replace(dest, **kwargs)
+    else:
+        _, rsrc = rsrc_api.create(dest, **kwargs)
+
+    return rsrc.uid
+
+
 if __name__ == '__main__':
     run()

+ 2 - 1
requirements_dev.txt

@@ -1,5 +1,5 @@
 CoilMQ>=1.0.1
-Cython==0.29
+Cython==0.29.6
 Flask>=0.12.2
 HiYaPyCo>=0.4.11
 Pillow>=4.3.0
@@ -9,6 +9,7 @@ click-log>=0.2.1
 click>=6.7
 gevent>=1.3.6
 gunicorn>=19.7.1
+matplotlib
 numpy>=1.15.1
 pytest-flask
 pytest>=3.2.2

+ 11 - 3
setup.py

@@ -16,7 +16,7 @@ from os import path
 import lakesuperior
 
 # Use this version to build C files from .pyx sources.
-CYTHON_VERSION='0.29'
+CYTHON_VERSION='0.29.6'
 
 KLEN = 5 # TODO Move somewhere else (config?)
 
@@ -91,8 +91,8 @@ extensions = [
             path.join(spookyhash_src_dir, 'spookyhash.c'),
             path.join(coll_src_dir, 'common.c'),
             path.join(coll_src_dir, 'array.c'),
-            path.join(coll_src_dir, 'hashset.c'),
             path.join(coll_src_dir, 'hashtable.c'),
+            path.join(coll_src_dir, 'hashset.c'),
             path.join('lakesuperior', 'model', 'structures', f'*.{ext}'),
         ],
         include_dirs=include_dirs,
@@ -108,8 +108,8 @@ extensions = [
             path.join(spookyhash_src_dir, 'spookyhash.c'),
             path.join(coll_src_dir, 'common.c'),
             path.join(coll_src_dir, 'array.c'),
-            path.join(coll_src_dir, 'hashset.c'),
             path.join(coll_src_dir, 'hashtable.c'),
+            path.join(coll_src_dir, 'hashset.c'),
             path.join('lakesuperior', 'model', 'graph', f'*.{ext}'),
         ],
         include_dirs=include_dirs,
@@ -119,6 +119,10 @@ extensions = [
     Extension(
         'lakesuperior.store.base_lmdb_store',
         [
+            path.join(coll_src_dir, 'common.c'),
+            path.join(coll_src_dir, 'array.c'),
+            path.join(coll_src_dir, 'hashtable.c'),
+            path.join(coll_src_dir, 'hashset.c'),
             path.join(tpl_src_dir, 'tpl.c'),
             path.join(lmdb_src_dir, 'mdb.c'),
             path.join(lmdb_src_dir, 'midl.c'),
@@ -129,6 +133,10 @@ extensions = [
     Extension(
         'lakesuperior.store.ldp_rs.lmdb_triplestore',
         [
+            path.join(coll_src_dir, 'common.c'),
+            path.join(coll_src_dir, 'array.c'),
+            path.join(coll_src_dir, 'hashtable.c'),
+            path.join(coll_src_dir, 'hashset.c'),
             path.join(lmdb_src_dir, 'mdb.c'),
             path.join(lmdb_src_dir, 'midl.c'),
             path.join(

+ 43 - 7
tests/1_store/test_lmdb_store.py

@@ -258,6 +258,42 @@ class TestBasicOps:
 
 
 
+@pytest.mark.usefixtures('store', 'bogus_trp')
+class TestExtendedOps:
+    '''
+    Test additional store operations.
+    '''
+
+    def test_all_terms(self, store, bogus_trp):
+        """
+        Test the "all terms" mehods.
+        """
+        with store.txn_ctx(True):
+            for trp in bogus_trp:
+                store.add(trp)
+
+        with store.txn_ctx():
+            all_s = store.all_terms('s')
+            all_p = store.all_terms('p')
+            all_o = store.all_terms('o')
+
+        assert len(all_s) == 1
+        assert len(all_p) == 100
+        assert len(all_o) == 1000
+
+        assert URIRef('urn:test_mp:s1') in all_s
+        assert URIRef('urn:test_mp:s1') not in all_p
+        assert URIRef('urn:test_mp:s1') not in all_o
+
+        assert URIRef('urn:test_mp:p10') not in all_s
+        assert URIRef('urn:test_mp:p10') in all_p
+        assert URIRef('urn:test_mp:p10') not in all_o
+
+        assert URIRef('urn:test_mp:o99') not in all_s
+        assert URIRef('urn:test_mp:o99') not in all_p
+        assert URIRef('urn:test_mp:o99') in all_o
+
+
 @pytest.mark.usefixtures('store', 'bogus_trp')
 class TestEntryCount:
     '''
@@ -649,7 +685,7 @@ class TestContext:
 
         with store.txn_ctx(True):
             store.add_graph(gr_uri)
-            assert gr_uri in {gr.uri for gr in store.contexts()}
+            assert gr_uri in store.contexts()
 
 
     def test_add_graph_with_triple(self, store):
@@ -664,7 +700,7 @@ class TestContext:
             store.add(trp, ctx_uri)
 
         with store.txn_ctx():
-            assert ctx_uri in {gr.uri for gr in store.contexts(trp)}
+            assert ctx_uri in store.contexts(trp)
 
 
     def test_empty_context(self, store):
@@ -675,10 +711,10 @@ class TestContext:
 
         with store.txn_ctx(True):
             store.add_graph(gr_uri)
-            assert gr_uri in {gr.uri for gr in store.contexts()}
+            assert gr_uri in store.contexts()
         with store.txn_ctx(True):
             store.remove_graph(gr_uri)
-            assert gr_uri not in {gr.uri for gr in store.contexts()}
+            assert gr_uri not in store.contexts()
 
 
     def test_context_ro_txn(self, store):
@@ -698,10 +734,10 @@ class TestContext:
         # allow a lookup in the same transaction, but this does not seem to be
         # possible.
         with store.txn_ctx():
-            assert gr_uri in {gr.uri for gr in store.contexts()}
+            assert gr_uri in store.contexts()
         with store.txn_ctx(True):
             store.remove_graph(gr_uri)
-            assert gr_uri not in {gr.uri for gr in store.contexts()}
+            assert gr_uri not in store.contexts()
 
 
     def test_add_trp_to_ctx(self, store):
@@ -732,7 +768,7 @@ class TestContext:
             assert len(set(store.triples((None, None, None), gr_uri))) == 3
             assert len(set(store.triples((None, None, None), gr2_uri))) == 1
 
-            assert gr2_uri in {gr.uri for gr in store.contexts()}
+            assert gr2_uri in store.contexts()
             assert trp1 in _clean(store.triples((None, None, None)))
             assert trp1 not in _clean(store.triples((None, None, None),
                     RDFLIB_DEFAULT_GRAPH_URI))

Alguns ficheiros não foram mostrados porque muitos ficheiros mudaram neste diff