Kaynağa Gözat

Bunch of changes:

* Add intersection and xor, and related tests;
* Replace crazy pointer-based comparison and hash with literal methods;
* Revert to almost-vanilla Collections-C;
* Create PXD file for Spookyhash.
Stefano Cossu 5 yıl önce
ebeveyn
işleme
b948067b63

+ 1 - 1
ext/collections-c

@@ -1 +1 @@
-Subproject commit 06660db3b3834a2119c3cccff04268f91f660604
+Subproject commit 402d5fa7d29000a578dbaba425179d45115e7f10

+ 1 - 1
lakesuperior/cy_include/collections.pxd

@@ -19,7 +19,7 @@ cdef extern from "common.h":
         CC_ERR_VALUE_NOT_FOUND
         CC_ERR_VALUE_NOT_FOUND
         CC_ERR_OUT_OF_RANGE
         CC_ERR_OUT_OF_RANGE
         CC_ITER_END
         CC_ITER_END
-        CC_DUP_KEY
+        #CC_DUP_KEY
 #
 #
 #    int cc_common_cmp_str(const void* key1, const void* key2)
 #    int cc_common_cmp_str(const void* key1, const void* key2)
 #
 #

+ 20 - 0
lakesuperior/cy_include/spookyhash.pxd

@@ -0,0 +1,20 @@
+from libc.stdint cimport uint32_t, uint64_t
+
+cdef extern from 'spookyhash_api.h':
+
+    ctypedef struct spookyhash_context:
+        pass
+
+    void spookyhash_context_init(
+            spookyhash_context *context, uint64_t seed_1, uint64_t seed_2)
+    void spookyhash_update(
+            spookyhash_context *context, const void *input, size_t input_size)
+    void spookyhash_final(
+            spookyhash_context *context, uint64_t *hash_1, uint64_t *hash_2)
+
+    uint32_t spookyhash_32(const void *input, size_t input_size, uint32_t seed)
+    uint64_t spookyhash_64(const void *input, size_t input_size, uint64_t seed)
+    void spookyhash_128(
+            const void *input, size_t input_size, uint64_t *hash_1,
+            uint64_t *hash_2)
+

+ 11 - 6
lakesuperior/model/graph/graph.pxd

@@ -37,20 +37,25 @@ cdef class SimpleGraph:
         void _data_from_lookup(self, tuple trp_ptn, ctx=*) except *
         void _data_from_lookup(self, tuple trp_ptn, ctx=*) except *
         void _data_from_keyset(self, Keyset data) except *
         void _data_from_keyset(self, Keyset data) except *
         inline void _add_from_spok(self, const TripleKey spok) except *
         inline void _add_from_spok(self, const TripleKey spok) except *
-        inline void _add_triple(
-            self, Buffer *ss, Buffer *sp, Buffer *so
-        ) except *
+        inline void _add_triple(self, BufferTriple *trp) except *
         int _remove_triple(self, BufferTriple* trp_buf) except -1
         int _remove_triple(self, BufferTriple* trp_buf) except -1
         bint _trp_contains(self, BufferTriple* btrp)
         bint _trp_contains(self, BufferTriple* btrp)
+        _get_terms(self)
         set _to_pyset(self)
         set _to_pyset(self)
 
 
+        # Basic graph operations.
+        void ip_union(self, SimpleGraph other) except *
+        void ip_intersection(self, SimpleGraph other) except *
+        void ip_xor(self, SimpleGraph other) except *
+
+    cpdef SimpleGraph xor(self, SimpleGraph other)
+    cpdef SimpleGraph intersection(self, SimpleGraph other)
+    cpdef SimpleGraph union(self, SimpleGraph other)
     cpdef void set(self, tuple trp) except *
     cpdef void set(self, tuple trp) except *
     cpdef void remove_triples(self, pattern) except *
     cpdef void remove_triples(self, pattern) except *
     cpdef object as_rdflib(self)
     cpdef object as_rdflib(self)
-    cpdef set terms(self, str type)
+    #cpdef set terms(self, str type)
 
 
-    cpdef SimpleGraph union(self, SimpleGraph other)
-    cpdef void ip_union(self, SimpleGraph other) except *
 
 
 cdef class Imr(SimpleGraph):
 cdef class Imr(SimpleGraph):
     cdef:
     cdef:

+ 272 - 132
lakesuperior/model/graph/graph.pyx

@@ -15,6 +15,7 @@ from cymem.cymem cimport Pool
 
 
 from lakesuperior.cy_include cimport cylmdb as lmdb
 from lakesuperior.cy_include cimport cylmdb as lmdb
 from lakesuperior.cy_include cimport collections as cc
 from lakesuperior.cy_include cimport collections as cc
+from lakesuperior.cy_include cimport spookyhash as sph
 from lakesuperior.model.graph cimport term
 from lakesuperior.model.graph cimport term
 from lakesuperior.store.ldp_rs.lmdb_triplestore cimport (
 from lakesuperior.store.ldp_rs.lmdb_triplestore cimport (
         KLEN, DBL_KLEN, TRP_KLEN, TripleKey)
         KLEN, DBL_KLEN, TRP_KLEN, TripleKey)
@@ -30,19 +31,6 @@ cdef extern from 'spookyhash_api.h':
 logger = logging.getLogger(__name__)
 logger = logging.getLogger(__name__)
 
 
 
 
-def use_data(fn):
-    """
-    Decorator to indicate that a set operation between two SimpleGraph
-    instances should use the ``data`` property of the second term. The second
-    term can also be a simple set.
-    """
-    @wraps(fn)
-    def _wrapper(self, other):
-        if isinstance(other, SimpleGraph):
-            other = other.data
-    return _wrapper
-
-
 cdef int term_cmp_fn(const void* key1, const void* key2):
 cdef int term_cmp_fn(const void* key1, const void* key2):
     """
     """
     Compare function for two Buffer objects.
     Compare function for two Buffer objects.
@@ -62,6 +50,45 @@ cdef int term_cmp_fn(const void* key1, const void* key2):
     return cmp
     return cmp
 
 
 
 
+cdef int trp_lit_cmp_fn(const void* key1, const void* key2):
+    """
+    Compare function for two triples in a set.
+
+    s, p, o byte data are compared literally.
+
+    :rtype: int
+    :return: 0 if all three terms point to byte-wise identical data in both
+        triples.
+    """
+    t1 = <BufferTriple *>key1
+    t2 = <BufferTriple *>key2
+    print('Comparing terms: {} {} {}'.format(
+        (<unsigned char*>t1.s.addr)[:t1.s.sz],
+        (<unsigned char*>t1.p.addr)[:t1.p.sz],
+        (<unsigned char*>t1.o.addr)[:t1.o.sz]
+    ))
+    print('With:            {} {} {}'.format(
+        (<unsigned char*>t2.s.addr)[:t2.s.sz],
+        (<unsigned char*>t2.p.addr)[:t2.p.sz],
+        (<unsigned char*>t2.o.addr)[:t2.o.sz]
+    ))
+
+    print('Term comparison results: {}, {}, {}'.format(
+        term_cmp_fn(t1.o, t2.o),
+        term_cmp_fn(t1.s, t2.s),
+        term_cmp_fn(t1.p, t2.p)
+    ))
+
+    diff = (
+        term_cmp_fn(t1.o, t2.o) or
+        term_cmp_fn(t1.s, t2.s) or
+        term_cmp_fn(t1.p, t2.p)
+    )
+
+    logger.info(f'Triples match: {not(diff)}')
+    return diff
+
+
 cdef int trp_cmp_fn(const void* key1, const void* key2):
 cdef int trp_cmp_fn(const void* key1, const void* key2):
     """
     """
     Compare function for two triples in a set.
     Compare function for two triples in a set.
@@ -75,10 +102,20 @@ cdef int trp_cmp_fn(const void* key1, const void* key2):
     """
     """
     t1 = <BufferTriple *>key1
     t1 = <BufferTriple *>key1
     t2 = <BufferTriple *>key2
     t2 = <BufferTriple *>key2
-    print('Comparing: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
-        <unsigned long>t1.s, <unsigned long>t1.p, <unsigned long>t1.o))
-    print('With:      <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
-        <unsigned long>t2.s, <unsigned long>t2.p, <unsigned long>t2.o))
+    print('Comparing terms: {} {} {}'.format(
+        (<unsigned char*>t1.s.addr)[:t1.s.sz],
+        (<unsigned char*>t1.p.addr)[:t1.p.sz],
+        (<unsigned char*>t1.o.addr)[:t1.o.sz]
+    ))
+    print('With:            {} {} {}'.format(
+        (<unsigned char*>t2.s.addr)[:t2.s.sz],
+        (<unsigned char*>t2.p.addr)[:t2.p.sz],
+        (<unsigned char*>t2.o.addr)[:t2.o.sz]
+    ))
+    print('Comparing addresses: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
+        <size_t>t1.s, <size_t>t1.p, <size_t>t1.o))
+    print('With:                <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
+        <size_t>t2.s, <size_t>t2.p, <size_t>t2.o))
 
 
     cdef int is_not_equal = (
     cdef int is_not_equal = (
         t1.s.addr != t2.s.addr or
         t1.s.addr != t2.s.addr or
@@ -86,7 +123,7 @@ cdef int trp_cmp_fn(const void* key1, const void* key2):
         t1.o.addr != t2.o.addr
         t1.o.addr != t2.o.addr
     )
     )
 
 
-    logger.info(f'Triples are NOT equal and will be added: {is_not_equal}')
+    logger.info(f'Triples match: {not(is_not_equal)}')
     return is_not_equal
     return is_not_equal
 
 
 
 
@@ -116,6 +153,28 @@ cdef size_t term_hash_fn(const void* key, int l, uint32_t seed):
     return <size_t>spookyhash_64((<Buffer*>key).addr, (<Buffer*>key).sz, seed)
     return <size_t>spookyhash_64((<Buffer*>key).addr, (<Buffer*>key).sz, seed)
 
 
 
 
+cdef size_t trp_lit_hash_fn(const void* key, int l, uint32_t seed):
+    """
+    Hash function for sets of (serialized) triples.
+
+    This function concatenates the literal terms of the triple as bytes
+    and computes their hash.
+    """
+    trp = <BufferTriple*>key
+    seed64 = <uint64_t>seed
+    seed_dummy = seed64
+
+    cdef sph.spookyhash_context ctx
+
+    sph.spookyhash_context_init(&ctx, seed64, seed_dummy)
+    sph.spookyhash_update(&ctx, trp.s.addr, trp.s.sz)
+    sph.spookyhash_update(&ctx, trp.s.addr, trp.p.sz)
+    sph.spookyhash_update(&ctx, trp.s.addr, trp.o.sz)
+    sph.spookyhash_final(&ctx, &seed64, &seed_dummy)
+
+    return <size_t>seed64
+
+
 cdef size_t trp_hash_fn(const void* key, int l, uint32_t seed):
 cdef size_t trp_hash_fn(const void* key, int l, uint32_t seed):
     """
     """
     Hash function for sets of (serialized) triples.
     Hash function for sets of (serialized) triples.
@@ -254,7 +313,7 @@ cdef class SimpleGraph:
             cc.HashSetConf terms_conf, trp_conf
             cc.HashSetConf terms_conf, trp_conf
 
 
         self.term_cmp_fn = &term_cmp_fn
         self.term_cmp_fn = &term_cmp_fn
-        self.trp_cmp_fn = &trp_cmp_fn
+        self.trp_cmp_fn = &trp_lit_cmp_fn
 
 
         cc.hashset_conf_init(&terms_conf)
         cc.hashset_conf_init(&terms_conf)
         terms_conf.load_factor = 0.85
         terms_conf.load_factor = 0.85
@@ -265,7 +324,7 @@ cdef class SimpleGraph:
 
 
         cc.hashset_conf_init(&trp_conf)
         cc.hashset_conf_init(&trp_conf)
         trp_conf.load_factor = 0.75
         trp_conf.load_factor = 0.75
-        trp_conf.hash = &trp_hash_fn
+        trp_conf.hash = &trp_lit_hash_fn
         trp_conf.hash_seed = term_hash_seed32
         trp_conf.hash_seed = term_hash_seed32
         trp_conf.key_compare = self.trp_cmp_fn
         trp_conf.key_compare = self.trp_cmp_fn
         trp_conf.key_length = sizeof(BufferTriple)
         trp_conf.key_length = sizeof(BufferTriple)
@@ -284,9 +343,6 @@ cdef class SimpleGraph:
             # Populate with provided Python set.
             # Populate with provided Python set.
             self.add(data)
             self.add(data)
 
 
-        print(len(self))
-        print('SimpleGraph cinit complete.')
-
 
 
     def __dealloc__(self):
     def __dealloc__(self):
         """
         """
@@ -305,6 +361,10 @@ cdef class SimpleGraph:
         """
         """
         return self._to_pyset()
         return self._to_pyset()
 
 
+    @property
+    def terms(self):
+        return self._get_terms()
+
 
 
     # # # BASIC SET OPERATIONS # # #
     # # # BASIC SET OPERATIONS # # #
 
 
@@ -331,12 +391,12 @@ cdef class SimpleGraph:
             cc.hashset_iter_init(&it, gr._triples)
             cc.hashset_iter_init(&it, gr._triples)
             while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
             while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
                 bt = <BufferTriple*>cur
                 bt = <BufferTriple*>cur
-                new_gr._add_triple(bt.s, bt.p, bt.o)
+                new_gr._add_triple(bt)
 
 
         return new_gr
         return new_gr
 
 
 
 
-    cpdef void ip_union(self, SimpleGraph other) except *:
+    cdef void ip_union(self, SimpleGraph other) except *:
         """
         """
         Perform an in-place set union that adds triples to this instance
         Perform an in-place set union that adds triples to this instance
 
 
@@ -349,12 +409,128 @@ cdef class SimpleGraph:
         cdef:
         cdef:
             void *cur
             void *cur
             cc.HashSetIter it
             cc.HashSetIter it
-            BufferTriple *trp
 
 
         cc.hashset_iter_init(&it, other._triples)
         cc.hashset_iter_init(&it, other._triples)
         while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
         while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
             bt = <BufferTriple*>cur
             bt = <BufferTriple*>cur
-            self._add_triple(bt.s, bt.p, bt.o)
+            self._add_triple(bt)
+
+
+    cpdef SimpleGraph intersection(self, SimpleGraph other):
+        """
+        Graph intersection.
+
+        :param SimpleGraph other: The other graph to intersect.
+
+        :rtype: SimpleGraph
+        :return: A new SimpleGraph instance.
+        """
+        cdef:
+            void *cur
+            cc.HashSetIter it
+            SimpleGraph new_gr = SimpleGraph()
+
+        cc.hashset_iter_init(&it, self._triples)
+        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur
+            print('Checking: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
+                <size_t>bt.s, <size_t>bt.p, <size_t>bt.o))
+            if other._trp_contains(bt):
+                print('Adding.')
+                new_gr._add_triple(bt)
+
+        return new_gr
+
+
+    cdef void ip_intersection(self, SimpleGraph other) except *:
+        """
+        In-place graph intersection.
+
+        Triples in common with another graph are removed from the current one.
+
+        :param SimpleGraph other: The other graph to intersect.
+
+        :rtype: void
+        """
+        cdef:
+            void *cur
+            cc.HashSetIter it
+
+        cc.hashset_iter_init(&it, self._triples)
+        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur
+            if not other._trp_contains(bt):
+                self._remove_triple(bt)
+
+
+    cpdef SimpleGraph xor(self, SimpleGraph other):
+        """
+        Graph Exclusive disjunction (XOR).
+
+        :param SimpleGraph other: The other graph to perform XOR with.
+
+        :rtype: SimpleGraph
+        :return: A new SimpleGraph instance.
+        """
+        cdef:
+            void *cur_self
+            void *cur_other
+            cc.HashSetIter it_self, it_other
+            SimpleGraph new_gr = SimpleGraph()
+            BufferTriple* bt
+
+        # Add triples in this and not in other.
+        print('Comparing with this.')
+        cc.hashset_iter_init(&it_self, self._triples)
+        while cc.hashset_iter_next(&it_self, &cur_self) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur_self
+            if not other._trp_contains(bt):
+                print('Adding from this.')
+                new_gr._add_triple(bt)
+
+        # Other way around.
+        print('Comparing with that.')
+        cc.hashset_iter_init(&it_other, other._triples)
+        while cc.hashset_iter_next(&it_other, &cur_other) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur_other
+            print('Checking on that.')
+            if not self._trp_contains(bt):
+                print('Adding from that.')
+                new_gr._add_triple(bt)
+            else:
+                print('Triple exists. Not adding.')
+
+        return new_gr
+
+
+    cdef void ip_xor(self, SimpleGraph other) except *:
+        """
+        In-place graph XOR.
+
+        Triples in common with another graph are removed from the current one,
+        and triples not in common will be added from the other one.
+
+        :param SimpleGraph other: The other graph to perform XOR with.
+
+        :rtype: void
+        """
+        cdef:
+            void *cur
+            cc.HashSetIter it
+
+        # Add triples in other graph and not in this graph.
+        cc.hashset_iter_init(&it, self._triples)
+        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur
+            if other._trp_contains(bt):
+                self._remove_triple(bt)
+
+        # Remove triples in common.
+        cc.hashset_iter_init(&it, other._triples)
+        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur
+            if not self._trp_contains(bt):
+                self._add_triple(bt)
 
 
 
 
     cdef void _data_from_lookup(self, tuple trp_ptn, ctx=None) except *:
     cdef void _data_from_lookup(self, tuple trp_ptn, ctx=None) except *:
@@ -387,96 +563,53 @@ cdef class SimpleGraph:
         """
         """
         Add a triple from a TripleKey of term keys.
         Add a triple from a TripleKey of term keys.
         """
         """
-        cdef:
-            SPOBuffer s_spo
-            BufferTriple trp
-
         s_spo = <SPOBuffer>self._pool.alloc(3, sizeof(Buffer))
         s_spo = <SPOBuffer>self._pool.alloc(3, sizeof(Buffer))
 
 
         self.store.lookup_term(spok, s_spo)
         self.store.lookup_term(spok, s_spo)
         self.store.lookup_term(spok + KLEN, s_spo + 1)
         self.store.lookup_term(spok + KLEN, s_spo + 1)
         self.store.lookup_term(spok + DBL_KLEN, s_spo + 2)
         self.store.lookup_term(spok + DBL_KLEN, s_spo + 2)
 
 
-        self._add_triple(s_spo, s_spo + 1, s_spo + 2)
+        trp = <BufferTriple *>self._pool.alloc(1, sizeof(BufferTriple))
+        trp.s = s_spo
+        trp.p = s_spo + 1
+        trp.o = s_spo + 2
+
+        self._add_triple(trp)
 
 
 
 
-    cdef inline void _add_triple(
-        self, BufferPtr ss, BufferPtr sp, BufferPtr so
-    ) except *:
+    cdef inline void _add_triple(self, BufferTriple* trp) except *:
         """
         """
         Add a triple from 3 (TPL) serialized terms.
         Add a triple from 3 (TPL) serialized terms.
 
 
         Each of the terms is added to the term set if not existing. The triple
         Each of the terms is added to the term set if not existing. The triple
         also is only added if not existing.
         also is only added if not existing.
         """
         """
-        trp = <BufferTriple *>self._pool.alloc(1, sizeof(BufferTriple))
-
         logger.info('Inserting terms.')
         logger.info('Inserting terms.')
-        logger.info(f'ss addr: 0x{<size_t>ss.addr:02x}')
-        logger.info(f'ss sz: {ss.sz}')
-        #logger.info('ss:')
-        #logger.info((<unsigned char *>ss.addr)[:ss.sz])
-        #print('Insert ss: @0x{:02x}'.format(<unsigned long>ss))
-        cc.hashset_add_or_get(self._terms, <void **>&ss)
-        #print('Now ss is: @0x{:02x}'.format(<unsigned long>ss))
-
-        #print('Insert sp: @0x{:02x}'.format(<unsigned long>sp))
-        cc.hashset_add_or_get(self._terms, <void **>&sp)
-        #print('Now sp is: @0x{:02x}'.format(<unsigned long>sp))
-
-        #print('Insert so: @0x{:02x}'.format(<unsigned long>so))
-        cc.hashset_add_or_get(self._terms, <void **>&so)
-        #print('Now so is: @0x{:02x}'.format(<unsigned long>so))
+        cc.hashset_add(self._terms, trp.s)
+        cc.hashset_add(self._terms, trp.p)
+        cc.hashset_add(self._terms, trp.o)
         logger.info('inserted terms.')
         logger.info('inserted terms.')
-        cdef size_t terms_sz = cc.hashset_size(self._terms)
-        logger.info(f'Terms set size: {terms_sz}')
+        logger.info(f'Terms set size: {cc.hashset_size(self._terms)}')
 
 
-        trp.s = ss
-        trp.p = sp
-        trp.o = so
         cdef size_t trp_sz = cc.hashset_size(self._triples)
         cdef size_t trp_sz = cc.hashset_size(self._triples)
         logger.info(f'Triples set size before adding: {trp_sz}')
         logger.info(f'Triples set size before adding: {trp_sz}')
 
 
         r = cc.hashset_add(self._triples, trp)
         r = cc.hashset_add(self._triples, trp)
-        #print('Insert triple result:')
-        #print(r)
+        print('Insert triple result:')
+        print(r)
 
 
         trp_sz = cc.hashset_size(self._triples)
         trp_sz = cc.hashset_size(self._triples)
         logger.info(f'Triples set size after adding: {trp_sz}')
         logger.info(f'Triples set size after adding: {trp_sz}')
 
 
         cdef:
         cdef:
             cc.HashSetIter ti
             cc.HashSetIter ti
-            BufferTriple *test_trp
             void *cur
             void *cur
 
 
-        cc.hashset_iter_init(&ti, self._triples)
-        while cc.hashset_iter_next(&ti, &cur) != cc.CC_ITER_END:
-            test_trp = <BufferTriple *>cur
-            print('Triple in set: 0x{:02x} 0x{:02x} 0x{:02x}'.format(
-                    <size_t>test_trp.s, <size_t>test_trp.p, <size_t>test_trp.o))
-
 
 
     cdef int _remove_triple(self, BufferTriple* trp_buf) except -1:
     cdef int _remove_triple(self, BufferTriple* trp_buf) except -1:
         """
         """
         Remove one triple from the graph.
         Remove one triple from the graph.
         """
         """
-        cdef:
-            cc.HashSetIter ti
-            void* cur
-
-        if (
-            cc.hashset_get(
-                self._terms, <void**>&(trp_buf.o)
-            ) == cc.CC_ERR_KEY_NOT_FOUND or
-            cc.hashset_get(
-                self._terms, <void**>&(trp_buf.s)
-            ) == cc.CC_ERR_KEY_NOT_FOUND or
-            cc.hashset_get(
-                self._terms, <void**>&(trp_buf.p)
-            ) == cc.CC_ERR_KEY_NOT_FOUND
-        ):
-            return cc.CC_ERR_KEY_NOT_FOUND
-
         return cc.hashset_remove(self._triples, trp_buf, NULL)
         return cc.hashset_remove(self._triples, trp_buf, NULL)
 
 
 
 
@@ -485,22 +618,6 @@ cdef class SimpleGraph:
             cc.HashSetIter it
             cc.HashSetIter it
             void* cur
             void* cur
 
 
-        # First check if any term is not in the set.
-        # Also assign addresses of terms in set with matching input terms.
-        if (
-            # Starting with o which is most likely to be missing.
-            cc.hashset_get(
-                self._terms, <void**>&(btrp.o)
-            ) == cc.CC_ERR_KEY_NOT_FOUND or
-            cc.hashset_get(
-                self._terms, <void**>&(btrp.s)
-            ) == cc.CC_ERR_KEY_NOT_FOUND or
-            cc.hashset_get(
-                self._terms, <void**>&(btrp.p)
-            ) == cc.CC_ERR_KEY_NOT_FOUND
-        ):
-            return False
-
         cc.hashset_iter_init(&it, self._triples)
         cc.hashset_iter_init(&it, self._triples)
         while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
         while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
             if self.trp_cmp_fn(cur, btrp) == 0:
             if self.trp_cmp_fn(cur, btrp) == 0:
@@ -508,6 +625,24 @@ cdef class SimpleGraph:
         return False
         return False
 
 
 
 
+    cdef _get_terms(self):
+        """
+        Get all terms in the graph.
+        """
+        cdef:
+            cc.HashSetIter it
+            void *cur
+
+        terms = [] # This is intentionally a list to spot issues with the set.
+
+        cc.hashset_iter_init(&it, self._terms)
+        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
+            s_term = <Buffer*>cur
+            terms.append((f'0x{<size_t>cur:02x}', term.deserialize_to_rdflib(s_term)))
+
+        return terms
+
+
     cdef set _to_pyset(self):
     cdef set _to_pyset(self):
         """
         """
         Convert triple data to a Python set.
         Convert triple data to a Python set.
@@ -517,13 +652,16 @@ cdef class SimpleGraph:
         cdef:
         cdef:
             void *void_p
             void *void_p
             cc.HashSetIter ti
             cc.HashSetIter ti
-            BufferTriple *trp
             term.Term s, p, o
             term.Term s, p, o
 
 
         graph_set = set()
         graph_set = set()
+        # Looping over an empty HashSet results in a segfault. Exit early in
+        # that case.
+        #if not cc.hashset_size(self._triples):
+        #    return graph_set
 
 
         cc.hashset_iter_init(&ti, self._triples)
         cc.hashset_iter_init(&ti, self._triples)
-        while cc.hashset_iter_next(&ti, &void_p) == cc.CC_OK:
+        while cc.hashset_iter_next(&ti, &void_p) != cc.CC_ITER_END:
             if void_p == NULL:
             if void_p == NULL:
                 logger.warn('Triple is NULL!')
                 logger.warn('Triple is NULL!')
                 break
                 break
@@ -546,21 +684,24 @@ cdef class SimpleGraph:
 
 
         :param iterable triples: Set, list or tuple of 3-tuple triples.
         :param iterable triples: Set, list or tuple of 3-tuple triples.
         """
         """
-        cdef size_t cur = 0
+        cdef size_t cur = 0, trp_cur = 0
 
 
         trp_ct = len(trp)
         trp_ct = len(trp)
-        trp_buf = <Buffer *>self._pool.alloc(3 * trp_ct, sizeof(Buffer))
+        term_buf = <Buffer*>self._pool.alloc(3 * trp_ct, sizeof(Buffer))
+        trp_buf = <BufferTriple*>self._pool.alloc(trp_ct, sizeof(BufferTriple))
 
 
         for s, p, o in trp:
         for s, p, o in trp:
-            term.serialize_from_rdflib(s, trp_buf + cur, self._pool)
-            term.serialize_from_rdflib(p, trp_buf + cur + 1, self._pool)
-            term.serialize_from_rdflib(o, trp_buf + cur + 2, self._pool)
-
-            self._add_triple(
-                trp_buf + cur,
-                trp_buf + cur + 1,
-                trp_buf + cur + 2
-            )
+            term.serialize_from_rdflib(s, term_buf + cur, self._pool)
+            term.serialize_from_rdflib(p, term_buf + cur + 1, self._pool)
+            term.serialize_from_rdflib(o, term_buf + cur + 2, self._pool)
+
+            (trp_buf + trp_cur).s = term_buf + cur
+            (trp_buf + trp_cur).p = term_buf + cur + 1
+            (trp_buf + trp_cur).o = term_buf + cur + 2
+
+            self._add_triple(trp_buf + trp_cur)
+
+            trp_cur += 1
             cur += 3
             cur += 3
 
 
 
 
@@ -596,7 +737,6 @@ cdef class SimpleGraph:
         return cc.hashset_size(self._triples)
         return cc.hashset_size(self._triples)
 
 
 
 
-    @use_data
     def __eq__(self, other):
     def __eq__(self, other):
         """ Equality operator between ``SimpleGraph`` instances. """
         """ Equality operator between ``SimpleGraph`` instances. """
         return graph_eq_fn(self, other)
         return graph_eq_fn(self, other)
@@ -609,8 +749,10 @@ cdef class SimpleGraph:
         It provides the number of triples in the graph and memory address of
         It provides the number of triples in the graph and memory address of
             the instance.
             the instance.
         """
         """
-        return (f'<{self.__class__.__name__} @{hex(id(self))} '
-            f'length={len(self.data)}>')
+        return (
+            f'<{self.__class__.__name__} @{hex(id(self))} '
+            f'length={len(self.data)}>'
+        )
 
 
 
 
     def __str__(self):
     def __str__(self):
@@ -630,14 +772,15 @@ cdef class SimpleGraph:
 
 
     def __and__(self, other):
     def __and__(self, other):
         """ Set intersection. """
         """ Set intersection. """
-        return self.intersect(other)
+        return self.intersection(other)
 
 
 
 
     def __iand__(self, other):
     def __iand__(self, other):
         """ In-place set intersection. """
         """ In-place set intersection. """
-        self.ip_intersect(other)
+        self.ip_intersection(other)
         return self
         return self
 
 
+
     def __or__(self, other):
     def __or__(self, other):
         """ Set union. """
         """ Set union. """
         return self.union(other)
         return self.union(other)
@@ -648,14 +791,15 @@ cdef class SimpleGraph:
         self.ip_union(other)
         self.ip_union(other)
         return self
         return self
 
 
+
     def __xor__(self, other):
     def __xor__(self, other):
-        """ Set exclusive intersection (XOR). """
-        return self.xintersect(other)
+        """ Set exclusive disjunction (XOR). """
+        return self.xor(other)
 
 
 
 
     def __ixor__(self, other):
     def __ixor__(self, other):
-        """ In-place set exclusive intersection (XOR). """
-        self.ip_xintersect(other)
+        """ In-place set exclusive disjunction (XOR). """
+        self.ip_xor(other)
         return self
         return self
 
 
 
 
@@ -836,14 +980,14 @@ cdef class SimpleGraph:
         return res
         return res
 
 
 
 
-    cpdef set terms(self, str type):
-        """
-        Get all terms of a type: subject, predicate or object.
+    #cpdef set terms(self, str type):
+    #    """
+    #    Get all terms of a type: subject, predicate or object.
 
 
-        :param str type: One of ``s``, ``p`` or ``o``.
-        """
-        i = 'spo'.index(type)
-        return {r[i] for r in self.data}
+    #    :param str type: One of ``s``, ``p`` or ``o``.
+    #    """
+    #    i = 'spo'.index(type)
+    #    return {r[i] for r in self.data}
 
 
 
 
 
 
@@ -910,28 +1054,24 @@ cdef class Imr(SimpleGraph):
         return (f'<{self.__class__.__name__} @{hex(id(self))} uri={self.uri}, '
         return (f'<{self.__class__.__name__} @{hex(id(self))} uri={self.uri}, '
             f'length={len(self.data)}>')
             f'length={len(self.data)}>')
 
 
-    @use_data
     def __sub__(self, other):
     def __sub__(self, other):
         """
         """
         Set difference. This creates a new Imr with the same subject URI.
         Set difference. This creates a new Imr with the same subject URI.
         """
         """
         return self.__class__(uri=self.uri, data=self.data - other)
         return self.__class__(uri=self.uri, data=self.data - other)
 
 
-    @use_data
     def __and__(self, other):
     def __and__(self, other):
         """
         """
         Set intersection. This creates a new Imr with the same subject URI.
         Set intersection. This creates a new Imr with the same subject URI.
         """
         """
         return self.__class__(uri=self.uri, data=self.data & other)
         return self.__class__(uri=self.uri, data=self.data & other)
 
 
-    @use_data
     def __or__(self, other):
     def __or__(self, other):
         """
         """
         Set union. This creates a new Imr with the same subject URI.
         Set union. This creates a new Imr with the same subject URI.
         """
         """
         return self.__class__(uri=self.uri, data=self.data | other)
         return self.__class__(uri=self.uri, data=self.data | other)
 
 
-    @use_data
     def __xor__(self, other):
     def __xor__(self, other):
         """
         """
         Set exclusive OR (XOR). This creates a new Imr with the same subject
         Set exclusive OR (XOR). This creates a new Imr with the same subject

+ 5 - 12
lakesuperior/model/structures/hash.pyx

@@ -1,21 +1,14 @@
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memcpy
 from libc.string cimport memcpy
 
 
 from lakesuperior.model.base cimport Buffer
 from lakesuperior.model.base cimport Buffer
+from lakesuperior.cy_include cimport spookyhash as sph
 
 
 
 
 memcpy(&term_hash_seed32, TERM_HASH_SEED, HLEN_32)
 memcpy(&term_hash_seed32, TERM_HASH_SEED, HLEN_32)
 memcpy(&term_hash_seed64_1, TERM_HASH_SEED, HLEN_64)
 memcpy(&term_hash_seed64_1, TERM_HASH_SEED, HLEN_64)
 memcpy(&term_hash_seed64_2, TERM_HASH_SEED + HLEN_64, HLEN_64)
 memcpy(&term_hash_seed64_2, TERM_HASH_SEED + HLEN_64, HLEN_64)
 
 
-# We only need a few basic functions from spookyhash. No need for a pxd file.
-cdef extern from 'spookyhash_api.h':
-    uint32_t spookyhash_32(const void *input, size_t input_size, uint32_t seed)
-    uint64_t spookyhash_64(const void *input, size_t input_size, uint64_t seed)
-    void spookyhash_128(
-            const void *input, size_t input_size, uint64_t *hash_1,
-            uint64_t *hash_2)
-
 
 
 cdef inline int hash32(const Buffer *message, Hash32 *hash) except -1:
 cdef inline int hash32(const Buffer *message, Hash32 *hash) except -1:
     """
     """
@@ -23,7 +16,7 @@ cdef inline int hash32(const Buffer *message, Hash32 *hash) except -1:
     """
     """
     cdef uint32_t seed = term_hash_seed64_1
     cdef uint32_t seed = term_hash_seed64_1
 
 
-    hash[0] = spookyhash_32(message[0].addr, message[0].sz, seed)
+    hash[0] = sph.spookyhash_32(message[0].addr, message[0].sz, seed)
 
 
 
 
 cdef inline int hash64(const Buffer *message, Hash64 *hash) except -1:
 cdef inline int hash64(const Buffer *message, Hash64 *hash) except -1:
@@ -32,7 +25,7 @@ cdef inline int hash64(const Buffer *message, Hash64 *hash) except -1:
     """
     """
     cdef uint64_t seed = term_hash_seed32
     cdef uint64_t seed = term_hash_seed32
 
 
-    hash[0] = spookyhash_64(message[0].addr, message[0].sz, seed)
+    hash[0] = sph.spookyhash_64(message[0].addr, message[0].sz, seed)
 
 
 
 
 cdef inline int hash128(const Buffer *message, Hash128 *hash) except -1:
 cdef inline int hash128(const Buffer *message, Hash128 *hash) except -1:
@@ -54,7 +47,7 @@ cdef inline int hash128(const Buffer *message, Hash128 *hash) except -1:
         DoubleHash64 seed = [term_hash_seed64_1, term_hash_seed64_2]
         DoubleHash64 seed = [term_hash_seed64_1, term_hash_seed64_2]
         Hash128 digest
         Hash128 digest
 
 
-    spookyhash_128(message[0].addr, message[0].sz, seed, seed + 1)
+    sph.spookyhash_128(message[0].addr, message[0].sz, seed, seed + 1)
 
 
     # This casts the 2 contiguous uint64_t's into a char[16] pointer.
     # This casts the 2 contiguous uint64_t's into a char[16] pointer.
     hash[0] = <Hash128>seed
     hash[0] = <Hash128>seed

+ 2 - 0
setup.py

@@ -103,6 +103,8 @@ extensions = [
         'lakesuperior.model.graph.*',
         'lakesuperior.model.graph.*',
         [
         [
             path.join(tpl_src_dir, 'tpl.c'),
             path.join(tpl_src_dir, 'tpl.c'),
+            path.join(spookyhash_src_dir, 'context.c'),
+            path.join(spookyhash_src_dir, 'globals.c'),
             path.join(spookyhash_src_dir, 'spookyhash.c'),
             path.join(spookyhash_src_dir, 'spookyhash.c'),
             path.join(coll_src_dir, 'common.c'),
             path.join(coll_src_dir, 'common.c'),
             path.join(coll_src_dir, 'array.c'),
             path.join(coll_src_dir, 'array.c'),

+ 71 - 3
tests/0_data_structures/test_graph.py

@@ -8,7 +8,10 @@ from lakesuperior.model.graph.graph import SimpleGraph, Imr
 def trp():
 def trp():
     return (
     return (
         (URIRef('urn:s:0'), URIRef('urn:p:0'), URIRef('urn:o:0')),
         (URIRef('urn:s:0'), URIRef('urn:p:0'), URIRef('urn:o:0')),
+        # Exact same as [0].
         (URIRef('urn:s:0'), URIRef('urn:p:0'), URIRef('urn:o:0')),
         (URIRef('urn:s:0'), URIRef('urn:p:0'), URIRef('urn:o:0')),
+        # NOTE: s and o are in reversed order.
+        (URIRef('urn:o:0'), URIRef('urn:p:0'), URIRef('urn:s:0')),
         (URIRef('urn:s:0'), URIRef('urn:p:1'), URIRef('urn:o:0')),
         (URIRef('urn:s:0'), URIRef('urn:p:1'), URIRef('urn:o:0')),
         (URIRef('urn:s:0'), URIRef('urn:p:1'), URIRef('urn:o:1')),
         (URIRef('urn:s:0'), URIRef('urn:p:1'), URIRef('urn:o:1')),
         (URIRef('urn:s:1'), URIRef('urn:p:1'), URIRef('urn:o:1')),
         (URIRef('urn:s:1'), URIRef('urn:p:1'), URIRef('urn:o:1')),
@@ -37,7 +40,7 @@ class TestGraphOps:
         assert len(gr) == 2
         assert len(gr) == 2
 
 
         gr.add(trp)
         gr.add(trp)
-        assert len(gr) == 5
+        assert len(gr) == 6
 
 
 
 
     def test_dup(self, trp):
     def test_dup(self, trp):
@@ -59,8 +62,73 @@ class TestGraphOps:
         gr = SimpleGraph()
         gr = SimpleGraph()
 
 
         gr.add(trp)
         gr.add(trp)
-        gr.remove(trp[1])
-        assert len(gr) == 4
+        gr.remove(trp[0])
+        assert len(gr) == 5
         assert trp[0] not in gr
         assert trp[0] not in gr
         assert trp[1] not in gr
         assert trp[1] not in gr
 
 
+        # This is the duplicate triple.
+        gr.remove(trp[1])
+        assert len(gr) == 5
+
+        # This is the triple in reverse order.
+        gr.remove(trp[2])
+        assert len(gr) == 4
+
+        gr.remove(trp[4])
+        assert len(gr) == 3
+
+
+    def test_union(self, trp):
+        """
+        Test graph union.
+        """
+        gr1 = SimpleGraph()
+        gr2 = SimpleGraph()
+
+        gr1.add(trp[0:3])
+        gr2.add(trp[2:6])
+
+        gr3 = gr1 | gr2
+
+        assert len(gr3) == 5
+        assert trp[0] in gr3
+        assert trp[4] in gr3
+
+
+    def test_iunion(self, trp):
+        """
+        Test graph union.
+        """
+        gr1 = SimpleGraph()
+        gr2 = SimpleGraph()
+
+        gr1.add(trp[0:3])
+        gr2.add(trp[2:6])
+
+        gr1 |= gr2
+
+        assert len(gr1) == 5
+        assert trp[0] in gr1
+        assert trp[4] in gr1
+
+
+    def test_intersect(self, trp):
+        """
+        Test graph union.
+        """
+        gr1 = SimpleGraph()
+        gr2 = SimpleGraph()
+
+        gr1.add(trp[0:4])
+        gr2.add(trp[2:6])
+
+        gr3 = gr1 & gr2
+
+        assert len(gr3) == 2
+        assert trp[2] in gr3
+        assert trp[3] in gr3
+        assert trp[0] not in gr3
+        assert trp[5] not in gr3
+
+