Browse Source

Bunch of changes:

* Add intersection and xor, and related tests;
* Replace crazy pointer-based comparison and hash with literal methods;
* Revert to almost-vanilla Collections-C;
* Create PXD file for Spookyhash.
Stefano Cossu 5 years ago
parent
commit
b948067b63

+ 1 - 1
ext/collections-c

@@ -1 +1 @@
-Subproject commit 06660db3b3834a2119c3cccff04268f91f660604
+Subproject commit 402d5fa7d29000a578dbaba425179d45115e7f10

+ 1 - 1
lakesuperior/cy_include/collections.pxd

@@ -19,7 +19,7 @@ cdef extern from "common.h":
         CC_ERR_VALUE_NOT_FOUND
         CC_ERR_OUT_OF_RANGE
         CC_ITER_END
-        CC_DUP_KEY
+        #CC_DUP_KEY
 #
 #    int cc_common_cmp_str(const void* key1, const void* key2)
 #

+ 20 - 0
lakesuperior/cy_include/spookyhash.pxd

@@ -0,0 +1,20 @@
+from libc.stdint cimport uint32_t, uint64_t
+
+cdef extern from 'spookyhash_api.h':
+
+    ctypedef struct spookyhash_context:
+        pass
+
+    void spookyhash_context_init(
+            spookyhash_context *context, uint64_t seed_1, uint64_t seed_2)
+    void spookyhash_update(
+            spookyhash_context *context, const void *input, size_t input_size)
+    void spookyhash_final(
+            spookyhash_context *context, uint64_t *hash_1, uint64_t *hash_2)
+
+    uint32_t spookyhash_32(const void *input, size_t input_size, uint32_t seed)
+    uint64_t spookyhash_64(const void *input, size_t input_size, uint64_t seed)
+    void spookyhash_128(
+            const void *input, size_t input_size, uint64_t *hash_1,
+            uint64_t *hash_2)
+

+ 11 - 6
lakesuperior/model/graph/graph.pxd

@@ -37,20 +37,25 @@ cdef class SimpleGraph:
         void _data_from_lookup(self, tuple trp_ptn, ctx=*) except *
         void _data_from_keyset(self, Keyset data) except *
         inline void _add_from_spok(self, const TripleKey spok) except *
-        inline void _add_triple(
-            self, Buffer *ss, Buffer *sp, Buffer *so
-        ) except *
+        inline void _add_triple(self, BufferTriple *trp) except *
         int _remove_triple(self, BufferTriple* trp_buf) except -1
         bint _trp_contains(self, BufferTriple* btrp)
+        _get_terms(self)
         set _to_pyset(self)
 
+        # Basic graph operations.
+        void ip_union(self, SimpleGraph other) except *
+        void ip_intersection(self, SimpleGraph other) except *
+        void ip_xor(self, SimpleGraph other) except *
+
+    cpdef SimpleGraph xor(self, SimpleGraph other)
+    cpdef SimpleGraph intersection(self, SimpleGraph other)
+    cpdef SimpleGraph union(self, SimpleGraph other)
     cpdef void set(self, tuple trp) except *
     cpdef void remove_triples(self, pattern) except *
     cpdef object as_rdflib(self)
-    cpdef set terms(self, str type)
+    #cpdef set terms(self, str type)
 
-    cpdef SimpleGraph union(self, SimpleGraph other)
-    cpdef void ip_union(self, SimpleGraph other) except *
 
 cdef class Imr(SimpleGraph):
     cdef:

+ 272 - 132
lakesuperior/model/graph/graph.pyx

@@ -15,6 +15,7 @@ from cymem.cymem cimport Pool
 
 from lakesuperior.cy_include cimport cylmdb as lmdb
 from lakesuperior.cy_include cimport collections as cc
+from lakesuperior.cy_include cimport spookyhash as sph
 from lakesuperior.model.graph cimport term
 from lakesuperior.store.ldp_rs.lmdb_triplestore cimport (
         KLEN, DBL_KLEN, TRP_KLEN, TripleKey)
@@ -30,19 +31,6 @@ cdef extern from 'spookyhash_api.h':
 logger = logging.getLogger(__name__)
 
 
-def use_data(fn):
-    """
-    Decorator to indicate that a set operation between two SimpleGraph
-    instances should use the ``data`` property of the second term. The second
-    term can also be a simple set.
-    """
-    @wraps(fn)
-    def _wrapper(self, other):
-        if isinstance(other, SimpleGraph):
-            other = other.data
-    return _wrapper
-
-
 cdef int term_cmp_fn(const void* key1, const void* key2):
     """
     Compare function for two Buffer objects.
@@ -62,6 +50,45 @@ cdef int term_cmp_fn(const void* key1, const void* key2):
     return cmp
 
 
+cdef int trp_lit_cmp_fn(const void* key1, const void* key2):
+    """
+    Compare function for two triples in a set.
+
+    s, p, o byte data are compared literally.
+
+    :rtype: int
+    :return: 0 if all three terms point to byte-wise identical data in both
+        triples.
+    """
+    t1 = <BufferTriple *>key1
+    t2 = <BufferTriple *>key2
+    print('Comparing terms: {} {} {}'.format(
+        (<unsigned char*>t1.s.addr)[:t1.s.sz],
+        (<unsigned char*>t1.p.addr)[:t1.p.sz],
+        (<unsigned char*>t1.o.addr)[:t1.o.sz]
+    ))
+    print('With:            {} {} {}'.format(
+        (<unsigned char*>t2.s.addr)[:t2.s.sz],
+        (<unsigned char*>t2.p.addr)[:t2.p.sz],
+        (<unsigned char*>t2.o.addr)[:t2.o.sz]
+    ))
+
+    print('Term comparison results: {}, {}, {}'.format(
+        term_cmp_fn(t1.o, t2.o),
+        term_cmp_fn(t1.s, t2.s),
+        term_cmp_fn(t1.p, t2.p)
+    ))
+
+    diff = (
+        term_cmp_fn(t1.o, t2.o) or
+        term_cmp_fn(t1.s, t2.s) or
+        term_cmp_fn(t1.p, t2.p)
+    )
+
+    logger.info(f'Triples match: {not(diff)}')
+    return diff
+
+
 cdef int trp_cmp_fn(const void* key1, const void* key2):
     """
     Compare function for two triples in a set.
@@ -75,10 +102,20 @@ cdef int trp_cmp_fn(const void* key1, const void* key2):
     """
     t1 = <BufferTriple *>key1
     t2 = <BufferTriple *>key2
-    print('Comparing: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
-        <unsigned long>t1.s, <unsigned long>t1.p, <unsigned long>t1.o))
-    print('With:      <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
-        <unsigned long>t2.s, <unsigned long>t2.p, <unsigned long>t2.o))
+    print('Comparing terms: {} {} {}'.format(
+        (<unsigned char*>t1.s.addr)[:t1.s.sz],
+        (<unsigned char*>t1.p.addr)[:t1.p.sz],
+        (<unsigned char*>t1.o.addr)[:t1.o.sz]
+    ))
+    print('With:            {} {} {}'.format(
+        (<unsigned char*>t2.s.addr)[:t2.s.sz],
+        (<unsigned char*>t2.p.addr)[:t2.p.sz],
+        (<unsigned char*>t2.o.addr)[:t2.o.sz]
+    ))
+    print('Comparing addresses: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
+        <size_t>t1.s, <size_t>t1.p, <size_t>t1.o))
+    print('With:                <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
+        <size_t>t2.s, <size_t>t2.p, <size_t>t2.o))
 
     cdef int is_not_equal = (
         t1.s.addr != t2.s.addr or
@@ -86,7 +123,7 @@ cdef int trp_cmp_fn(const void* key1, const void* key2):
         t1.o.addr != t2.o.addr
     )
 
-    logger.info(f'Triples are NOT equal and will be added: {is_not_equal}')
+    logger.info(f'Triples match: {not(is_not_equal)}')
     return is_not_equal
 
 
@@ -116,6 +153,28 @@ cdef size_t term_hash_fn(const void* key, int l, uint32_t seed):
     return <size_t>spookyhash_64((<Buffer*>key).addr, (<Buffer*>key).sz, seed)
 
 
+cdef size_t trp_lit_hash_fn(const void* key, int l, uint32_t seed):
+    """
+    Hash function for sets of (serialized) triples.
+
+    This function concatenates the literal terms of the triple as bytes
+    and computes their hash.
+    """
+    trp = <BufferTriple*>key
+    seed64 = <uint64_t>seed
+    seed_dummy = seed64
+
+    cdef sph.spookyhash_context ctx
+
+    sph.spookyhash_context_init(&ctx, seed64, seed_dummy)
+    sph.spookyhash_update(&ctx, trp.s.addr, trp.s.sz)
+    sph.spookyhash_update(&ctx, trp.s.addr, trp.p.sz)
+    sph.spookyhash_update(&ctx, trp.s.addr, trp.o.sz)
+    sph.spookyhash_final(&ctx, &seed64, &seed_dummy)
+
+    return <size_t>seed64
+
+
 cdef size_t trp_hash_fn(const void* key, int l, uint32_t seed):
     """
     Hash function for sets of (serialized) triples.
@@ -254,7 +313,7 @@ cdef class SimpleGraph:
             cc.HashSetConf terms_conf, trp_conf
 
         self.term_cmp_fn = &term_cmp_fn
-        self.trp_cmp_fn = &trp_cmp_fn
+        self.trp_cmp_fn = &trp_lit_cmp_fn
 
         cc.hashset_conf_init(&terms_conf)
         terms_conf.load_factor = 0.85
@@ -265,7 +324,7 @@ cdef class SimpleGraph:
 
         cc.hashset_conf_init(&trp_conf)
         trp_conf.load_factor = 0.75
-        trp_conf.hash = &trp_hash_fn
+        trp_conf.hash = &trp_lit_hash_fn
         trp_conf.hash_seed = term_hash_seed32
         trp_conf.key_compare = self.trp_cmp_fn
         trp_conf.key_length = sizeof(BufferTriple)
@@ -284,9 +343,6 @@ cdef class SimpleGraph:
             # Populate with provided Python set.
             self.add(data)
 
-        print(len(self))
-        print('SimpleGraph cinit complete.')
-
 
     def __dealloc__(self):
         """
@@ -305,6 +361,10 @@ cdef class SimpleGraph:
         """
         return self._to_pyset()
 
+    @property
+    def terms(self):
+        return self._get_terms()
+
 
     # # # BASIC SET OPERATIONS # # #
 
@@ -331,12 +391,12 @@ cdef class SimpleGraph:
             cc.hashset_iter_init(&it, gr._triples)
             while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
                 bt = <BufferTriple*>cur
-                new_gr._add_triple(bt.s, bt.p, bt.o)
+                new_gr._add_triple(bt)
 
         return new_gr
 
 
-    cpdef void ip_union(self, SimpleGraph other) except *:
+    cdef void ip_union(self, SimpleGraph other) except *:
         """
         Perform an in-place set union that adds triples to this instance
 
@@ -349,12 +409,128 @@ cdef class SimpleGraph:
         cdef:
             void *cur
             cc.HashSetIter it
-            BufferTriple *trp
 
         cc.hashset_iter_init(&it, other._triples)
         while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
             bt = <BufferTriple*>cur
-            self._add_triple(bt.s, bt.p, bt.o)
+            self._add_triple(bt)
+
+
+    cpdef SimpleGraph intersection(self, SimpleGraph other):
+        """
+        Graph intersection.
+
+        :param SimpleGraph other: The other graph to intersect.
+
+        :rtype: SimpleGraph
+        :return: A new SimpleGraph instance.
+        """
+        cdef:
+            void *cur
+            cc.HashSetIter it
+            SimpleGraph new_gr = SimpleGraph()
+
+        cc.hashset_iter_init(&it, self._triples)
+        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur
+            print('Checking: <0x{:02x}> <0x{:02x}> <0x{:02x}>'.format(
+                <size_t>bt.s, <size_t>bt.p, <size_t>bt.o))
+            if other._trp_contains(bt):
+                print('Adding.')
+                new_gr._add_triple(bt)
+
+        return new_gr
+
+
+    cdef void ip_intersection(self, SimpleGraph other) except *:
+        """
+        In-place graph intersection.
+
+        Triples in common with another graph are removed from the current one.
+
+        :param SimpleGraph other: The other graph to intersect.
+
+        :rtype: void
+        """
+        cdef:
+            void *cur
+            cc.HashSetIter it
+
+        cc.hashset_iter_init(&it, self._triples)
+        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur
+            if not other._trp_contains(bt):
+                self._remove_triple(bt)
+
+
+    cpdef SimpleGraph xor(self, SimpleGraph other):
+        """
+        Graph Exclusive disjunction (XOR).
+
+        :param SimpleGraph other: The other graph to perform XOR with.
+
+        :rtype: SimpleGraph
+        :return: A new SimpleGraph instance.
+        """
+        cdef:
+            void *cur_self
+            void *cur_other
+            cc.HashSetIter it_self, it_other
+            SimpleGraph new_gr = SimpleGraph()
+            BufferTriple* bt
+
+        # Add triples in this and not in other.
+        print('Comparing with this.')
+        cc.hashset_iter_init(&it_self, self._triples)
+        while cc.hashset_iter_next(&it_self, &cur_self) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur_self
+            if not other._trp_contains(bt):
+                print('Adding from this.')
+                new_gr._add_triple(bt)
+
+        # Other way around.
+        print('Comparing with that.')
+        cc.hashset_iter_init(&it_other, other._triples)
+        while cc.hashset_iter_next(&it_other, &cur_other) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur_other
+            print('Checking on that.')
+            if not self._trp_contains(bt):
+                print('Adding from that.')
+                new_gr._add_triple(bt)
+            else:
+                print('Triple exists. Not adding.')
+
+        return new_gr
+
+
+    cdef void ip_xor(self, SimpleGraph other) except *:
+        """
+        In-place graph XOR.
+
+        Triples in common with another graph are removed from the current one,
+        and triples not in common will be added from the other one.
+
+        :param SimpleGraph other: The other graph to perform XOR with.
+
+        :rtype: void
+        """
+        cdef:
+            void *cur
+            cc.HashSetIter it
+
+        # Add triples in other graph and not in this graph.
+        cc.hashset_iter_init(&it, self._triples)
+        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur
+            if other._trp_contains(bt):
+                self._remove_triple(bt)
+
+        # Remove triples in common.
+        cc.hashset_iter_init(&it, other._triples)
+        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
+            bt = <BufferTriple*>cur
+            if not self._trp_contains(bt):
+                self._add_triple(bt)
 
 
     cdef void _data_from_lookup(self, tuple trp_ptn, ctx=None) except *:
@@ -387,96 +563,53 @@ cdef class SimpleGraph:
         """
         Add a triple from a TripleKey of term keys.
         """
-        cdef:
-            SPOBuffer s_spo
-            BufferTriple trp
-
         s_spo = <SPOBuffer>self._pool.alloc(3, sizeof(Buffer))
 
         self.store.lookup_term(spok, s_spo)
         self.store.lookup_term(spok + KLEN, s_spo + 1)
         self.store.lookup_term(spok + DBL_KLEN, s_spo + 2)
 
-        self._add_triple(s_spo, s_spo + 1, s_spo + 2)
+        trp = <BufferTriple *>self._pool.alloc(1, sizeof(BufferTriple))
+        trp.s = s_spo
+        trp.p = s_spo + 1
+        trp.o = s_spo + 2
+
+        self._add_triple(trp)
 
 
-    cdef inline void _add_triple(
-        self, BufferPtr ss, BufferPtr sp, BufferPtr so
-    ) except *:
+    cdef inline void _add_triple(self, BufferTriple* trp) except *:
         """
         Add a triple from 3 (TPL) serialized terms.
 
         Each of the terms is added to the term set if not existing. The triple
         also is only added if not existing.
         """
-        trp = <BufferTriple *>self._pool.alloc(1, sizeof(BufferTriple))
-
         logger.info('Inserting terms.')
-        logger.info(f'ss addr: 0x{<size_t>ss.addr:02x}')
-        logger.info(f'ss sz: {ss.sz}')
-        #logger.info('ss:')
-        #logger.info((<unsigned char *>ss.addr)[:ss.sz])
-        #print('Insert ss: @0x{:02x}'.format(<unsigned long>ss))
-        cc.hashset_add_or_get(self._terms, <void **>&ss)
-        #print('Now ss is: @0x{:02x}'.format(<unsigned long>ss))
-
-        #print('Insert sp: @0x{:02x}'.format(<unsigned long>sp))
-        cc.hashset_add_or_get(self._terms, <void **>&sp)
-        #print('Now sp is: @0x{:02x}'.format(<unsigned long>sp))
-
-        #print('Insert so: @0x{:02x}'.format(<unsigned long>so))
-        cc.hashset_add_or_get(self._terms, <void **>&so)
-        #print('Now so is: @0x{:02x}'.format(<unsigned long>so))
+        cc.hashset_add(self._terms, trp.s)
+        cc.hashset_add(self._terms, trp.p)
+        cc.hashset_add(self._terms, trp.o)
         logger.info('inserted terms.')
-        cdef size_t terms_sz = cc.hashset_size(self._terms)
-        logger.info(f'Terms set size: {terms_sz}')
+        logger.info(f'Terms set size: {cc.hashset_size(self._terms)}')
 
-        trp.s = ss
-        trp.p = sp
-        trp.o = so
         cdef size_t trp_sz = cc.hashset_size(self._triples)
         logger.info(f'Triples set size before adding: {trp_sz}')
 
         r = cc.hashset_add(self._triples, trp)
-        #print('Insert triple result:')
-        #print(r)
+        print('Insert triple result:')
+        print(r)
 
         trp_sz = cc.hashset_size(self._triples)
         logger.info(f'Triples set size after adding: {trp_sz}')
 
         cdef:
             cc.HashSetIter ti
-            BufferTriple *test_trp
             void *cur
 
-        cc.hashset_iter_init(&ti, self._triples)
-        while cc.hashset_iter_next(&ti, &cur) != cc.CC_ITER_END:
-            test_trp = <BufferTriple *>cur
-            print('Triple in set: 0x{:02x} 0x{:02x} 0x{:02x}'.format(
-                    <size_t>test_trp.s, <size_t>test_trp.p, <size_t>test_trp.o))
-
 
     cdef int _remove_triple(self, BufferTriple* trp_buf) except -1:
         """
         Remove one triple from the graph.
         """
-        cdef:
-            cc.HashSetIter ti
-            void* cur
-
-        if (
-            cc.hashset_get(
-                self._terms, <void**>&(trp_buf.o)
-            ) == cc.CC_ERR_KEY_NOT_FOUND or
-            cc.hashset_get(
-                self._terms, <void**>&(trp_buf.s)
-            ) == cc.CC_ERR_KEY_NOT_FOUND or
-            cc.hashset_get(
-                self._terms, <void**>&(trp_buf.p)
-            ) == cc.CC_ERR_KEY_NOT_FOUND
-        ):
-            return cc.CC_ERR_KEY_NOT_FOUND
-
         return cc.hashset_remove(self._triples, trp_buf, NULL)
 
 
@@ -485,22 +618,6 @@ cdef class SimpleGraph:
             cc.HashSetIter it
             void* cur
 
-        # First check if any term is not in the set.
-        # Also assign addresses of terms in set with matching input terms.
-        if (
-            # Starting with o which is most likely to be missing.
-            cc.hashset_get(
-                self._terms, <void**>&(btrp.o)
-            ) == cc.CC_ERR_KEY_NOT_FOUND or
-            cc.hashset_get(
-                self._terms, <void**>&(btrp.s)
-            ) == cc.CC_ERR_KEY_NOT_FOUND or
-            cc.hashset_get(
-                self._terms, <void**>&(btrp.p)
-            ) == cc.CC_ERR_KEY_NOT_FOUND
-        ):
-            return False
-
         cc.hashset_iter_init(&it, self._triples)
         while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
             if self.trp_cmp_fn(cur, btrp) == 0:
@@ -508,6 +625,24 @@ cdef class SimpleGraph:
         return False
 
 
+    cdef _get_terms(self):
+        """
+        Get all terms in the graph.
+        """
+        cdef:
+            cc.HashSetIter it
+            void *cur
+
+        terms = [] # This is intentionally a list to spot issues with the set.
+
+        cc.hashset_iter_init(&it, self._terms)
+        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
+            s_term = <Buffer*>cur
+            terms.append((f'0x{<size_t>cur:02x}', term.deserialize_to_rdflib(s_term)))
+
+        return terms
+
+
     cdef set _to_pyset(self):
         """
         Convert triple data to a Python set.
@@ -517,13 +652,16 @@ cdef class SimpleGraph:
         cdef:
             void *void_p
             cc.HashSetIter ti
-            BufferTriple *trp
             term.Term s, p, o
 
         graph_set = set()
+        # Looping over an empty HashSet results in a segfault. Exit early in
+        # that case.
+        #if not cc.hashset_size(self._triples):
+        #    return graph_set
 
         cc.hashset_iter_init(&ti, self._triples)
-        while cc.hashset_iter_next(&ti, &void_p) == cc.CC_OK:
+        while cc.hashset_iter_next(&ti, &void_p) != cc.CC_ITER_END:
             if void_p == NULL:
                 logger.warn('Triple is NULL!')
                 break
@@ -546,21 +684,24 @@ cdef class SimpleGraph:
 
         :param iterable triples: Set, list or tuple of 3-tuple triples.
         """
-        cdef size_t cur = 0
+        cdef size_t cur = 0, trp_cur = 0
 
         trp_ct = len(trp)
-        trp_buf = <Buffer *>self._pool.alloc(3 * trp_ct, sizeof(Buffer))
+        term_buf = <Buffer*>self._pool.alloc(3 * trp_ct, sizeof(Buffer))
+        trp_buf = <BufferTriple*>self._pool.alloc(trp_ct, sizeof(BufferTriple))
 
         for s, p, o in trp:
-            term.serialize_from_rdflib(s, trp_buf + cur, self._pool)
-            term.serialize_from_rdflib(p, trp_buf + cur + 1, self._pool)
-            term.serialize_from_rdflib(o, trp_buf + cur + 2, self._pool)
-
-            self._add_triple(
-                trp_buf + cur,
-                trp_buf + cur + 1,
-                trp_buf + cur + 2
-            )
+            term.serialize_from_rdflib(s, term_buf + cur, self._pool)
+            term.serialize_from_rdflib(p, term_buf + cur + 1, self._pool)
+            term.serialize_from_rdflib(o, term_buf + cur + 2, self._pool)
+
+            (trp_buf + trp_cur).s = term_buf + cur
+            (trp_buf + trp_cur).p = term_buf + cur + 1
+            (trp_buf + trp_cur).o = term_buf + cur + 2
+
+            self._add_triple(trp_buf + trp_cur)
+
+            trp_cur += 1
             cur += 3
 
 
@@ -596,7 +737,6 @@ cdef class SimpleGraph:
         return cc.hashset_size(self._triples)
 
 
-    @use_data
     def __eq__(self, other):
         """ Equality operator between ``SimpleGraph`` instances. """
         return graph_eq_fn(self, other)
@@ -609,8 +749,10 @@ cdef class SimpleGraph:
         It provides the number of triples in the graph and memory address of
             the instance.
         """
-        return (f'<{self.__class__.__name__} @{hex(id(self))} '
-            f'length={len(self.data)}>')
+        return (
+            f'<{self.__class__.__name__} @{hex(id(self))} '
+            f'length={len(self.data)}>'
+        )
 
 
     def __str__(self):
@@ -630,14 +772,15 @@ cdef class SimpleGraph:
 
     def __and__(self, other):
         """ Set intersection. """
-        return self.intersect(other)
+        return self.intersection(other)
 
 
     def __iand__(self, other):
         """ In-place set intersection. """
-        self.ip_intersect(other)
+        self.ip_intersection(other)
         return self
 
+
     def __or__(self, other):
         """ Set union. """
         return self.union(other)
@@ -648,14 +791,15 @@ cdef class SimpleGraph:
         self.ip_union(other)
         return self
 
+
     def __xor__(self, other):
-        """ Set exclusive intersection (XOR). """
-        return self.xintersect(other)
+        """ Set exclusive disjunction (XOR). """
+        return self.xor(other)
 
 
     def __ixor__(self, other):
-        """ In-place set exclusive intersection (XOR). """
-        self.ip_xintersect(other)
+        """ In-place set exclusive disjunction (XOR). """
+        self.ip_xor(other)
         return self
 
 
@@ -836,14 +980,14 @@ cdef class SimpleGraph:
         return res
 
 
-    cpdef set terms(self, str type):
-        """
-        Get all terms of a type: subject, predicate or object.
+    #cpdef set terms(self, str type):
+    #    """
+    #    Get all terms of a type: subject, predicate or object.
 
-        :param str type: One of ``s``, ``p`` or ``o``.
-        """
-        i = 'spo'.index(type)
-        return {r[i] for r in self.data}
+    #    :param str type: One of ``s``, ``p`` or ``o``.
+    #    """
+    #    i = 'spo'.index(type)
+    #    return {r[i] for r in self.data}
 
 
 
@@ -910,28 +1054,24 @@ cdef class Imr(SimpleGraph):
         return (f'<{self.__class__.__name__} @{hex(id(self))} uri={self.uri}, '
             f'length={len(self.data)}>')
 
-    @use_data
     def __sub__(self, other):
         """
         Set difference. This creates a new Imr with the same subject URI.
         """
         return self.__class__(uri=self.uri, data=self.data - other)
 
-    @use_data
     def __and__(self, other):
         """
         Set intersection. This creates a new Imr with the same subject URI.
         """
         return self.__class__(uri=self.uri, data=self.data & other)
 
-    @use_data
     def __or__(self, other):
         """
         Set union. This creates a new Imr with the same subject URI.
         """
         return self.__class__(uri=self.uri, data=self.data | other)
 
-    @use_data
     def __xor__(self, other):
         """
         Set exclusive OR (XOR). This creates a new Imr with the same subject

+ 5 - 12
lakesuperior/model/structures/hash.pyx

@@ -1,21 +1,14 @@
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memcpy
 
 from lakesuperior.model.base cimport Buffer
+from lakesuperior.cy_include cimport spookyhash as sph
 
 
 memcpy(&term_hash_seed32, TERM_HASH_SEED, HLEN_32)
 memcpy(&term_hash_seed64_1, TERM_HASH_SEED, HLEN_64)
 memcpy(&term_hash_seed64_2, TERM_HASH_SEED + HLEN_64, HLEN_64)
 
-# We only need a few basic functions from spookyhash. No need for a pxd file.
-cdef extern from 'spookyhash_api.h':
-    uint32_t spookyhash_32(const void *input, size_t input_size, uint32_t seed)
-    uint64_t spookyhash_64(const void *input, size_t input_size, uint64_t seed)
-    void spookyhash_128(
-            const void *input, size_t input_size, uint64_t *hash_1,
-            uint64_t *hash_2)
-
 
 cdef inline int hash32(const Buffer *message, Hash32 *hash) except -1:
     """
@@ -23,7 +16,7 @@ cdef inline int hash32(const Buffer *message, Hash32 *hash) except -1:
     """
     cdef uint32_t seed = term_hash_seed64_1
 
-    hash[0] = spookyhash_32(message[0].addr, message[0].sz, seed)
+    hash[0] = sph.spookyhash_32(message[0].addr, message[0].sz, seed)
 
 
 cdef inline int hash64(const Buffer *message, Hash64 *hash) except -1:
@@ -32,7 +25,7 @@ cdef inline int hash64(const Buffer *message, Hash64 *hash) except -1:
     """
     cdef uint64_t seed = term_hash_seed32
 
-    hash[0] = spookyhash_64(message[0].addr, message[0].sz, seed)
+    hash[0] = sph.spookyhash_64(message[0].addr, message[0].sz, seed)
 
 
 cdef inline int hash128(const Buffer *message, Hash128 *hash) except -1:
@@ -54,7 +47,7 @@ cdef inline int hash128(const Buffer *message, Hash128 *hash) except -1:
         DoubleHash64 seed = [term_hash_seed64_1, term_hash_seed64_2]
         Hash128 digest
 
-    spookyhash_128(message[0].addr, message[0].sz, seed, seed + 1)
+    sph.spookyhash_128(message[0].addr, message[0].sz, seed, seed + 1)
 
     # This casts the 2 contiguous uint64_t's into a char[16] pointer.
     hash[0] = <Hash128>seed

+ 2 - 0
setup.py

@@ -103,6 +103,8 @@ extensions = [
         'lakesuperior.model.graph.*',
         [
             path.join(tpl_src_dir, 'tpl.c'),
+            path.join(spookyhash_src_dir, 'context.c'),
+            path.join(spookyhash_src_dir, 'globals.c'),
             path.join(spookyhash_src_dir, 'spookyhash.c'),
             path.join(coll_src_dir, 'common.c'),
             path.join(coll_src_dir, 'array.c'),

+ 71 - 3
tests/0_data_structures/test_graph.py

@@ -8,7 +8,10 @@ from lakesuperior.model.graph.graph import SimpleGraph, Imr
 def trp():
     return (
         (URIRef('urn:s:0'), URIRef('urn:p:0'), URIRef('urn:o:0')),
+        # Exact same as [0].
         (URIRef('urn:s:0'), URIRef('urn:p:0'), URIRef('urn:o:0')),
+        # NOTE: s and o are in reversed order.
+        (URIRef('urn:o:0'), URIRef('urn:p:0'), URIRef('urn:s:0')),
         (URIRef('urn:s:0'), URIRef('urn:p:1'), URIRef('urn:o:0')),
         (URIRef('urn:s:0'), URIRef('urn:p:1'), URIRef('urn:o:1')),
         (URIRef('urn:s:1'), URIRef('urn:p:1'), URIRef('urn:o:1')),
@@ -37,7 +40,7 @@ class TestGraphOps:
         assert len(gr) == 2
 
         gr.add(trp)
-        assert len(gr) == 5
+        assert len(gr) == 6
 
 
     def test_dup(self, trp):
@@ -59,8 +62,73 @@ class TestGraphOps:
         gr = SimpleGraph()
 
         gr.add(trp)
-        gr.remove(trp[1])
-        assert len(gr) == 4
+        gr.remove(trp[0])
+        assert len(gr) == 5
         assert trp[0] not in gr
         assert trp[1] not in gr
 
+        # This is the duplicate triple.
+        gr.remove(trp[1])
+        assert len(gr) == 5
+
+        # This is the triple in reverse order.
+        gr.remove(trp[2])
+        assert len(gr) == 4
+
+        gr.remove(trp[4])
+        assert len(gr) == 3
+
+
+    def test_union(self, trp):
+        """
+        Test graph union.
+        """
+        gr1 = SimpleGraph()
+        gr2 = SimpleGraph()
+
+        gr1.add(trp[0:3])
+        gr2.add(trp[2:6])
+
+        gr3 = gr1 | gr2
+
+        assert len(gr3) == 5
+        assert trp[0] in gr3
+        assert trp[4] in gr3
+
+
+    def test_iunion(self, trp):
+        """
+        Test graph union.
+        """
+        gr1 = SimpleGraph()
+        gr2 = SimpleGraph()
+
+        gr1.add(trp[0:3])
+        gr2.add(trp[2:6])
+
+        gr1 |= gr2
+
+        assert len(gr1) == 5
+        assert trp[0] in gr1
+        assert trp[4] in gr1
+
+
+    def test_intersect(self, trp):
+        """
+        Test graph union.
+        """
+        gr1 = SimpleGraph()
+        gr2 = SimpleGraph()
+
+        gr1.add(trp[0:4])
+        gr2.add(trp[2:6])
+
+        gr3 = gr1 & gr2
+
+        assert len(gr3) == 2
+        assert trp[2] in gr3
+        assert trp[3] in gr3
+        assert trp[0] not in gr3
+        assert trp[5] not in gr3
+
+