Browse Source

Hash term and triple.

Stefano Cossu 3 years ago
parent
commit
ea8a0b6725
12 changed files with 152 additions and 73 deletions
  1. 11 8
      TODO.md
  2. 19 4
      cpython/py_graph.h
  3. 6 0
      cpython/py_term.h
  4. 16 20
      cpython/py_triple.h
  5. 26 0
      include/buffer.h
  6. 9 20
      include/term.h
  7. 32 0
      include/triple.h
  8. 4 0
      src/buffer.c
  9. 17 14
      src/store_mdb.c
  10. 1 3
      src/term.c
  11. 6 0
      src/triple.c
  12. 5 4
      test/assets.h

+ 11 - 8
TODO.md

@@ -1,18 +1,21 @@
 # Quick TODO list
 
-## Critical
+P = pending; W = working on it; D = done.
 
-- LMDB back end
-- N3 serialization / deserialization
-- Turtle serialization / deserialization
-- Python bindings
-- Better error handling
+## Critical for MVP
 
+- *D* LMDB back end
+- *W* Python bindings
+- *W* Better error handling
+- *P* Hash table back end
+- *P* Namespace manager
+- *P* N3 serialization / deserialization
+- *P* Turtle serialization / deserialization
 
-## Non-critical
+
+## Non-critical for MVP
 
 - Logging
-- Optimize hash maps for arbitrary key lengths (macros)
 - Term and triple validation
 
 

+ 19 - 4
cpython/py_graph.h

@@ -91,16 +91,15 @@ Graph_copy (PyTypeObject *cls, PyObject *src)
 static PyObject *
 Graph_richcmp (PyObject *self, PyObject *other, int op)
 {
+    // Only equality and non-equality supported.
     if (op != Py_EQ && op != Py_NE) Py_RETURN_NOTIMPLEMENTED;
 
     Py_RETURN_TRUE; // TODO use graph xor == 0
     /*
-    PyObject *res = NULL;
-
     LSUP_Graph *t1 = ((GraphObject *) self)->ob_struct;
     LSUP_Graph *t2 = ((GraphObject *) other)->ob_struct;
 
-    if (LSUP_graph_equals (t1, t2) && op == Py_EQ) Py_RETURN_TRUE;
+    if (LSUP_graph_equals (t1, t2) ^ (op == Py_NE)) Py_RETURN_TRUE;
     Py_RETURN_FALSE;
     */
  }
@@ -231,6 +230,22 @@ static PyNumberMethods Graph_number_methods = {
 };
 
 
+static int
+Graph_contains (PyObject *self, PyObject *value)
+{
+    if (!PyObject_TypeCheck (value, &TripleType)) {
+        PyErr_SetString (PyExc_ValueError, "Error parsing input value.");
+        return -1;
+    }
+
+    int rc = LSUP_graph_contains (
+            ((GraphObject *) self)->ob_struct,
+            ((TripleObject *) value)->ob_struct);
+
+    return rc;
+}
+
+
 static Py_ssize_t
 Graph_get_size (PyObject *self)
 { return LSUP_graph_size (((GraphObject *) self)->ob_struct); }
@@ -238,7 +253,7 @@ Graph_get_size (PyObject *self)
 
 static PySequenceMethods Graph_seq_methods = {
     .sq_length = (lenfunc) Graph_get_size,
-    //.sq_contains = Graph_contains, // TODO
+    .sq_contains = (objobjproc) Graph_contains,
 };
 
 

+ 6 - 0
cpython/py_term.h

@@ -110,6 +110,11 @@ static PyObject *
 Term_richcmp (PyObject *obj1, PyObject *obj2, int op);
 
 
+static Py_hash_t
+Term_hash (PyObject *self)
+{ return LSUP_term_hash (((TermObject *)self)->ob_struct); }
+
+
 PyTypeObject TermType = {
     PyVarObject_HEAD_INIT(NULL, 0)
     .tp_name = "term.Term",
@@ -122,6 +127,7 @@ PyTypeObject TermType = {
     .tp_dealloc = (destructor) Term_dealloc,
     .tp_getset = Term_getsetters,
     .tp_richcompare = Term_richcmp,
+    .tp_hash = Term_hash,
 };
 
 

+ 16 - 20
cpython/py_triple.h

@@ -77,32 +77,27 @@ static PyMemberDef Triple_members[] = {
 static PyObject *
 Triple_richcmp (PyObject *obj1, PyObject *obj2, int op)
 {
-    PyObject *result = NULL;
+    // Only equality and non-equality supported.
+    if (op != Py_EQ && op != Py_NE) Py_RETURN_NOTIMPLEMENTED;
 
-    int c = 0;
     LSUP_Triple *t1 = ((TripleObject *) obj1)->ob_struct;
     LSUP_Triple *t2 = ((TripleObject *) obj2)->ob_struct;
 
-    switch (op) {
-        case Py_LT: result = Py_NotImplemented; break;
-        case Py_LE: result = Py_NotImplemented; break;
-        case Py_EQ: c = (
-                            LSUP_term_equals (t1->s, t2->s) &&
-                            LSUP_term_equals (t1->p, t2->p) &&
-                            LSUP_term_equals (t1->o, t2->o)); break;
-        case Py_NE: c = (!
-                            LSUP_term_equals (t1->s, t2->s) &&
-                            LSUP_term_equals (t1->p, t2->p) &&
-                            LSUP_term_equals (t1->o, t2->o)); break;
-        case Py_GT: result = Py_NotImplemented; break;
-        case Py_GE: result = Py_NotImplemented; break;
-    }
+    if (
+        (
+            LSUP_term_equals (t1->s, t2->s) &&
+            LSUP_term_equals (t1->p, t2->p) &&
+            LSUP_term_equals (t1->o, t2->o)
+        ) ^ (op == Py_NE)
+    ) Py_RETURN_TRUE;
+
+    Py_RETURN_FALSE;
+}
 
-    if (!result) result = c ? Py_True : Py_False;
 
-    Py_INCREF(result);
-    return result;
- }
+static Py_hash_t
+Triple_hash (PyObject *self)
+{ return LSUP_triple_hash (((TripleObject *)self)->ob_struct); }
 
 
 PyTypeObject TripleType = {
@@ -117,6 +112,7 @@ PyTypeObject TripleType = {
     .tp_dealloc = (destructor) Triple_dealloc,
     .tp_members = Triple_members,
     .tp_richcompare = Triple_richcmp,
+    .tp_hash = Triple_hash,
 };
 
 #endif

+ 26 - 0
include/buffer.h

@@ -1,8 +1,17 @@
 #ifndef _LSUP_BUFFER_H
 #define _LSUP_BUFFER_H
 
+#include "xxhash.h"
+
 #include "core.h"
 
+#ifndef HASH_SEED
+/** @brief Seed used for all hashing. Compile-time configurable.
+ */
+#define HASH_SEED 0
+#endif
+
+
 /** @brief General-purpose data buffer.
  *
  * The structure is transparently exposed so that the related API only defines
@@ -82,6 +91,23 @@ void LSUP_buffer_done (LSUP_Buffer *buf);
  */
 void LSUP_buffer_free (LSUP_Buffer *buf);
 
+
+/** @brief Hash a buffer.
+ */
+inline LSUP_Key
+LSUP_buffer_hash (const LSUP_Buffer *buf)
+{ return XXH64(buf->addr, buf->size, HASH_SEED); }
+
+
+/** @brief Combine hash values.
+ *
+ * TODO Adapt to different sizes of LSUP_Key.
+ */
+inline LSUP_Key LSUP_btriple_hash (
+        const LSUP_Buffer *b1, const LSUP_Buffer *b2)
+{ return XXH64 (b2->addr, b2->size, XXH64 (b1->addr, b1->size, HASH_SEED)); }
+
+
 /** @brief Print a byte string of a given length in a human-readable format.
  *
  * The string is printed in Python style: printable characters are output

+ 9 - 20
include/term.h

@@ -4,14 +4,11 @@
 #include <assert.h>
 #include <regex.h>
 
-#include "xxhash.h"
-
 #include "buffer.h"
 
 // URI parsing regular expression. Conforms to RFC3986.
 #define URI_REGEX_STR \
     "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"
-#define SEED 0 // TODO Make configurable.
 #define LANG_SIZE 8 // Size in chars of lang tag
 
 // "NULL" key, a value that is never user-provided. Used to mark special
@@ -167,32 +164,24 @@ LSUP_rc
 LSUP_term_deserialize (const LSUP_Buffer *sterm, LSUP_Term *term);
 
 
-inline LSUP_Key
-LSUP_sterm_to_key (const LSUP_Buffer *sterm)
-{
-    if (UNLIKELY (sterm == NULL)) return NULL_KEY;
-
-    return XXH64(sterm->addr, sterm->size, SEED);
-}
-
-
-/** @brief Hash a term into a key.
- *
- * If NULL is passed, the result is NULL_KEY.
+/** @brief Hash a buffer.
  */
 inline LSUP_Key
-LSUP_term_to_key (const LSUP_Term *term)
+LSUP_term_hash (const LSUP_Term *term)
 {
-    if (UNLIKELY (term == NULL)) return NULL_KEY;
+    LSUP_Buffer *buf;
 
-    LSUP_Buffer *sterm = LSUP_buffer_new_from_term (term);
-    LSUP_Key key = XXH64(sterm->addr, sterm->size, SEED);
+    if (UNLIKELY (!term)) buf = BUF_DUMMY;
+    else buf = LSUP_buffer_new_from_term (term);
 
-    LSUP_buffer_free (sterm);
+    LSUP_Key key = LSUP_buffer_hash (buf);
+
+    LSUP_buffer_free (buf);
 
     return key;
 }
 
+
 /**
  * Compare two terms.
  */

+ 32 - 0
include/triple.h

@@ -161,4 +161,36 @@ LSUP_striple_pos (const LSUP_SerTriple *trp, LSUP_TriplePos n)
 { _FN_BODY }
 #undef _FN_BODY
 
+
+/** @brief Hash a buffer triple.
+ *
+ * TODO This doesn't handle blank nodes correctly.
+ */
+inline LSUP_Key
+LSUP_striple_hash (const LSUP_SerTriple *strp)
+{
+    return XXH64 (
+        strp->s->addr, strp->s->size,
+        XXH64 (
+            strp->p->addr, strp->p->size,
+            XXH64 (strp->o->addr, strp->o->size, HASH_SEED)
+        )
+    );
+}
+
+
+/** @brief Hash a triple.
+ *
+ * TODO This doesn't handle blank nodes correctly.
+ */
+inline LSUP_Key
+LSUP_triple_hash (const LSUP_Triple *trp)
+{
+    LSUP_SerTriple *strp = LSUP_striple_new_from_triple (trp);
+    LSUP_Key hash = LSUP_striple_hash (strp);
+    LSUP_striple_free (strp);
+
+    return hash;
+}
+
 #endif

+ 4 - 0
src/buffer.c

@@ -85,3 +85,7 @@ void LSUP_buffer_free (LSUP_Buffer *buf)
     free (buf);
 }
 
+
+/* Extern inline prototypes. */
+
+inline LSUP_Key LSUP_buffer_hash (const LSUP_Buffer *buf);

+ 17 - 14
src/store_mdb.c

@@ -357,7 +357,7 @@ LSUP_mdbstore_add_init (LSUP_MDBStore *store, const LSUP_Buffer *sc)
     if (store->default_ctx != NULL) {
         if (sc == NULL) sc = store->default_ctx;
 
-        it->ck = LSUP_sterm_to_key (sc);
+        it->ck = LSUP_buffer_hash (sc);
 
         // Insert t:st for context.
         //TRACE ("Adding context: %s", sc);
@@ -393,7 +393,7 @@ LSUP_mdbstore_add_iter (MDBIterator *it, const LSUP_SerTriple *sspo)
         printf ("\n");
 #endif
 
-        spok[i] = LSUP_sterm_to_key (st);
+        spok[i] = LSUP_buffer_hash (st);
 
         it->key.mv_data = spok + i;
         it->key.mv_size = KLEN;
@@ -496,7 +496,7 @@ sterm_to_key (
         LSUP_MDBStore *store, const LSUP_Buffer *sterm)
 {
     // TODO this will be replaced by a lookup when 128-bit hash is introduced.
-    return LSUP_sterm_to_key (sterm);
+    return LSUP_buffer_hash (sterm);
 }
 
 
@@ -543,16 +543,16 @@ LSUP_mdbstore_lookup(
         const LSUP_Buffer *sc, size_t *ct)
 {
     LSUP_TripleKey spok = {
-        LSUP_sterm_to_key (sspo->s),
-        LSUP_sterm_to_key (sspo->p),
-        LSUP_sterm_to_key (sspo->o),
+        LSUP_buffer_hash (sspo->s),
+        LSUP_buffer_hash (sspo->p),
+        LSUP_buffer_hash (sspo->o),
     };
 
     LSUP_MDBIterator *it = malloc (sizeof (*it));
     if (UNLIKELY (!it)) return NULL;
 
     it->store = store;
-    it->ck = store->default_ctx ? LSUP_sterm_to_key (sc) : NULL_KEY;
+    it->ck = store->default_ctx ? LSUP_buffer_hash (sc) : NULL_KEY;
 
     if (ct) *ct = 0;
 
@@ -625,7 +625,6 @@ mdbiter_next_key (LSUP_MDBIterator *it)
 
     LSUP_rc rc;
 
-    it->i++;
     it->iter_op_fn (it);
 
     if (it->ck) {
@@ -723,7 +722,7 @@ LSUP_mdbstore_remove(
 
     if (store->default_ctx != NULL) {
         if (sc == NULL) sc = store->default_ctx;
-        ck = LSUP_sterm_to_key (sc);
+        ck = LSUP_buffer_hash (sc);
     }
 
     MDB_txn *txn;
@@ -944,7 +943,7 @@ it_next_1bound (MDBIterator *it)
     it->spok[it->term_order[2]] = lu_dset[it->i][1];
 
     TRACE(
-            "Composed triple: {%lu %lu %lu}",
+            "Composed triple: {%lx %lx %lx}",
             it->spok[0], it->spok[1], it->spok[2]);
 
     // Ensure next block within the same page is not beyond the last.
@@ -958,7 +957,8 @@ it_next_1bound (MDBIterator *it)
         // move cursor to beginning of next page.
         it->i = 0;
         TRACE ("Reset page cursor to %lu.", it->i);
-        it->rc = mdb_cursor_get (it->cur, &it->key, &it->data, MDB_NEXT_MULTIPLE);
+        it->rc = mdb_cursor_get (
+                it->cur, &it->key, &it->data, MDB_NEXT_MULTIPLE);
         TRACE ("it->rc: %d", it->rc);
     }
 }
@@ -1056,7 +1056,7 @@ lookup_1bound (MDBStore *store, uint8_t idx0, MDBIterator *it, size_t *ct)
 {
     it->term_order = (const uint8_t*)lookup_ordering_1bound[idx0];
 
-    TRACE ("Looking up 1 bound term: %lu\n", it->luk[0]);
+    TRACE ("Looking up 1 bound term: %lx\n", it->luk[0]);
 
     if (!it->txn) {
         if (store->txn) it->txn = store->txn;
@@ -1075,6 +1075,7 @@ lookup_1bound (MDBStore *store, uint8_t idx0, MDBIterator *it, size_t *ct)
         // If a context is specified, the only way to count triples matching
         // the context is to loop over them.
         if (it->ck != NULL_KEY) {
+            TRACE ("Counting in context: %lx\n", it->ck);
             MDBIterator *ct_it = malloc (sizeof (MDBIterator));
             if (UNLIKELY (!ct_it)) return LSUP_MEM_ERR;
 
@@ -1089,10 +1090,12 @@ lookup_1bound (MDBStore *store, uint8_t idx0, MDBIterator *it, size_t *ct)
             ct_it->key = it->key;
             ct_it->data = it->data;
             ct_it->i = 0;
-            lookup_1bound (store, idx0, ct_it, NULL);
+
+            LSUP_rc rc = lookup_1bound (store, idx0, ct_it, NULL);
+            if (rc < 0) return rc;
 
             while (LSUP_mdbiter_next (ct_it, NULL) != LSUP_END) {
-                ct[0] ++;
+                (*ct)++;
                 TRACE ("Counter increased to %lu.", *ct);
             }
 

+ 1 - 3
src/term.c

@@ -287,7 +287,5 @@ void LSUP_term_free (LSUP_Term *term)
 
 // Extern inline functions.
 
-LSUP_Key LSUP_sterm_to_key (const LSUP_Buffer *sterm);
-
-LSUP_Key LSUP_term_to_key (const LSUP_Term *term);
+LSUP_Key LSUP_term_hash (const LSUP_Term *term);
 

+ 6 - 0
src/triple.c

@@ -168,3 +168,9 @@ LSUP_striple_free (LSUP_SerTriple *sspo)
 
     free (sspo);
 }
+
+
+/* Inline extern prototypes. */
+
+inline LSUP_Key LSUP_striple_hash (const LSUP_SerTriple *strp);
+inline LSUP_Key LSUP_triple_hash (const LSUP_Triple *trp);

+ 5 - 4
test/assets.h

@@ -8,7 +8,8 @@
 LSUP_Triple *create_triples()
 {
     LSUP_Triple *trp;
-    CRITICAL (trp = malloc (NUM_TRP * sizeof (LSUP_Triple)));
+    trp = malloc (NUM_TRP * sizeof (LSUP_Triple));
+    if (!trp) abort();
 
     // These constitute overall 10 individual triples, 8 unique.
 
@@ -51,9 +52,9 @@ LSUP_Triple *create_triples()
     trp[6] = {terms[6][0], terms[6][1], terms[6][2]};
     */
 
-    trp[0].s = LSUP_uri_new ("urn:s:0"),
-    trp[0].p = LSUP_uri_new ("urn:p:0"),
-    trp[0].o = LSUP_uri_new ("urn:o:0"),
+    trp[0].s = LSUP_uri_new ("urn:s:0");
+    trp[0].p = LSUP_uri_new ("urn:p:0");
+    trp[0].o = LSUP_uri_new ("urn:o:0");
 
     trp[1].s = LSUP_uri_new ("urn:s:1");
     trp[1].p = LSUP_uri_new ("urn:p:1");