Browse Source

Experimental implementation of type-agnostic hash table.

Stefano Cossu 4 năm trước cách đây
mục cha
commit
242c427501
3 tập tin đã thay đổi với 335 bổ sung23 xóa
  1. 9 0
      include/store_htable.h
  2. 79 23
      src/htable.c
  3. 247 0
      src/store_htable.c

+ 9 - 0
include/store_htable.h

@@ -0,0 +1,9 @@
+#ifndef _LSUP_STORE_HTABLE_H
+#define _LSUP_STORE_HTABLE_H
+
+#include "triple.h"
+#include "htable.h"
+
+typedef struct HTStore LSUP_HTStore;
+
+#endif  // _LSUP_STORE_HTABLE_H

+ 79 - 23
src/htable.c

@@ -22,14 +22,54 @@
 #define    APPROX_40_PERCENT(x)     (((x) * 409) >> 10)
 
 #define MIN_HT_SIZE         1 << 3
+#define MAX_KEY_SIZE        64
 
+/** @brief Bucket types.
+ * Table columns are: bucket type tag; key type; value type.
+ * NOTE This macro can be redefined BEFORE including this library in order to
+ * redefine the bucket types for general purpose use.
+ */
+#ifndef LSUP_HTABLE_BUCKET_TYPES
+#define LSUP_HTABLE_BUCKET_TYPES                                    \
+    ENTRY(  ks,         LSUP_Key,            void*              )   \
+    ENTRY(  kt,         LSUP_Key,            LSUP_TripleKey     )   \
+    ENTRY(  tk,         LSUP_TripleKey,      LSUP_Key           )   \
+
+#endif
 
-typedef struct {
-    LSUP_TripleKey  key;                // TODO Make configurable but
-                                        // statically allocated via macros
-    void *          val;
-    uint64_t        hash;
-    uint16_t        psl;
+//#ifdef LSUP_BIG_HT // TODO
+typedef uint64_t ht_hash_t;
+//else
+//typedef uint32_t ht_hash_t;
+//#endif
+
+/** @brief Bucket structure.
+ *
+ * Note that the address of "key" is reliably findable across multiple bucket
+ * types in a union, since hash and psl don't change. The address of the value,
+ * however, cannot be determined without knowing the bucket type or key size.
+ */
+#define ENTRY(tag, k, v) \
+typedef struct {                \
+    ht_hash_t       hash;       \
+    uint16_t        psl;        \
+    k               key;        \
+    v               val;        \
+} bucket_##tag##_t;
+LSUP_HTABLE_BUCKET_TYPES
+#undef ENTRY
+
+typedef enum {
+#define ENTRY(tag, k, v) BT_##tag,
+LSUP_HTABLE_BUCKET_TYPES
+#undef ENTRY
+} BucketTypeTag;
+
+typedef union {
+#define ENTRY(tag, k, v) \
+    bucket_##tag##_t *  tag;
+LSUP_HTABLE_BUCKET_TYPES
+#undef ENTRY
 } bucket_t;
 
 typedef struct htable_t {
@@ -37,7 +77,9 @@ typedef struct htable_t {
     htsize_t        nitems;
     unsigned        flags;
     uint64_t        divinfo;
+
     bucket_t *      buckets;
+    BucketTypeTag   bucket_type;
     uint64_t        seed;
 
     key_hash_fn_t   key_hash_fn;
@@ -45,16 +87,32 @@ typedef struct htable_t {
 
     ksize_t         ksize;
     vsize_t         vsize;
-
-    void *          del_marker;         // Used to fill deleted buckets.
 } HTable;
 
+// Fill and compare empty buckets.
+static const unsigned char del_marker[MAX_KEY_SIZE] = {0};
+
+/*
+ * Byte offset for key address in a bucket.
+ *
+ * This is applicable to any type of bucket if handled as a null pointer.
+ */
+static const unsigned int k_offset = sizeof(ht_hash_t) + sizeof(uint16_t);
 
+/** @brief Specific bucket access in union.
+ *
+ * Use: ht->HT_BUCKET_NAME(ht)
+ */
+#define ENTRY(tag, k, v) if (ht->bucket_type == BT_##tag) return b->tag;
+static inline void *bkey(LSUP_HTable *ht, bucket_t *b) { LSUP_HTABLE_BUCKET_TYPES; }
+#undef ENTRY
+
+#define HT_BUCKET(tag, b, ht) b.ht->##tag
 
 /* * * GENERIC UTILITIES * * */
 
-static inline bool is_empty_bucket(const HTable *ht, const bucket_t *bucket)
-{ return memcmp(bucket->key, ht->del_marker, ht->ksize) == 0; }
+static inline bool is_empty_bucket(const HTable *ht, const void *bucket)
+{ return memcmp(bucket + k_offset, del_marker, ht->ksize) == 0; }
 
 /*
  * Find first bit.
@@ -110,15 +168,16 @@ fast_rem32(uint32_t v, uint32_t div, uint64_t divinfo)
 { return v - div * fast_div32(v, div, divinfo); }
 
 
+/*
 static int __attribute__((__unused__))
 //static int
-validate_psl_p(const HTable *ht, const bucket_t *bucket, unsigned i)
+validate_psl_p(const HTable *ht, unsigned i)
 {
     unsigned base_i = fast_rem32(bucket->hash, ht->size, ht->divinfo);
     unsigned diff = (base_i > i) ? ht->size - base_i + i : i - base_i;
-    return is_empty_bucket(ht, bucket) || diff == bucket->psl;
+    return is_empty_bucket(ht, ht->buckets + i) || diff == bucket->psl;
 }
-
+*/
 
 /* * * PUBLIC API * * */
 
@@ -136,8 +195,6 @@ HTable *LSUP_htable_new(
     ht->flags = flags;
     ht->size = 0;
 
-    CRITICAL(ht->del_marker = calloc(1, ksize));
-
     LSUP_htable_resize(ht, size);
 
     return ht;
@@ -222,7 +279,7 @@ int LSUP_htable_insert(HTable *ht, const void *key, void *val)
 
         if(is_empty_bucket(ht, ht->buckets + i)) break;
 
-        ASSERT(validate_psl_p(ht, bucket, i));
+        //ASSERT(validate_psl_p(ht, i));
 
         // There is a key in the bucket.
         TRACE("Entry key: {%lu, %lu, %lu}; bucket key: {%lu, %lu, %lu}", entry.key[0], entry.key[1], entry.key[2], bucket->key[0], bucket->key[1], bucket->key[2]);
@@ -252,7 +309,7 @@ int LSUP_htable_insert(HTable *ht, const void *key, void *val)
         entry.psl++;
 
         /* Continue to the next bucket. */
-        ASSERT(validate_psl_p(ht, bucket, i));
+        //ASSERT(validate_psl_p(ht, bucket, i));
         i = fast_rem32(i + 1, ht->size, ht->divinfo);
     }
 
@@ -264,7 +321,7 @@ int LSUP_htable_insert(HTable *ht, const void *key, void *val)
     memcpy(bucket, &entry, sizeof(bucket_t)); // copy
     ht->nitems++;
 
-    ASSERT(validate_psl_p(ht, bucket, i));
+    //ASSERT(validate_psl_p(ht, bucket, i));
 
     return LSUP_OK;
 }
@@ -309,7 +366,7 @@ int LSUP_htable_get(const HTable *ht, const void *key, void **valp)
      */
     for(;;) {
         bucket_t *bucket = ht->buckets + i;
-        ASSERT(validate_psl_p(ht, bucket, i));
+        //ASSERT(validate_psl_p(ht, bucket, i));
 
         if (ht->key_eq_fn(bucket->key, key, ht->ksize)) {
             // Key found within max probe length.
@@ -357,7 +414,7 @@ int LSUP_htable_del(HTable *ht, const void *key)
         if (is_empty_bucket(ht, bucket) || n > bucket->psl)
             return LSUP_NOACTION;
 
-        ASSERT(validate_psl_p(ht, bucket, i));
+        //ASSERT(validate_psl_p(ht, bucket, i));
 
         if (!ht->key_eq_fn(bucket->key, key, ht->ksize)) {
             /* Continue to the next bucket. */
@@ -376,11 +433,11 @@ int LSUP_htable_del(HTable *ht, const void *key)
     while(1) {
         bucket_t *nbucket;
 
-        memcpy(bucket->key, ht->del_marker, ht->ksize);
+        memcpy(bucket->key, del_marker, ht->ksize);
 
         i = fast_rem32(i + 1, ht->size, ht->divinfo);
         nbucket = ht->buckets + i;
-        ASSERT(validate_psl_p(ht, nbucket, i));
+        //ASSERT(validate_psl_p(ht, nbucket, i));
 
         /*
          * Stop if we reach an empty bucket or hit a key which
@@ -433,7 +490,6 @@ extern int LSUP_htable_iter(
 void LSUP_htable_done(HTable *ht)
 {
     if(LIKELY(ht->buckets != NULL)) free(ht->buckets);
-    free(ht->del_marker);
 }
 
 

+ 247 - 0
src/store_htable.c

@@ -0,0 +1,247 @@
+#include "store_htable.h"
+#include "khash.h"
+
+// Assume VERY coarsly that the number of unique terms will be in general
+// 1.7 times the number of triples. This is conservative to maintain load
+// factor low.
+#define IDX_SIZE_RATIO 1.7
+
+
+
+typedef struct HTStore {
+    LSUP_HTable *keys;
+    LSUP_HTable *idx;            // Dictionary of keys to serialized terms
+} HTStore;
+
+typedef struct HTIterator {
+    LSUP_HTable *ht;
+    size_t *cur;
+} HTIterator;
+
+
+/**
+ * Identity hashing function.
+ *
+ * Since the key is already a strong hash, reuse it for bucket allocation.
+ */
+static inline uint64_t id_hash_fn(const void *key, ksize_t size, uint64_t seed)
+{ return *(uint64_t*)key; }
+
+
+/**
+ * General XX64 hash. Strong (non-crypto) and extremely fast.
+ */
+static inline uint64_t xx64_hash_fn(
+        const void *key, ksize_t size, uint64_t seed)
+{ return XXH64(key, size, seed); }
+
+
+static inline bool buffer_eq_fn(const void *a, const void *b, ksize_t size)
+{ return memcmp(a, b, size) == 0; }
+
+
+/* * * CALLBACKS * * */
+
+/**
+ * Callback type for key comparison.
+ */
+typedef bool (*LSUP_key_cmp_fn_t)(
+        const LSUP_TripleKey* spok, const LSUP_Key k1, const LSUP_Key k2);
+
+
+/**
+ * Dummy callback for queries with all parameters unbound. Returns true.
+*/
+static bool lookup_none_cmp_fn(
+        const LSUP_TripleKey* spok, const LSUP_Key k1, const LSUP_Key k2)
+{ return true; }
+
+/**
+ * Keyset lookup for S key.
+ */
+static bool lookup_sk_cmp_fn(
+        const LSUP_TripleKey* spok, const LSUP_Key k1, const LSUP_Key k2)
+{ return spok[0][0] == k1; }
+
+/**
+ * Keyset lookup for P key.
+ */
+static bool lookup_pk_cmp_fn(
+        const LSUP_TripleKey* spok, const LSUP_Key k1, const LSUP_Key k2)
+{ return spok[0][1] == k1; }
+
+/**
+ * Keyset lookup for O key.
+ */
+static bool lookup_ok_cmp_fn(
+        const LSUP_TripleKey* spok, const LSUP_Key k1, const LSUP_Key k2)
+{ return spok[0][2] == k1; }
+
+/**
+ * Keyset lookup for S and P keys.
+ */
+static bool lookup_skpk_cmp_fn(
+        const LSUP_TripleKey* spok, const LSUP_Key k1, const LSUP_Key k2)
+{ return spok[0][0] == k1 && spok[0][1] == k2; }
+
+/**
+ * Keyset lookup for S and O keys.
+ */
+static bool lookup_skok_cmp_fn(
+        const LSUP_TripleKey* spok, const LSUP_Key k1, const LSUP_Key k2)
+{ return spok[0][0] == k1 && spok[0][2] == k2; }
+
+/**
+ * Keyset lookup for P and O keys.
+ */
+static bool lookup_pkok_cmp_fn(
+        const LSUP_TripleKey* spok, const LSUP_Key k1, const LSUP_Key k2)
+{ return spok[0][1] == k1 && spok[0][2] == k2; }
+
+
+/* * * API * * */
+
+LSUP_rc
+LSUP_htstore_new(size_t capacity, HTStore **ht_p)
+{
+    HTStore *ht;
+    CRITICAL(ht = malloc(sizeof(HTStore)));
+    *ht_p = ht;
+
+    ht->keys = LSUP_htable_new(
+            capacity, TRP_KLEN, 0, xx64_hash_fn, buffer_eq_fn, 0);
+    ht->idx = LSUP_htable_new(
+        capacity * IDX_SIZE_RATIO, sizeof(uint64_t), sizeof(uintptr_t),
+        xx64_hash_fn, buffer_eq_fn, 0);
+}
+
+void
+LSUP_htstore_free(HTStore *ht)
+{
+    if (!ht) return;
+
+    LSUP_htable_free(ht->keys);
+
+    // Free up index entries and index.
+    htsize_t cur = 0;
+    LSUP_TripleKey spok;
+    LSUP_Buffer *sterm;
+    while(LSUP_htable_iter(
+                ht->idx, &cur, (void**)&spok, (void**)&sterm) == LSUP_OK) {
+        TRACE("Freeing indexed term buffer #%d at %p", cur, sterm);
+        LSUP_buffer_done(sterm);
+    }
+
+    LSUP_htable_free(ht->idx);
+    free(ht);
+}
+
+
+LSUP_rc
+LSUP_htstore_lookup(
+        HTStore *gr, const LSUP_Triple *spo, HTIterator **it_p, size_t *ct)
+{
+    if (LSUP_htable_size(gr->keys) == 0)
+        return LSUP_NOACTION;
+
+    htsize_t cur = 0;
+    LSUP_Key k1, k2;
+    LSUP_key_cmp_fn_t cmp_fn;
+    LSUP_TripleKey i_spok;
+
+    LSUP_TripleKey spok = {
+        LSUP_term_to_key(spo->s),
+        LSUP_term_to_key(spo->p),
+        LSUP_term_to_key(spo->o),
+    };
+
+    HTIterator *it;
+    CRITICAL(it = malloc(sizeof(HTIterator)));
+    *it_p = it;
+
+    if (spok[0] != NULL_KEY && spok[1] != NULL_KEY && spok[2] != NULL_KEY) {
+        int rc = LSUP_htable_get(gr->keys, spok, NULL);
+        /* * * /
+        if (match_cond == true) {
+            // Shortcut for 3-term match—only if match_cond is true.
+            LSUP_graph_init(res, 1, NULL, LSUP_STORE_MEM);
+            int rc = LSUP_htable_get(gr->keys, spok, NULL);
+            if(rc == LSUP_OK) {
+                callback_fn(gr, res, &spok, ctx);
+                return LSUP_OK;
+            } else {
+                return LSUP_NOACTION;
+            }
+        } else {
+            // For negative condition (i.e. "apply this function to all triples
+            // except the matching one")
+            int rc = LSUP_NOACTION;
+            while (LSUP_htable_iter(
+                        gr->keys, &cur, (void**)&i_spok, NULL) == LSUP_OK) {
+                if (LIKELY(
+                    i_spok[2] != spok[2] ||
+                    i_spok[0] != spok[0] ||
+                    i_spok[1] != spok[1]
+                )) {
+                    rc = callback_fn(gr, res, &i_spok, ctx);
+                }
+            }
+
+            return rc;
+        }
+        */
+
+    } else if (spok[0] != NULL_KEY) {
+        k1 = spok[0];
+
+        if (spok[1] != NULL_KEY) { // s p ?
+            k2 = spok[1];
+            cmp_fn = lookup_skpk_cmp_fn;
+
+        } else if (spok[2] != NULL_KEY) { // s ? o
+            k2 = spok[2];
+            cmp_fn = lookup_skok_cmp_fn;
+
+        } else { // s ? ?
+            cmp_fn = lookup_sk_cmp_fn;
+
+        }
+
+    } else if (spok[1] != NULL_KEY) {
+        k1 = spok[1];
+
+        if (spok[2] != NULL_KEY) { // ? p o
+            k2 = spok[2];
+            cmp_fn = lookup_pkok_cmp_fn;
+
+        } else { // ? p ?
+            cmp_fn = lookup_pk_cmp_fn;
+        }
+
+    } else if (spok[2] != NULL_KEY) { // ? ? o
+        k1 = spok[2];
+        cmp_fn = lookup_ok_cmp_fn;
+
+    } else {
+        printf("WARNING: no bound terms, making a compact copy.\n");
+        return LSUP_graph_copy(res, &gr);
+    }
+
+    while (LSUP_htable_iter(gr->keys, &cur, (void**)&i_spok, NULL) == LSUP_OK) {
+        if (cmp_fn(&i_spok, k1, k2) == match_cond)
+            callback_fn(gr, res, &i_spok, ctx);
+    }
+
+    return LSUP_OK;
+}
+
+
+/*
+int LSUP_graph_lookup(LSUP_Graph *gr, LSUP_Graph *res, const LSUP_Triple *spo)
+{
+    LSUP_graph_init(res, LOOKUP_GR_INIT_SIZE, NULL, LSUP_STORE_MEM);
+
+    return LSUP_graph_match_callback(gr, res, spo, &match_add_fn, true, NULL);
+}
+*/
+