Browse Source

WIP Rewrite klib hashmap.

Stefano Cossu 4 years ago
parent
commit
324720c061
3 changed files with 584 additions and 3 deletions
  1. 77 0
      include/htable.h
  2. 498 0
      src/htable.c
  3. 9 3
      test/test_graph.c

+ 77 - 0
include/htable.h

@@ -0,0 +1,77 @@
+/**
+ * Hash table implementation.
+ *
+ * This code is hack...ahem, built upon Klib:
+ * https://github.com/attractivechaos/klib/blob/master/khash.h
+ *
+ * After trying several hash map implementations, none met all the requirements
+ * (small, single-file; accept arbitrarily-sized elements; not an unsightly
+ * macro mess; reasonably fast), so I decided to expand a KLib macro and adapt
+ * it to a data type agnostic model.
+ *
+ * This table stores keys and optionally values as unspecified null pointers of
+ * arbitrary, but fixed, data sizes. For small keys / values of unusual size,
+ * this is convenient because it avoids creating (and having to manage) a
+ * pointer for each key and value. Data are all stored inline. The data types
+ * are set by casting on retrieval.
+ *
+ * For larger or variably-sized keys or values, or ones that are not convenient
+ * to copy into the table, pointers can obviously be used by specifying ptr_t
+ * key and/or value size.
+ */
+
+#ifndef _LSUP_HTABLE_H
+#define _LSUP_HTABLE_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#include "include/core.h"
+
+#ifdef LSUP_BIG_HTABLE
+typedef size_t ht_size_t;
+#else
+typedef uint32_t ht_size_t;
+#endif
+
+/**
+ * Key hashing function.
+ *
+ * Takes a void pointer. The key length is calculated from the ksize value in
+ * the table.
+ */
+typedef uint64_t (*key_hash_fn_t)(const void *key);
+
+/**
+ * Key equality function (true: keys are equal).
+ *
+ * Takes two void pointers. The key lengths are calculated from the ksize value
+ * in the table.
+ */
+typedef bool (*key_eq_fn_t)(const void *a, const void *b);
+
+/**
+ * Hash table type.
+ *
+ * Supports up to UINT_MAX entries (~4 billions on most modern machines).
+ *
+ * If compiled with -DLSUP_BIG_HTABLE it supports up to size_t entries
+ * for extremely large in-memory graphs.
+ */
+typedef struct HTable LSUP_HTable;
+
+extern int LSUP_htable_init(
+        LSUP_HTable *ht, ht_size_t size, uint32_t ksize, uint32_t vsize,
+        key_hash_fn_t key_hash_fn, key_eq_fn_t key_eq_fn);
+
+extern void LSUP_htable_done(LSUP_HTable *ht);
+
+extern ht_size_t LSUP_htable_get(const LSUP_HTable *ht, void *key);
+
+extern ht_size_t LSUP_htable_put(LSUP_HTable *h, void *key, int *ret);
+
+extern void LSUP_htable_del(LSUP_HTable *h, khint_t x);
+
+#endif

+ 498 - 0
src/htable.c

@@ -0,0 +1,498 @@
+#include "include/htable.h"
+
+#define BUCKET_EMPTY        1 << 0
+#define BUCKET_DELETED      1 << 1
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+static const double __ac_HASH_UPPER = 0.77;
+
+typedef uint8_t flags_t;
+
+typedef struct HTable {
+        ht_size_t n_buckets;            // # of buckets. Up to UINT_MAX
+        ht_size_t size;                 // # of entries in the table.
+        ht_size_t n_occupied;           // # of occupied buckets.
+        ht_size_t upper_bound;          // 
+
+        flags_t *flags;                 // Flags for each bucket.
+
+        key_hash_fn_t key_hash_fn;      // Function to compute hash of a key.
+        key_eq_fn_t key_eq_fn;          // Function to evaluate equality of keys.
+
+        uint32_t ksize;                 // Key size, in bytes.
+        uint32_t vsize;                 // Value size, in bytes.
+
+        void *keys;                     // Key data. This is a void pointer to
+                                        // a memory block that is looked up in
+                                        // "steps" determined by the ksize
+                                        // value.
+        void *vals;                     // Value data, same layout as keys.
+} HTable;
+
+
+/* * * Static prototypes * * */
+
+static int LSUP_htable_resize(HTable *ht, ht_size_t new_n_buckets);
+
+
+/* * * API * * */
+
+int LSUP_htable_init(
+        HTable *ht, ht_size_t size, uint32_t ksize, uint32_t vsize,
+        key_hash_fn_t key_hash_fn, key_eq_fn_t key_eq_fn)
+{
+    ht->key_hash_fn = key_hash_fn;
+    ht->key_eq_fn = key_eq_fn;
+
+    ht->ksize = ksize;
+    ht->vsize = vsize;
+
+    return LSUP_htable_resize(ht, size);
+}
+
+
+void LSUP_htable_done(HTable *ht)
+{
+    if (LIKELY(ht != NULL)) {
+        free((void *)ht->keys);
+        free(ht->flags);
+        free((void *)ht->vals);
+    }
+}
+
+
+ht_size_t LSUP_htable_get(const HTable *ht, void *key)
+{
+    if (ht->n_buckets > 0) {
+        ht_size_t i, last, mask, step = 0;
+
+        mask = ht->n_buckets - 1;
+        i = ht->key_hash_fn(key) & mask;
+
+        last = i;
+
+        while (
+                !CHK_FLAG(ht->flags[i], BUCKET_EMPTY) &&
+                (
+                    CHK_FLAG(ht->flags[i], BUCKET_DELETED) ||
+                    !ht->key_eq_fn(ht->keys + i * ht->ksize, key)
+                )) {
+            i = (i + (++step)) & mask;
+
+            if (i == last) return ht->n_buckets;
+        }
+        return CHK_FLAG(ht->flags[i], (BUCKET_EMPTY | BUCKET_DELETED)) ?
+            ht->n_buckets : i;
+
+    } else return 0;
+}
+
+
+
+
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+
+int LSUP_htable_resize(HTable *ht, ht_size_t new_n_buckets)
+{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */
+    flags_t *new_flags = 0;
+    ht_size_t j = 1;
+
+    kroundup32(new_n_buckets); // TODO make universal (not only 32-bit)
+
+    if (new_n_buckets < 4) new_n_buckets = 4;
+
+    if (ht->size >= (ht_size_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) {
+        // requested size is too small
+        j = 0;
+
+    } else {
+        // hash table size to be changed (shrink or expand); rehash
+        CRITICAL(new_flags = (flags_t*)malloc(new_n_buckets * sizeof(flags_t)));
+
+        memset(new_flags, 0, new_n_buckets * sizeof(flags_t));
+
+        if (ht->n_buckets < new_n_buckets) {
+            // Expand.
+            CRITICAL(ht->keys = realloc(ht->keys, new_n_buckets * ht->ksize));
+
+            if (ht->vsize > 0) {
+                // Not for hash sets.
+                CRITICAL(ht->vals = realloc(
+                        ht->vals, new_n_buckets * ht->vsize));
+            }
+        }
+    }
+
+    if (j) { /* rehashing is needed */
+        for (j = 0; j != ht->n_buckets; ++j) {
+            if (__ac_iseither(ht->flags, j) == 0) {
+                khkey_t key = ht->keys[j];
+                khval_t val;
+                khint_t new_mask;
+                new_mask = new_n_buckets - 1;
+                if (ht->vsize > 0) val = ht->vals[j];
+                __ac_set_isdel_true(ht->flags, j);
+                while (1) { /* kick-out process; sort of like in Cuckoo hashing */
+                    khint_t k, i, step = 0;
+                    k = __hash_func(key);
+                    i = k & new_mask;
+                    while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask;
+                    __ac_set_isempty_false(new_flags, i);
+                    if (i < ht->n_buckets && __ac_iseither(ht->flags, i) == 0) { /* kick out the existing element */
+                        { khkey_t tmp = ht->keys[i]; ht->keys[i] = key; key = tmp; }
+                        if (ht->vsize > 0) { khval_t tmp = ht->vals[i]; ht->vals[i] = val; val = tmp; }
+                        __ac_set_isdel_true(ht->flags, i); /* mark it as deleted in the old hash table */
+                    } else { /* write the element and jump out of the loop */
+                        ht->keys[i] = key;
+                        if (ht->vsize > 0) ht->vals[i] = val;
+                        break;
+                    }
+                }
+            }
+        }
+        if (ht->n_buckets > new_n_buckets) { /* shrink the hash table */
+            ht->keys = (khkey_t*)realloc((void *)ht->keys, new_n_buckets * sizeof(khkey_t));
+            if (ht->vsize > 0) ht->vals = (khval_t*)realloc((void *)ht->vals, new_n_buckets * sizeof(khval_t));
+        }
+        kfree(ht->flags); /* free the working space */
+        ht->flags = new_flags;
+        ht->n_buckets = new_n_buckets;
+        ht->n_occupied = ht->size;
+        ht->upper_bound = (khint_t)(ht->n_buckets * __ac_HASH_UPPER + 0.5);
+    }
+    return 0;
+}
+SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret)
+{
+    khint_t x;
+    if (h->n_occupied >= h->upper_bound) { /* update the hash table */
+        if (h->n_buckets > (h->size<<1)) {
+            if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */
+                *ret = -1; return h->n_buckets;
+            }
+        } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */
+            *ret = -1; return h->n_buckets;
+        }
+        printf("capacity now %d\n\n", h->n_buckets);
+    } /* TODO: to implement automatically shrinking; resize() already support shrinking */
+    {
+        khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0;
+        x = site = h->n_buckets; k = __hash_func(key); i = k & mask;
+        if (__ac_isempty(h->flags, i)) x = i; /* for speed up */
+        else {
+            last = i;
+            printf("Duplicate\n");
+            printf("Keys[i]: %lx\n", (size_t)h->keys[i]);
+            printf("Key: %lx\n", (size_t)key);
+            while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) {
+                if (__ac_isdel(h->flags, i)) site = i;
+                i = (i + (++step)) & mask;
+                if (i == last) { x = site; break; }
+            }
+            if (x == h->n_buckets) {
+                if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site;
+                else x = i;
+            }
+        }
+    }
+    if (__ac_isempty(h->flags, x)) { /* not present at all */
+        h->keys[x] = key;
+        __ac_set_isboth_false(h->flags, x);
+        ++h->size; ++h->n_occupied;
+        *ret = 1;
+    } else if (__ac_isdel(h->flags, x)) { /* deleted */
+        h->keys[x] = key;
+        __ac_set_isboth_false(h->flags, x);
+        ++h->size;
+        *ret = 2;
+    } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */
+    return x;
+}
+SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)
+{
+    if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {
+        __ac_set_isdel_true(h->flags, x);
+        --h->size;
+    }
+}
+
+/*
+            printf("Not empty: %d\n", !__ac_isempty(h->flags, i));
+            printf("deleted: %d\n", __ac_isdel(h->flags, i));
+            printf("Hash does not match: %d\n\n", !__hash_equal(h->keys[i], key));
+*/
+#define KHASH_DECLARE(name, khkey_t, khval_t)
+__KHASH_TYPE(name, khkey_t, khval_t)
+__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+__KHASH_TYPE(name, khkey_t, khval_t)
+__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+@abstract     Integer hash function
+@param  key   The integer [ht_size_t]
+@return       The hash value [khint_t]
+*/
+#define kh_int_hash_func(key) (ht_size_t)(key)
+/*! @function
+@abstract     Integer comparison function
+*/
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+@abstract     64-bit integer hash function
+@param  key   The integer [khint64_t]
+@return       The hash value [khint_t]
+*/
+#define kh_int64_hash_func(key) (ht_size_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+@abstract     64-bit integer comparison function
+*/
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+@abstract     const char* hash function
+@param  s     Pointer to a null terminated string
+@return       The hash value
+*/
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+khint_t h = (khint_t)*s;
+if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+return h;
+}
+/*! @function
+@abstract     Another interface to const char* hash function
+@param  key   Pointer to a null terminated string [const char*]
+@return       The hash value [khint_t]
+*/
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+@abstract     Const char* comparison function
+*/
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+key += ~(key << 15);
+key ^=  (key >> 10);
+key +=  (key << 3);
+key ^=  (key >> 6);
+key += ~(key << 11);
+key ^=  (key >> 16);
+return key;
+}
+#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+@abstract Type of the hash table.
+@param  name  Name of the hash table [symbol]
+*/
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+@abstract     Initiate a hash table.
+@param  name  Name of the hash table [symbol]
+@return       Pointer to the hash table [khash_t(name)*]
+*/
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+@abstract     Destroy a hash table.
+@param  name  Name of the hash table [symbol]
+@param  h     Pointer to the hash table [khash_t(name)*]
+*/
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+@abstract     Reset a hash table without deallocating memory.
+@param  name  Name of the hash table [symbol]
+@param  h     Pointer to the hash table [khash_t(name)*]
+*/
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+@abstract     Resize a hash table.
+@param  name  Name of the hash table [symbol]
+@param  h     Pointer to the hash table [khash_t(name)*]
+@param  s     New size [khint_t]
+*/
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+@abstract     Insert a key to the hash table.
+@param  name  Name of the hash table [symbol]
+@param  h     Pointer to the hash table [khash_t(name)*]
+@param  k     Key [type of keys]
+@param  r     Extra return code: -1 if the operation failed;
+            0 if the key is present in the hash table;
+            1 if the bucket is empty (never used); 2 if the element in
+            the bucket has been deleted [int*]
+@return       Iterator to the inserted element [khint_t]
+*/
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+@abstract     Retrieve a key from the hash table.
+@param  name  Name of the hash table [symbol]
+@param  h     Pointer to the hash table [khash_t(name)*]
+@param  k     Key [type of keys]
+@return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+*/
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+@abstract     Remove a key from the hash table.
+@param  name  Name of the hash table [symbol]
+@param  h     Pointer to the hash table [khash_t(name)*]
+@param  k     Iterator to the element to be deleted [khint_t]
+*/
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+@abstract     Test whether a bucket contains data.
+@param  h     Pointer to the hash table [khash_t(name)*]
+@param  x     Iterator to the bucket [khint_t]
+@return       1 if containing data; 0 otherwise [int]
+*/
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+@abstract     Get key given an iterator
+@param  h     Pointer to the hash table [khash_t(name)*]
+@param  x     Iterator to the bucket [khint_t]
+@return       Key [type of keys]
+*/
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+@abstract     Get value given an iterator
+@param  h     Pointer to the hash table [khash_t(name)*]
+@param  x     Iterator to the bucket [khint_t]
+@return       Value [type of values]
+@discussion   For hash sets, calling this results in segfault.
+*/
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+@abstract     Alias of kh_val()
+*/
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+@abstract     Get the start iterator
+@param  h     Pointer to the hash table [khash_t(name)*]
+@return       The start iterator [khint_t]
+*/
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+@abstract     Get the end iterator
+@param  h     Pointer to the hash table [khash_t(name)*]
+@return       The end iterator [khint_t]
+*/
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+@abstract     Get the number of elements in the hash table
+@param  h     Pointer to the hash table [khash_t(name)*]
+@return       Number of elements in the hash table [khint_t]
+*/
+#define kh_size(h) ((h)->size)
+
+/*! @function
+@abstract     Get the number of buckets in the hash table
+@param  h     Pointer to the hash table [khash_t(name)*]
+@return       Number of buckets in the hash table [khint_t]
+*/
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+@abstract     Iterate over the entries in the hash table
+@param  h     Pointer to the hash table [khash_t(name)*]
+@param  kvar  Variable to which key will be assigned
+@param  vvar  Variable to which value will be assigned
+@param  code  Block of code to execute
+*/
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;
+for (__i = kh_begin(h); __i != kh_end(h); ++__i) {
+    if (!kh_exist(h,__i)) continue;
+    (kvar) = kh_key(h,__i);
+    (vvar) = kh_val(h,__i);
+    code;
+} }
+
+/*! @function
+@abstract     Iterate over the values in the hash table
+@param  h     Pointer to the hash table [khash_t(name)*]
+@param  vvar  Variable to which value will be assigned
+@param  code  Block of code to execute
+*/
+#define kh_foreach_value(h, vvar, code) { khint_t __i;
+for (__i = kh_begin(h); __i != kh_end(h); ++__i) {
+    if (!kh_exist(h,__i)) continue;
+    (vvar) = kh_val(h,__i);
+    code;
+} }
+
+/* More convenient interfaces */
+
+/*! @function
+@abstract     Instantiate a hash set containing integer keys
+@param  name  Name of the hash table [symbol]
+*/
+#define KHASH_SET_INIT_INT(name)
+KHASH_INIT(name, ht_size_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+@abstract     Instantiate a hash map containing integer keys
+@param  name  Name of the hash table [symbol]
+@param  khval_t  Type of values [type]
+*/
+#define KHASH_MAP_INIT_INT(name, khval_t)
+KHASH_INIT(name, ht_size_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+@abstract     Instantiate a hash set containing 64-bit integer keys
+@param  name  Name of the hash table [symbol]
+*/
+#define KHASH_SET_INIT_INT64(name)
+KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+@abstract     Instantiate a hash map containing 64-bit integer keys
+@param  name  Name of the hash table [symbol]
+@param  khval_t  Type of values [type]
+*/
+#define KHASH_MAP_INIT_INT64(name, khval_t)
+KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+@abstract     Instantiate a hash map containing const char* keys
+@param  name  Name of the hash table [symbol]
+*/
+#define KHASH_SET_INIT_STR(name)
+KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+@abstract     Instantiate a hash map containing const char* keys
+@param  name  Name of the hash table [symbol]
+@param  khval_t  Type of values [type]
+*/
+#define KHASH_MAP_INIT_STR(name, khval_t)
+KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+

+ 9 - 3
test/test_graph.c

@@ -94,6 +94,12 @@ static int test_graph_add()
 
     LSUP_graph_add(gr, trp, NUM_TRP);
 
+    for (int i = 0; i < sizeof(trp); i++) {
+        printf("checking triple #%d... ", i);
+        ASSERT(LSUP_graph_contains(gr, trp + i), "Triple not in graph!");
+        printf("OK.\n");
+    }
+
     _free_triples(trp); // gr takes ownership of data.
 
     EXPECT_INT_EQ(LSUP_graph_capacity(gr), 16);
@@ -107,7 +113,7 @@ static int test_graph_add()
 
 static int test_graph_add_100k()
 {
-    size_t nt = 10000;
+    size_t nt = 100000;
 
     LSUP_Triple *trp;
     CRITICAL(trp = malloc(nt * sizeof(LSUP_Triple)));
@@ -137,8 +143,8 @@ static int test_graph_add_100k()
 
 int graph_tests()
 {
-    RUN(test_graph_heap);
-    //RUN(test_graph_add);
+    //RUN(test_graph_heap);
+    RUN(test_graph_add);
     RUN(test_graph_add_100k);
     return 0;
 }