Selaa lähdekoodia

Refactor hash module and add hash32 function.

Stefano Cossu 6 vuotta sitten
vanhempi
commit
e347f97929

+ 1 - 1
lakesuperior/store/ldp_rs/lmdb_triplestore.pyx

@@ -19,7 +19,7 @@ from lakesuperior.store.base_lmdb_store cimport (
 from lakesuperior.store.ldp_rs.keyset cimport Keyset
 from lakesuperior.store.ldp_rs.term cimport (
         Buffer, deserialize_to_rdflib, serialize_from_rdflib)
-from lakesuperior.util.hash cimport HLEN, Hash128, hash128
+from lakesuperior.util.hash cimport HLEN_128 as HLEN, Hash128, hash128
 
 
 FIRST_KEY = <bytes>KEY_START * KLEN

+ 12 - 9
lakesuperior/util/hash.pxd

@@ -1,30 +1,33 @@
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint32_t, uint64_t
 
 from lakesuperior.store.ldp_rs.term cimport Buffer
 
 
-DEF _SEED_LEN = 8 # sizeof(uint64_t)
-DEF _HLEN = _SEED_LEN * 2
-
 # Seed for computing the term hash.
 #
 # This is a 16-byte string that will be split up into two ``uint64``
 # numbers to make up the ``spookyhash_128`` seeds.
+#
+# TODO This should be made configurable.
 DEF _TERM_HASH_SEED = b'\xff\xf2Q\xf2j\x0bG\xc1\x8a}\xca\x92\x98^y\x12'
 
 cdef enum:
-    SEED_LEN = _SEED_LEN
-    HLEN = _HLEN
+    HLEN_32 = sizeof(uint32_t)
+    HLEN_64 = sizeof(uint64_t)
+    HLEN_128 = sizeof(uint64_t) * 2
 
+ctypedef uint32_t Hash32
 ctypedef uint64_t Hash64
 ctypedef uint64_t DoubleHash64[2]
-ctypedef unsigned char Hash128[_HLEN]
+ctypedef unsigned char Hash128[HLEN_128]
 
 cdef:
-    uint64_t term_hash_seed1, term_hash_seed2
+    uint32_t term_hash_seed32
+    uint64_t term_hash_seed64_1, term_hash_seed64_2
     unsigned char TERM_HASH_SEED[16]
 
-    int hash128(const Buffer *message, Hash128 *hash) except -1
+    int hash32(const Buffer *message, Hash32 *hash) except -1
     int hash64(const Buffer *message, Hash64 *hash) except -1
+    int hash128(const Buffer *message, Hash128 *hash) except -1
 
 TERM_HASH_SEED = _TERM_HASH_SEED

+ 28 - 17
lakesuperior/util/hash.pyx

@@ -4,15 +4,35 @@ from libc.string cimport memcpy
 from lakesuperior.store.ldp_rs.term cimport Buffer
 
 
-memcpy(&term_hash_seed1, TERM_HASH_SEED, SEED_LEN)
-memcpy(&term_hash_seed2, TERM_HASH_SEED + SEED_LEN, SEED_LEN)
+memcpy(&term_hash_seed32, TERM_HASH_SEED, HLEN_32)
+memcpy(&term_hash_seed64_1, TERM_HASH_SEED, HLEN_64)
+memcpy(&term_hash_seed64_2, TERM_HASH_SEED + HLEN_64, HLEN_64)
 
-# We only need a couple of functions from spookyhash. No need for a pxd file.
+# We only need a few basic functions from spookyhash. No need for a pxd file.
 cdef extern from 'spookyhash_api.h':
+    uint32_t spookyhash_32(const void *input, size_t input_size, uint32_t seed)
+    uint64_t spookyhash_64(const void *input, size_t input_size, uint64_t seed)
     void spookyhash_128(
             const void *input, size_t input_size, uint64_t *hash_1,
             uint64_t *hash_2)
-    uint64_t spookyhash_64(const void *input, size_t input_size, uint64_t seed)
+
+
+cdef inline int hash32(const Buffer *message, Hash32 *hash) except -1:
+    """
+    Get a 32-bit (unsigned int) hash value of a byte string.
+    """
+    cdef uint32_t seed = term_hash_seed64_1
+
+    hash[0] = spookyhash_32(message[0].addr, message[0].sz, seed)
+
+
+cdef inline int hash64(const Buffer *message, Hash64 *hash) except -1:
+    """
+    Get a 64-bit (unsigned long) hash value of a byte string.
+    """
+    cdef uint64_t seed = term_hash_seed32
+
+    hash[0] = spookyhash_64(message[0].addr, message[0].sz, seed)
 
 
 cdef inline int hash128(const Buffer *message, Hash128 *hash) except -1:
@@ -23,27 +43,18 @@ cdef inline int hash128(const Buffer *message, Hash128 *hash) except -1:
     <http://burtleburtle.net/bob/hash/spooky.html>`_ which produces 128-bit
     (16-byte) digests.
 
+    Note that this returns a char array while the smaller functions return
+    numeric types (uint, ulong).
+
     The initial seeds are determined in the application configuration.
 
     :rtype: Hash128
     """
     cdef:
-        DoubleHash64 seed = [term_hash_seed1, term_hash_seed2]
+        DoubleHash64 seed = [term_hash_seed64_1, term_hash_seed64_2]
         Hash128 digest
 
     spookyhash_128(message[0].addr, message[0].sz, seed, seed + 1)
 
     # This casts the 2 contiguous uint64_t's into a char[16] pointer.
     hash[0] = <Hash128>seed
-
-
-cdef inline int hash64(const Buffer *message, Hash64 *hash) except -1:
-    """
-    Get a 64-bit (unsigned long) hash value of a byte string.
-
-    This function also uses SpookyHash. Note that this returns a UInt64 while
-    the 128-bit function returns a char array.
-    """
-    cdef uint64_t seed = term_hash_seed1
-
-    hash[0] = spookyhash_64(message[0].addr, message[0].sz, seed)