Selaa lähdekoodia

Move hash functions to separate module.

Stefano Cossu 5 vuotta sitten
vanhempi
commit
d37bdd7edb

+ 1 - 18
lakesuperior/store/ldp_rs/term.pxd

@@ -1,6 +1,5 @@
-from libc.stdint cimport uint64_t
+from lakesuperior.cy_include cimport cytpl as tpl
 
-# cdefs for serialize and deserialize methods
 cdef:
     #unsigned char *pack_data
     unsigned char term_type
@@ -23,19 +22,3 @@ cdef:
     int serialize(term, unsigned char **pack_data, size_t *pack_size) except -1
     deserialize(unsigned char *data, size_t size)
 
-
-# cdefs for hash methods
-DEF _HLEN = 16
-
-ctypedef uint64_t Hash_128[2]
-ctypedef unsigned char Hash[_HLEN]
-
-cdef:
-    uint64_t term_hash_seed1
-    uint64_t term_hash_seed2
-    unsigned char *term_hash_seed
-    size_t SEED_LEN
-    size_t HLEN
-
-    void hash_(
-        const unsigned char *message, size_t message_size, Hash *digest)

+ 0 - 43
lakesuperior/store/ldp_rs/term.pyx

@@ -3,9 +3,7 @@ from rdflib import URIRef, BNode, Literal
 #from cpython.mem cimport PyMem_Malloc, PyMem_Free
 from libc.stdint cimport uint64_t
 from libc.stdlib cimport malloc, free
-from libc.string cimport memcpy
 
-#from lakesuperior.cy_include.cyspookyhash cimport spookyhash_128
 from lakesuperior.cy_include cimport cytpl as tpl
 
 
@@ -16,29 +14,6 @@ DEF LSUP_PK_FMT_ID = b'S(cs)'
 DEF LSUP_PK_FMT_LIT = b'S(csss)'
 
 
-DEF _SEED_LEN = 8
-DEF _HLEN = 16
-
-HLEN = _HLEN
-SEED_LEN = _SEED_LEN
-
-term_hash_seed = b'\xff\xf2Q\xf2j\x0bG\xc1\x8a}\xca\x92\x98^y\x12'
-"""
-Seed for computing the term hash.
-
-This is a 16-byte string that will be split up into two ``uint64``
-numbers to make up the ``spookyhash_128`` seeds.
-"""
-memcpy(&term_hash_seed1, term_hash_seed, SEED_LEN)
-memcpy(&term_hash_seed2, term_hash_seed + SEED_LEN, SEED_LEN)
-
-# We only need one function from spookyhash. No need for a pxd file.
-cdef extern from 'spookyhash_api.h':
-    void spookyhash_128(
-            const void *input, size_t input_size, uint64_t *hash_1,
-            uint64_t *hash_2)
-
-
 cdef int serialize(
         term, unsigned char **pack_data, size_t *pack_size) except -1:
     cdef:
@@ -114,21 +89,3 @@ cdef deserialize(const unsigned char *data, const size_t data_size):
         free(term_lang)
         free(_pk)
         free(fmt)
-
-
-cdef inline void hash_(
-        const unsigned char *message, size_t message_size, Hash *digest):
-    """
-    Get the hash value of a serialized object.
-
-    The hashing algorithm is `SpookyHash
-    <http://burtleburtle.net/bob/hash/spooky.html>`_ which produces 128-bit
-    (16-byte) digests.
-
-    The initial seeds are determined in the application configuration.
-    """
-    cdef Hash_128 seed = [term_hash_seed1, term_hash_seed2]
-
-    spookyhash_128(message, message_size, seed, seed + 1)
-
-    memcpy(digest, seed, sizeof(Hash))

+ 24 - 0
lakesuperior/util/hash.pxd

@@ -0,0 +1,24 @@
+from libc.stdint cimport uint64_t
+
+
+DEF _SEED_LEN = 8 # sizeof(uint64_t)
+DEF _HLEN = _SEED_LEN * 2
+
+cdef enum:
+    SEED_LEN = _SEED_LEN
+    HLEN = _HLEN
+
+ctypedef uint64_t Hash64
+ctypedef uint64_t DoubleHash64[2]
+ctypedef unsigned char Hash128[_HLEN]
+
+cdef:
+    uint64_t term_hash_seed1
+    uint64_t term_hash_seed2SetValue
+    unsigned char *term_hash_seed
+
+    Hash128 hash128(
+        const unsigned char *message, size_t message_size)
+    Hash64 hash64(
+        const unsigned char *message, size_t message_size)
+

+ 55 - 0
lakesuperior/util/hash.pyx

@@ -0,0 +1,55 @@
+from libc.stdint cimport uint64_t
+from libc.string cimport memcpy
+
+term_hash_seed = b'\xff\xf2Q\xf2j\x0bG\xc1\x8a}\xca\x92\x98^y\x12'
+"""
+Seed for computing the term hash.
+
+This is a 16-byte string that will be split up into two ``uint64``
+numbers to make up the ``spookyhash_128`` seeds.
+"""
+memcpy(&term_hash_seed1, term_hash_seed, SEED_LEN)
+memcpy(&term_hash_seed2, term_hash_seed + SEED_LEN, SEED_LEN)
+
+# We only need a couple of functions from spookyhash. No need for a pxd file.
+cdef extern from 'spookyhash_api.h':
+    void spookyhash_128(
+            const void *input, size_t input_size, uint64_t *hash_1,
+            uint64_t *hash_2)
+    uint64_t spookyhash_64(const void *input, size_t input_size, uint64_t seed)
+
+
+cdef inline Hash128 hash128(
+        const unsigned char *message, size_t message_size):
+    """
+    Get the hash value of a byte string with a defined size.
+
+    The hashing algorithm is `SpookyHash
+    <http://burtleburtle.net/bob/hash/spooky.html>`_ which produces 128-bit
+    (16-byte) digests.
+
+    The initial seeds are determined in the application configuration.
+
+    :rtype: Hash128
+    """
+    cdef:
+        DoubleHash64 seed = [term_hash_seed1, term_hash_seed2]
+        Hash128 digest
+
+    spookyhash_128(message, message_size, seed, seed + 1)
+
+    # This casts the 2 contiguous uint64_t's into a char pointer.
+    return <Hash128>seed
+
+
+cdef inline Hash64 hash64(
+        const unsigned char *message, size_t message_size):
+    """
+    Get a 64-bit (unsigned long) hash value of a byte string.
+
+    This function also uses SpookyHash. Note that this returns a UInt64 while
+    the 128-bit function returns a char array.
+    """
+    cdef uint64_t seed = term_hash_seed1
+
+    return spookyhash_64(message, message_size, seed)

+ 8 - 1
setup.py

@@ -80,7 +80,6 @@ extensions = [
         'lakesuperior.store.ldp_rs.term',
         [
             path.join(tpl_src_dir, 'tpl.c'),
-            path.join(spookyhash_src_dir, 'spookyhash.c'),
             path.join('lakesuperior', 'store', 'ldp_rs', f'term.{ext}'),
         ],
         include_dirs=include_dirs,
@@ -108,6 +107,14 @@ extensions = [
         extra_compile_args=['-fopenmp'],
         extra_link_args=['-fopenmp']
     ),
+    Extension(
+        'lakesuperior.util.hash',
+        [
+            path.join(spookyhash_src_dir, 'spookyhash.c'),
+            path.join('lakesuperior', 'util', f'hash.{ext}'),
+        ],
+        include_dirs=include_dirs,
+    ),
     Extension(
         'lakesuperior.store.ldp_rs.graph',
         [