Преглед изворни кода

Merge remote-tracking branch 'ktx/benchmark' into development

Stefano Cossu пре 5 година
родитељ
комит
2e1453dc4d

+ 28 - 63
docs/performance.rst

@@ -4,38 +4,20 @@ Performance Benchmark Report
 The purpose of this document is to provide very broad performance measurements
 and comparison between Lakesuperior and Fedora/Modeshape implementations.
 
-Lakesuperior v1.0a17 and v1.0a18 were taken into consideration. This is because
-of the extensive reworking of the whole architecture and complete rewrite
-of the storage layer, that led to significant performance gains.
-
 Environment
 -----------
 
 Hardware
 ~~~~~~~~
 
-‘Rather Snappy’ Laptop
-^^^^^^^^^^^^^^^^^^^^^^
-
--  Dell Latitude 7490 Laptop
--  8x Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz
+-  MacBook Pro14,2
+-  1x Intel(R) Core(TM) i5 @3.1Ghz
 -  16Gb RAM
 -  SSD
--  Arch Linux OS
--  glibc 2.26-11
--  python 3.7.0
+-  OS X 10.13
+-  python 3.7.2
 -  lmdb 0.9.22
 
-The laptop was left alone during the process, but some major applications
-(browser, email client, etc.) were left open.
-
-‘Ole Workhorse’ server
-^^^^^^^^^^^^^^^^^^^^^^
-
--  8x Intel(R) Xeon(R) CPU X5550 @ 2.67GHz
--  16Gb RAM
--  Magnetic drive, XXX RPM
-
 Benchmark script
 ~~~~~~~~~~~~~~~~
 
@@ -45,7 +27,7 @@ The script was run with default values: resprectively 10,000 and 100,000
 children under the same parent. PUT and POST requests were tested separately.
 
 The script calculates only the timings used for the PUT or POST requests, not
-counting the time used to generate the graphs.
+counting the time used to generate the random data.
 
 Data Set
 ~~~~~~~~
@@ -101,26 +83,21 @@ IPython console::
 
    In [1]: from lakesuperior import env_setup
    In [2]: from lakesuperior.api import resource as rsrc_api
-   In [3]: %timeit x = rsrc_api.get('/pomegranate').imr
+   In [3]: %timeit x = rsrc_api.get('/pomegranate').imr.as_rdflib
 
 Results
 -------
 
-.. _rather-snappy-laptop-1:
-
-‘Rather Snappy’ Laptop
-~~~~~~~~~~~~~~~~~~~~~~
-
 10K Resources
 ^^^^^^^^^^^^^
 
-=========================  ============  ============  ============  ============  ================
-System                     PUT           Store         GET           SPARQL Query  Py-API retrieval
-=========================  ============  ============  ============  ============  ================
-FCREPO / Modeshape 4.7.5   49ms (100%)   3.7Gb (100%)  6.2s (100%)   N/A           N/A
-Lakesuperior 1.0a17        78ms (159%)   298Mb (8%)    2.8s          0m1.194s      Not measured
-Lakesuperior 1.0a18        62ms (126%)   789Mb (21%)   2.2s          0m2.214s      66ms
-=========================  ============  ============  ============  ============  ================
+===============================  =============  =============  ============  ============  ============
+System                           PUT            POST           Store         GET           SPARQL Query
+===============================  =============  =============  ============  ============  ============
+FCREPO / Modeshape 4.7.5         68ms (100%)    XXms (100%)    3.9Gb (100%)  6.2s (100%)   N/A         
+Lakesuperior 1.0a20 REST API     105ms (159%)   XXXms (XXX%)   298Mb (8%)    2.1s          XXXXXXXs    
+Lakesuperior 1.0a20 Python API   53ms (126%)    XXms (XXX%)    789Mb (21%)   381ms         N/A         
+===============================  =============  =============  ============  ============  ============
 
 **Notes:**
 
@@ -138,36 +115,24 @@ Lakesuperior 1.0a18        62ms (126%)   789Mb (21%)   2.2s          0m2.214s
 100K Resources
 ^^^^^^^^^^^^^^
 
-=========================  ===============  =============  =============  ===============  ============  ================
-System                     PUT              POST           Store          GET              Query         Py-API retrieval
-=========================  ===============  =============  =============  ===============  ============  ================
-FCREPO / Modeshape 4.7.5   500ms* (100%)    38ms (100%)    13Gb (100%)    2m6.7s (100%)    N/A           N/A
-Lakesuperior 1.0a17        104ms (21%)      104ms (273%)   5.3Gb (40%)    0m17.0s (13%)    0m12.481s     3810ms
-Lakesuperior 1.0a18        79ms (15%)       79ms  (207%)   7.5Gb (58%)    0m14.2s (11%)    0m4.214s**    905ms
-=========================  ===============  =============  =============  ===============  ============  ================
-
-\* POST was stopped at 50K resources. From looking at ingest timings over time
-we can easily infer that ingest time would further increase. This is the
-manifestation of the "many members" issue. The "Store" value is for the PUT
-operation which ran regularly with 100K resources.
-
-\*\* Timing based on a warm cache. The first query timed at 0m22.2s.
+===============================  ===============  ===============  =============  ===============  ==============
+System                           PUT              POST             Store          GET              SPARQL Query  
+===============================  ===============  ===============  =============  ===============  ==============
+FCREPO / Modeshape 4.7.5         500+ms*          65ms (100%)\*\*  12Gb (100%)    3m41s (100%)     N/A           
+Lakesuperior 1.0a20 REST API     104ms (100%)     123ms (189%)     8.7Gb (72%)    30s (14%)        XXXXXXXXs     
+Lakesuperior 1.0a20 Python API   69ms (60%)       XXms  (XXX%)     8.7Gb (72%)    6s (2.7%)        XXXXXXXs\*\*\*
+===============================  ===============  ===============  =============  ===============  ==============
 
-.. _ole-workhorse-server-1:
+\* POST was stopped at 30K resources after the ingest time reached >1s per
+resource. This is the manifestation of the "many members" issue which is
+visible in the graph below. The "Store" value is for the PUT operation which
+ran regularly with 100K resources.
 
-‘Ole Workhorse’ server
-~~~~~~~~~~~~~~~~~~~~~~
-
-10K Resources
-^^^^^^^^^^^^^
+\*\* the POST test with 100K resources was conducted with fedora 4.7.5 because
+5.0 would not automatically create a pairtree, thereby resulting in the same
+performance as the PUT method.
 
-=========================  ==============  ==============  ==============  ==============  ==================
-System                     PUT             Store           GET             SPARQL Query    Py-API retrieval
-=========================  ==============  ==============  ==============  ==============  ==================
-FCREPO / Modeshape 4.7.5   285ms (100%)    3.7Gb (100%)    9.6s (100%)     N/A             N/A
-Lakesuperior 1.0a17        446ms           298Mb           5.6s (58%)      0m1.194s        Not measured
-Lakesuperior 1.0a18        Not measured    Not measured    Not measured    Not measured    Not measured
-=========================  ==============  ==============  ==============  ==============  ==================
+\*\*\* Timing based on a warm cache. The first query timed at 0m22.2s.
 
 Conclusions
 -----------

+ 8 - 2
lakesuperior/api/admin.py

@@ -27,8 +27,14 @@ def stats():
     """
     import lakesuperior.env_setup
     with env.app_globals.rdf_store.txn_ctx():
-        repo_stats = {'rsrc_stats': env.app_globals.rdfly.count_rsrc()}
-        repo_stats['store_stats'] = env.app_globals.rdf_store.stats()
+        repo_stats = {
+            'rsrc_stats': env.app_globals.rdfly.count_rsrc(),
+            'store_stats': env.app_globals.rdf_store.stats(),
+            'nonrdf_stats': {
+                'ct': env.app_globals.nonrdfly.file_ct,
+                'size': env.app_globals.nonrdfly.store_size,
+            },
+        }
 
     return repo_stats
 

+ 1 - 19
lakesuperior/endpoints/admin.py

@@ -5,6 +5,7 @@ from flask import Blueprint, jsonify, render_template
 from lakesuperior.api import admin as admin_api
 from lakesuperior.exceptions import (
     ChecksumValidationError, ResourceNotExistsError, TombstoneError)
+from lakesuperior.util.toolbox import fsize_fmt
 
 
 # Admin interface and REST API.
@@ -18,25 +19,6 @@ def stats():
     """
     Get repository statistics.
     """
-    def fsize_fmt(num, suffix='b'):
-        """
-        Format an integer into 1024-block file size format.
-
-        Adapted from Python 2 code on
-        https://stackoverflow.com/a/1094933/3758232
-
-        :param int num: Size value in bytes.
-        :param str suffix: Suffix label (defaults to ``b``).
-
-        :rtype: str
-        :return: Formatted size to largest fitting unit.
-        """
-        for unit in ['','K','M','G','T','P','E','Z']:
-            if abs(num) < 1024.0:
-                return f'{num:3.1f} {unit}{suffix}'
-            num /= 1024.0
-        return f'{num:.1f} Y{suffix}'
-
     repo_stats = admin_api.stats()
 
     return render_template(

+ 9 - 8
lakesuperior/endpoints/ldp.py

@@ -29,7 +29,8 @@ from lakesuperior.model.ldp.ldp_factory import LdpFactory
 from lakesuperior.model.ldp.ldp_nr import LdpNr
 from lakesuperior.model.ldp.ldp_rs import LdpRs
 from lakesuperior.model.ldp.ldpr import Ldpr
-from lakesuperior.toolbox import Toolbox
+from lakesuperior.util import toolbox
+from lakesuperior.util.toolbox import RequestUtils
 
 
 DEFAULT_RDF_MIMETYPE = 'text/turtle'
@@ -112,7 +113,7 @@ def log_request_start():
 
 @ldp.before_request
 def instantiate_req_vars():
-    g.tbox = Toolbox()
+    g.tbox = RequestUtils()
 
 
 @ldp.after_request
@@ -161,7 +162,7 @@ def get_resource(uid, out_fmt=None):
     # Then, business as usual.
     # Evaluate which representation is requested.
     if 'prefer' in request.headers:
-        prefer = g.tbox.parse_rfc7240(request.headers['prefer'])
+        prefer = toolbox.parse_rfc7240(request.headers['prefer'])
         logger.debug('Parsed Prefer header: {}'.format(pformat(prefer)))
         if 'return' in prefer:
             repr_options = parse_repr_options(prefer['return'])
@@ -178,7 +179,7 @@ def get_resource(uid, out_fmt=None):
         out_headers.update(_headers_from_metadata(rsrc, out_fmt))
         uri = g.tbox.uid_to_uri(uid)
 
-# RDF output.
+        # RDF output.
         if out_fmt == 'rdf':
             if locals().get('rdf_mimetype', None) is None:
                 rdf_mimetype = DEFAULT_RDF_MIMETYPE
@@ -187,7 +188,7 @@ def get_resource(uid, out_fmt=None):
             return _negotiate_content(
                     ggr, rdf_mimetype, out_headers, uid=uid, uri=uri)
 
-# Datastream.
+        # Datastream.
         else:
             if not getattr(rsrc, 'local_path', False):
                 return ('{} has no binary content.'.format(rsrc.uid), 404)
@@ -445,7 +446,7 @@ def delete_resource(uid):
     headers = std_headers.copy()
 
     if 'prefer' in request.headers:
-        prefer = g.tbox.parse_rfc7240(request.headers['prefer'])
+        prefer = toolbox.parse_rfc7240(request.headers['prefer'])
         leave_tstone = 'no-tombstone' not in prefer
     else:
         leave_tstone = True
@@ -608,13 +609,13 @@ def set_post_put_params():
     """
     handling = 'strict'
     if 'prefer' in request.headers:
-        prefer = g.tbox.parse_rfc7240(request.headers['prefer'])
+        prefer = toolbox.parse_rfc7240(request.headers['prefer'])
         logger.debug('Parsed Prefer header: {}'.format(prefer))
         if 'handling' in prefer:
             handling = prefer['handling']['value']
 
     try:
-        disposition = g.tbox.parse_rfc7240(
+        disposition = toolbox.parse_rfc7240(
                 request.headers['content-disposition'])
     except KeyError:
         disposition = None

+ 0 - 1
lakesuperior/endpoints/query.py

@@ -10,7 +10,6 @@ from lakesuperior import env
 from lakesuperior.api import query as query_api
 from lakesuperior.dictionaries.namespaces import ns_collection as nsc
 from lakesuperior.dictionaries.namespaces import ns_mgr as nsm
-from lakesuperior.toolbox import Toolbox
 
 # Query endpoint. raw SPARQL queries exposing the underlying layout can be made
 # available. Also convenience methods that allow simple lookups based on simple

+ 9 - 30
lakesuperior/endpoints/templates/stats.html

@@ -4,38 +4,17 @@
 {% block title %}System Statistics{% endblock %}
 {% block content %}
     <h2>Repository</h2>
-    <p>Current resources: <strong>{{ '{:,}'.format(rsrc_stats['main']) }}</strong></p>
+    <p>Current resource count: <strong>{{ '{:,}'.format(rsrc_stats['main']) }}</strong></p>
     <p>Historic snapshots: <strong>{{ '{:,}'.format(rsrc_stats['hist']) }}</strong></p>
-    <p>Triples: <strong>{{ '{:,}'.format(store_stats['num_triples']) }}</strong></p>
-    <h2>LMDB Store</h2>
-    <p>Overall size on disk: <strong>{{ fsize_fmt(
-        store_stats['idx_db_size'] + store_stats['data_db_size']
-    )}}</strong></p>
-    <h3>Data</h3>
-    <p>Size on disk: <strong>{{ fsize_fmt(store_stats['data_db_size']) }}</strong></p>
+    <p>Total triples: <strong>{{ '{:,}'.format(store_stats['num_triples']) }}</strong></p>
+    <h2>Binary Store</h2>
+    <p>Current file count: <strong>{{ '{:,}'.format(nonrdf_stats['ct']) }}</strong></p>
+    <p>Current size on disk: <strong>{{ fsize_fmt(nonrdf_stats['size']) }}</strong></p>
+    <h2>RDF Store</h2>
+    <p>Overall size on disk: <strong>{{ fsize_fmt(store_stats['store_size'])}}</strong></p>
+    <h3>LMDB Store Details</h3>
     <p>Refer to the <a href="http://lmdb.readthedocs.io/en/release/#lmdb.Environment.stat">LMDB API documentation</a> for details about the parameters below.</p>
-    {% for db_label, db in store_stats['data_db_stats'].items() %}
-    <h4>{{ db_label }}</h4>
-    <table class="table table-striped">
-        <thead>
-            <tr>
-                <td>Property</td>
-                <td>Value</td>
-            </tr>
-        </thead>
-        <tbody>
-        {% for p, v in db.items() | sort %}
-            <tr>
-                <td>{{ p }}</td>
-                <td>{{ v }}</td>
-            </tr>
-        {% endfor %}
-        </tbody>
-    </table>
-    {% endfor %}
-    <h3>Indices</h3>
-    <p>Size on disk: <strong>{{ fsize_fmt(store_stats['idx_db_size']) }}</strong></p>
-    {% for db_label, db in store_stats['idx_db_stats'].items() %}
+    {% for db_label, db in store_stats['db_stats'].items() %}
     <h4>{{ db_label }}</h4>
     <table class="table table-striped">
         <thead>

+ 2 - 3
lakesuperior/model/ldp/ldpr.py

@@ -29,7 +29,7 @@ from lakesuperior.exceptions import (
     ServerManagedTermError, TombstoneError)
 from lakesuperior.model.rdf.graph import Graph
 from lakesuperior.store.ldp_rs.rsrc_centric_layout import VERS_CONT_LABEL
-from lakesuperior.toolbox import Toolbox
+from lakesuperior.util.toolbox import replace_term_domain
 
 
 rdfly = env.app_globals.rdfly
@@ -165,7 +165,6 @@ class Ldpr(metaclass=ABCMeta):
         self.uri = nsc['fcres'][uid]
         # @FIXME Not ideal, should separate app-context dependent functions in
         # a different toolbox.
-        self.tbox = Toolbox()
 
         self.provided_imr = provided_imr
 
@@ -552,7 +551,7 @@ class Ldpr(metaclass=ABCMeta):
                 pass
             else:
                 ver_add_gr.add((
-                    self.tbox.replace_term_domain(t[0], self.uri, ver_uri),
+                    replace_term_domain(t[0], self.uri, ver_uri),
                     t[1], t[2]))
 
         rdfly.modify_rsrc(ver_uid, add_trp=ver_add_gr)

+ 2 - 2
lakesuperior/model/structures/keyset.pxd

@@ -22,9 +22,9 @@ cdef class Keyset:
         size_t size(self)
         size_t tell(self)
         bint get_next(self, TripleKey* item)
-        void add(self, const TripleKey* val, bint check_dup=*) except *
+        int add(self, const TripleKey* val, bint check_dup=*, bint check_cap=*) except -1
         void remove(self, const TripleKey* val) except *
-        bint contains(self, const TripleKey* val) nogil
+        bint contains(self, const TripleKey* val)
         Keyset copy(self)
         Keyset sparse_copy(self)
         void resize(self, size_t size=*) except *

+ 11 - 7
lakesuperior/model/structures/keyset.pyx

@@ -26,7 +26,7 @@ cdef class Keyset:
     data block, so e.g. bulk removal and intersection are much more efficient
     than individual record operations.
     """
-    def __cinit__(self, size_t capacity=0, expand_ratio=.5):
+    def __cinit__(self, size_t capacity=0, expand_ratio=.75):
         """
         Initialize and allocate memory for the data set.
 
@@ -98,15 +98,18 @@ cdef class Keyset:
         return True
 
 
-    cdef void add(self, const TripleKey* val, bint check_dup=False) except *:
+    cdef inline int add(
+            self, const TripleKey* val, bint check_dup=False,
+            bint check_cap=True
+    ) except -1:
         """
         Add a triple key to the array.
         """
         # Check for deleted triples and optionally duplicates.
-        if val[0] == NULL_TRP or (check_dup and self.contains(val)):
-            return
+        if check_dup and self.contains(val):
+            return 1
 
-        if self.free_i >= self.capacity:
+        if check_cap and self.free_i >= self.capacity:
             if self.expand_ratio > 0:
                 # In some edge casees, a very small ratio may round down to a
                 # zero increase, so the baseline increase is 1 element.
@@ -115,9 +118,10 @@ cdef class Keyset:
                 raise MemoryError('No space left in key set.')
 
         self.data[self.free_i] = val[0]
-
         self.free_i += 1
 
+        return 0
+
 
     cdef void remove(self, const TripleKey* val) except *:
         """
@@ -138,7 +142,7 @@ cdef class Keyset:
                 return
 
 
-    cdef bint contains(self, const TripleKey* val) nogil:
+    cdef bint contains(self, const TripleKey* val):
         """
         Whether a value exists in the set.
         """

+ 11 - 8
lakesuperior/store/base_lmdb_store.pxd

@@ -1,5 +1,7 @@
 from lakesuperior.cy_include cimport cylmdb as lmdb
 
+ctypedef char DbLabel[8]
+
 cdef:
     int rc
     size_t i
@@ -29,28 +31,29 @@ cdef class BaseLmdbStore:
         void _txn_commit(self) except *
         void _txn_abort(self) except *
         inline bint _key_exists(
-            self, unsigned char *key, unsigned char klen,
-            unsigned char *dblabel=*) except -1
+            self, unsigned char *key, unsigned char klen, DbLabel dblabel=*
+        ) except -1
 
         size_t _txn_id(self) except -1
         lmdb.MDB_cursor *_cur_open(
-                self, unsigned char *dblabel=*, lmdb.MDB_txn *txn=*) except *
+            self, DbLabel dblabel=*, lmdb.MDB_txn *txn=*
+        ) except *
 
         lmdb.MDB_dbi get_dbi(
-                self, unsigned char *dblabel=*, lmdb.MDB_txn *txn=*)
+                self, DbLabel dblabel=*, lmdb.MDB_txn *txn=*)
 
         void _put(
                 self, unsigned char *key, size_t key_size, unsigned char *data,
-                size_t data_size, unsigned char *dblabel=*,
+                size_t data_size, DbLabel dblabel=*,
                 lmdb.MDB_txn *txn=*, unsigned int flags=*) except *
 
         void _get_data(
                 self, unsigned char *key, size_t klen, lmdb.MDB_val *rv,
-                unsigned char *dblabel=*) except *
+                DbLabel dblabel=*) except *
 
         void _delete(
                 self, unsigned char *key, size_t klen,
-                unsigned char *dblabel=*) except *
+                DbLabel dblabel=*) except *
 
         dict _stats(self)
         #int _reader_list_callback(self, const unsigned char *msg, void *str_)
@@ -59,7 +62,7 @@ cdef class BaseLmdbStore:
     cpdef void destroy(self, _path=*) except *
     #cpdef get_dup_data(self, unsigned char *key, db=*)
     #cpdef get_all_pairs(self, db=*)
-    cpdef bytes get_data(self, key, dblabel=*)
+    cpdef bytes get_data(self, key, DbLabel dblabel=*)
     cpdef dict stats(self)
     cpdef int txn_id(self)
     #cpdef str reader_list(self)

+ 22 - 21
lakesuperior/store/base_lmdb_store.pyx

@@ -276,7 +276,7 @@ cdef class BaseLmdbStore:
                 for i, dblabel in enumerate(self.dbi_labels):
                     flags = self.dbi_flags.get(dblabel, 0) | create_flag
                     _check(lmdb.mdb_dbi_open(
-                            txn, dblabel.encode(), flags, self.dbis + i))
+                            txn, dblabel, flags, self.dbis + i))
                     dbi = self.dbis[i]
                     logger.debug(f'Created DB {dblabel}: {dbi}')
                     # Open and close cursor to initialize the memory slot.
@@ -450,15 +450,14 @@ cdef class BaseLmdbStore:
         """
         if new_txn is True:
             with self.txn_ctx():
-                return self._key_exists(
-                        key, len(key), dblabel=dblabel.encode())
+                return self._key_exists(key, len(key), dblabel=dblabel)
         else:
-            return self._key_exists(key, len(key), dblabel=dblabel.encode())
+            return self._key_exists(key, len(key), dblabel=dblabel)
 
 
     cdef inline bint _key_exists(
             self, unsigned char *key, unsigned char klen,
-            unsigned char *dblabel=b'') except -1:
+            DbLabel dblabel=b'') except -1:
         """
         Return whether a key exists in a database.
 
@@ -485,13 +484,14 @@ cdef class BaseLmdbStore:
         Put one key/value pair (Python-facing method).
         """
         self._put(
-                key, len(key), data, len(data), dblabel=dblabel.encode(),
-                txn=self.txn, flags=flags)
+            key, len(key), data, len(data), dblabel=dblabel,
+            txn=self.txn, flags=flags
+        )
 
 
     cdef void _put(
             self, unsigned char *key, size_t key_size, unsigned char *data,
-            size_t data_size, unsigned char *dblabel='',
+            size_t data_size, DbLabel dblabel='',
             lmdb.MDB_txn *txn=NULL, unsigned int flags=0) except *:
         """
         Put one key/value pair.
@@ -511,13 +511,13 @@ cdef class BaseLmdbStore:
                 key[: key_size], data[: data_size]))
 
 
-    cpdef bytes get_data(self, key, dblabel=''):
+    cpdef bytes get_data(self, key, DbLabel dblabel=''):
         """
         Get a single value (non-dup) for a key (Python-facing method).
         """
         cdef lmdb.MDB_val rv
         try:
-            self._get_data(key, len(key), &rv, dblabel=dblabel.encode())
+            self._get_data(key, len(key), &rv, dblabel=dblabel)
 
             return (<unsigned char *>rv.mv_data)[: rv.mv_size]
         except KeyNotFoundError:
@@ -526,7 +526,7 @@ cdef class BaseLmdbStore:
 
     cdef void _get_data(
             self, unsigned char *key, size_t klen, lmdb.MDB_val *rv,
-            unsigned char *dblabel='') except *:
+            DbLabel dblabel='') except *:
         """
         Get a single value (non-dup) for a key.
         """
@@ -545,12 +545,12 @@ cdef class BaseLmdbStore:
         """
         Delete one single value by key. Python-facing method.
         """
-        self._delete(key, len(key), dblabel.encode())
+        self._delete(key, len(key), dblabel)
 
 
     cdef void _delete(
             self, unsigned char *key, size_t klen,
-            unsigned char *dblabel=b'') except *:
+            DbLabel dblabel=b'') except *:
         """
         Delete one single value by key from a non-dup database.
 
@@ -588,13 +588,13 @@ cdef class BaseLmdbStore:
                 lmdb.mdb_stat(self.txn, self.dbis[i], &stat),
                 'Error getting datbase stats: {}')
             entries = stat.ms_entries
-            db_stats[dblabel.encode()] = <dict>stat
+            db_stats[dblabel] = <dict>stat
 
         return {
             'env_stats': env_stats,
             'env_size': os.stat(self.env_path).st_size,
             'db_stats': {
-                db_label: db_stats[db_label.encode()]
+                db_label: db_stats[db_label]
                 for db_label in self.dbi_labels
             },
         }
@@ -700,7 +700,7 @@ cdef class BaseLmdbStore:
 
 
     cdef lmdb.MDB_dbi get_dbi(
-            self, unsigned char *dblabel=NULL, lmdb.MDB_txn *txn=NULL):
+            self, DbLabel dblabel=NULL, lmdb.MDB_txn *txn=NULL):
         """
         Return a DB handle by database name.
         """
@@ -712,8 +712,9 @@ cdef class BaseLmdbStore:
         if dblabel is NULL:
             logger.debug('Getting DBI without label.')
         dbidx = (
-                0 if dblabel is NULL
-                else self.dbi_labels.index(dblabel.decode()))
+            0 if dblabel is NULL
+            else self.dbi_labels.index(dblabel)
+        )
         #logger.debug(
         #        f'Got DBI {self.dbis[dbidx]} with label {dblabel} '
         #        f'and index #{dbidx}')
@@ -722,7 +723,7 @@ cdef class BaseLmdbStore:
 
 
     cdef lmdb.MDB_cursor *_cur_open(
-            self, unsigned char *dblabel=NULL, lmdb.MDB_txn *txn=NULL) except *:
+            self, DbLabel dblabel=NULL, lmdb.MDB_txn *txn=NULL) except *:
         cdef:
             lmdb.MDB_dbi dbi
 
@@ -731,7 +732,7 @@ cdef class BaseLmdbStore:
 
         dbi = self.get_dbi(dblabel, txn=txn)
 
-        logger.debug(f'Opening cursor for DB {dblabel} (DBI {dbi})...')
+        #logger.debug(f'Opening cursor for DB {dblabel} (DBI {dbi})...')
         #try:
         #    # FIXME Either reuse the cursor, if it works, or remove this code.
         #    _check(lmdb.mdb_cursor_renew(txn, self.curs[dbi]))
@@ -744,7 +745,7 @@ cdef class BaseLmdbStore:
         _check(
                 lmdb.mdb_cursor_open(txn, dbi, self.curs + dbi),
                 f'Error opening cursor: {dblabel}')
-        logger.debug('...opened @ {:x}.'.format(<unsigned long>self.curs[dbi]))
+        #logger.debug('...opened @ {:x}.'.format(<unsigned long>self.curs[dbi]))
 
         return self.curs[dbi]
 

+ 13 - 0
lakesuperior/store/ldp_nr/base_non_rdf_layout.py

@@ -1,7 +1,9 @@
 import logging
+import os
 
 from abc import ABCMeta, abstractmethod
 
+from lakesuperior.util.toolbox import get_tree_size
 
 logger = logging.getLogger(__name__)
 
@@ -23,6 +25,17 @@ class BaseNonRdfLayout(metaclass=ABCMeta):
         self.root = config['location']
 
 
+    @property
+    def store_size(self):
+        """Calculated the store size on disk."""
+        return get_tree_size(self.root)
+
+
+    @property
+    def file_ct(self):
+        """Calculated the store size on disk."""
+        return sum([len(files) for r, d, files in os.walk(self.root)])
+
     ## INTERFACE METHODS ##
 
     @abstractmethod

+ 8 - 6
lakesuperior/store/ldp_rs/lmdb_store.py

@@ -143,13 +143,15 @@ class LmdbStore(LmdbTriplestore, Store):
         prefix = prefix.encode()
         namespace = namespace.encode()
         if self.is_txn_rw:
-            self.put(prefix, namespace, 'pfx:ns')
-            self.put(namespace, prefix, 'ns:pfx')
+            # FIXME DB labels should be constants but there are problems
+            # imprting them from the Cython module.
+            self.put(prefix, namespace, b'pfx:ns_')
+            self.put(namespace, prefix, b'ns:pfx_')
         else:
             #logger.debug('Opening RW transaction.')
             with self.txn_ctx(write=True) as wtxn:
-                self.put(prefix, namespace, 'pfx:ns')
-                self.put(namespace, prefix, 'ns:pfx')
+                self.put(prefix, namespace, b'pfx:ns_')
+                self.put(namespace, prefix, b'ns:pfx_')
 
 
     def namespace(self, prefix):
@@ -157,7 +159,7 @@ class LmdbStore(LmdbTriplestore, Store):
         Get the namespace for a prefix.
         :param str prefix: Namespace prefix.
         """
-        ns = self.get_data(prefix.encode(), 'pfx:ns')
+        ns = self.get_data(prefix.encode(), b'pfx:ns_')
 
         return Namespace(ns.decode()) if ns is not None else None
 
@@ -173,7 +175,7 @@ class LmdbStore(LmdbTriplestore, Store):
 
         :rtype: str or None
         """
-        prefix = self.get_data(str(namespace).encode(), 'ns:pfx')
+        prefix = self.get_data(str(namespace).encode(), b'ns:pfx_')
 
         return prefix.decode() if prefix is not None else None
 

+ 4 - 2
lakesuperior/store/ldp_rs/lmdb_triplestore.pxd

@@ -4,7 +4,7 @@ cimport lakesuperior.cy_include.cylmdb as lmdb
 from lakesuperior.model.base cimport Key, DoubleKey, TripleKey, Buffer
 from lakesuperior.model.rdf.graph cimport Graph
 from lakesuperior.model.structures.keyset cimport Keyset
-from lakesuperior.store.base_lmdb_store cimport BaseLmdbStore
+from lakesuperior.store.base_lmdb_store cimport DbLabel, BaseLmdbStore
 
 cdef:
     enum:
@@ -14,10 +14,12 @@ cdef:
     unsigned char lookup_rank[3]
     unsigned char lookup_ordering[3][3]
     unsigned char lookup_ordering_2bound[3][3]
+    char lookup_indices[6][8] # Can't use DbLabel[6] here...
 
 
 
 cdef class LmdbTriplestore(BaseLmdbStore):
+
     cpdef dict stats(self)
     cpdef size_t _len(self, context=*) except -1
     cpdef void add(self, triple, context=*, quoted=*) except *
@@ -41,5 +43,5 @@ cdef class LmdbTriplestore(BaseLmdbStore):
         void all_contexts(self, Key** ctx, size_t* sz, triple=*) except *
         Key _append(
                 self, Buffer *value,
-                unsigned char *dblabel=*, lmdb.MDB_txn *txn=*,
+                DbLabel dblabel=*, lmdb.MDB_txn *txn=*,
                 unsigned int flags=*) except? 0

+ 94 - 78
lakesuperior/store/ldp_rs/lmdb_triplestore.pyx

@@ -8,6 +8,7 @@ from rdflib.graph import DATASET_DEFAULT_GRAPH_ID as RDFLIB_DEFAULT_GRAPH_URI
 
 from lakesuperior.store.base_lmdb_store import (
         KeyExistsError, KeyNotFoundError, LmdbError)
+from lakesuperior.util.toolbox import get_tree_size
 
 from libc.stdlib cimport malloc, free
 
@@ -47,6 +48,32 @@ INT_DUP_MASK = (
     | LSUP_REVERSEKEY | LSUP_REVERSEDUP
 )
 
+cdef:
+    DbLabel DB_T_ST = 't:st___',
+    # Joined triple keys to context key
+    DbLabel DB_SPO_C = 'spo:c__',
+    # This has empty values and is used to keep track of empty contexts.
+    DbLabel DB_C_ = 'c:_____',
+    # Prefix to namespace
+    DbLabel DB_PFX_NS = 'pfx:ns_',
+
+    # Indices
+    # Namespace to prefix
+    DbLabel DB_NS_PFX = 'ns:pfx_',
+    # Term hash to triple key
+    DbLabel DB_TH_T = 'th:t___',
+    # 1-bound lookups
+    DbLabel DB_S_PO = 's:po___',
+    DbLabel DB_P_SO = 'p:so___',
+    DbLabel DB_O_SP = 'o:sp___',
+    # 2-bound lookups
+    DbLabel DB_PO_S = 'po:s___',
+    DbLabel DB_SO_P = 'so:p___',
+    DbLabel DB_SP_O = 'sp:o___',
+    # Context lookup
+    DbLabel DB_C_SPO = 'c:spo__',
+
+
 lookup_rank = [0, 2, 1]
 """
 Order in which keys are looked up if two terms are bound.
@@ -56,10 +83,8 @@ looked up first.
 0 = s:po
 1 = p:so
 2 = o:sp
-
-If we want to get fancy, this can be rebalanced from time to time by
-looking up the number of keys in (s:po, p:so, o:sp).
 """
+
 lookup_ordering = [
     [0, 1, 2], # spo
     [1, 0, 2], # pso
@@ -71,6 +96,15 @@ lookup_ordering_2bound = [
     [0, 1, 2], # sp:o
 ]
 
+lookup_indices = [
+    DB_S_PO,
+    DB_P_SO,
+    DB_O_SP,
+    DB_PO_S,
+    DB_SO_P,
+    DB_SP_O,
+]
+
 
 logger = logging.getLogger(__name__)
 
@@ -88,51 +122,32 @@ cdef class LmdbTriplestore(BaseLmdbStore):
     """
 
     dbi_labels = [
-        # Main data
-        # Term key to serialized term content
-        't:st',
-        # Joined triple keys to context key
-        'spo:c',
-        # This has empty values and is used to keep track of empty contexts.
-        'c:',
-        # Prefix to namespace
-        'pfx:ns',
-
-        # Indices
-        # Namespace to prefix
-        'ns:pfx',
-        # Term hash to triple key
-        'th:t',
-        # Lookups
-        's:po',
-        'p:so',
-        'o:sp',
-        'po:s',
-        'so:p',
-        'sp:o',
-        'c:spo',
-    ]
-
-    lookup_indices = [
-        b's:po',
-        b'p:so',
-        b'o:sp',
-        b'po:s',
-        b'so:p',
-        b'sp:o',
+        DB_T_ST,
+        DB_SPO_C,
+        DB_C_,
+        DB_PFX_NS,
+        DB_NS_PFX,
+        DB_TH_T,
+        DB_S_PO,
+        DB_P_SO,
+        DB_O_SP,
+        DB_PO_S,
+        DB_SO_P,
+        DB_SP_O,
+        DB_C_SPO,
     ]
 
     dbi_flags = {
-        'c': INT_KEY_MASK,
-        't:st': INT_KEY_MASK,
-        's:po': INT_DUP_KEY_MASK,
-        'p:so': INT_DUP_KEY_MASK,
-        'o:sp': INT_DUP_KEY_MASK,
-        'po:s': INT_DUP_MASK,
-        'so:p': INT_DUP_MASK,
-        'sp:o': INT_DUP_MASK,
-        'c:spo': INT_DUP_KEY_MASK,
-        'spo:c': INT_DUP_MASK,
+        DB_C_: INT_KEY_MASK,
+        DB_T_ST: INT_KEY_MASK,
+        DB_S_PO: INT_DUP_KEY_MASK,
+        DB_P_SO: INT_DUP_KEY_MASK,
+        DB_O_SP: INT_DUP_KEY_MASK,
+        DB_PO_S: INT_DUP_MASK,
+        DB_SO_P: INT_DUP_MASK,
+        DB_SP_O: INT_DUP_MASK,
+        DB_C_SPO: INT_DUP_KEY_MASK,
+        DB_SPO_C: INT_DUP_MASK,
     }
     logger.debug(f'DBI flags: {dbi_flags}')
 
@@ -149,7 +164,8 @@ cdef class LmdbTriplestore(BaseLmdbStore):
         """
         Gather statistics about the database."""
         st = self._stats()
-        st['num_triples'] = st['db_stats']['spo:c']['ms_entries']
+        st['num_triples'] = st['db_stats'][DB_SPO_C]['ms_entries']
+        st['store_size'] = get_tree_size(self.env_path)
 
         return st
 
@@ -171,7 +187,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
             key_v.mv_data = &ck
             key_v.mv_size = KLEN
 
-            cur = self._cur_open('c:spo')
+            cur = self._cur_open(DB_C_SPO)
             try:
                 _check(lmdb.mdb_cursor_get(
                         cur, &key_v, NULL, lmdb.MDB_SET))
@@ -214,7 +230,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
             c = RDFLIB_DEFAULT_GRAPH_URI
 
         s, p, o = triple
-        icur = self._cur_open('th:t')
+        icur = self._cur_open(DB_TH_T)
         try:
             for i, term_obj in enumerate((s, p, o, c)):
                 serialize_from_rdflib(term_obj, &pk_t)
@@ -223,13 +239,13 @@ cdef class LmdbTriplestore(BaseLmdbStore):
                     key_v.mv_data = thash
                     key_v.mv_size = HLEN
                     _check(lmdb.mdb_get(
-                            self.txn, self.get_dbi('th:t'), &key_v, &data_v))
+                            self.txn, self.get_dbi(DB_TH_T), &key_v, &data_v))
                     spock[i] = (<Key*>data_v.mv_data)[0]
                 except KeyNotFoundError:
                     # If term_obj is not found, add it...
                     logger.debug('Hash {} not found. Adding to DB.'.format(
                             thash[: HLEN]))
-                    spock[i] = self._append(&pk_t, dblabel=b't:st')
+                    spock[i] = self._append(&pk_t, dblabel=DB_T_ST)
 
                     # ...and index it.
                     key_v.mv_data = thash
@@ -251,21 +267,21 @@ cdef class LmdbTriplestore(BaseLmdbStore):
 
         try:
             _check(lmdb.mdb_put(
-                self.txn, self.get_dbi('c:'), &c_v, &null_v,
+                self.txn, self.get_dbi(DB_C_), &c_v, &null_v,
                 lmdb.MDB_NOOVERWRITE))
         except KeyExistsError:
             pass
         try:
             # Add triple:context association.
             _check(lmdb.mdb_put(
-                self.txn, self.get_dbi('spo:c'), &spo_v, &c_v,
+                self.txn, self.get_dbi(DB_SPO_C), &spo_v, &c_v,
                 lmdb.MDB_NODUPDATA))
         except KeyExistsError:
             pass
         try:
             # Index context:triple association.
             _check(lmdb.mdb_put(
-                self.txn, self.get_dbi('c:spo'), &c_v, &spo_v,
+                self.txn, self.get_dbi(DB_C_SPO), &c_v, &spo_v,
                 lmdb.MDB_NODUPDATA))
         except KeyExistsError:
             pass
@@ -291,7 +307,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
         c = self._normalize_context(c)
 
         ck = self.to_key(c)
-        if not self._key_exists(<unsigned char*>&ck, KLEN, b'c:'):
+        if not self._key_exists(<unsigned char*>&ck, KLEN, DB_C_):
             # Insert context term if not existing.
             if self.is_txn_rw:
                 #logger.debug('Working in existing RW transaction.')
@@ -308,7 +324,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
                 data_v.mv_data = &ck # Whatever, length is zero anyways
                 data_v.mv_size = 0
                 _check(lmdb.mdb_put(
-                    _txn, self.get_dbi(b'c:'), &key_v, &data_v, 0
+                    _txn, self.get_dbi(DB_C_), &key_v, &data_v, 0
                 ))
                 if not self.is_txn_rw:
                     _check(lmdb.mdb_txn_commit(_txn))
@@ -338,8 +354,8 @@ cdef class LmdbTriplestore(BaseLmdbStore):
         # Get the matching pattern.
         match_set = self.triple_keys(triple_pattern, context)
 
-        dcur = self._cur_open('spo:c')
-        icur = self._cur_open('c:spo')
+        dcur = self._cur_open(DB_SPO_C)
+        icur = self._cur_open(DB_C_SPO)
 
         try:
             spok_v.mv_size = TRP_KLEN
@@ -466,8 +482,8 @@ cdef class LmdbTriplestore(BaseLmdbStore):
             logger.debug(f'Add {spok[0]} to indices.')
 
         while i < 3:
-            cur1 = self._cur_open(self.lookup_indices[i]) # s:po, p:so, o:sp
-            cur2 = self._cur_open(self.lookup_indices[i + 3])# po:s, so:p, sp:o
+            cur1 = self._cur_open(lookup_indices[i]) # s:po, p:so, o:sp
+            cur2 = self._cur_open(lookup_indices[i + 3])# po:s, so:p, sp:o
             try:
                 key_v.mv_data = spok + i
                 dbl_key_v.mv_data = dbl_keys[i]
@@ -546,11 +562,11 @@ cdef class LmdbTriplestore(BaseLmdbStore):
         chash_v.mv_size = HLEN
         try:
             ck_v.mv_data = &ck
-            _check(lmdb.mdb_del(self.txn, self.get_dbi(b'c:'), &ck_v, NULL))
+            _check(lmdb.mdb_del(self.txn, self.get_dbi(DB_C_), &ck_v, NULL))
             ck_v.mv_data = &ck
-            _check(lmdb.mdb_del(self.txn, self.get_dbi(b't:st'), &ck_v, NULL))
+            _check(lmdb.mdb_del(self.txn, self.get_dbi(DB_T_ST), &ck_v, NULL))
             chash_v.mv_data = chash
-            _check(lmdb.mdb_del(self.txn, self.get_dbi(b'th:t'), &chash_v, NULL))
+            _check(lmdb.mdb_del(self.txn, self.get_dbi(DB_TH_T), &chash_v, NULL))
         except KeyNotFoundError:
             pass
 
@@ -611,7 +627,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
 
         #logger.debug('Triple keys found: {}'.format(rset.data[:rset.size]))
 
-        cur = self._cur_open('spo:c')
+        cur = self._cur_open(DB_SPO_C)
         try:
             key_v.mv_size = TRP_KLEN
             rset.keys.seek()
@@ -673,7 +689,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
                 # Context not found.
                 return Graph(self, uri=uri)
 
-            icur = self._cur_open('c:spo')
+            icur = self._cur_open(DB_C_SPO)
 
             try:
                 key_v.mv_data = &ck
@@ -806,7 +822,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
                     try:
                         spok = [tk1, tk2, tk3]
                         _check(lmdb.mdb_get(
-                            self.txn, self.get_dbi('spo:c'), &spok_v, &ck_v))
+                            self.txn, self.get_dbi(DB_SPO_C), &spok_v, &ck_v))
                     except KeyNotFoundError:
                         return Graph(self)
 
@@ -839,7 +855,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
 
         # ? ? ?
         # Get all triples in the database.
-        dcur = self._cur_open('spo:c')
+        dcur = self._cur_open(DB_SPO_C)
 
         try:
             _check(
@@ -892,8 +908,8 @@ cdef class LmdbTriplestore(BaseLmdbStore):
         logger.debug(f'lookup 1bound: {idx}, {luk}')
 
         term_order = lookup_ordering[idx]
-        icur = self._cur_open(self.lookup_indices[idx])
-        logging.debug(f'DB label: {self.lookup_indices[idx]}')
+        icur = self._cur_open(lookup_indices[idx])
+        logging.debug(f'DB label: {lookup_indices[idx]}')
         logging.debug('term order: {}'.format(term_order[: 3]))
 
         try:
@@ -963,7 +979,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
                 else:
                     luk1_offset = 1
                     luk2_offset = 0
-                dblabel = self.lookup_indices[i + 3] # skip 1bound index labels
+                dblabel = lookup_indices[i + 3] # skip 1bound index labels
                 break
 
             if i == 2:
@@ -1017,7 +1033,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
             lmdb.MDB_stat stat
             cc.HashSetConf tkeys_conf
 
-        idx_label = self.lookup_indices['spo'.index(term_type)]
+        idx_label = lookup_indices['spo'.index(term_type)]
         icur = self._cur_open(idx_label)
         try:
             _check(lmdb.mdb_stat(self.txn, lmdb.mdb_cursor_dbi(icur), &stat))
@@ -1083,7 +1099,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
             lmdb.MDB_stat stat
 
         ret = []
-        dcur = self._cur_open('pfx:ns')
+        dcur = self._cur_open(DB_PFX_NS)
         try:
             try:
                 _check(lmdb.mdb_cursor_get(
@@ -1119,8 +1135,8 @@ cdef class LmdbTriplestore(BaseLmdbStore):
             TripleKey spok
 
         cur = (
-                self._cur_open('spo:c') if triple and all(triple)
-                else self._cur_open('c:'))
+                self._cur_open(DB_SPO_C) if triple and all(triple)
+                else self._cur_open(DB_C_))
         try:
             if triple and all(triple):
                 _check(lmdb.mdb_stat(
@@ -1196,7 +1212,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
 
         _check(
             lmdb.mdb_get(
-                self.txn, self.get_dbi('t:st'), &key_v, &data_v
+                self.txn, self.get_dbi(DB_T_ST), &key_v, &data_v
             ),
             f'Error getting data for key \'{tk}\'.'
         )
@@ -1247,7 +1263,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
             #    f'{(<unsigned char*>thash)[:HLEN]} in store before adding.'
             #)
             _check(lmdb.mdb_get(
-                self.txn, self.get_dbi(b'th:t'), &key_v, &data_v)
+                self.txn, self.get_dbi(DB_TH_T), &key_v, &data_v)
             )
 
             return (<Key*>data_v.mv_data)[0]
@@ -1266,13 +1282,13 @@ cdef class LmdbTriplestore(BaseLmdbStore):
 
             try:
                 # Main entry.
-                tk = self._append(&pk_t, b't:st', txn=_txn)
+                tk = self._append(&pk_t, DB_T_ST, txn=_txn)
 
                 # Index.
                 data_v.mv_data = &tk
                 data_v.mv_size = KLEN
                 _check(lmdb.mdb_put(
-                    _txn, self.get_dbi(b'th:t'), &key_v, &data_v, 0
+                    _txn, self.get_dbi(DB_TH_T), &key_v, &data_v, 0
                 ))
                 if not self.is_txn_rw:
                     _check(lmdb.mdb_txn_commit(_txn))
@@ -1289,7 +1305,7 @@ cdef class LmdbTriplestore(BaseLmdbStore):
 
     cdef Key _append(
         self, Buffer *value,
-        unsigned char *dblabel=b'', lmdb.MDB_txn *txn=NULL,
+        DbLabel dblabel=b'', lmdb.MDB_txn *txn=NULL,
         unsigned int flags=0
         ) except? 0:
         """

+ 1 - 1
lakesuperior/store/ldp_rs/rsrc_centric_layout.py

@@ -25,6 +25,7 @@ from lakesuperior.globals import ROOT_RSRC_URI
 from lakesuperior.exceptions import (InvalidResourceError,
         ResourceNotExistsError, TombstoneError, PathSegmentError)
 from lakesuperior.model.rdf.graph import Graph
+from lakesuperior.util.toolbox import get_tree_size
 
 
 META_GR_URI = nsc['fcsystem']['meta']
@@ -197,7 +198,6 @@ class RsrcCentricLayout:
         return self._attr_routes
 
 
-
     def bootstrap(self):
         """
         Delete all graphs and insert the basic triples.

+ 37 - 8
lakesuperior/util/benchmark.py

@@ -26,11 +26,13 @@ def_endpoint = 'http://localhost:8000/ldp'
 def_ct = 10000
 def_parent = '/pomegranate'
 def_gr_size = 200
+def_img_size = 1024
 
 logging.disable(logging.WARN)
 
 
 @click.command()
+
 @click.option(
     '--mode', '-m', default=def_mode,
     help=(
@@ -40,6 +42,7 @@ logging.disable(logging.WARN)
         f'Default: {def_endpoint}'
     )
 )
+
 @click.option(
     '--endpoint', '-e', default=def_endpoint,
     help=(
@@ -47,18 +50,22 @@ logging.disable(logging.WARN)
         f'Default: {def_endpoint}'
     )
 )
+
 @click.option(
     '--count', '-c', default=def_ct,
     help='Number of resources to ingest. Default: {def_ct}')
+
 @click.option(
     '--parent', '-p', default=def_parent,
     help='Path to the container resource under which the new resources will be '
         'created. It must begin with a slash (`/`) character. '
         f'Default: {def_parent}')
+
 @click.option(
     '--delete-container', '-d', is_flag=True,
     help='Delete container resource and its children if already existing. By '
     'default, the container is not deleted and new resources are added to it.')
+
 @click.option(
     '--method', '-X', default='put',
     help=(
@@ -66,14 +73,29 @@ logging.disable(logging.WARN)
         'Default: PUT'
     )
 )
+
 @click.option(
     '--graph-size', '-s', default=def_gr_size,
-    help=f'Number of triples in each graph. Default: {def_gr_size}')
+    help=(
+        'Number of triples in each random graph, rounded down to a multiple '
+        f'of 8. Default: {def_gr_size}'
+    )
+)
+
+@click.option(
+    '--image-size', '-S', default=def_img_size,
+    help=(
+        'Size of random square image, in pixels for each dimension, rounded '
+        f'down to a multiple of 8. Default: {def_img_size}'
+    )
+)
+
 @click.option(
     '--resource-type', '-t', default='r',
     help='Type of resources to ingest. One of `r` (only LDP-RS, i.e. RDF), '
     '`n` (only  LDP-NR, i.e. binaries), or `b` (50/50% of both). '
     'Default: r')
+
 @click.option(
     '--plot', '-P', is_flag=True, help='Plot a graph of ingest timings. '
     'The graph figure is displayed on screen with basic manipulation and save '
@@ -81,7 +103,7 @@ logging.disable(logging.WARN)
 
 def run(
     mode, endpoint, count, parent, method, delete_container,
-    graph_size, resource_type, plot
+    graph_size, image_size, resource_type, plot
 ):
     """
     Run the benchmark.
@@ -115,6 +137,10 @@ def run(
     else:
         raise ValueError(f'Mode not supported: {mode}')
 
+    if resource_type != 'r':
+        # Set image parameters.
+        ims = max(image_size - image_size % 8, 128)
+        tn = ims // 32
 
     # URI used to establish an in-repo relationship. This is set to
     # the most recently created resource in each loop.
@@ -149,7 +175,7 @@ def run(
                 data = random_graph(graph_size, ref)
                 headers = {'content-type': 'text/turtle'}
             else:
-                img = random_image(name=uuid4(), ts=16, ims=512)
+                img = random_image(tn=tn, ims=ims)
                 data = img['content']
                 data.seek(0)
                 headers = {
@@ -163,12 +189,13 @@ def run(
                 tcounter = ckpt - ckpt
                 prev_tcounter = tcounter
 
+            #import pdb; pdb.set_trace()
             ref = (
-                _ingest_graph_ldp(
-                    method, dest, data.serialize(format='ttl'), headers, ref
+                _ingest_ldp(
+                    method, dest, data, headers, ref
                 )
                 if mode == 'ldp'
-                else _ingest_graph_py(method, dest, data, ref)
+                else _ingest_py(method, dest, data, ref)
             )
             tcounter += (arrow.utcnow() - ckpt)
 
@@ -207,16 +234,18 @@ def run(
         plt.show()
 
 
-def _ingest_graph_ldp(method, uri, data, headers, ref):
+def _ingest_ldp(method, uri, data, headers, ref):
     """
     Ingest the graph via HTTP/LDP.
     """
+    if isinstance(data, rdflib.Graph):
+        data = data.serialize(format='ttl')
     rsp = requests.request(method, uri, data=data, headers=headers)
     rsp.raise_for_status()
     return rsp.headers['location']
 
 
-def _ingest_graph_py(method, dest, data, ref):
+def _ingest_py(method, dest, data, ref):
     from lakesuperior.api import resource as rsrc_api
 
     kwargs = {}

+ 8 - 2
lakesuperior/util/generators.py

@@ -37,8 +37,14 @@ def random_utf8_string(length):
     return ''.join(random.choice(alphabet) for i in range(length))
 
 
-def random_image(name, ts=8, ims=256):
-    imarray = numpy.random.rand(ts, ts, 3) * 255
+def random_image(tn=8, ims=256):
+    """
+    Generate a random square image with pretty color tiles.
+
+    :param int tn: Number of tiles in each dimension of the image.
+    :param int ims: Size in pixel of each dimension of the image.
+    """
+    imarray = numpy.random.rand(tn, tn, 3) * 255
     im = Image.fromarray(imarray.astype('uint8')).convert('RGBA')
     im = im.resize((ims, ims), Image.NEAREST)
 

+ 43 - 0
lakesuperior/util/locustfile.py

@@ -0,0 +1,43 @@
+import random
+
+from os import environ
+from uuid import uuid4
+
+import requests
+
+from locust import HttpLocust, TaskSet, task
+from rdflib import Graph, URIRef
+
+from lakesuperior.util.generators import random_graph, random_image
+
+ldp_root = environ.get(
+    'FCREPO_BENCHMARK_ROOT', 'http://localhost:8000/ldp/pomegranate'
+)
+print('Retrieving LDP graphs. Be patient, this may take a while...')
+rsp = requests.request('GET', ldp_root)
+root_gr = Graph().parse(data=rsp.text, format='ttl')
+subjects = {*root_gr.objects(
+    None, URIRef('http://www.w3.org/ns/ldp#contains')
+)}
+
+class Graph(TaskSet):
+
+    @task(1)
+    def ingest_graph(self):
+        uri = f'{ldp_root}/{uuid4()}'
+        data = random_graph(200, ldp_root).serialize(format='ttl')
+        headers = {'content-type': 'text/turtle'}
+        rsp = self.client.request('PUT', uri, data=data, name='random_ingest', headers=headers)
+
+
+    @task(50)
+    def request_graph(self):
+        uri = str(random.sample(subjects, 1)[0])
+        self.client.request('get', uri, name='random_get')
+
+
+class LsupSwarmer(HttpLocust):
+    task_set = Graph
+    min_wait = 50
+    max_wait = 500
+

+ 123 - 75
lakesuperior/toolbox.py → lakesuperior/util/toolbox.py

@@ -1,11 +1,10 @@
 import logging
-import pickle
+import os
 import re
 
 from collections import defaultdict
 from hashlib import sha1
 
-from flask import g
 from rdflib import Graph
 from rdflib.term import URIRef, Variable
 
@@ -15,26 +14,124 @@ from lakesuperior.globals import ROOT_RSRC_URI
 
 logger = logging.getLogger(__name__)
 
+__doc__ = ''' Utility to translate and generate strings and other objects. '''
 
-class Toolbox:
+
+def fsize_fmt(num, suffix='b'):
+    """
+    Format an integer into 1024-block file size format.
+
+    Adapted from Python 2 code on
+    https://stackoverflow.com/a/1094933/3758232
+
+    :param int num: Size value in bytes.
+    :param str suffix: Suffix label (defaults to ``b``).
+
+    :rtype: str
+    :return: Formatted size to largest fitting unit.
+    """
+    for unit in ['','K','M','G','T','P','E','Z']:
+        if abs(num) < 1024.0:
+            return f'{num:3.1f} {unit}{suffix}'
+        num /= 1024.0
+    return f'{num:.1f} Y{suffix}'
+
+
+def get_tree_size(path, follow_symlinks=True):
+    """
+    Return total size of files in given path and subdirs.
+
+    Ripped from https://www.python.org/dev/peps/pep-0471/
+    """
+    total = 0
+    for entry in os.scandir(path):
+        if entry.is_dir(follow_symlinks=follow_symlinks):
+            total += get_tree_size(entry.path)
+        else:
+            total += entry.stat(
+                follow_symlinks=follow_symlinks
+            ).st_size
+
+    return total
+
+
+def replace_term_domain(term, search, replace):
+    '''
+    Replace the domain of a term.
+
+    :param rdflib.URIRef term: The term (URI) to change.
+    :param str search: Domain string to replace.
+    :param str replace: Domain string to use for replacement.
+
+    :rtype: rdflib.URIRef
     '''
-    Utility class to translate and generate strings and other objects.
+    s = str(term)
+    if s.startswith(search):
+        s = s.replace(search, replace)
+
+    return URIRef(s)
+
+
+def parse_rfc7240(h_str):
     '''
-    def replace_term_domain(self, term, search, replace):
-        '''
-        Replace the domain of a term.
+    Parse ``Prefer`` header as per https://tools.ietf.org/html/rfc7240
 
-        :param rdflib.URIRef term: The term (URI) to change.
-        :param str search: Domain string to replace.
-        :param str replace: Domain string to use for replacement.
+    The ``cgi.parse_header`` standard method does not work with all
+    possible use cases for this header.
+
+    :param str h_str: The header(s) as a comma-separated list of Prefer
+        statements, excluding the ``Prefer:`` token.
+    '''
+    parsed_hdr = defaultdict(dict)
+
+    # Split up headers by comma
+    hdr_list = [ x.strip() for x in h_str.split(',') ]
+    for hdr in hdr_list:
+        parsed_pref = defaultdict(dict)
+        # Split up tokens by semicolon
+        token_list = [ token.strip() for token in hdr.split(';') ]
+        prefer_token = token_list.pop(0).split('=')
+        prefer_name = prefer_token[0]
+        # If preference has a '=', it has a value, else none.
+        if len(prefer_token)>1:
+            parsed_pref['value'] = prefer_token[1].strip('"')
+
+        for param_token in token_list:
+            # If the token list had a ';' the preference has a parameter.
+            param_parts = [ prm.strip().strip('"') \
+                    for prm in param_token.split('=') ]
+            param_value = param_parts[1] if len(param_parts) > 1 else None
+            parsed_pref['parameters'][param_parts[0]] = param_value
+
+        parsed_hdr[prefer_name] = parsed_pref
+
+    return parsed_hdr
+
+
+def split_uuid(uuid):
+    '''
+    Split a UID into pairtree segments. This mimics FCREPO4 behavior.
+
+    :param str uuid: UUID to split.
+
+    :rtype: str
+    '''
+    path = '{}/{}/{}/{}/{}'.format(uuid[:2], uuid[2:4],
+            uuid[4:6], uuid[6:8], uuid)
+
+    return path
 
-        :rtype: rdflib.URIRef
-        '''
-        s = str(term)
-        if s.startswith(search):
-            s = s.replace(search, replace)
 
-        return URIRef(s)
+
+class RequestUtils:
+    """
+    Utilities that require access to an HTTP request context.
+
+    Initialize this within a Flask request context.
+    """
+    def __init__(self):
+        from flask import g
+        self.webroot = g.webroot
 
 
     def uid_to_uri(self, uid):
@@ -42,7 +139,7 @@ class Toolbox:
 
         :rtype: rdflib.URIRef
         '''
-        return URIRef(g.webroot + uid)
+        return URIRef(self.webroot + uid)
 
 
     def uri_to_uid(self, uri):
@@ -53,7 +150,7 @@ class Toolbox:
         if uri.startswith(nsc['fcres']):
             return str(uri).replace(nsc['fcres'], '')
         else:
-            return '/' + str(uri).replace(g.webroot, '').strip('/')
+            return '/' + str(uri).replace(self.webroot, '').strip('/')
 
 
     def localize_uri_string(self, s):
@@ -63,11 +160,11 @@ class Toolbox:
 
         :rtype: str
         '''
-        if s.strip('/') == g.webroot:
+        if s.strip('/') == self.webroot:
             return str(ROOT_RSRC_URI)
         else:
             return s.rstrip('/').replace(
-                    g.webroot, str(nsc['fcres']))
+                    self.webroot, str(nsc['fcres']))
 
 
     def localize_term(self, uri):
@@ -90,9 +187,9 @@ class Toolbox:
         :rtype: tuple(rdflib.URIRef)
         '''
         s, p, o = trp
-        if s.startswith(g.webroot):
+        if s.startswith(self.webroot):
             s = self.localize_term(s)
-        if o.startswith(g.webroot):
+        if o.startswith(self.webroot):
             o = self.localize_term(o)
 
         return s, p, o
@@ -119,10 +216,10 @@ class Toolbox:
         :rtype: bytes
         '''
         return data.replace(
-            (g.webroot + '/').encode('utf-8'),
+            (self.webroot + '/').encode('utf-8'),
             (nsc['fcres'] + '/').encode('utf-8')
         ).replace(
-            g.webroot.encode('utf-8'),
+            self.webroot.encode('utf-8'),
             (nsc['fcres'] + '/').encode('utf-8')
         )
 
@@ -139,7 +236,7 @@ class Toolbox:
         URIs are converted to absolute using the internal URI as the base;
         finally, the root node is appropriately addressed.
         '''
-        esc_webroot = g.webroot.replace('/', '\\/')
+        esc_webroot = self.webroot.replace('/', '\\/')
         #loc_ptn = r'<({}\/?)?(.*?)?(\?.*?)?(#.*?)?>'.format(esc_webroot)
         loc_ptn1 = r'<{}\/?(.*?)>'.format(esc_webroot)
         loc_sub1 = '<{}/\\1>'.format(nsc['fcres'])
@@ -163,7 +260,7 @@ class Toolbox:
 
         :rtype: string
         '''
-        return s.replace(str(nsc['fcres']), g.webroot)
+        return s.replace(str(nsc['fcres']), self.webroot)
 
 
     def globalize_term(self, urn):
@@ -231,52 +328,3 @@ class Toolbox:
 
         return global_gr.resource(global_uri)
 
-
-    def parse_rfc7240(self, h_str):
-        '''
-        Parse ``Prefer`` header as per https://tools.ietf.org/html/rfc7240
-
-        The ``cgi.parse_header`` standard method does not work with all
-        possible use cases for this header.
-
-        :param str h_str: The header(s) as a comma-separated list of Prefer
-            statements, excluding the ``Prefer:`` token.
-        '''
-        parsed_hdr = defaultdict(dict)
-
-        # Split up headers by comma
-        hdr_list = [ x.strip() for x in h_str.split(',') ]
-        for hdr in hdr_list:
-            parsed_pref = defaultdict(dict)
-            # Split up tokens by semicolon
-            token_list = [ token.strip() for token in hdr.split(';') ]
-            prefer_token = token_list.pop(0).split('=')
-            prefer_name = prefer_token[0]
-            # If preference has a '=', it has a value, else none.
-            if len(prefer_token)>1:
-                parsed_pref['value'] = prefer_token[1].strip('"')
-
-            for param_token in token_list:
-                # If the token list had a ';' the preference has a parameter.
-                param_parts = [ prm.strip().strip('"') \
-                        for prm in param_token.split('=') ]
-                param_value = param_parts[1] if len(param_parts) > 1 else None
-                parsed_pref['parameters'][param_parts[0]] = param_value
-
-            parsed_hdr[prefer_name] = parsed_pref
-
-        return parsed_hdr
-
-
-    def split_uuid(self, uuid):
-        '''
-        Split a UID into pairtree segments. This mimics FCREPO4 behavior.
-
-        :param str uuid: UUID to split.
-
-        :rtype: str
-        '''
-        path = '{}/{}/{}/{}/{}'.format(uuid[:2], uuid[2:4],
-                uuid[4:6], uuid[6:8], uuid)
-
-        return path

+ 1 - 1
requirements_rtd.txt

@@ -1,5 +1,5 @@
 CoilMQ>=1.0.1
-Cython==0.29
+Cython==0.29.6
 Flask>=0.12.2
 HiYaPyCo>=0.4.11
 PyYAML>=3.13

+ 15 - 15
tests/1_store/test_1_0_lmdb_store.py

@@ -374,14 +374,14 @@ class TestEntryCount:
         List of index labels.
         """
         return [
-            's:po',
-            'p:so',
-            'o:sp',
-            'po:s',
-            'so:p',
-            'sp:o',
-            'spo:c',
-            'c:spo',
+            b's:po___',
+            b'p:so___',
+            b'o:sp___',
+            b'po:s___',
+            b'so:p___',
+            b'sp:o___',
+            b'spo:c__',
+            b'c:spo__',
         ]
 
 
@@ -408,8 +408,8 @@ class TestEntryCount:
             assert stat['db_stats'][idxlabel]['ms_entries'] == 1000
 
         # 1 subject, 100 predicates, 1000 objects, 1 context
-        assert stat['db_stats']['t:st']['ms_entries'] == 1102
-        assert stat['db_stats']['th:t']['ms_entries'] == 1102
+        assert stat['db_stats'][b't:st___']['ms_entries'] == 1102
+        assert stat['db_stats'][b'th:t___']['ms_entries'] == 1102
 
 
     def test_entries_partial(self, store, indices):
@@ -422,8 +422,8 @@ class TestEntryCount:
         with store.txn_ctx():
             stat = store.stats()
 
-        assert stat['db_stats']['t:st']['ms_entries'] == 1102
-        assert stat['db_stats']['th:t']['ms_entries'] == 1102
+        assert stat['db_stats'][b't:st___']['ms_entries'] == 1102
+        assert stat['db_stats'][b'th:t___']['ms_entries'] == 1102
 
 
     def test_entries_empty(self, store, indices):
@@ -439,8 +439,8 @@ class TestEntryCount:
         for idxlabel in indices:
             assert stat['db_stats'][idxlabel]['ms_entries'] == 0
 
-        assert stat['db_stats']['t:st']['ms_entries'] == 1102
-        assert stat['db_stats']['th:t']['ms_entries'] == 1102
+        assert stat['db_stats'][b't:st___']['ms_entries'] == 1102
+        assert stat['db_stats'][b'th:t___']['ms_entries'] == 1102
 
 
 
@@ -619,7 +619,7 @@ class TestCleanup:
 
     def _is_empty(self, store):
         stats = store.stats()['db_stats']
-        for dblabel in ('spo:c', 'c:spo', 's:po', 'p:so', 'o:sp',):
+        for dblabel in (b'spo:c__', b'c:spo__', b's:po___', b'p:so___', b'o:sp___',):
             if stats[dblabel]['ms_entries'] > 0:
                 return False