Browse Source

Trim some queries for PUT and POST; benchmark tools.

Stefano Cossu 6 years ago
parent
commit
3c728c2fbd

+ 7 - 6
README.md

@@ -13,17 +13,17 @@ Key features:
   [Delta document](doc/notes/fcrepo4_deltas.md))—currently being tested with
   Hyrax 2
 - Stores metadata in a graph store, binaries in filesystem
-- Strives to be faster than Modeshape implementation (benchmarks TBD)
 - Simple search and SPARQL Query API via back-end triplestore (alpha 2)
 - No performance issues storing many resources under the same container; no
   [kudzu](https://www.nature.org/ourinitiatives/urgentissues/land-conservation/forests/kudzu.xml)
-  pairtree segmentation [1]
+  pairtree segmentation <sup id="a1">[1](#f1)</sup>
 - Mitigates "many member" issue: constant performance writing to a resource with
   many children or members; option to omit children in retrieval
 - Flexible back-end layouts: options to organize information in back end
-- Migration tool (in alpha2)
+- Migration tool (in alpha3)
 
-Implementation of the official Fedora API specs (Fedora 5.x and beyond) is not
+Implementation of the official [Fedora API specs](https://fedora.info/spec/)
+(Fedora 5.x and beyond) is not
 foreseen in the short term, however it would be a natural evolution of this
 project if it gains support.
 
@@ -78,5 +78,6 @@ for a rudimentary road map and status.
 
 The design documents are in the [doc/pdf](doc/pdf) folder. *@TODO needs update*
 
-[1]: However if your client splits pairtrees upstream, such as Hyrax does, that
-obviously needs to change to get rid of the path segments.
+<b id="f1">1</b> However if your client splits pairtrees upstream, such as
+Hyrax does, that obviously needs to change to get rid of the path
+segments. [↩](#a1)

+ 5 - 0
etc.skeleton/application.yml

@@ -37,6 +37,11 @@ store:
                 #password: <set me>
                 #ssl_verify: false
 
+            # Connector for BerkeleyDB embedded store.
+            #module: bdb_connector
+            #options:
+            #    location: /data/fcrepo/ldprs_store/bdb
+
         # store layout. this corresponds to a sub-class of the
         # `lakesuperior.store_layouts.rdf.base_rdf_layout/baserdflayout`.
         layout: default_layout

+ 1 - 1
lakesuperior/model/ldp_factory.py

@@ -56,7 +56,7 @@ class LdpFactory:
             raise ResourceNotExistsError(uuid)
 
         # Sneak in the already extracted IMR to save a query.
-        rsrc._imr = imr
+        rsrc.imr = imr
 
         return rsrc
 

+ 41 - 23
lakesuperior/model/ldpr.py

@@ -182,6 +182,29 @@ class Ldpr(metaclass=ABCMeta):
         return self._imr
 
 
+    @imr.setter
+    def imr(self, v):
+        '''
+        Replace in-memory buffered resource.
+
+        @param v (set | rdflib.Graph) New set of triples to populate the IMR
+        with.
+        '''
+        if isinstance(v, Resource):
+            v = v.graph
+        self._imr = Resource(Graph(), self.urn)
+        gr = self._imr.graph
+        gr += v
+
+
+    @imr.deleter
+    def imr(self):
+        '''
+        Delete in-memory buffered resource.
+        '''
+        delattr(self, '_imr')
+
+
     @property
     def stored_or_new_imr(self):
         '''
@@ -209,14 +232,6 @@ class Ldpr(metaclass=ABCMeta):
         return self._imr
 
 
-    @imr.deleter
-    def imr(self):
-        '''
-        Delete in-memory buffered resource.
-        '''
-        delattr(self, '_imr')
-
-
     @property
     def out_graph(self):
         '''
@@ -283,10 +298,13 @@ class Ldpr(metaclass=ABCMeta):
 
     @property
     def is_stored(self):
-        if hasattr(self, '_imr'):
-            return len(self.imr.graph) > 0
-        else:
-            return self.rdfly.ask_rsrc_exists(self.urn)
+        if not hasattr(self, '_is_stored'):
+            if hasattr(self, '_imr'):
+                self._is_stored = len(self.imr.graph) > 0
+            else:
+                self._is_stored = self.rdfly.ask_rsrc_exists(self.urn)
+
+        return self._is_stored
 
 
     @property
@@ -556,6 +574,9 @@ class Ldpr(metaclass=ABCMeta):
         '''
         self._modify_rsrc(self.RES_CREATED, add_trp=self.provided_imr.graph)
 
+        # Set the IMR contents to the "add" triples.
+        self.imr = self.provided_imr.graph
+
         return self.RES_CREATED
 
 
@@ -573,8 +594,8 @@ class Ldpr(metaclass=ABCMeta):
         delta = self._dedup_deltas(self.imr.graph, self.provided_imr.graph)
         self._modify_rsrc(self.RES_UPDATED, *delta)
 
-        # Reset the IMR because it has changed.
-        delattr(self, 'imr')
+        # Set the IMR contents to the "add" triples.
+        self.imr = delta[1]
 
         return self.RES_UPDATED
 
@@ -812,8 +833,8 @@ class Ldpr(metaclass=ABCMeta):
                     self._logger.info('Removing offending type: {}'.format(t))
                     gr.remove((None, RDF.type, t))
 
-        self._logger.debug('Sanitized graph: {}'.format(gr.serialize(
-            format='turtle').decode('utf-8')))
+        #self._logger.debug('Sanitized graph: {}'.format(gr.serialize(
+        #    format='turtle').decode('utf-8')))
         return gr
 
 
@@ -866,7 +887,7 @@ class Ldpr(metaclass=ABCMeta):
         parent_rsrc._modify_rsrc(self.RES_UPDATED, add_trp=add_gr)
 
         # Direct or indirect container relationship.
-        self._add_ldp_dc_ic_rel(parent_uri)
+        self._add_ldp_dc_ic_rel(parent_rsrc)
 
 
     def _find_parent_or_create_pairtree(self):
@@ -952,22 +973,19 @@ class Ldpr(metaclass=ABCMeta):
         self.rdfly.modify_dataset(add_trp=imr.graph)
 
 
-    def _add_ldp_dc_ic_rel(self, cont_uri):
+    def _add_ldp_dc_ic_rel(self, cont_rsrc):
         '''
         Add relationship triples from a parent direct or indirect container.
 
-        @param cont_uri (rdflib.term.URIRef)  The container URI.
+        @param cont_rsrc (rdflib.resource.Resouce)  The container resource.
         '''
-        cont_uuid = g.tbox.uri_to_uuid(cont_uri)
-        cont_rsrc = LdpFactory.from_stored(cont_uuid,
-                repr_opts={'incl_children' : False})
         cont_p = set(cont_rsrc.imr.graph.predicates())
         add_gr = Graph()
 
         self._logger.info('Checking direct or indirect containment.')
         #self._logger.debug('Parent predicates: {}'.format(cont_p))
 
-        add_gr.add((self.urn, nsc['fcrepo'].hasParent, cont_uri))
+        add_gr.add((self.urn, nsc['fcrepo'].hasParent, cont_rsrc.urn))
         if self.MBR_RSRC_URI in cont_p and self.MBR_REL_URI in cont_p:
             s = g.tbox.localize_term(
                     cont_rsrc.imr.value(self.MBR_RSRC_URI).identifier)

+ 12 - 4
lakesuperior/store_layouts/ldp_rs/base_connector.py

@@ -1,4 +1,5 @@
 import logging
+import traceback
 
 from abc import ABCMeta, abstractmethod
 
@@ -19,17 +20,17 @@ class BaseConnector(metaclass=ABCMeta):
 
     _logger = logging.getLogger(__name__)
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, location, *args, **kwargs):
         '''
         Initialize the connection to the SPARQL endpoint.
 
         If `update_ep` is not specified, the store is initialized as read-only.
         '''
-        self._init_connection(*args, **kwargs)
+        self._init_connection(location, *args, **kwargs)
 
 
     @abstractmethod
-    def _init_connection(self, *args, **kwargs):
+    def _init_connection(self, location, *args, **kwargs):
         '''
         Interface method. Connection steps go here.
         '''
@@ -46,6 +47,10 @@ class BaseConnector(metaclass=ABCMeta):
 
         @return rdflib.query.Result
         '''
+        #self._logger.debug('Sending SPARQL Query: {}\nBindings: {}'.format(
+        #    q, initBindings))
+        #self._logger.debug('From:\n{}'.format(
+        #    (''.join(traceback.format_stack(limit=5)))))
         return self.ds.query(q, initBindings=initBindings, initNs=nsc)
 
 
@@ -61,7 +66,10 @@ class BaseConnector(metaclass=ABCMeta):
 
         @return None
         '''
-        self._logger.debug('Sending SPARQL update: {}'.format(q))
+        #self._logger.debug('Sending SPARQL Update: {}\nBindings: {}'.format(
+        #    q, initBindings))
+        #self._logger.debug('From:\n{}'.format(
+        #    (''.join(traceback.format_stack(limit=5)))))
         return self.ds.query(q, initBindings=initBindings, initNs=nsc)
 
 

+ 12 - 8
lakesuperior/store_layouts/ldp_rs/bdb_connector.py

@@ -20,17 +20,21 @@ class BdbConnector(BaseConnector):
 
     _logger = logging.getLogger(__name__)
 
-    def _init_connection(self, path):
+    def _init_connection(self, location):
         '''
-        Initialize the connection to the SPARQL endpoint.
+        Initialize the connection to the BerkeleyDB (Sleepycat) store.
 
-        If `update_ep` is not specified, the store is initialized as read-only.
+        Also open the store, which must be closed by the __del__ method.
         '''
-        self.store = plugin.get('Sleepycat', Store)(
-                identifier=URIRef('urn:fcsystem:lsup'))
-        self.store.open(path, create=True)
-        self.ds = Dataset(self.store, default_union=True)
+        #self.store = plugin.get('Sleepycat', Store)(
+        #        identifier=URIRef('urn:fcsystem:lsup'))
+        self.ds = Dataset('Sleepycat', default_union=True)
+        self.store = self.ds.store
+        self.ds.open(location, create=True)
 
 
     def __del__(self):
-        self.store.close()
+        '''
+        Close store connection.
+        '''
+        self.ds.close(commit_pending_transaction=False)

+ 6 - 7
lakesuperior/store_layouts/ldp_rs/default_layout.py

@@ -76,14 +76,13 @@ class DefaultLayout(BaseRdfLayout):
             ?s fcrepo:writable true .
             ?f ?fp ?fo .
         }}
-        FROM fcg:main
-        FROM fcg:historic
-        FROM fcg:metadata
         WHERE {{
-          ?s ?p ?o .{inb_qry}{incl_chld}{embed_chld}
-          OPTIONAL {{
-            ?f fcsystem:fragmentOf ?s ;
-              ?fp ?fo .
+          GRAPH ?g {{
+            ?s ?p ?o .{inb_qry}{incl_chld}{embed_chld}
+            OPTIONAL {{
+              ?f fcsystem:fragmentOf ?s ;
+                ?fp ?fo .
+            }}
           }}
         }}
         '''.format(inb_cnst=inbound_construct,

+ 11 - 3
tests/10K_children.py

@@ -1,22 +1,29 @@
 #!/usr/bin/env python
 import sys
 
+from uuid import uuid4
+
 import arrow
 import requests
 
 default_n = 10000
 webroot = 'http://localhost:8000/ldp'
-#webroot = 'http://localhost:8080/fcrepo/rest'
+#webroot = 'http://lake.devbox.local/fcrepo/rest'
 container = webroot + '/pomegranate'
 datafile = 'tests/data/marcel_duchamp_single_subject.ttl'
 
 sys.stdout.write('How many children? [{}] >'.format(default_n))
 choice = input().lower()
-
 n = int(choice) if choice else default_n
 
+sys.stdout.write('Delete container? [n] >')
+choice = input().lower()
+del_cont = choice or 'n'
+
 # Generate 10,000 children of root node.
 
+if del_cont  == 'y':
+    requests.delete(container, headers={'prefer': 'no-tombstone'})
 requests.put(container)
 
 start = arrow.utcnow()
@@ -26,7 +33,8 @@ print('Inserting {} children.'.format(n))
 
 data = open(datafile, 'rb').read()
 for i in range(1, n):
-    requests.post(container, data=data, headers={'content-type' : 'text/turtle'})
+    requests.put('{}/{}'.format(container, uuid4()), data=data, headers={
+        'content-type': 'text/turtle'})
     if i % 100 == 0:
         now = arrow.utcnow()
         tdelta = now - ckpt

+ 40 - 0
tests/bdb.py

@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+import sys
+
+from uuid import uuid4
+
+import arrow
+
+from rdflib import Dataset
+from rdflib.term import URIRef
+
+default_n = 100000
+sys.stdout.write('How many resources? [{}] >'.format(default_n))
+choice = input().lower()
+n = int(choice) if choice else default_n
+
+ds = Dataset('Sleepycat')
+ds.open('/tmp/lsup_bdb.db')
+gr = ds.graph('http://ex.org/graph#g1')
+
+start = arrow.utcnow()
+ckpt = start
+
+for i in range(1, n):
+    if i % 100 == 0:
+        print('inserted {} resources.'.format(i))
+    subj = URIRef('http://ex.org/rdf/{}'.format(uuid4()))
+    gr.add((subj, URIRef('http://ex.org/p1'), URIRef('http://ex.org/o1')))
+    gr.add((URIRef('http://ex.org/s1'), URIRef('http://ex.org/p2'), subj))
+
+    now = arrow.utcnow()
+    tdelta = now - ckpt
+    ckpt = now
+    print('Record: {}\tTime elapsed: {}'.format(i, tdelta))
+
+tdelta = arrow.utcnow() - start
+print('Total elapsed time: {}'.format(tdelta))
+print('Average time per resource: {}'.format(tdelta.total_seconds()/n))
+print('Graph size: {}'.format(len(gr)))
+
+ds.close()

+ 2 - 0
util/bootstrap.py

@@ -35,6 +35,7 @@ def bootstrap_db(app):
             source='data/bootstrap/default_layout.nq', format='nquads')
 
     app.rdfly.ds.store.commit()
+    app.rdfly.ds.close()
 
     return app.rdfly
 
@@ -51,6 +52,7 @@ def bootstrap_binary_store(app):
         pass
     print('Recreating binary store path: {}'.format(root_path))
     os.makedirs(root_path + '/tmp')
+    print('Binary store initialized.')
 
 
 if __name__=='__main__':