Browse Source

Remove pairtrees; only use full LDPC for intermediate paths.

Stefano Cossu 6 years ago
parent
commit
42ab1a3efb

+ 1 - 0
README.md

@@ -36,6 +36,7 @@ repository administration and monitoring are shipped with the standard release.
 - Python API (*planned*): Authors of Python clients can use LAKEsuperior as an
   embedded repository with no HTTP traffic or interim RDF serialization &
   de-serialization involved.
+- Fits in a pocket: you can carry over 50M triples in an 8Gb memory stick.
 
 Implementation of the official [Fedora API specs](https://fedora.info/spec/)
 (Fedora 5.x and beyond) is not

+ 4 - 4
doc/notes/TODO

@@ -59,7 +59,7 @@
 
 # Alpha 4
 
-- [W] Reorganize pairtrees
+- [D] Reorganize pairtrees
 - [D] Fix basic Hyrax workflows
   - [D] Simple ingest
   - [D] Batch ingest
@@ -77,12 +77,12 @@
 - [D] Optimize lookups (round 1)
 - [D] Optimize store
 - [D] Stats page
-- [W] Refactor for Python API
 - [D] Basic admin tools
   - [D] Stats page
   - [D] Stub admin tools page
-- [ ] Better management of path segments
-- [ ] Optimize SPARQL
+- [D] Make default graph a merge graph
+- [D] Revamp containment
+- [ ] Refactor for Python API
 
 # Alpha 7
 

+ 1 - 50
lakesuperior/endpoints/ldp.py

@@ -63,7 +63,6 @@ std_headers = {
 
 '''Predicates excluded by view.'''
 vw_blacklist = {
-    nsc['fcsystem'].contains,
 }
 
 
@@ -237,7 +236,7 @@ def post_resource(parent):
 
     try:
         with TxnManager(g.store, True):
-            uid = uuid_for_post(parent, slug)
+            uid = LdpFactory.mint_uid(parent, slug)
             logger.debug('Generated UID for POST: {}'.format(uid))
             rsrc = LdpFactory.from_provided(
                     uid, content_length=request.content_length,
@@ -521,54 +520,6 @@ def negotiate_content(rsp, headers=None):
         return (rsp.serialize(format='turtle'), headers)
 
 
-def uuid_for_post(parent_uid, slug=None):
-    '''
-    Validate conditions to perform a POST and return an LDP resource
-    UID for using with the `post` method.
-
-    This may raise an exception resulting in a 404 if the parent is not
-    found or a 409 if the parent is not a valid container.
-    '''
-    def split_if_legacy(uid):
-        if current_app.config['store']['ldp_rs']['legacy_ptree_split']:
-            uid = g.tbox.split_uuid(uid)
-        return uid
-
-    # Shortcut!
-    if not slug and parent_uid == '':
-        uid = split_if_legacy(str(uuid4()))
-        return uid
-
-    parent = LdpFactory.from_stored(parent_uid,
-            repr_opts={'incl_children' : False})
-
-    #if isintance(parent, PathSegment):
-    #    raise InvalidResourceError(parent.uid,
-    #            'Resource {} cannot be created under a pairtree.')
-
-    # Set prefix.
-    if parent_uid:
-        if (not isinstance(parent, PathSegment)
-                and nsc['ldp'].Container not in parent.types):
-            raise InvalidResourceError(parent_uid,
-                    'Parent {} is not a container.')
-        pfx = parent_uid + '/'
-    else:
-        pfx = ''
-
-    # Create candidate UID and validate.
-    if slug:
-        cnd_uid = pfx + slug
-        if current_app.rdfly.ask_rsrc_exists(cnd_uid):
-            uid = pfx + split_if_legacy(str(uuid4()))
-        else:
-            uid = cnd_uid
-    else:
-        uid = pfx + split_if_legacy(str(uuid4()))
-
-    return uid
-
-
 def bitstream_from_req():
     '''
     Find how a binary file and its MIMEtype were uploaded in the request.

+ 71 - 4
lakesuperior/model/ldp_factory.py

@@ -1,6 +1,7 @@
 import logging
 
 from pprint import pformat
+from uuid import uuid4
 
 import rdflib
 
@@ -12,8 +13,9 @@ from rdflib.namespace import RDF
 from lakesuperior import model
 from lakesuperior.model.generic_resource import PathSegment
 from lakesuperior.dictionaries.namespaces import ns_collection as nsc
-from lakesuperior.exceptions import (IncompatibleLdpTypeError,
-        InvalidResourceError, ResourceNotExistsError)
+from lakesuperior.exceptions import (
+        IncompatibleLdpTypeError, InvalidResourceError, ResourceExistsError,
+        ResourceNotExistsError)
 
 class LdpFactory:
     '''
@@ -25,6 +27,19 @@ class LdpFactory:
 
     _logger = logging.getLogger(__name__)
 
+
+    @staticmethod
+    def new_container(uid):
+        if not uid:
+            raise InvalidResourceError(uid)
+        if current_app.rdfly.ask_rsrc_exists(uid):
+            raise ResourceExistsError(uid)
+        rsrc = model.ldp_rs.Ldpc(
+                uid, provided_imr=Resource(Graph(), nsc['fcres'][uid]))
+
+        return rsrc
+
+
     @staticmethod
     def from_stored(uid, repr_opts={}, **kwargs):
         '''
@@ -134,8 +149,6 @@ class LdpFactory:
             types = inst.types
         except:
             types = set()
-        if nsc['fcrepo'].Pairtree in types:
-            raise InvalidResourceError(inst.uid, 'Resource {} is a Pairtree.')
 
         return inst
 
@@ -169,3 +182,57 @@ class LdpFactory:
         else:
             return True
 
+
+    @staticmethod
+    def mint_uid(parent_uid, path=None):
+        '''
+        Mint a new resource UID based on client directives.
+
+        This method takes a parent ID and a tentative path and returns an LDP
+        resource UID.
+
+        This may raise an exception resulting in a 404 if the parent is not
+        found or a 409 if the parent is not a valid container.
+
+        @param parent_uid (string) UID of the parent resource. It must be an
+        existing LDPC.
+        @param path (string) path to the resource, relative to the parent.
+
+        @return string The confirmed resource UID. This may be different from
+        what has been indicated.
+        '''
+        def split_if_legacy(uid):
+            if current_app.config['store']['ldp_rs']['legacy_ptree_split']:
+                uid = g.tbox.split_uuid(uid)
+            return uid
+
+        # Shortcut!
+        if not path and parent_uid == '':
+            uid = split_if_legacy(str(uuid4()))
+            return uid
+
+        parent = LdpFactory.from_stored(parent_uid,
+                repr_opts={'incl_children' : False})
+
+        # Set prefix.
+        if parent_uid:
+            if nsc['ldp'].Container not in parent.types:
+                raise InvalidResourceError(parent_uid,
+                        'Parent {} is not a container.')
+            pfx = parent_uid + '/'
+        else:
+            pfx = ''
+
+        # Create candidate UID and validate.
+        if path:
+            cnd_uid = pfx + path
+            if current_app.rdfly.ask_rsrc_exists(cnd_uid):
+                uid = pfx + split_if_legacy(str(uuid4()))
+            else:
+                uid = cnd_uid
+        else:
+            uid = pfx + split_if_legacy(str(uuid4()))
+
+        return uid
+
+

+ 24 - 80
lakesuperior/model/ldpr.py

@@ -836,70 +836,48 @@ class Ldpr(metaclass=ABCMeta):
         '''Find the closest parent in the path indicated by the uid and
         establish a containment triple.
 
+        Check the path-wise parent of the new resource. If it exists, add the
+        containment relationship with this UID. Otherwise, create a container
+        resource as the parent.
+        This function may recurse up the path tree until an existing container
+        is found.
+
         E.g. if only urn:fcres:a (short: a) exists:
         - If a/b/c/d is being created, a becomes container of a/b/c/d. Also,
-          pairtree nodes are created for a/b and a/b/c.
+          containers are created for a/b and a/b/c.
         - If e is being created, the root node becomes container of e.
         '''
         if '/' in self.uid:
             # Traverse up the hierarchy to find the parent.
-            parent_uid = self._find_parent_or_create_pairtree()
+            path_components = self.uid.split('/')
+            cnd_parent_uid = '/'.join(path_components[:-1])
+            if self.rdfly.ask_rsrc_exists(cnd_parent_uid):
+                parent_rsrc = LdpFactory.from_stored(cnd_parent_uid)
+                if nsc['ldp'].Container not in parent_rsrc.types:
+                    raise InvalidResourceError(parent_uid,
+                            'Parent {} is not a container.')
+
+                parent_uid = cnd_parent_uid
+            else:
+                parent_rsrc = LdpFactory.new_container(cnd_parent_uid)
+                # This will trigger this method again and recurse until an
+                # existing container or the root node is reached.
+                parent_rsrc.put()
+                parent_uid = parent_rsrc.uid
         else:
             parent_uid = ROOT_UID
 
         add_gr = Graph()
         add_gr.add((nsc['fcres'][parent_uid], nsc['ldp'].contains, self.urn))
         parent_rsrc = LdpFactory.from_stored(
-                parent_uid, repr_opts={
-                'incl_children' : False}, handling='none')
+                parent_uid, repr_opts={'incl_children' : False},
+                handling='none')
         parent_rsrc._modify_rsrc(self.RES_UPDATED, add_trp=add_gr)
 
         # Direct or indirect container relationship.
         self._add_ldp_dc_ic_rel(parent_rsrc)
 
 
-    def _find_parent_or_create_pairtree(self):
-        '''
-        Check the path-wise parent of the new resource. If it exists, return
-        its UID. Otherwise, create pairtree resources up the path until an
-        actual resource or the root node is found.
-
-        @return string Resource UID.
-        '''
-        path_components = self.uid.split('/')
-
-         # If there is only one element, the parent is the root node.
-        if len(path_components) < 2:
-            return ROOT_UID
-
-        # Build search list, e.g. for a/b/c/d/e would be a/b/c/d, a/b/c, a/b, a
-        self._logger.info('Path components: {}'.format(path_components))
-        fwd_search_order = accumulate(
-            list(path_components)[:-1],
-            func=lambda x,y : x + '/' + y
-        )
-        rev_search_order = reversed(list(fwd_search_order))
-
-        cur_child_uid = self.uid
-        parent_uid = ROOT_UID # Defaults to root
-        segments = []
-        for cparent_uid in rev_search_order:
-            if self.rdfly.ask_rsrc_exists(cparent_uid):
-                # If a real parent is found, set that and break the loop.
-                parent_uid = cparent_uid
-                break
-            else:
-                # Otherwise, add to the list of segments to be built.
-                segments.append((cparent_uid, cur_child_uid))
-                cur_child_uid = cparent_uid
-
-        for segm_uid, next_uid in segments:
-            self.rdfly.add_path_segment(uid=segm_uid, next_uid=next_uid,
-                    child_uid=self.uid, parent_uid=parent_uid)
-
-        return parent_uid
-
-
     def _dedup_deltas(self, remove_gr, add_gr):
         '''
         Remove duplicate triples from add and remove delta graphs, which would
@@ -911,40 +889,6 @@ class Ldpr(metaclass=ABCMeta):
         )
 
 
-    #def _create_path_segment(self, uid, child_uid, parent_uid):
-    #    '''
-    #    Create a path segment with a non-LDP containment statement.
-
-    #    If a resource such as `fcres:a/b/c` is created, and neither fcres:a or
-    #    fcres:a/b exists, we have to create two "hidden" containment statements
-    #    between a and a/b and between a/b and a/b/c in order to maintain the
-    #    containment chain.
-
-    #    These triples are stored separately and are not versioned.
-    #    '''
-    #    rsrc_uri = nsc['fcres'][uid]
-
-    #    add_trp = {
-    #        (rsrc_uri, nsc['fcsystem'].contains, nsc['fcres'][child_uid]),
-    #        (rsrc_uri, nsc['ldp'].contains, self.urn),
-    #        (rsrc_uri, RDF.type, nsc['ldp'].Container),
-    #        (rsrc_uri, RDF.type, nsc['ldp'].BasicContainer),
-    #        (rsrc_uri, RDF.type, nsc['ldp'].RDFSource),
-    #        (rsrc_uri, RDF.type, nsc['fcrepo'].Pairtree),
-    #        (rsrc_uri, nsc['fcrepo'].hasParent, nsc['fcres'][real_parent_uid]),
-    #    }
-
-    #    self.rdfly.add_segment(nsc['fcres'][uid], next=self.urn,
-    #            child=nsc['fcres'][child_uid],
-    #            parent=nsc['fcres'][parent_uid])
-
-    #    # If the path segment is just below root
-    #    if '/' not in uid:
-    #        self.rdfly.modify_rsrc(ROOT_UID, add_trp={
-    #            (ROOT_RSRC_URI, nsc['fcsystem'].contains, nsc['fcres'][uid])
-    #        })
-
-
     def _add_ldp_dc_ic_rel(self, cont_rsrc):
         '''
         Add relationship triples from a parent direct or indirect container.

+ 3 - 46
lakesuperior/store_layouts/ldp_rs/rsrc_centric_layout.py

@@ -94,7 +94,6 @@ class RsrcCentricLayout:
         nsc['fcstruct']: {
             # These are placed in a separate graph for optimization purposes.
             'p': {
-                nsc['fcsystem'].contains,
                 nsc['ldp'].contains,
                 nsc['pcdm'].hasMember,
             }
@@ -344,12 +343,11 @@ class RsrcCentricLayout:
         )
 
 
-    def get_descendants(self, uid, recurse=True, path_segments=False):
+    def get_descendants(self, uid, recurse=True):
         '''
         Get descendants (recursive children) of a resource.
 
         @param uid (string) Resource UID.
-        @param path_segments (bool) Whether to add path segments to the
         result set.
 
         @return iterator(rdflib.URIRef) Subjects of descendant resources.
@@ -366,18 +364,10 @@ class RsrcCentricLayout:
                     _recurse(dset, ss, p, cc)
             return dset
 
-        children = (
+        return (
             _recurse(set(), subj_uri, nsc['ldp'].contains, ctx_uri)
             if recurse
             else ds.graph(ctx_uri)[subj_uri : nsc['ldp'].contains : ])
-        if path_segments:
-            psegs = (
-                _recurse(set(), subj_uri, nsc['fcsystem'].contains, ctx_uri)
-                if recurse
-                else ds.graph(ctx_uri)[subj_uri : nsc['fcsystem'].contains : ])
-            return chain(children, psegs)
-        else:
-            return children
 
 
     def patch_rsrc(self, uid, qry):
@@ -417,7 +407,7 @@ class RsrcCentricLayout:
         # remove children.
         if children:
             self._logger.debug('Purging children for /{}'.format(uid))
-            for rsrc_uri in self.get_descendants(uid, False, True):
+            for rsrc_uri in self.get_descendants(uid, False):
                 self.purge_rsrc(uid_fn(rsrc_uri), inbound, False)
             # Remove structure graph.
             self.ds.remove_graph(nsc['fcstruct'][uid])
@@ -523,39 +513,6 @@ class RsrcCentricLayout:
         return '{}/{}/{}'.format(uid, VERS_CONT_LABEL, ver_uid)
 
 
-    def add_path_segment(self, uid, next_uid, parent_uid, child_uid):
-        '''
-        Add a pairtree segment.
-
-        @param uid (string) The UID of the subject.
-        @param next_uid (string) UID of the next step down. This may be an LDP
-        resource or another segment.
-        @param parent_uid (string) UID of the actual resource(s) that contains
-        the segment.
-        @param child_uid (string) UID of the LDP resource contained by the
-        segment.
-        '''
-        props = (
-            (RDF.type, nsc['fcsystem'].PathSegment),
-            (nsc['fcsystem'].contains, nsc['fcres'][next_uid]),
-            (nsc['ldp'].contains, nsc['fcres'][child_uid]),
-            #(RDF.type, nsc['ldp'].Container),
-            #(RDF.type, nsc['ldp'].BasicContainer),
-            #(RDF.type, nsc['ldp'].RDFSource),
-            #(RDF.type, nsc['fcrepo'].Pairtree),
-            (nsc['fcrepo'].hasParent, nsc['fcres'][parent_uid]),
-        )
-        for p, o in props:
-            self.ds.graph(PTREE_GR_URI).add((nsc['fcres'][uid], p, o))
-
-
-    def delete_path_segment(self, uid):
-        '''
-        Delete a pairtree segment.
-        '''
-        self.ds.graph(PTREE_GR_URI).delete((nsc['fcres'][uid], None, None))
-
-
     def clear_smt(self, uid):
         '''
         This is an ugly way to deal with lenient SPARQL update statements

+ 39 - 5
tests/endpoints/test_ldp.py

@@ -72,16 +72,20 @@ class TestLdp:
         '''
         PUT a resource with several path segments.
 
-        The test should create intermediate path segments that are not
+        The test should create intermediate path segments that are LDPCs,
         accessible to PUT or POST.
         '''
         path = '/ldp/test_tree/a/b/c/d/e/f/g'
         self.client.put(path)
 
         assert self.client.get(path).status_code == 200
+        assert self.client.get('/ldp/test_tree/a/b/c').status_code == 200
 
-        assert self.client.put('/ldp/test_tree/a').status_code == 201
-        assert self.client.post('/ldp/test_tree/a').status_code == 201
+        assert self.client.post('/ldp/test_tree/a/b').status_code == 201
+        with open('tests/data/marcel_duchamp_single_subject.ttl', 'rb') as f:
+            put_int_resp = self.client.put(
+                    'ldp/test_tree/a', data=f, content_type='text/turtle')
+        assert put_int_resp.status_code == 204
         # @TODO More thorough testing of contents
 
 
@@ -103,7 +107,7 @@ class TestLdp:
         cont1_data = self.client.get('/ldp').data
         gr1 = Graph().parse(data=cont1_data, format='turtle')
         assert gr1[ URIRef(g.webroot + '/') : nsc['ldp'].contains : \
-                URIRef(g.webroot + '/' + uuid1) ]
+                URIRef(g.webroot + '/test_nested_tree') ]
 
         self.client.put(path2)
 
@@ -111,7 +115,7 @@ class TestLdp:
         gr2 = Graph().parse(data=cont2_data, format='turtle')
         assert gr2[ URIRef(g.webroot + '/' + uuid1) : \
                 nsc['ldp'].contains : \
-                URIRef(g.webroot + '/' + uuid2) ]
+                URIRef(g.webroot + '/' + uuid1 + '/e') ]
 
 
     def test_put_ldp_rs(self, client):
@@ -192,6 +196,36 @@ class TestLdp:
         assert ldp_nr_resp.status_code == 415
 
 
+    def test_missing_reference(self, client):
+        '''
+        PUT a resource with RDF payload referencing a non-existing in-repo
+        resource.
+        '''
+        self.client.get('/ldp')
+        data = '''
+        PREFIX ns: <http://example.org#>
+        PREFIX res: <http://example-source.org/res/>
+        <> ns:p1 res:bogus ;
+          ns:p2 <{0}/> ;
+          ns:p3 <{0}/nonexistent> .
+        '''.format(g.webroot)
+        put_rsp = self.client.put('/ldp/test_missing_ref', data=data, headers={
+            'content-type': 'text/turtle'})
+        assert put_rsp.status_code == 201
+
+        resp = self.client.get('/ldp/test_missing_ref',
+                headers={'accept' : 'text/turtle'})
+        assert resp.status_code == 200
+
+        gr = Graph().parse(data=resp.data, format='text/turtle')
+        assert URIRef('http://example-source.org/res/bogus') in \
+                gr.objects(None, URIRef('http://example.org#p1'))
+        assert URIRef(g.webroot + '/') in \
+                gr.objects(None, URIRef('http://example.org#p2'))
+        assert URIRef(g.webroot + '/nonexistent') in \
+                gr.objects(None, URIRef('http://example.org#p3'))
+
+
     def test_post_resource(self, client):
         '''
         Check response headers for a POST operation with empty payload.

+ 2 - 2
tests/store/test_lmdb_store.py

@@ -276,13 +276,13 @@ class TestContext:
 
             assert len(set(store.triples((None, None, None)))) == 3
             assert len(set(store.triples((None, None, None),
-                RDFLIB_DEFAULT_GRAPH_URI))) == 2
+                RDFLIB_DEFAULT_GRAPH_URI))) == 3
             assert len(set(store.triples((None, None, None), gr_uri))) == 2
             assert len(set(store.triples((None, None, None), gr2_uri))) == 1
 
             assert gr2_uri in {gr.identifier for gr in store.contexts()}
             assert trp1 in _clean(store.triples((None, None, None)))
-            assert trp1 not in _clean(store.triples((None, None, None),
+            assert trp1 in _clean(store.triples((None, None, None),
                     RDFLIB_DEFAULT_GRAPH_URI))
             assert trp2 in _clean(store.triples((None, None, None), gr_uri))
             assert trp2 in _clean(store.triples((None, None, None)))