Browse Source

WIP dump method: store to LMDB (no localization).

Stefano Cossu 6 years ago
parent
commit
f9837d4308
2 changed files with 117 additions and 13 deletions
  1. 109 10
      lakesuperior/api/admin.py
  2. 8 3
      lsup-admin

+ 109 - 10
lakesuperior/api/admin.py

@@ -1,7 +1,14 @@
 import logging
 import logging
 
 
+from contextlib import ExitStack
+from shutil import rmtree
+
 import lmdb
 import lmdb
+import requests
+
+from rdflib import Graph, URIRef
 
 
+from lakesuperior.dictionaries.namespaces import ns_collection as nsc
 from lakesuperior.env import env
 from lakesuperior.env import env
 from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
 from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
 
 
@@ -28,7 +35,9 @@ def stats():
     return repo_stats
     return repo_stats
 
 
 
 
-def dump(src, dest, start='/', binary_handling='include'):
+def dump(
+        src, dest, start=('/',), binary_handling='include',
+        compact_uris=False):
     '''
     '''
     Dump a whole LDP repository or parts of it to disk.
     Dump a whole LDP repository or parts of it to disk.
 
 
@@ -42,19 +51,109 @@ def dump(src, dest, start='/', binary_handling='include'):
     @param start (tuple|list) List of starting points to retrieve resources
     @param start (tuple|list) List of starting points to retrieve resources
     from. It would typically be the repository root in case of a full dump
     from. It would typically be the repository root in case of a full dump
     or one or more resources in the repository for a partial one.
     or one or more resources in the repository for a partial one.
-    @binary_handling (string) One of 'include', 'truncate' or 'split'.
+    @param binary_handling (string) One of 'include', 'truncate' or 'split'.
+    @param compact_uris (bool) NOT IMPLEMENTED. Whether the process should
+    attempt to compact URIs generated with broken up path segments. If the UID
+    matches a pattern such as `/12/34/56/123456...` it is converted to
+    `/123456...`. This would remove a lot of cruft caused by the pairtree
+    segments. Note that this will change the publicly exposed URIs. If
+    durability is a concern, a rewrite directive can be added to the HTTP
+    server that proxies the WSGI endpoint.
     '''
     '''
     # 1. Retrieve list of resources.
     # 1. Retrieve list of resources.
     if not isinstance(start, list) and not isinstance(start, tuple):
     if not isinstance(start, list) and not isinstance(start, tuple):
         start = (start,)
         start = (start,)
-    subjects = _gather_subjects(src, start)
+    _gather_resources(src, start)
+
+
+def _gather_resources(webroot, start_pts):
+    '''
+    Gather all resources recursively and save them to temporary store.
+
+    Resource UIDs (without the repository webroot) are saved as unique keys
+    in a temporary store.
+
+    @param webroot (string) Base URI of the repository.
+    @param start_pts (tuple|list) Starting points to gather.
+    '''
+    dbpath = '/var/tmp/fcrepo_migration_data'
+    rmtree(dbpath, ignore_errors=True)
+    with lmdb.open(
+            dbpath, 1024 ** 4, metasync=False, readahead=False,
+            meminit=False) as db:
+        #import pdb; pdb.set_trace()
+        for start in start_pts:
+            if not start.startswith('/'):
+                raise ValueError(
+                        'Starting point {} does not begin with a slash.'
+                        .format(start))
+
+            _gather_refs(db, webroot, start)
+
+
+def _gather_refs(db, base, path):
+    '''
+    Get the UID of a resource and its relationships recursively.
+
+    This method recurses into itself each time a reference to a resource
+    managed by the repository is encountered.
+
+    @param base (string) Base URL of repository. This is used to determine
+    whether encountered URI terms are repository-managed.
+    @param base (string) Path, relative to base URL, of the resource to gather.
+    '''
+    pfx = base.rstrip('/')
+    # Public URI of source repo.
+    uri = pfx + path
+    # Internal URI of destination.
+    iuri = uri.replace(pfx, nsc['fcres'])
+
+    rsp = requests.head(uri)
+    rsp.raise_for_status()
+
+    # Determine LDP type.
+    ldp_type = 'ldp_nr'
+    for link in requests.utils.parse_header_links(rsp.headers.get('link')):
+        if (
+                link.get('rel') == 'type'
+                and link.get('url') == str(nsc['ldp'].RDFSource)):
+            ldp_type = 'ldp_rs'
+            break
+
+    if ldp_type == 'ldp_rs':
+        # Get the whole RDF document now because we have to know all outbound
+        # links.
+        get_uri = uri
+    else:
+        get_uri = uri + '/fcr:metadata'
 
 
+    get_req = requests.get(get_uri)
+    get_req.raise_for_status()
+    data = get_req.content
+    gr = Graph(identifier=iuri).parse(data=data, format='turtle')
 
 
-def _gather_subjects(webroot, start_pts):
-    env = lmdb.open('/var/tmp/
-    for start in start_pts:
-        if not start.startswith('/'):
-            raise ValueError('Starting point {} does not begin with a slash.'
-                    .format(start))
+    # First store the resource, so when we recurse, a resource referring back
+    # to this resource will skip it as already existing and avoid an infinite
+    # loop.
+    #
+    # The RDF data stream inserted is the turtle-serialized bytestring as it
+    # comes from the request.
+    with db.begin(write=True) as txn:
+        with txn.cursor() as cur:
+            if not cur.set_key(iuri.encode('utf-8')):
+                cur.put(uri.encode('utf-8'), data)
 
 
-        pfx = src.rstrip('/') + start
+    # Now, crawl through outbound links.
+    # LDP-NR fcr:metadata must be checked too.
+    for pred, obj in gr.predicate_objects():
+        if (
+                isinstance(obj, URIRef)
+                and obj.startswith(uri)
+                and pred != nsc['fcrepo'].hasParent):
+            with db.begin() as txn:
+                with txn.cursor() as cur:
+                    # Avoid ∞
+                    if cur.set_key(obj.encode('utf-8')):
+                        #import pdb; pdb.set_trace()
+                        continue
+            _gather_refs(db, base, obj.replace(base, ''))

+ 8 - 3
lsup-admin

@@ -119,19 +119,24 @@ def copy():
 @click.argument('src')
 @click.argument('src')
 @click.argument('dest')
 @click.argument('dest')
 @click.option(
 @click.option(
-    '--binaries', '-b', show_default=True,
+    '--start', '-s', default='/', show_default=True,
+    help='Starting point for looking for resources in the repository.\n'
+    'The default `/` value starts at the root, i.e. dumps the whole '
+    'repository.')
+@click.option(
+    '--binaries', '-b', default='include', show_default=True,
     help='If set to `include`, full binaries are included in the dump. If '
     help='If set to `include`, full binaries are included in the dump. If '
     'set to `truncate`, binaries are created as zero-byte files in the proper '
     'set to `truncate`, binaries are created as zero-byte files in the proper '
     'folder structure. If set to `skip`, binaries are not exported. Data '
     'folder structure. If set to `skip`, binaries are not exported. Data '
     'folders are not created.')
     'folders are not created.')
-def dump(src, dest, binaries='include'):
+def dump(src, dest, start, binaries):
     '''
     '''
     Dump a repository or parts of it to disk.
     Dump a repository or parts of it to disk.
 
 
     Dump an LDP repository to disk. The source repo can be LAKEsuperior or
     Dump an LDP repository to disk. The source repo can be LAKEsuperior or
     another LDP-compatible implementation.
     another LDP-compatible implementation.
     '''
     '''
-    pass
+    return admin_api.dump(src, dest, start, binaries)
 
 
 
 
 @click.command()
 @click.command()