6 years ago · f9837d4308
--- a/lakesuperior/api/admin.py
+++ b/lakesuperior/api/admin.py
@@ -1,7 +1,14 @@
 
				 import logging
			
 
				 
			
 
				+from contextlib import ExitStack
			
 
				+from shutil import rmtree
			
 
				+
			
 
				 import lmdb
			
 
				+import requests
			
 
				+
			
 
				+from rdflib import Graph, URIRef
			
 
				 
			
 
				+from lakesuperior.dictionaries.namespaces import ns_collection as nsc
			
 
				 from lakesuperior.env import env
			
 
				 from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
			
 
				 
			
@@ -28,7 +35,9 @@ def stats():
 
				     return repo_stats
			
 
				 
			
 
				 
			
 
				-def dump(src, dest, start='/', binary_handling='include'):
			
 
				+def dump(
			
 
				+        src, dest, start=('/',), binary_handling='include',
			
 
				+        compact_uris=False):
			
 
				     '''
			
 
				     Dump a whole LDP repository or parts of it to disk.
			
 
				 
			
@@ -42,19 +51,109 @@ def dump(src, dest, start='/', binary_handling='include'):
 
				     @param start (tuple|list) List of starting points to retrieve resources
			
 
				     from. It would typically be the repository root in case of a full dump
			
 
				     or one or more resources in the repository for a partial one.
			
 
				-    @binary_handling (string) One of 'include', 'truncate' or 'split'.
			
 
				+    @param binary_handling (string) One of 'include', 'truncate' or 'split'.
			
 
				+    @param compact_uris (bool) NOT IMPLEMENTED. Whether the process should
			
 
				+    attempt to compact URIs generated with broken up path segments. If the UID
			
 
				+    matches a pattern such as `/12/34/56/123456...` it is converted to
			
 
				+    `/123456...`. This would remove a lot of cruft caused by the pairtree
			
 
				+    segments. Note that this will change the publicly exposed URIs. If
			
 
				+    durability is a concern, a rewrite directive can be added to the HTTP
			
 
				+    server that proxies the WSGI endpoint.
			
 
				     '''
			
 
				     # 1. Retrieve list of resources.
			
 
				     if not isinstance(start, list) and not isinstance(start, tuple):
			
 
				         start = (start,)
			
 
				-    subjects = _gather_subjects(src, start)
			
 
				+    _gather_resources(src, start)
			
 
				+
			
 
				+
			
 
				+def _gather_resources(webroot, start_pts):
			
 
				+    '''
			
 
				+    Gather all resources recursively and save them to temporary store.
			
 
				+
			
 
				+    Resource UIDs (without the repository webroot) are saved as unique keys
			
 
				+    in a temporary store.
			
 
				+
			
 
				+    @param webroot (string) Base URI of the repository.
			
 
				+    @param start_pts (tuple|list) Starting points to gather.
			
 
				+    '''
			
 
				+    dbpath = '/var/tmp/fcrepo_migration_data'
			
 
				+    rmtree(dbpath, ignore_errors=True)
			
 
				+    with lmdb.open(
			
 
				+            dbpath, 1024 ** 4, metasync=False, readahead=False,
			
 
				+            meminit=False) as db:
			
 
				+        #import pdb; pdb.set_trace()
			
 
				+        for start in start_pts:
			
 
				+            if not start.startswith('/'):
			
 
				+                raise ValueError(
			
 
				+                        'Starting point {} does not begin with a slash.'
			
 
				+                        .format(start))
			
 
				+
			
 
				+            _gather_refs(db, webroot, start)
			
 
				+
			
 
				+
			
 
				+def _gather_refs(db, base, path):
			
 
				+    '''
			
 
				+    Get the UID of a resource and its relationships recursively.
			
 
				+
			
 
				+    This method recurses into itself each time a reference to a resource
			
 
				+    managed by the repository is encountered.
			
 
				+
			
 
				+    @param base (string) Base URL of repository. This is used to determine
			
 
				+    whether encountered URI terms are repository-managed.
			
 
				+    @param base (string) Path, relative to base URL, of the resource to gather.
			
 
				+    '''
			
 
				+    pfx = base.rstrip('/')
			
 
				+    # Public URI of source repo.
			
 
				+    uri = pfx + path
			
 
				+    # Internal URI of destination.
			
 
				+    iuri = uri.replace(pfx, nsc['fcres'])
			
 
				+
			
 
				+    rsp = requests.head(uri)
			
 
				+    rsp.raise_for_status()
			
 
				+
			
 
				+    # Determine LDP type.
			
 
				+    ldp_type = 'ldp_nr'
			
 
				+    for link in requests.utils.parse_header_links(rsp.headers.get('link')):
			
 
				+        if (
			
 
				+                link.get('rel') == 'type'
			
 
				+                and link.get('url') == str(nsc['ldp'].RDFSource)):
			
 
				+            ldp_type = 'ldp_rs'
			
 
				+            break
			
 
				+
			
 
				+    if ldp_type == 'ldp_rs':
			
 
				+        # Get the whole RDF document now because we have to know all outbound
			
 
				+        # links.
			
 
				+        get_uri = uri
			
 
				+    else:
			
 
				+        get_uri = uri + '/fcr:metadata'
			
 
				 
			
 
				+    get_req = requests.get(get_uri)
			
 
				+    get_req.raise_for_status()
			
 
				+    data = get_req.content
			
 
				+    gr = Graph(identifier=iuri).parse(data=data, format='turtle')
			
 
				 
			
 
				-def _gather_subjects(webroot, start_pts):
			
 
				-    env = lmdb.open('/var/tmp/
			
 
				-    for start in start_pts:
			
 
				-        if not start.startswith('/'):
			
 
				-            raise ValueError('Starting point {} does not begin with a slash.'
			
 
				-                    .format(start))
			
 
				+    # First store the resource, so when we recurse, a resource referring back
			
 
				+    # to this resource will skip it as already existing and avoid an infinite
			
 
				+    # loop.
			
 
				+    #
			
 
				+    # The RDF data stream inserted is the turtle-serialized bytestring as it
			
 
				+    # comes from the request.
			
 
				+    with db.begin(write=True) as txn:
			
 
				+        with txn.cursor() as cur:
			
 
				+            if not cur.set_key(iuri.encode('utf-8')):
			
 
				+                cur.put(uri.encode('utf-8'), data)
			
 
				 
			
 
				-        pfx = src.rstrip('/') + start
			
 
				+    # Now, crawl through outbound links.
			
 
				+    # LDP-NR fcr:metadata must be checked too.
			
 
				+    for pred, obj in gr.predicate_objects():
			
 
				+        if (
			
 
				+                isinstance(obj, URIRef)
			
 
				+                and obj.startswith(uri)
			
 
				+                and pred != nsc['fcrepo'].hasParent):
			
 
				+            with db.begin() as txn:
			
 
				+                with txn.cursor() as cur:
			
 
				+                    # Avoid ∞
			
 
				+                    if cur.set_key(obj.encode('utf-8')):
			
 
				+                        #import pdb; pdb.set_trace()
			
 
				+                        continue
			
 
				+            _gather_refs(db, base, obj.replace(base, ''))
			
--- a/lsup-admin
+++ b/lsup-admin
@@ -119,19 +119,24 @@ def copy():
 
				 @click.argument('src')
			
 
				 @click.argument('dest')
			
 
				 @click.option(
			
 
				-    '--binaries', '-b', show_default=True,
			
 
				+    '--start', '-s', default='/', show_default=True,
			
 
				+    help='Starting point for looking for resources in the repository.\n'
			
 
				+    'The default `/` value starts at the root, i.e. dumps the whole '
			
 
				+    'repository.')
			
 
				+@click.option(
			
 
				+    '--binaries', '-b', default='include', show_default=True,
			
 
				     help='If set to `include`, full binaries are included in the dump. If '
			
 
				     'set to `truncate`, binaries are created as zero-byte files in the proper '
			
 
				     'folder structure. If set to `skip`, binaries are not exported. Data '
			
 
				     'folders are not created.')
			
 
				-def dump(src, dest, binaries='include'):
			
 
				+def dump(src, dest, start, binaries):
			
 
				     '''
			
 
				     Dump a repository or parts of it to disk.
			
 
				 
			
 
				     Dump an LDP repository to disk. The source repo can be LAKEsuperior or
			
 
				     another LDP-compatible implementation.
			
 
				     '''
			
 
				-    pass
			
 
				+    return admin_api.dump(src, dest, start, binaries)
			
 
				 
			
 
				 
			
 
				 @click.command()