7 years ago · 6cfb06364f
--- a/lakesuperior/api/admin.py
+++ b/lakesuperior/api/admin.py
@@ -1,4 +1,5 @@
 
															 import logging
														
 
															+import os
														
 
															 import click_log
														
 
															 from contextlib import ExitStack
														
@@ -9,9 +10,12 @@ import requests
 
															 from rdflib import Graph, URIRef
														
 
															+import lakesuperior.env_setup
														
 
															+
														
 
															 from lakesuperior.dictionaries.namespaces import ns_collection as nsc
														
 
															 from lakesuperior.env import env
														
 
															 from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
														
 
															+from lakesuperior.store.ldp_nr.default_layout import DefaultLayout as FileLayout
														
 
															 __doc__ = '''
														
 
															 Admin API.
														
@@ -22,6 +26,11 @@ This module contains maintenance utilities and stats.
 
															 logger = logging.getLogger(__name__)
														
 
															 app_globals = env.app_globals
														
 
															+_ignore_list = (
														
 
															+    nsc['fcrepo'].hasParent,
														
 
															+    nsc['fcrepo'].hasTransactionProvider,
														
 
															+)
														
 
															+
														
 
															 def stats():
														
 
															     '''
														
@@ -36,7 +45,6 @@ def stats():
 
															     return repo_stats
														
 
															-@click_log.simple_verbosity_option(logger)
														
 
															 def dump(
														
 
															         src, dest, start=('/',), binary_handling='include',
														
 
															         compact_uris=False):
														
@@ -47,9 +55,9 @@ def dump(
 
															     correspond to the LDP root node (for Fedora it can be e.g.
														
 
															     `http://localhost:8080fcrepo/rest/`) and is used to determine if URIs
														
 
															     retrieved are managed by this repository.
														
 
															-    @param dest (rdflib.URIRef) Base URI of the destination. This can be any
														
 
															-    container in a LAKEsuperior server. If the resource exists, it must be an
														
 
															-    LDP container. If it does not exist, it will be created.
														
 
															+    @param dest (str) Local path of the destination. If the location exists it
														
 
															+    must be a writable directory. It will be deleted and recreated. If it does
														
 
															+    not exist, it will be created along with its parents if missing.
														
 
															     @param start (tuple|list) List of starting points to retrieve resources
														
 
															     from. It would typically be the repository root in case of a full dump
														
 
															     or one or more resources in the repository for a partial one.
														
@@ -63,38 +71,35 @@ def dump(
 
															     server that proxies the WSGI endpoint.
														
 
															     '''
														
 
															     # 1. Retrieve list of resources.
														
 
															-    if not isinstance(start, list) and not isinstance(start, tuple):
														
 
															-        start = (start,)
														
 
															-    _gather_resources(src, start)
														
 
															+    start_pts = (
														
 
															+            (start,)
														
 
															+            if not isinstance(start, list) and not isinstance(start, tuple)
														
 
															+            else start)
														
 
															-
														
 
															-def _gather_resources(webroot, start_pts):
														
 
															-    '''
														
 
															-    Gather all resources recursively and save them to temporary store.
														
 
															-
														
 
															-    Resource UIDs (without the repository webroot) are saved as unique keys
														
 
															-    in a temporary store.
														
 
															-
														
 
															-    @param webroot (string) Base URI of the repository.
														
 
															-    @param start_pts (tuple|list) Starting points to gather.
														
 
															-    '''
														
 
															-    dbpath = '/var/tmp/fcrepo_migration_data'
														
 
															+    dbpath = '{}/ldprs_store'.format(dest)
														
 
															     rmtree(dbpath, ignore_errors=True)
														
 
															+    os.makedirs(dbpath)
														
 
															+    fpath = '{}/ldpnr_store'.format(dest)
														
 
															+    rmtree(fpath, ignore_errors=True)
														
 
															+    os.makedirs(fpath)
														
 
															+
														
 
															     with lmdb.open(
														
 
															             dbpath, 1024 ** 4, metasync=False, readahead=False,
														
 
															             meminit=False) as db:
														
 
															-        #import pdb; pdb.set_trace()
														
 
															         for start in start_pts:
														
 
															             if not start.startswith('/'):
														
 
															                 raise ValueError(
														
 
															                         'Starting point {} does not begin with a slash.'
														
 
															                         .format(start))
														
 
															-            _gather_refs(db, webroot, start)
														
 
															+            _gather_refs(db, src, start, dest)
														
 
															+        entries = db.stat()['entries']
														
 
															+        logger.info('Dumped {} resources.'.format(entries))
														
 
															+
														
 
															+    return entries
														
 
															-@click_log.simple_verbosity_option(logger)
														
 
															-def _gather_refs(db, base, path):
														
 
															+def _gather_refs(db, base, path, dest):
														
 
															     '''
														
 
															     Get the UID of a resource and its relationships recursively.
														
@@ -103,13 +108,14 @@ def _gather_refs(db, base, path):
 
															     @param base (string) Base URL of repository. This is used to determine
														
 
															     whether encountered URI terms are repository-managed.
														
 
															-    @param base (string) Path, relative to base URL, of the resource to gather.
														
 
															+    @param path (string) Path, relative to base URL, of the resource to gather.
														
 
															+    @param dest (string) Local path for RDF database and non-RDF files.
														
 
															     '''
														
 
															     pfx = base.rstrip('/')
														
 
															     # Public URI of source repo.
														
 
															     uri = pfx + path
														
 
															     # Internal URI of destination.
														
 
															-    iuri = uri.replace(pfx, nsc['fcres'])
														
 
															+    iuri = URIRef(uri.replace(pfx, nsc['fcres']))
														
 
															     ibase = base.replace(pfx, nsc['fcres'])
														
 
															     rsp = requests.head(uri)
														
@@ -124,15 +130,12 @@ def _gather_refs(db, base, path):
 
															             ldp_type = 'ldp_rs'
														
 
															             break
														
 
															-    if ldp_type == 'ldp_rs':
														
 
															-        # Get the whole RDF document now because we have to know all outbound
														
 
															-        # links.
														
 
															-        get_uri = uri
														
 
															-    else:
														
 
															-        get_uri = uri + '/fcr:metadata'
														
 
															-
														
 
															+    # Get the whole RDF document now because we have to know all outbound
														
 
															+    # links.
														
 
															+    get_uri = uri if ldp_type == 'ldp_rs' else '{}/fcr:metadata'.format(uri)
														
 
															     get_req = requests.get(get_uri)
														
 
															     get_req.raise_for_status()
														
 
															+
														
 
															     data = get_req.content.replace(base.encode('utf-8'), ibase.encode('utf-8'))
														
 
															     logger.debug('Localized data: {}'.format(data.decode('utf-8')))
														
 
															     gr = Graph(identifier=iuri).parse(data=data, format='turtle')
														
@@ -148,16 +151,30 @@ def _gather_refs(db, base, path):
 
															             if not cur.set_key(iuri.encode('utf-8')):
														
 
															                 cur.put(uri.encode('utf-8'), data)
														
 
															+    # Grab binary.
														
 
															+    if ldp_type == 'ldp_nr':
														
 
															+        bin_resp = requests.get('{}/fcr:content'.format(uri))
														
 
															+        bin_resp.raise_for_status()
														
 
															+
														
 
															+        # @FIXME Use a more robust checking mechanism. Maybe offer the option
														
 
															+        # to verify the content checksum.
														
 
															+        cnt_hash = gr.value(iuri, nsc['premis'].hasMessageDigest).replace(
														
 
															+                'urn:sha1:', '')
														
 
															+        fpath = FileLayout.local_path('{}/ldpnr_store'.format(dest), cnt_hash)
														
 
															+        os.makedirs(os.path.dirname(fpath), exist_ok=True)
														
 
															+        with open(fpath, 'wb') as f:
														
 
															+            f.write(bin_resp.content)
														
 
															+
														
 
															     # Now, crawl through outbound links.
														
 
															     # LDP-NR fcr:metadata must be checked too.
														
 
															     for pred, obj in gr.predicate_objects():
														
 
															         if (
														
 
															                 isinstance(obj, URIRef)
														
 
															                 and obj.startswith(iuri)
														
 
															-                and pred != nsc['fcrepo'].hasParent):
														
 
															+                and pred not in _ignore_list):
														
 
															             with db.begin() as txn:
														
 
															                 with txn.cursor() as cur:
														
 
															                     # Avoid ∞
														
 
															                     if cur.set_key(obj.encode('utf-8')):
														
 
															                         continue
														
 
															-            _gather_refs(db, base, obj.replace(ibase, ''))
														
 
															+            _gather_refs(db, base, obj.replace(ibase, ''), dest)
														
--- a/lakesuperior/endpoints/ldp.py
+++ b/lakesuperior/endpoints/ldp.py
@@ -119,9 +119,9 @@ def get_resource(uid, out_fmt=None):
 
															     @param uid (string) UID of resource to retrieve. The repository root has
														
 
															     an empty string for UID.
														
 
															-    @param force_rdf (boolean) Whether to retrieve RDF even if the resource is
														
 
															+    @param out_fmt (string) Force output to RDF or non-RDF if the resource is
														
 
															     a LDP-NR. This is not available in the API but is used e.g. by the
														
 
															-    `*/fcr:metadata` endpoint. The default is False.
														
 
															+    `*/fcr:metadata` and `*/fcr:content` endpoints. The default is False.
														
 
															     '''
														
 
															     logger.info('UID: {}'.format(uid))
														
 
															     out_headers = std_headers
														
@@ -154,7 +154,7 @@ def get_resource(uid, out_fmt=None):
 
															             if not getattr(rsrc, 'filename', False):
														
 
															                 return ('{} has no binary content.'.format(rsrc.uid), 404)
														
 
															-            logger.info('Streaming out binary content.')
														
 
															+            logger.debug('Streaming out binary content.')
														
 
															             rsp = make_response(send_file(
														
 
															                     rsrc.local_path, as_attachment=True,
														
 
															                     attachment_filename=rsrc.filename,
														
--- a/lakesuperior/model/ldp_nr.py
+++ b/lakesuperior/model/ldp_nr.py
@@ -63,7 +63,8 @@ class LdpNr(Ldpr):
 
															     def local_path(self):
														
 
															         cksum_term = self.imr.value(nsc['premis'].hasMessageDigest)
														
 
															         cksum = str(cksum_term.identifier.replace('urn:sha1:',''))
														
 
															-        return nonrdfly.local_path(cksum)
														
 
															+        return nonrdfly.__class__.local_path(
														
 
															+                nonrdfly.root, cksum, nonrdfly.bl, nonrdfly.bc)
														
 
															     def create_or_replace(self, create_only=False):
														
--- a/lakesuperior/model/ldpr.py
+++ b/lakesuperior/model/ldpr.py
@@ -249,7 +249,7 @@ class Ldpr(metaclass=ABCMeta):
 
															             if (
														
 
															                 # Exclude digest hash and version information.
														
 
															                 t[1] not in {
														
 
															-                    nsc['premis'].hasMessageDigest,
														
 
															+                    #nsc['premis'].hasMessageDigest,
														
 
															                     nsc['fcrepo'].hasVersion,
														
 
															                 }
														
 
															             ) and (
														
--- a/lakesuperior/store/ldp_nr/default_layout.py
+++ b/lakesuperior/store/ldp_nr/default_layout.py
@@ -15,6 +15,36 @@ class DefaultLayout(BaseNonRdfLayout):
 
															     '''
														
 
															     Default file layout.
														
 
															     '''
														
 
															+    @staticmethod
														
 
															+    def local_path(root, uuid, bl=4, bc=4):
														
 
															+        '''
														
 
															+        Generate the resource path splitting the resource checksum according to
														
 
															+        configuration parameters.
														
 
															+
														
 
															+        @param uuid (string) The resource UUID. This corresponds to the content
														
 
															+        checksum.
														
 
															+        '''
														
 
															+        logger.debug('Generating path from uuid: {}'.format(uuid))
														
 
															+        term = len(uuid) if bc == 0 else min(bc * bl, len(uuid))
														
 
															+
														
 
															+        path = [uuid[i : i + bl] for i in range(0, term, bl)]
														
 
															+
														
 
															+        if bc > 0:
														
 
															+            path.append(uuid[term :])
														
 
															+        path.insert(0, root)
														
 
															+
														
 
															+        return '/'.join(path)
														
 
															+
														
 
															+
														
 
															+    def __init__(self, *args, **kwargs):
														
 
															+        '''
														
 
															+        Set up path segmentation parameters.
														
 
															+        '''
														
 
															+        super().__init__(*args, **kwargs)
														
 
															+
														
 
															+        self.bl = self.config['pairtree_branch_length']
														
 
															+        self.bc = self.config['pairtree_branches']
														
 
															+
														
 
															     ## INTERFACE METHODS ##
														
@@ -60,11 +90,11 @@ class DefaultLayout(BaseNonRdfLayout):
 
															             os.unlink(tmp_file)
														
 
															             raise
														
 
															         if size == 0:
														
 
															-            logger.warn('Zero-file size received.')
														
 
															+            logger.warn('Zero-length file received.')
														
 
															         # Move temp file to final destination.
														
 
															         uuid = hash.hexdigest()
														
 
															-        dst = self.local_path(uuid)
														
 
															+        dst = __class__.local_path(self.root, uuid, self.bl, self.bc)
														
 
															         logger.debug('Saving file to disk: {}'.format(dst))
														
 
															         if not os.access(os.path.dirname(dst), os.X_OK):
														
 
															             os.makedirs(os.path.dirname(dst))
														
@@ -84,28 +114,4 @@ class DefaultLayout(BaseNonRdfLayout):
 
															         '''
														
 
															         See BaseNonRdfLayout.delete.
														
 
															         '''
														
 
															-        os.unlink(self.local_path(uuid))
														
 
															-
														
 
															-
														
 
															-    ## PROTECTED METHODS ##
														
 
															-
														
 
															-    def local_path(self, uuid):
														
 
															-        '''
														
 
															-        Generate the resource path splitting the resource checksum according to
														
 
															-        configuration parameters.
														
 
															-
														
 
															-        @param uuid (string) The resource UUID. This corresponds to the content
														
 
															-        checksum.
														
 
															-        '''
														
 
															-        logger.debug('Generating path from uuid: {}'.format(uuid))
														
 
															-        bl = self.config['pairtree_branch_length']
														
 
															-        bc = self.config['pairtree_branches']
														
 
															-        term = len(uuid) if bc==0 else min(bc*bl, len(uuid))
														
 
															-
														
 
															-        path = [ uuid[i:i+bl] for i in range(0, term, bl) ]
														
 
															-
														
 
															-        if bc > 0:
														
 
															-            path.append(uuid[term:])
														
 
															-        path.insert(0, self.root)
														
 
															-
														
 
															-        return '/'.join(path)
														
 
															+        os.unlink(__class__.local_path(self.root, uuid, self.bl, self.bc))
														
--- a/lsup-admin
+++ b/lsup-admin
@@ -143,7 +143,8 @@ def dump(src, dest, start, binaries):
 
															     another LDP-compatible implementation.
														
 
															     '''
														
 
															     logger.info('Dumping database.')
														
 
															-    return admin_api.dump(src, dest, start, binaries)
														
 
															+    entries = admin_api.dump(src, dest, start, binaries)
														
 
															+    logger.info('Dumped {} resources.'.format(entries))
														
 
															 @click.command()