|
@@ -1,4 +1,5 @@
|
|
import logging
|
|
import logging
|
|
|
|
+import os
|
|
|
|
|
|
import click_log
|
|
import click_log
|
|
from contextlib import ExitStack
|
|
from contextlib import ExitStack
|
|
@@ -9,9 +10,12 @@ import requests
|
|
|
|
|
|
from rdflib import Graph, URIRef
|
|
from rdflib import Graph, URIRef
|
|
|
|
|
|
|
|
+import lakesuperior.env_setup
|
|
|
|
+
|
|
from lakesuperior.dictionaries.namespaces import ns_collection as nsc
|
|
from lakesuperior.dictionaries.namespaces import ns_collection as nsc
|
|
from lakesuperior.env import env
|
|
from lakesuperior.env import env
|
|
from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
|
|
from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
|
|
|
|
+from lakesuperior.store.ldp_nr.default_layout import DefaultLayout as FileLayout
|
|
|
|
|
|
__doc__ = '''
|
|
__doc__ = '''
|
|
Admin API.
|
|
Admin API.
|
|
@@ -22,6 +26,11 @@ This module contains maintenance utilities and stats.
|
|
logger = logging.getLogger(__name__)
|
|
logger = logging.getLogger(__name__)
|
|
app_globals = env.app_globals
|
|
app_globals = env.app_globals
|
|
|
|
|
|
|
|
+_ignore_list = (
|
|
|
|
+ nsc['fcrepo'].hasParent,
|
|
|
|
+ nsc['fcrepo'].hasTransactionProvider,
|
|
|
|
+)
|
|
|
|
+
|
|
|
|
|
|
def stats():
|
|
def stats():
|
|
'''
|
|
'''
|
|
@@ -36,7 +45,6 @@ def stats():
|
|
return repo_stats
|
|
return repo_stats
|
|
|
|
|
|
|
|
|
|
-@click_log.simple_verbosity_option(logger)
|
|
|
|
def dump(
|
|
def dump(
|
|
src, dest, start=('/',), binary_handling='include',
|
|
src, dest, start=('/',), binary_handling='include',
|
|
compact_uris=False):
|
|
compact_uris=False):
|
|
@@ -47,9 +55,9 @@ def dump(
|
|
correspond to the LDP root node (for Fedora it can be e.g.
|
|
correspond to the LDP root node (for Fedora it can be e.g.
|
|
`http://localhost:8080fcrepo/rest/`) and is used to determine if URIs
|
|
`http://localhost:8080fcrepo/rest/`) and is used to determine if URIs
|
|
retrieved are managed by this repository.
|
|
retrieved are managed by this repository.
|
|
- @param dest (rdflib.URIRef) Base URI of the destination. This can be any
|
|
|
|
- container in a LAKEsuperior server. If the resource exists, it must be an
|
|
|
|
- LDP container. If it does not exist, it will be created.
|
|
|
|
|
|
+ @param dest (str) Local path of the destination. If the location exists it
|
|
|
|
+ must be a writable directory. It will be deleted and recreated. If it does
|
|
|
|
+ not exist, it will be created along with its parents if missing.
|
|
@param start (tuple|list) List of starting points to retrieve resources
|
|
@param start (tuple|list) List of starting points to retrieve resources
|
|
from. It would typically be the repository root in case of a full dump
|
|
from. It would typically be the repository root in case of a full dump
|
|
or one or more resources in the repository for a partial one.
|
|
or one or more resources in the repository for a partial one.
|
|
@@ -63,38 +71,35 @@ def dump(
|
|
server that proxies the WSGI endpoint.
|
|
server that proxies the WSGI endpoint.
|
|
'''
|
|
'''
|
|
# 1. Retrieve list of resources.
|
|
# 1. Retrieve list of resources.
|
|
- if not isinstance(start, list) and not isinstance(start, tuple):
|
|
|
|
- start = (start,)
|
|
|
|
- _gather_resources(src, start)
|
|
|
|
|
|
+ start_pts = (
|
|
|
|
+ (start,)
|
|
|
|
+ if not isinstance(start, list) and not isinstance(start, tuple)
|
|
|
|
+ else start)
|
|
|
|
|
|
-
|
|
|
|
-def _gather_resources(webroot, start_pts):
|
|
|
|
- '''
|
|
|
|
- Gather all resources recursively and save them to temporary store.
|
|
|
|
-
|
|
|
|
- Resource UIDs (without the repository webroot) are saved as unique keys
|
|
|
|
- in a temporary store.
|
|
|
|
-
|
|
|
|
- @param webroot (string) Base URI of the repository.
|
|
|
|
- @param start_pts (tuple|list) Starting points to gather.
|
|
|
|
- '''
|
|
|
|
- dbpath = '/var/tmp/fcrepo_migration_data'
|
|
|
|
|
|
+ dbpath = '{}/ldprs_store'.format(dest)
|
|
rmtree(dbpath, ignore_errors=True)
|
|
rmtree(dbpath, ignore_errors=True)
|
|
|
|
+ os.makedirs(dbpath)
|
|
|
|
+ fpath = '{}/ldpnr_store'.format(dest)
|
|
|
|
+ rmtree(fpath, ignore_errors=True)
|
|
|
|
+ os.makedirs(fpath)
|
|
|
|
+
|
|
with lmdb.open(
|
|
with lmdb.open(
|
|
dbpath, 1024 ** 4, metasync=False, readahead=False,
|
|
dbpath, 1024 ** 4, metasync=False, readahead=False,
|
|
meminit=False) as db:
|
|
meminit=False) as db:
|
|
- #import pdb; pdb.set_trace()
|
|
|
|
for start in start_pts:
|
|
for start in start_pts:
|
|
if not start.startswith('/'):
|
|
if not start.startswith('/'):
|
|
raise ValueError(
|
|
raise ValueError(
|
|
'Starting point {} does not begin with a slash.'
|
|
'Starting point {} does not begin with a slash.'
|
|
.format(start))
|
|
.format(start))
|
|
|
|
|
|
- _gather_refs(db, webroot, start)
|
|
|
|
|
|
+ _gather_refs(db, src, start, dest)
|
|
|
|
+ entries = db.stat()['entries']
|
|
|
|
+ logger.info('Dumped {} resources.'.format(entries))
|
|
|
|
+
|
|
|
|
+ return entries
|
|
|
|
|
|
|
|
|
|
-@click_log.simple_verbosity_option(logger)
|
|
|
|
-def _gather_refs(db, base, path):
|
|
|
|
|
|
+def _gather_refs(db, base, path, dest):
|
|
'''
|
|
'''
|
|
Get the UID of a resource and its relationships recursively.
|
|
Get the UID of a resource and its relationships recursively.
|
|
|
|
|
|
@@ -103,13 +108,14 @@ def _gather_refs(db, base, path):
|
|
|
|
|
|
@param base (string) Base URL of repository. This is used to determine
|
|
@param base (string) Base URL of repository. This is used to determine
|
|
whether encountered URI terms are repository-managed.
|
|
whether encountered URI terms are repository-managed.
|
|
- @param base (string) Path, relative to base URL, of the resource to gather.
|
|
|
|
|
|
+ @param path (string) Path, relative to base URL, of the resource to gather.
|
|
|
|
+ @param dest (string) Local path for RDF database and non-RDF files.
|
|
'''
|
|
'''
|
|
pfx = base.rstrip('/')
|
|
pfx = base.rstrip('/')
|
|
# Public URI of source repo.
|
|
# Public URI of source repo.
|
|
uri = pfx + path
|
|
uri = pfx + path
|
|
# Internal URI of destination.
|
|
# Internal URI of destination.
|
|
- iuri = uri.replace(pfx, nsc['fcres'])
|
|
|
|
|
|
+ iuri = URIRef(uri.replace(pfx, nsc['fcres']))
|
|
ibase = base.replace(pfx, nsc['fcres'])
|
|
ibase = base.replace(pfx, nsc['fcres'])
|
|
|
|
|
|
rsp = requests.head(uri)
|
|
rsp = requests.head(uri)
|
|
@@ -124,15 +130,12 @@ def _gather_refs(db, base, path):
|
|
ldp_type = 'ldp_rs'
|
|
ldp_type = 'ldp_rs'
|
|
break
|
|
break
|
|
|
|
|
|
- if ldp_type == 'ldp_rs':
|
|
|
|
- # Get the whole RDF document now because we have to know all outbound
|
|
|
|
- # links.
|
|
|
|
- get_uri = uri
|
|
|
|
- else:
|
|
|
|
- get_uri = uri + '/fcr:metadata'
|
|
|
|
-
|
|
|
|
|
|
+ # Get the whole RDF document now because we have to know all outbound
|
|
|
|
+ # links.
|
|
|
|
+ get_uri = uri if ldp_type == 'ldp_rs' else '{}/fcr:metadata'.format(uri)
|
|
get_req = requests.get(get_uri)
|
|
get_req = requests.get(get_uri)
|
|
get_req.raise_for_status()
|
|
get_req.raise_for_status()
|
|
|
|
+
|
|
data = get_req.content.replace(base.encode('utf-8'), ibase.encode('utf-8'))
|
|
data = get_req.content.replace(base.encode('utf-8'), ibase.encode('utf-8'))
|
|
logger.debug('Localized data: {}'.format(data.decode('utf-8')))
|
|
logger.debug('Localized data: {}'.format(data.decode('utf-8')))
|
|
gr = Graph(identifier=iuri).parse(data=data, format='turtle')
|
|
gr = Graph(identifier=iuri).parse(data=data, format='turtle')
|
|
@@ -148,16 +151,30 @@ def _gather_refs(db, base, path):
|
|
if not cur.set_key(iuri.encode('utf-8')):
|
|
if not cur.set_key(iuri.encode('utf-8')):
|
|
cur.put(uri.encode('utf-8'), data)
|
|
cur.put(uri.encode('utf-8'), data)
|
|
|
|
|
|
|
|
+ # Grab binary.
|
|
|
|
+ if ldp_type == 'ldp_nr':
|
|
|
|
+ bin_resp = requests.get('{}/fcr:content'.format(uri))
|
|
|
|
+ bin_resp.raise_for_status()
|
|
|
|
+
|
|
|
|
+ # @FIXME Use a more robust checking mechanism. Maybe offer the option
|
|
|
|
+ # to verify the content checksum.
|
|
|
|
+ cnt_hash = gr.value(iuri, nsc['premis'].hasMessageDigest).replace(
|
|
|
|
+ 'urn:sha1:', '')
|
|
|
|
+ fpath = FileLayout.local_path('{}/ldpnr_store'.format(dest), cnt_hash)
|
|
|
|
+ os.makedirs(os.path.dirname(fpath), exist_ok=True)
|
|
|
|
+ with open(fpath, 'wb') as f:
|
|
|
|
+ f.write(bin_resp.content)
|
|
|
|
+
|
|
# Now, crawl through outbound links.
|
|
# Now, crawl through outbound links.
|
|
# LDP-NR fcr:metadata must be checked too.
|
|
# LDP-NR fcr:metadata must be checked too.
|
|
for pred, obj in gr.predicate_objects():
|
|
for pred, obj in gr.predicate_objects():
|
|
if (
|
|
if (
|
|
isinstance(obj, URIRef)
|
|
isinstance(obj, URIRef)
|
|
and obj.startswith(iuri)
|
|
and obj.startswith(iuri)
|
|
- and pred != nsc['fcrepo'].hasParent):
|
|
|
|
|
|
+ and pred not in _ignore_list):
|
|
with db.begin() as txn:
|
|
with db.begin() as txn:
|
|
with txn.cursor() as cur:
|
|
with txn.cursor() as cur:
|
|
# Avoid ∞
|
|
# Avoid ∞
|
|
if cur.set_key(obj.encode('utf-8')):
|
|
if cur.set_key(obj.encode('utf-8')):
|
|
continue
|
|
continue
|
|
- _gather_refs(db, base, obj.replace(ibase, ''))
|
|
|
|
|
|
+ _gather_refs(db, base, obj.replace(ibase, ''), dest)
|