|
@@ -1,7 +1,14 @@
|
|
|
import logging
|
|
|
|
|
|
+from contextlib import ExitStack
|
|
|
+from shutil import rmtree
|
|
|
+
|
|
|
import lmdb
|
|
|
+import requests
|
|
|
+
|
|
|
+from rdflib import Graph, URIRef
|
|
|
|
|
|
+from lakesuperior.dictionaries.namespaces import ns_collection as nsc
|
|
|
from lakesuperior.env import env
|
|
|
from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
|
|
|
|
|
@@ -28,7 +35,9 @@ def stats():
|
|
|
return repo_stats
|
|
|
|
|
|
|
|
|
-def dump(src, dest, start='/', binary_handling='include'):
|
|
|
+def dump(
|
|
|
+ src, dest, start=('/',), binary_handling='include',
|
|
|
+ compact_uris=False):
|
|
|
'''
|
|
|
Dump a whole LDP repository or parts of it to disk.
|
|
|
|
|
@@ -42,19 +51,109 @@ def dump(src, dest, start='/', binary_handling='include'):
|
|
|
@param start (tuple|list) List of starting points to retrieve resources
|
|
|
from. It would typically be the repository root in case of a full dump
|
|
|
or one or more resources in the repository for a partial one.
|
|
|
- @binary_handling (string) One of 'include', 'truncate' or 'split'.
|
|
|
+ @param binary_handling (string) One of 'include', 'truncate' or 'split'.
|
|
|
+ @param compact_uris (bool) NOT IMPLEMENTED. Whether the process should
|
|
|
+ attempt to compact URIs generated with broken up path segments. If the UID
|
|
|
+ matches a pattern such as `/12/34/56/123456...` it is converted to
|
|
|
+ `/123456...`. This would remove a lot of cruft caused by the pairtree
|
|
|
+ segments. Note that this will change the publicly exposed URIs. If
|
|
|
+ durability is a concern, a rewrite directive can be added to the HTTP
|
|
|
+ server that proxies the WSGI endpoint.
|
|
|
'''
|
|
|
|
|
|
if not isinstance(start, list) and not isinstance(start, tuple):
|
|
|
start = (start,)
|
|
|
- subjects = _gather_subjects(src, start)
|
|
|
+ _gather_resources(src, start)
|
|
|
+
|
|
|
+
|
|
|
+def _gather_resources(webroot, start_pts):
|
|
|
+ '''
|
|
|
+ Gather all resources recursively and save them to temporary store.
|
|
|
+
|
|
|
+ Resource UIDs (without the repository webroot) are saved as unique keys
|
|
|
+ in a temporary store.
|
|
|
+
|
|
|
+ @param webroot (string) Base URI of the repository.
|
|
|
+ @param start_pts (tuple|list) Starting points to gather.
|
|
|
+ '''
|
|
|
+ dbpath = '/var/tmp/fcrepo_migration_data'
|
|
|
+ rmtree(dbpath, ignore_errors=True)
|
|
|
+ with lmdb.open(
|
|
|
+ dbpath, 1024 ** 4, metasync=False, readahead=False,
|
|
|
+ meminit=False) as db:
|
|
|
+
|
|
|
+ for start in start_pts:
|
|
|
+ if not start.startswith('/'):
|
|
|
+ raise ValueError(
|
|
|
+ 'Starting point {} does not begin with a slash.'
|
|
|
+ .format(start))
|
|
|
+
|
|
|
+ _gather_refs(db, webroot, start)
|
|
|
+
|
|
|
+
|
|
|
+def _gather_refs(db, base, path):
|
|
|
+ '''
|
|
|
+ Get the UID of a resource and its relationships recursively.
|
|
|
+
|
|
|
+ This method recurses into itself each time a reference to a resource
|
|
|
+ managed by the repository is encountered.
|
|
|
+
|
|
|
+ @param base (string) Base URL of repository. This is used to determine
|
|
|
+ whether encountered URI terms are repository-managed.
|
|
|
+ @param base (string) Path, relative to base URL, of the resource to gather.
|
|
|
+ '''
|
|
|
+ pfx = base.rstrip('/')
|
|
|
+
|
|
|
+ uri = pfx + path
|
|
|
+
|
|
|
+ iuri = uri.replace(pfx, nsc['fcres'])
|
|
|
+
|
|
|
+ rsp = requests.head(uri)
|
|
|
+ rsp.raise_for_status()
|
|
|
+
|
|
|
+
|
|
|
+ ldp_type = 'ldp_nr'
|
|
|
+ for link in requests.utils.parse_header_links(rsp.headers.get('link')):
|
|
|
+ if (
|
|
|
+ link.get('rel') == 'type'
|
|
|
+ and link.get('url') == str(nsc['ldp'].RDFSource)):
|
|
|
+ ldp_type = 'ldp_rs'
|
|
|
+ break
|
|
|
+
|
|
|
+ if ldp_type == 'ldp_rs':
|
|
|
+
|
|
|
+
|
|
|
+ get_uri = uri
|
|
|
+ else:
|
|
|
+ get_uri = uri + '/fcr:metadata'
|
|
|
|
|
|
+ get_req = requests.get(get_uri)
|
|
|
+ get_req.raise_for_status()
|
|
|
+ data = get_req.content
|
|
|
+ gr = Graph(identifier=iuri).parse(data=data, format='turtle')
|
|
|
|
|
|
-def _gather_subjects(webroot, start_pts):
|
|
|
- env = lmdb.open('/var/tmp/
|
|
|
- for start in start_pts:
|
|
|
- if not start.startswith('/'):
|
|
|
- raise ValueError('Starting point {} does not begin with a slash.'
|
|
|
- .format(start))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ with db.begin(write=True) as txn:
|
|
|
+ with txn.cursor() as cur:
|
|
|
+ if not cur.set_key(iuri.encode('utf-8')):
|
|
|
+ cur.put(uri.encode('utf-8'), data)
|
|
|
|
|
|
- pfx = src.rstrip('/') + start
|
|
|
+
|
|
|
+
|
|
|
+ for pred, obj in gr.predicate_objects():
|
|
|
+ if (
|
|
|
+ isinstance(obj, URIRef)
|
|
|
+ and obj.startswith(uri)
|
|
|
+ and pred != nsc['fcrepo'].hasParent):
|
|
|
+ with db.begin() as txn:
|
|
|
+ with txn.cursor() as cur:
|
|
|
+
|
|
|
+ if cur.set_key(obj.encode('utf-8')):
|
|
|
+
|
|
|
+ continue
|
|
|
+ _gather_refs(db, base, obj.replace(base, ''))
|