Jelajahi Sumber

Merge branch 'development' of https://github.com/scossu/lakesuperior into development

Stefano Cossu 7 tahun lalu
induk
melakukan
ed317e3d01

+ 2 - 0
README.md

@@ -160,6 +160,8 @@ meant to live as a community project.
 
 [Messaging](doc/notes/messaging.md)
 
+[Migration Guide](doc/notes/migration.md)
+
 [Command-Line Reference](doc/notes/cli.md)
 
 [Storage Implementation](doc/notes/storage.md)

+ 2 - 0
data/bootstrap/rsrc_centric_layout.sparql

@@ -12,6 +12,8 @@ INSERT DATA {
     <info:fcres/> a
       fcrepo:RepositoryRoot , fcrepo:Resource , fcrepo:Container ,
       ldp:Container , ldp:BasicContainer , ldp:RDFSource ;
+      fcrepo:created "$timestamp"^^xsd:dateTime ;
+      fcrepo:lastModified "$timestamp"^^xsd:dateTime ;
     .
   }
 

+ 13 - 4
doc/notes/fcrepo4_deltas.md

@@ -167,13 +167,22 @@ specs should not notice any difference.
 The following are improvements in performance or usability that can only be taken
 advantage of if client code is adjusted.
 
-### LDP-NR metadata by content negotiation
+### LDP-NR content and metadata
 
 FCREPO4 relies on the `/fcr:metadata` identifier to retrieve RDF metadata about
 an LDP-NR. LAKEsuperior supports this as a legacy option, but encourages the
-use of content negotiation to do the same. Any request to an LDP-NR with an
-`Accept` header set to one of the supported RDF serialization formats will
-yield the RDF metadata of the resource instead of the binary contents.
+use of content negotiation to do the same while offering explicit endpoints
+for RDF and non-RDF content retrieval.
+
+Any request to an LDP-NR with an `Accept` header set to one of the supported
+RDF serialization formats will yield the RDF metadata of the resource instead
+of the binary contents.
+
+The `fcr:metadata` URI returns the RDF metadata of a LDP-NR.
+
+The `fcr:content` URI returns the non-RDF content.
+
+The two optionsabove return an HTTP error if requested for a LDP-RS.
 
 ### "Include" and "Omit" options for children
 

+ 44 - 0
doc/notes/migration.md

@@ -0,0 +1,44 @@
+# Migration Guide
+
+A command-line utility is available as part of the `lsup-admin` suite
+to migrate contents from a Fedora implementation to LAKEsuperior.
+
+A repository can be migrated with a one-line command such as:
+
+```
+./lsup-admin migrate http://source-repo.edu/rest /local/dest/folder
+```
+
+For more options, enter
+
+```
+./lsup-admin migrate --help
+```
+
+The script will crawl through the resources and crawl through outbound links
+within them. In order to do this, resources are added as raw triples (i.e.
+no consistency checks are made).
+
+**Note:** the consistency check tool has not yet been implemented at the moment
+but its release should follow shortly. This will ensure that all the links
+between resources are consistent in regard to referential integrity.
+
+This script will create a full dataset in the specified destination folder,
+complete with a default configuration that allows to start the LAKEsuperior
+server immediately after the migration is complete.
+
+Two approaches to migration are possible:
+
+1. By providing a starting point on the source repository. E.g. if the
+   repository you want to migrate is at `http://repo.edu/rest/prod` you can add
+   the `-s /prod` option to the script to avoid migrating irrelevant branches.
+   Note that the script will still reach outside of the starting point if
+   resources are referencing other resources outside of it.
+2. By providing a file containing a list of resources to migrate. This is
+   useful if a source repository cannot produce a full list (e.g. the root node
+   has more children than the server can handle) but a list of individual
+   resources is available via an external index (Solr, triplestore, etc.).
+   The resources can be indicated by their fully qualified URIs or paths
+   relative to the repository root. (*TODO latter option needs testing*)
+
+

+ 22 - 5
lakesuperior/api/admin.py

@@ -1,27 +1,44 @@
 import logging
 
 from lakesuperior.env import env
+from lakesuperior.migrator import Migrator
+from lakesuperior.store.ldp_nr.default_layout import DefaultLayout as FileLayout
 from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
 
-__doc__ = '''
+__doc__ = """
 Admin API.
 
 This module contains maintenance utilities and stats.
-'''
+"""
 
 logger = logging.getLogger(__name__)
-app_globals = env.app_globals
 
 
 def stats():
-    '''
+    """
     Get repository statistics.
 
     @return dict Store statistics, resource statistics.
-    '''
+    """
+    import lakesuperior.env_setup
     repo_stats = {'rsrc_stats': env.app_globals.rdfly.count_rsrc()}
     with TxnManager(env.app_globals.rdf_store) as txn:
         repo_stats['store_stats'] = env.app_globals.rdf_store.stats()
 
     return repo_stats
 
+
+def migrate(src, dest, start_pts=None, list_file=None, **kwargs):
+    """
+    Migrate an LDP repository to a new LAKEsuperior instance.
+
+    See :py:meth:`Migrator.__init__`.
+    """
+    if start_pts:
+        if not isinstance(
+                start_pts, list) and not isinstance(start_pts, tuple):
+            start_pts = (start_pts,)
+    elif not list_file:
+        start_pts = ('/',)
+
+    return Migrator(src, dest, **kwargs).migrate(start_pts, list_file)

+ 39 - 20
lakesuperior/api/resource.py

@@ -20,7 +20,6 @@ from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
 
 
 logger = logging.getLogger(__name__)
-app_globals = env.app_globals
 
 __doc__ = '''
 Primary API for resource manipulation.
@@ -75,12 +74,11 @@ def transaction(write=False):
             # update timestamps on resources.
             env.timestamp = arrow.utcnow()
             env.timestamp_term = Literal(env.timestamp, datatype=XSD.dateTime)
-            with TxnManager(app_globals.rdf_store, write=write) as txn:
+            with TxnManager(env.app_globals.rdf_store, write=write) as txn:
                 ret = fn(*args, **kwargs)
-            if len(app_globals.changelog):
+            if len(env.app_globals.changelog):
                 job = Thread(target=process_queue)
                 job.start()
-            logger.debug('Deleting timestamp: {}'.format(getattr(env, 'timestamp')))
             delattr(env, 'timestamp')
             delattr(env, 'timestamp_term')
             return ret
@@ -94,8 +92,8 @@ def process_queue():
     '''
     lock = Lock()
     lock.acquire()
-    while len(app_globals.changelog):
-        send_event_msg(*app_globals.changelog.popleft())
+    while len(env.app_globals.changelog):
+        send_event_msg(*env.app_globals.changelog.popleft())
     lock.release()
 
 
@@ -118,11 +116,35 @@ def send_event_msg(remove_trp, add_trp, metadata):
     subjects = set(remove_dict.keys()) | set(add_dict.keys())
     for rsrc_uri in subjects:
         logger.debug('Processing event for subject: {}'.format(rsrc_uri))
-        app_globals.messenger.send(rsrc_uri, **metadata)
+        env.app_globals.messenger.send(rsrc_uri, **metadata)
 
 
 ### API METHODS ###
 
+@transaction()
+def exists(uid):
+    '''
+    Return whether a resource exists (is stored) in the repository.
+
+    @param uid (string) Resource UID.
+    '''
+    try:
+        exists = LdpFactory.from_stored(uid).is_stored
+    except ResourceNotExistsError:
+        exists = False
+    return exists
+
+
+@transaction()
+def get_metadata(uid):
+    '''
+    Get metadata (admin triples) of an LDPR resource.
+
+    @param uid (string) Resource UID.
+    '''
+    return LdpFactory.from_stored(uid).metadata
+
+
 @transaction()
 def get(uid, repr_options={}):
     '''
@@ -229,13 +251,10 @@ def update(uid, update_str, is_metadata=False):
     If False, and the resource being updated is a LDP-NR, an error is raised.
     '''
     rsrc = LdpFactory.from_stored(uid)
-    if LDP_NR_TYPE in rsrc.ldp_types:
-        if is_metadata:
-            rsrc.patch_metadata(update_str)
-        else:
-            raise InvalidResourceError(uid)
-    else:
-        rsrc.patch(update_str)
+    if LDP_NR_TYPE in rsrc.ldp_types and not is_metadata:
+        raise InvalidResourceError(uid)
+
+    rsrc.sparql_update(update_str)
 
     return rsrc
 
@@ -266,11 +285,11 @@ def delete(uid, soft=True):
     '''
     # If referential integrity is enforced, grab all inbound relationships
     # to break them.
-    refint = app_globals.rdfly.config['referential_integrity']
+    refint = env.app_globals.rdfly.config['referential_integrity']
     inbound = True if refint else inbound
     repr_opts = {'incl_inbound' : True} if refint else {}
 
-    children = app_globals.rdfly.get_descendants(uid)
+    children = env.app_globals.rdfly.get_descendants(uid)
 
     if soft:
         rsrc = LdpFactory.from_stored(uid, repr_opts)
@@ -279,16 +298,16 @@ def delete(uid, soft=True):
         for child_uri in children:
             try:
                 child_rsrc = LdpFactory.from_stored(
-                    app_globals.rdfly.uri_to_uid(child_uri),
+                    env.app_globals.rdfly.uri_to_uid(child_uri),
                     repr_opts={'incl_children' : False})
             except (TombstoneError, ResourceNotExistsError):
                 continue
             child_rsrc.bury_rsrc(inbound, tstone_pointer=rsrc.uri)
     else:
-        ret = app_globals.rdfly.forget_rsrc(uid, inbound)
+        ret = env.app_globals.rdfly.forget_rsrc(uid, inbound)
         for child_uri in children:
-            child_uid = app_globals.rdfly.uri_to_uid(child_uri)
-            ret = app_globals.rdfly.forget_rsrc(child_uid, inbound)
+            child_uid = env.app_globals.rdfly.uri_to_uid(child_uri)
+            ret = env.app_globals.rdfly.forget_rsrc(child_uid, inbound)
 
     return ret
 

+ 69 - 46
lakesuperior/config_parser.py

@@ -5,49 +5,72 @@ from os import path, environ
 import hiyapyco
 import yaml
 
-configs = (
-    'application',
-    'logging',
-    'namespaces',
-    'flask',
-)
-
-# This will hold a dict of all configuration values.
-config = {}
-
-# Parse configuration
-CONFIG_DIR = environ.get(
-        'FCREPO_CONFIG_DIR',
-        path.dirname(path.dirname(path.abspath(__file__))) + '/etc.defaults')
-
-print('Reading configuration at {}'.format(CONFIG_DIR))
-
-for cname in configs:
-    file = '{}/{}.yml'.format(CONFIG_DIR , cname)
-    with open(file, 'r') as stream:
-        config[cname] = yaml.load(stream, yaml.SafeLoader)
-
-# Merge default and test configurations.
-error_msg = '''
-**************
-** WARNING! **
-**************
-
-Your test {} store location is set to be the same as the production location.
-This means that if you run a test suite, your live data may be wiped clean!
-
-Please review your configuration before starting.
-'''
-
-test_config = {'application': hiyapyco.load(CONFIG_DIR + '/application.yml',
-        CONFIG_DIR + '/test.yml', method=hiyapyco.METHOD_MERGE)}
-
-if config['application']['store']['ldp_rs']['location'] \
-        == test_config['application']['store']['ldp_rs']['location']:
-            raise RuntimeError(error_msg.format('RDF'))
-            sys.exit()
-
-if config['application']['store']['ldp_nr']['path'] \
-        == test_config['application']['store']['ldp_nr']['path']:
-            raise RuntimeError(error_msg.format('binary'))
-            sys.exit()
+
+def parse_config(config_dir=None):
+    """
+    Parse configuration from a directory.
+
+    This is normally called by the standard endpoints (``lsup_admin``, web
+    server, etc.) or by a Python client by importing
+    :py:mod:`lakesuperior.env_setup` but an application using a non-default
+    configuration may specify an alternative configuration directory.
+
+    The directory must have the same structure as the one provided in
+    ``etc.defaults``.
+
+    :param config_dir: Location on the filesystem of the configuration
+    directory. The default is set by the ``FCREPO_CONFIG_DIR`` environment
+    variable or, if this is not set, the ``etc.defaults`` stock directory.
+    """
+    configs = (
+        'application',
+        'logging',
+        'namespaces',
+        'flask',
+    )
+
+    if not config_dir:
+        config_dir = environ.get('FCREPO_CONFIG_DIR', path.dirname(
+                path.dirname(path.abspath(__file__))) + '/etc.defaults')
+
+    # This will hold a dict of all configuration values.
+    _config = {}
+
+    print('Reading configuration at {}'.format(config_dir))
+
+    for cname in configs:
+        file = '{}/{}.yml'.format(config_dir , cname)
+        with open(file, 'r') as stream:
+            _config[cname] = yaml.load(stream, yaml.SafeLoader)
+
+    error_msg = '''
+    **************
+    ** WARNING! **
+    **************
+
+    Your test {} store location is set to be the same as the production
+    location. This means that if you run a test suite, your live data may be
+    wiped clean!
+
+    Please review your configuration before starting.
+    '''
+
+    # Merge default and test configurations.
+    _test_config = {'application': hiyapyco.load(
+            config_dir + '/application.yml',
+            config_dir + '/test.yml', method=hiyapyco.METHOD_MERGE)}
+
+    if _config['application']['store']['ldp_rs']['location'] \
+            == _test_config['application']['store']['ldp_rs']['location']:
+                raise RuntimeError(error_msg.format('RDF'))
+                sys.exit()
+
+    if _config['application']['store']['ldp_nr']['path'] \
+            == _test_config['application']['store']['ldp_nr']['path']:
+                raise RuntimeError(error_msg.format('binary'))
+                sys.exit()
+    return _config, _test_config
+
+
+# Load default configuration.
+config, test_config = parse_config()

+ 16 - 17
lakesuperior/endpoints/ldp.py

@@ -107,9 +107,11 @@ def log_request_end(rsp):
 
 @ldp.route('/<path:uid>', methods=['GET'], strict_slashes=False)
 @ldp.route('/', defaults={'uid': '/'}, methods=['GET'], strict_slashes=False)
-@ldp.route('/<path:uid>/fcr:metadata', defaults={'force_rdf' : True},
+@ldp.route('/<path:uid>/fcr:metadata', defaults={'out_fmt' : 'rdf'},
+        methods=['GET'])
+@ldp.route('/<path:uid>/fcr:content', defaults={'out_fmt' : 'non_rdf'},
         methods=['GET'])
-def get_resource(uid, force_rdf=False):
+def get_resource(uid, out_fmt=None):
     '''
     https://www.w3.org/TR/ldp/#ldpr-HTTP_GET
 
@@ -117,9 +119,9 @@ def get_resource(uid, force_rdf=False):
 
     @param uid (string) UID of resource to retrieve. The repository root has
     an empty string for UID.
-    @param force_rdf (boolean) Whether to retrieve RDF even if the resource is
+    @param out_fmt (string) Force output to RDF or non-RDF if the resource is
     a LDP-NR. This is not available in the API but is used e.g. by the
-    `*/fcr:metadata` endpoint. The default is False.
+    `*/fcr:metadata` and `*/fcr:content` endpoints. The default is False.
     '''
     logger.info('UID: {}'.format(uid))
     out_headers = std_headers
@@ -137,17 +139,22 @@ def get_resource(uid, force_rdf=False):
     except TombstoneError as e:
         return _tombstone_response(e, uid)
     else:
+        if out_fmt is None:
+            out_fmt = (
+                    'rdf'
+                    if isinstance(rsrc, LdpRs) or is_accept_hdr_rdf_parsable()
+                    else 'non_rdf')
         out_headers.update(_headers_from_metadata(rsrc))
         uri = g.tbox.uid_to_uri(uid)
-        if (
-                isinstance(rsrc, LdpRs)
-                or is_accept_hdr_rdf_parsable()
-                or force_rdf):
+        if out_fmt == 'rdf':
             ggr = g.tbox.globalize_graph(rsrc.out_graph)
             ggr.namespace_manager = nsm
             return _negotiate_content(ggr, out_headers, uid=uid, uri=uri)
         else:
-            logger.info('Streaming out binary content.')
+            if not getattr(rsrc, 'local_path', False):
+                return ('{} has no binary content.'.format(rsrc.uid), 404)
+
+            logger.debug('Streaming out binary content.')
             rsp = make_response(send_file(
                     rsrc.local_path, as_attachment=True,
                     attachment_filename=rsrc.filename,
@@ -265,7 +272,6 @@ def put_resource(uid):
     rsp_headers = {'Content-Type' : 'text/plain; charset=utf-8'}
 
     handling, disposition = set_post_put_params()
-    #import pdb; pdb.set_trace()
     stream, mimetype = _bistream_from_req()
 
     if LdpFactory.is_rdf_parsable(mimetype):
@@ -494,13 +500,6 @@ def _bistream_from_req():
     return stream, mimetype
 
 
-def _get_bitstream(rsrc):
-    # @TODO This may change in favor of more low-level handling if the file
-    # system is not local.
-    return send_file(rsrc.local_path, as_attachment=True,
-            attachment_filename=rsrc.filename)
-
-
 def _tombstone_response(e, uid):
     headers = {
         'Link': '<{}/fcr:tombstone>; rel="hasTombstone"'.format(request.url),

+ 11 - 6
lakesuperior/endpoints/templates/resource.html

@@ -23,15 +23,20 @@
 </nav>
 {% endblock %}
 {% block content %}
+{% if gr[gr.identifier : nsc['rdf'].type : nsc['ldp'].NonRDFSource] %}
+<div class="pull-right">
+    <a href="{{ gr.identifier }}/fcr:content" class="btn btn-success btn-lg">
+        <span class="glyphicon glyphicon-download" aria-hidden="true"></span>
+        Download Content</a>
+</div>
+{% endif %}
 {% set created_ts = arrow.get(
-    gr.value(gr.identifier, nsc['fcrepo'].created)).replace(
-    tzinfo='local') %}
+    gr.value(gr.identifier, nsc['fcrepo'].created)).to('local') %}
 {% set updated_ts = arrow.get(
-    gr.value(gr.identifier, nsc['fcrepo'].lastModified)).replace(
-    tzinfo='local') %}
-<p><strong>Created on:</strong>&nbsp;{{ created_ts }}&nbsp;
+    gr.value(gr.identifier, nsc['fcrepo'].lastModified)).to('local') %}
+<p><strong>Created on:</strong>&nbsp;{{ created_ts.format('YYYY-MM-DD HH:mm:ss ZZ') }}&nbsp;
 ({{created_ts.humanize() }})</p>
-<p><strong>Last updated on:</strong>&nbsp;{{ updated_ts }}&nbsp;
+<p><strong>Last updated on:</strong>&nbsp;{{ updated_ts.format('YYYY-MM-DD HH:mm:ss ZZ') }}&nbsp;
 ({{updated_ts.humanize() }})</p>
 <p><strong>Types:</strong>
 {% for t in gr[gr.identifier : nsc['rdf'].type :] | sort %}

+ 278 - 0
lakesuperior/migrator.py

@@ -0,0 +1,278 @@
+import logging
+import shutil
+
+from io import BytesIO
+from contextlib import ContextDecorator
+from os import makedirs, path
+from urllib.parse import urldefrag
+
+import lmdb
+import requests
+import yaml
+
+from rdflib import Graph, URIRef
+
+from lakesuperior.dictionaries.namespaces import ns_collection as nsc
+from lakesuperior.env import env
+from lakesuperior.globals import AppGlobals, ROOT_UID
+from lakesuperior.config_parser import parse_config
+from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
+
+
+logger = logging.getLogger(__name__)
+
+
+class StoreWrapper(ContextDecorator):
+    '''
+    Open and close a store.
+    '''
+    def __init__(self, store):
+        self.store = store
+
+    def __enter__(self):
+        self.store.open(
+                env.config['application']['store']['ldp_rs'])
+
+    def __exit__(self, *exc):
+        self.store.close()
+
+
+class Migrator:
+    """
+    Class to handle a database migration.
+
+    This class holds state of progress and shared variables as it crawls
+    through linked resources in an LDP server.
+
+    Since a repository migration can be a very long operation but it is
+    impossible to know the number of the resources to gather by LDP interaction
+    alone, a progress ticker outputs the number of processed resources at
+    regular intervals.
+    """
+
+    """
+    LMDB database parameters.
+
+    See :meth:`lmdb.Environment.__init__`
+    """
+    db_params = {
+        'map_size': 1024 ** 4,
+        'metasync': False,
+        'readahead': False,
+        'meminit': False,
+    }
+
+    """List of predicates to ignore when looking for links."""
+    ignored_preds = (
+        nsc['fcrepo'].hasParent,
+        nsc['fcrepo'].hasTransactionProvider,
+        nsc['fcrepo'].hasFixityService,
+    )
+
+
+    def __init__(
+            self, src, dest, zero_binaries=False, compact_uris=False,
+            skip_errors=False):
+        """
+        Set up base paths and clean up existing directories.
+
+        :param src: (URIRef) Webroot of source repository. This must
+        correspond to the LDP root node (for Fedora it can be e.g.
+        ``http://localhost:8080fcrepo/rest/``) and is used to determine if URIs
+        retrieved are managed by this repository.
+        :param dest: (str) Destination repository path. If the location exists
+        it must be a writable directory. It will be deleted and recreated. If
+        it does not exist, it will be created along with its parents if
+        missing.
+        :param binary_handling: (string) One of ``include``, ``truncate`` or
+        ``split``.
+        :param compact_uris: (bool) NOT IMPLEMENTED. Whether the process should
+        attempt to compact URIs generated with broken up path segments. If the
+        UID matches a pattern such as `/12/34/56/123456...` it is converted to
+        `/123456...`. This would remove a lot of cruft caused by the pairtree
+        segments. Note that this will change the publicly exposed URIs. If
+        durability is a concern, a rewrite directive can be added to the HTTP
+        server that proxies the WSGI endpoint.
+        """
+        # Set up repo folder structure and copy default configuration to
+        # destination file.
+        cur_dir = path.dirname(path.dirname(path.abspath(__file__)))
+        self.dbpath = '{}/data/ldprs_store'.format(dest)
+        self.fpath = '{}/data/ldpnr_store'.format(dest)
+        self.config_dir = '{}/etc'.format(dest)
+
+        shutil.rmtree(dest, ignore_errors=True)
+        shutil.copytree(
+                '{}/etc.defaults'.format(cur_dir), self.config_dir)
+
+        # Modify and overwrite destination configuration.
+        orig_config, _ = parse_config(self.config_dir)
+        orig_config['application']['store']['ldp_rs']['location'] = self.dbpath
+        orig_config['application']['store']['ldp_nr']['path'] = self.fpath
+
+        with open('{}/application.yml'.format(self.config_dir), 'w') \
+                as config_file:
+            config_file.write(yaml.dump(orig_config['application']))
+
+        env.config, _ = parse_config(self.config_dir)
+        env.app_globals = AppGlobals(env.config)
+
+        self.rdfly = env.app_globals.rdfly
+        self.nonrdfly = env.app_globals.nonrdfly
+
+        with TxnManager(env.app_globals.rdf_store, write=True) as txn:
+            self.rdfly.bootstrap()
+            self.rdfly.store.close()
+        env.app_globals.nonrdfly.bootstrap()
+
+        self.src = src.rstrip('/')
+        self.zero_binaries = zero_binaries
+        self.skip_errors = skip_errors
+
+
+
+    def migrate(self, start_pts=None, list_file=None):
+        """
+        Migrate the database.
+
+        This method creates a fully functional and configured LAKEsuperior
+        data set contained in a folder from an LDP repository.
+
+        :param tuple|list start_pts: List of starting points to retrieve
+        resources from. It would typically be the repository root in case of a
+        full dump or one or more resources in the repository for a partial one.
+        :param str listf_ile: path to a local file containing a list of URIs,
+        one per line.
+        """
+        from lakesuperior.api import resource as rsrc_api
+        self._ct = 0
+        with StoreWrapper(self.rdfly.store):
+            if start_pts:
+                for start in start_pts:
+                    if not start.startswith('/'):
+                        raise ValueError(
+                            'Starting point {} does not begin with a slash.'
+                            .format(start))
+
+                    if start != ROOT_UID:
+                        # Create the full hierarchy with link to the parents.
+                        rsrc_api.create_or_replace(start)
+                    # Then populate the new resource and crawl for more
+                    # relationships.
+                    self._crawl(start)
+            elif list_file:
+                with open(list_file, 'r') as fp:
+                    for uri in fp:
+                        uid = uri.strip().replace(self.src, '')
+                        if uid != ROOT_UID:
+                            rsrc_api.create_or_replace(uid)
+                        self._crawl(uid)
+        logger.info('Dumped {} resources.'.format(self._ct))
+
+        return self._ct
+
+
+    def _crawl(self, uid):
+        """
+        Get the contents of a resource and its relationships recursively.
+
+        This method recurses into itself each time a reference to a resource
+        managed by the repository is encountered.
+
+        @param uid (string) The path relative to the source server webroot
+        pointing to the resource to crawl, effectively the resource UID.
+        """
+        ibase = str(nsc['fcres'])
+        # Public URI of source repo.
+        uri = self.src + uid
+        # Internal URI of destination.
+        iuri = ibase + uid
+
+        rsp = requests.head(uri)
+        if not self.skip_errors:
+            rsp.raise_for_status()
+        elif rsp.status_code > 399:
+            print('Error retrieving resource {} headers: {} {}'.format(
+                uri, rsp.status_code, rsp.text))
+
+        # Determine LDP type.
+        ldp_type = 'ldp_nr'
+        try:
+            for link in requests.utils.parse_header_links(
+                    rsp.headers.get('link')):
+                if (
+                        link.get('rel') == 'type'
+                        and (
+                            link.get('url') == str(nsc['ldp'].RDFSource)
+                            or link.get('url') == str(nsc['ldp'].Container))
+                ):
+                    # Resource is an LDP-RS.
+                    ldp_type = 'ldp_rs'
+                    break
+        except TypeError:
+            ldp_type = 'ldp_rs'
+            #raise ValueError('URI {} is not an LDP resource.'.format(uri))
+
+        # Get the whole RDF document now because we have to know all outbound
+        # links.
+        get_uri = (
+                uri if ldp_type == 'ldp_rs' else '{}/fcr:metadata'.format(uri))
+        get_rsp = requests.get(get_uri)
+        if not self.skip_errors:
+            get_rsp.raise_for_status()
+        elif get_rsp.status_code > 399:
+            print('Error retrieving resource {} body: {} {}'.format(
+                uri, get_rsp.status_code, get_rsp.text))
+
+        data = get_rsp.content.replace(
+                self.src.encode('utf-8'), ibase.encode('utf-8'))
+        gr = Graph(identifier=iuri).parse(data=data, format='turtle')
+
+        # Store raw graph data. No checks.
+        with TxnManager(self.rdfly.store, True):
+            self.rdfly.modify_rsrc(uid, add_trp=set(gr))
+
+        # Grab binary and set new resource parameters.
+        if ldp_type == 'ldp_nr':
+            provided_imr = gr.resource(URIRef(iuri))
+            if self.zero_binaries:
+                data = b''
+            else:
+                bin_rsp = requests.get(uri)
+                if not self.skip_errors:
+                    bin_rsp.raise_for_status()
+                elif bin_rsp.status_code > 399:
+                    print('Error retrieving resource {} body: {} {}'.format(
+                        uri, bin_rsp.status_code, bin_rsp.text))
+                data = bin_rsp.content
+            #import pdb; pdb.set_trace()
+            uuid = str(gr.value(
+                URIRef(iuri), nsc['premis'].hasMessageDigest)).split(':')[-1]
+            fpath = self.nonrdfly.local_path(
+                    self.nonrdfly.config['path'], uuid)
+            makedirs(path.dirname(fpath), exist_ok=True)
+            with open(fpath, 'wb') as fh:
+                fh.write(data)
+
+        self._ct += 1
+        if self._ct % 10 == 0:
+            print('{} resources processed so far.'.format(self._ct))
+
+        # Now, crawl through outbound links.
+        # LDP-NR fcr:metadata must be checked too.
+        for pred, obj in gr.predicate_objects():
+            #import pdb; pdb.set_trace()
+            obj_uid = obj.replace(ibase, '')
+            with TxnManager(self.rdfly.store, True):
+                conditions = bool(
+                    isinstance(obj, URIRef)
+                    and obj.startswith(iuri)
+                    # Avoid ∞ loop with fragment URIs.
+                    and str(urldefrag(obj).url) != str(iuri)
+                    # Avoid ∞ loop with circular references.
+                    and not self.rdfly.ask_rsrc_exists(obj_uid)
+                    and pred not in self.ignored_preds
+                )
+            if conditions:
+                print('Object {} will be crawled.'.format(obj_uid))
+                self._crawl(urldefrag(obj_uid).url)

+ 7 - 4
lakesuperior/model/ldp_factory.py

@@ -78,7 +78,8 @@ class LdpFactory:
 
 
     @staticmethod
-    def from_provided(uid, mimetype, stream=None, **kwargs):
+    def from_provided(
+            uid, mimetype=None, stream=None, provided_imr=None, **kwargs):
         '''
         Determine LDP type from request content.
 
@@ -87,15 +88,15 @@ class LdpFactory:
         @param stream (IOStream | None) The provided data stream. This can be
         RDF or non-RDF content, or None. In the latter case, an empty container
         is created.
+        @param **kwargs Arguments passed to the LDP class constructor.
         '''
         uri = nsc['fcres'][uid]
 
-        if not stream:
+        if not stream and not mimetype:
             # Create empty LDPC.
             logger.info('No data received in request. '
                     'Creating empty container.')
             inst = Ldpc(uid, provided_imr=Resource(Graph(), uri), **kwargs)
-
         elif __class__.is_rdf_parsable(mimetype):
             # Create container and populate it with provided RDF data.
             input_rdf = stream.read()
@@ -125,7 +126,9 @@ class LdpFactory:
 
         else:
             # Create a LDP-NR and equip it with the binary file provided.
-            provided_imr = Resource(Graph(), uri)
+            # The IMR can also be provided for additional metadata.
+            if not provided_imr:
+                provided_imr = Resource(Graph(), uri)
             inst = LdpNr(uid, stream=stream, mimetype=mimetype,
                     provided_imr=provided_imr, **kwargs)
 

+ 2 - 12
lakesuperior/model/ldp_nr.py

@@ -63,7 +63,8 @@ class LdpNr(Ldpr):
     def local_path(self):
         cksum_term = self.imr.value(nsc['premis'].hasMessageDigest)
         cksum = str(cksum_term.identifier.replace('urn:sha1:',''))
-        return nonrdfly.local_path(cksum)
+        return nonrdfly.__class__.local_path(
+                nonrdfly.root, cksum, nonrdfly.bl, nonrdfly.bc)
 
 
     def create_or_replace(self, create_only=False):
@@ -87,17 +88,6 @@ class LdpNr(Ldpr):
             return ev_type
 
 
-    def patch_metadata(self, update_str):
-        '''
-        Update resource metadata by applying a SPARQL-UPDATE query.
-
-        @param update_str (string) SPARQL-Update staements.
-        '''
-        self.handling = 'lenient' # FCREPO does that and Hyrax requires it.
-
-        return self._sparql_update(update_str)
-
-
     ## PROTECTED METHODS ##
 
     def _add_srv_mgd_triples(self, create=False):

+ 27 - 58
lakesuperior/model/ldpr.py

@@ -97,6 +97,16 @@ class Ldpr(metaclass=ABCMeta):
     }
 
 
+    # Predicates to remove when a resource is replaced.
+    delete_preds_on_replace = {
+        nsc['ebucore'].hasMimeType, 
+        nsc['fcrepo'].lastModified,
+        nsc['fcrepo'].lastModifiedBy,
+        nsc['premis'].hasSize, 
+        nsc['premis'].hasMessageDigest,
+    }
+
+
     ## MAGIC METHODS ##
 
     def __init__(self, uid, repr_opts={}, provided_imr=None, **kwargs):
@@ -121,6 +131,8 @@ class Ldpr(metaclass=ABCMeta):
 
         self.provided_imr = provided_imr
 
+        # Disable all internal checks e.g. for raw I/O.
+
 
     @property
     def rsrc(self):
@@ -211,33 +223,6 @@ class Ldpr(metaclass=ABCMeta):
         self._metadata = rsrc
 
 
-    @property
-    def stored_or_new_imr(self):
-        '''
-        Extract an in-memory resource for harmless manipulation and output.
-
-        If the resource is not stored (yet), initialize a new IMR with basic
-        triples.
-
-        @return rdflib.resource.Resource
-        '''
-        if not hasattr(self, '_imr'):
-            if hasattr(self, '_imr_options'):
-                #logger.debug('IMR options:{}'.format(self._imr_options))
-                imr_options = self._imr_options
-            else:
-                imr_options = {}
-            options = dict(imr_options, strict=True)
-            try:
-                self._imr = rdfly.extract_imr(self.uid, **options)
-            except ResourceNotExistsError:
-                self._imr = Resource(Graph(), self.uri)
-                for t in self.base_types:
-                    self.imr.add(RDF.type, t)
-
-        return self._imr
-
-
     @property
     def out_graph(self):
         '''
@@ -249,7 +234,7 @@ class Ldpr(metaclass=ABCMeta):
             if (
                 # Exclude digest hash and version information.
                 t[1] not in {
-                    nsc['premis'].hasMessageDigest,
+                    #nsc['premis'].hasMessageDigest,
                     nsc['fcrepo'].hasVersion,
                 }
             ) and (
@@ -372,8 +357,8 @@ class Ldpr(metaclass=ABCMeta):
         @param create_only (boolean) Whether this is a create-only operation.
         '''
         create = create_only or not self.is_stored
-        ev_type = RES_CREATED if create else RES_UPDATED
 
+        ev_type = RES_CREATED if create else RES_UPDATED
         self._add_srv_mgd_triples(create)
         ref_int = rdfly.config['referential_integrity']
         if ref_int:
@@ -384,10 +369,10 @@ class Ldpr(metaclass=ABCMeta):
             rdfly.truncate_rsrc(self.uid)
 
         remove_trp = {
-            (self.uri, nsc['fcrepo'].lastModified, None),
-            (self.uri, nsc['fcrepo'].lastModifiedBy, None),
-        }
-        add_trp = set(self.provided_imr.graph) | self._containment_rel(create)
+            (self.uri, pred, None) for pred in self.delete_preds_on_replace}
+        add_trp = (
+                set(self.provided_imr.graph)
+                | self._containment_rel(create))
 
         self._modify_rsrc(ev_type, remove_trp, add_trp)
         new_gr = Graph()
@@ -399,24 +384,6 @@ class Ldpr(metaclass=ABCMeta):
         return ev_type
 
 
-    def put(self):
-        '''
-        https://www.w3.org/TR/ldp/#ldpr-HTTP_PUT
-        '''
-        return self.create_or_replace()
-
-
-    def patch(self, update_str):
-        '''
-        Update an existing resource by applying a SPARQL-UPDATE query.
-
-        @param update_str (string) SPARQL-Update staements.
-        '''
-        self.handling = 'lenient' # FCREPO does that and Hyrax requires it.
-
-        return self._sparql_update(update_str)
-
-
     def bury_rsrc(self, inbound, tstone_pointer=None):
         '''
         Delete a single resource and create a tombstone.
@@ -510,7 +477,7 @@ class Ldpr(metaclass=ABCMeta):
             (self.uri, nsc['fcrepo'].hasVersion, ver_uri),
             (self.uri, nsc['fcrepo'].hasVersions, nsc['fcres'][vers_uid]),
         }
-        self._modify_rsrc(RES_UPDATED, add_trp=rsrc_add_gr, notify=False)
+        self._modify_rsrc(RES_UPDATED, add_trp=rsrc_add_gr)
 
         return ver_uid
 
@@ -608,7 +575,7 @@ class Ldpr(metaclass=ABCMeta):
 
 
     def _modify_rsrc(
-            self, ev_type, remove_trp=set(), add_trp=set(), notify=True):
+            self, ev_type, remove_trp=set(), add_trp=set()):
         '''
         Low-level method to modify a graph for a single resource.
 
@@ -616,14 +583,16 @@ class Ldpr(metaclass=ABCMeta):
         store that needs to be notified should be performed by invoking this
         method.
 
-        @param ev_type (string) The type of event (create, update, delete).
+        @param ev_type (string|None) The type of event (create, update,
+        delete) or None. In the latter case, no notification is sent.
         @param remove_trp (set) Triples to be removed.
         @param add_trp (set) Triples to be added.
-        @param notify (boolean) Whether to send a message about the change.
         '''
         rdfly.modify_rsrc(self.uid, remove_trp, add_trp)
 
-        if notify and env.config['application'].get('messaging'):
+        if (
+                ev_type is not None
+                and env.config['application'].get('messaging')):
             logger.debug('Enqueuing message for {}'.format(self.uid))
             self._enqueue_msg(ev_type, remove_trp, add_trp)
 
@@ -857,7 +826,7 @@ class Ldpr(metaclass=ABCMeta):
         return add_trp
 
 
-    def _sparql_update(self, update_str, notify=True):
+    def sparql_update(self, update_str):
         '''
         Apply a SPARQL update to a resource.
 
@@ -868,7 +837,7 @@ class Ldpr(metaclass=ABCMeta):
         self.handling = 'lenient' # FCREPO does that and Hyrax requires it.
         delta = self._sparql_delta(update_str)
 
-        return self._modify_rsrc(RES_UPDATED, *delta, notify=notify)
+        return self._modify_rsrc(RES_UPDATED, *delta)
 
 
     def _sparql_delta(self, q):

+ 33 - 27
lakesuperior/store/ldp_nr/default_layout.py

@@ -15,6 +15,36 @@ class DefaultLayout(BaseNonRdfLayout):
     '''
     Default file layout.
     '''
+    @staticmethod
+    def local_path(root, uuid, bl=4, bc=4):
+        '''
+        Generate the resource path splitting the resource checksum according to
+        configuration parameters.
+
+        @param uuid (string) The resource UUID. This corresponds to the content
+        checksum.
+        '''
+        logger.debug('Generating path from uuid: {}'.format(uuid))
+        term = len(uuid) if bc == 0 else min(bc * bl, len(uuid))
+
+        path = [uuid[i : i + bl] for i in range(0, term, bl)]
+
+        if bc > 0:
+            path.append(uuid[term :])
+        path.insert(0, root)
+
+        return '/'.join(path)
+
+
+    def __init__(self, *args, **kwargs):
+        '''
+        Set up path segmentation parameters.
+        '''
+        super().__init__(*args, **kwargs)
+
+        self.bl = self.config['pairtree_branch_length']
+        self.bc = self.config['pairtree_branches']
+
 
     ## INTERFACE METHODS ##
 
@@ -60,11 +90,11 @@ class DefaultLayout(BaseNonRdfLayout):
             os.unlink(tmp_file)
             raise
         if size == 0:
-            logger.warn('Zero-file size received.')
+            logger.warn('Zero-length file received.')
 
         # Move temp file to final destination.
         uuid = hash.hexdigest()
-        dst = self.local_path(uuid)
+        dst = __class__.local_path(self.root, uuid, self.bl, self.bc)
         logger.debug('Saving file to disk: {}'.format(dst))
         if not os.access(os.path.dirname(dst), os.X_OK):
             os.makedirs(os.path.dirname(dst))
@@ -84,28 +114,4 @@ class DefaultLayout(BaseNonRdfLayout):
         '''
         See BaseNonRdfLayout.delete.
         '''
-        os.unlink(self.local_path(uuid))
-
-
-    ## PROTECTED METHODS ##
-
-    def local_path(self, uuid):
-        '''
-        Generate the resource path splitting the resource checksum according to
-        configuration parameters.
-
-        @param uuid (string) The resource UUID. This corresponds to the content
-        checksum.
-        '''
-        logger.debug('Generating path from uuid: {}'.format(uuid))
-        bl = self.config['pairtree_branch_length']
-        bc = self.config['pairtree_branches']
-        term = len(uuid) if bc==0 else min(bc*bl, len(uuid))
-
-        path = [ uuid[i:i+bl] for i in range(0, term, bl) ]
-
-        if bc > 0:
-            path.append(uuid[term:])
-        path.insert(0, self.root)
-
-        return '/'.join(path)
+        os.unlink(__class__.local_path(self.root, uuid, self.bl, self.bc))

+ 7 - 2
lakesuperior/store/ldp_rs/rsrc_centric_layout.py

@@ -2,6 +2,9 @@ import logging
 
 from collections import defaultdict
 from itertools import chain
+from string import Template
+
+import arrow
 
 from rdflib import Dataset, Graph, Literal, URIRef, plugin
 from rdflib.namespace import RDF
@@ -178,7 +181,8 @@ class RsrcCentricLayout:
         store.open()
         with TxnManager(store, True):
             with open('data/bootstrap/rsrc_centric_layout.sparql', 'r') as f:
-                self.ds.update(f.read())
+                data = Template(f.read())
+                self.ds.update(data.substitute(timestamp=arrow.utcnow()))
 
 
     def get_raw(self, uri, ctx=None):
@@ -491,7 +495,8 @@ class RsrcCentricLayout:
             # Add metadata.
             meta_gr.set(
                     (gr_uri, nsc['foaf'].primaryTopic, nsc['fcres'][uid]))
-            meta_gr.set((gr_uri, nsc['fcrepo'].created, env.timestamp_term))
+            ts = getattr(env, 'timestamp_term', Literal(arrow.utcnow()))
+            meta_gr.set((gr_uri, nsc['fcrepo'].created, ts))
             if historic:
                 # @FIXME Ugly reverse engineering.
                 ver_uid = uid.split(VERS_CONT_LABEL)[1].lstrip('/')

+ 54 - 16
lsup-admin

@@ -1,19 +1,18 @@
 #!/usr/bin/env python
 import click
+import click_log
 import json
+import logging
 import os
 import sys
 
-import lakesuperior.env_setup
-
 from lakesuperior.api import admin as admin_api
 from lakesuperior.config_parser import config
-from lakesuperior.globals import AppGlobals
 from lakesuperior.env import env
 from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
 
-rdfly = env.app_globals.rdfly
-nonrdfly = env.app_globals.nonrdfly
+logger = logging.getLogger(__name__)
+click_log.basic_config(logger)
 
 
 @click.group()
@@ -31,6 +30,11 @@ def bootstrap():
 
     Additional scaffolding files may be parsed to create initial contents.
     '''
+    import lakesuperior.env_setup
+
+    rdfly = env.app_globals.rdfly
+    nonrdfly = env.app_globals.nonrdfly
+
     click.echo(
             click.style(
                 'WARNING: This operation will WIPE ALL YOUR DATA.\n',
@@ -119,19 +123,53 @@ def copy():
 @click.argument('src')
 @click.argument('dest')
 @click.option(
-    '--binaries', '-b', show_default=True,
-    help='If set to `include`, full binaries are included in the dump. If '
-    'set to `truncate`, binaries are created as zero-byte files in the proper '
-    'folder structure. If set to `skip`, binaries are not exported. Data '
-    'folders are not created.')
-def dump(src, dest, binaries='include'):
+    '--start', '-s', show_default=True,
+    help='Starting point for looking for resources in the repository.\n'
+    'The default `/` value starts at the root, i.e. migrates the whole '
+    'repository.')
+@click.option(
+    '--list-file', '-l', help='Path to a local file containing URIs to be '
+    'used as starting points, one per line. Use this alternatively to `-s`. '
+    'The URIs can be relative to the repository root (e.g. `/a/b/c`) or fully '
+    'qualified (e.g. `https://example.edu/fcrepo/rest/a/b/c`).')
+@click.option(
+    '--zero-binaries', '-z', is_flag=True,
+    help='If set, binaries are created as zero-byte files in the proper '
+    'folder structure rather than having their full content copied.')
+@click.option(
+    '--skip-errors', '-e', is_flag=True,
+    help='If set, when the application encounters an error while retrieving '
+    'a resource from the source repository, it will log the error rather than '
+    'quitting. Other exceptions caused by the application will terminate the '
+    'process as usual.')
+@click_log.simple_verbosity_option(logger)
+def migrate(src, dest, start, list_file, zero_binaries, skip_errors):
     '''
-    [STUB] Dump repository to disk.
+    Migrate an LDP repository to LAKEsuperior.
+
+    This utility creates a fully functional LAKEshore repository from an
+    existing repository. The source repo can be LAKEsuperior or
+    another LDP-compatible implementation.
 
-    Dump a Fedora 4 repository to disk. The Fedora repo can be
-    LAKEsuperior or another compatible implementation.
+    A folder will be created in the location indicated by ``dest``. If the
+    folder exists already, it will be deleted and recreated. The folder will be
+    populated with the RDF and binary data directories and a default
+    configuration directory. The new repository can be immediately started
+    from this location.
     '''
-    pass
+    logger.info('Migrating {} into a new repository on {}.'.format(
+            src, dest))
+    entries = admin_api.migrate(
+            src, dest, start_pts=start, list_file=list_file,
+            zero_binaries=zero_binaries, skip_errors=skip_errors)
+    logger.info('Migrated {} resources.'.format(entries))
+    logger.info('''Migration complete. To start the new repository, from the
+    directory you launched this script run:
+
+    FCREPO_CONFIG_DIR="{}/etc" ./fcrepo
+
+    Make sure that the default port is not being used by another repository.
+    '''.format(dest))
 
 
 @click.command()
@@ -152,8 +190,8 @@ admin.add_command(check_fixity)
 admin.add_command(check_refint)
 admin.add_command(cleanup)
 admin.add_command(copy)
-admin.add_command(dump)
 admin.add_command(load)
+admin.add_command(migrate)
 admin.add_command(stats)
 
 if __name__ == '__main__':

+ 1 - 0
requirements.txt

@@ -5,6 +5,7 @@ Pillow==4.3.0
 PyYAML==3.12
 arrow==0.10.0
 click==6.7
+click-log==0.2.1
 gevent==1.2.2
 gunicorn==19.7.1
 lmdb==0.93

+ 26 - 38
util/benchmark.py

@@ -7,10 +7,11 @@ from uuid import uuid4
 import arrow
 import requests
 
-from rdflib import Graph, URIRef, Literal
-
-from util.generators import random_utf8_string
+from util.generators import random_image, random_graph, random_utf8_string
 
+__doc__ = '''
+Benchmark script to measure write performance.
+'''
 
 default_n = 10000
 webroot = 'http://localhost:8000/ldp'
@@ -31,7 +32,9 @@ if choice and choice.lower() not in ('post', 'put'):
     raise ValueError('Not a valid verb.')
 method = choice.lower() or 'put'
 
-# Generate 10,000 children of root node.
+sys.stdout.write('RDF Sources (r), Non-RDF (n), or Both 50/50 (b)? [b] >')
+choice = input().lower()
+res_type = choice or 'b'
 
 if del_cont  == 'y':
     requests.delete(container_uri, headers={'prefer': 'no-tombstone'})
@@ -44,52 +47,37 @@ ckpt = start
 print('Inserting {} children.'.format(n))
 
 # URI used to establish an in-repo relationship.
-prev_uri = container_uri
-size = 50 # Size of graph to be multiplied by 4.
+ref = container_uri
+size = 200 # Size of graph.
 
 try:
-    for i in range(1, n):
+    for i in range(1, n + 1):
         url = '{}/{}'.format(container_uri, uuid4()) if method == 'put' \
                 else container_uri
 
-        # Generate synthetic graph.
-        #print('generating graph: {}'.format(i))
-        g = Graph()
-        for ii in range(size):
-            g.add((
-                URIRef(''),
-                URIRef('urn:inturi_p:{}'.format(ii % size)),
-                URIRef(prev_uri)
-            ))
-            g.add((
-                URIRef(''),
-                URIRef('urn:lit_p:{}'.format(ii % size)),
-                Literal(random_utf8_string(64))
-            ))
-            g.add((
-                URIRef(''),
-                URIRef('urn:lit_p:{}'.format(ii % size)),
-                Literal(random_utf8_string(64))
-            ))
-            g.add((
-                URIRef(''),
-                URIRef('urn:exturi_p:{}'.format(ii % size)),
-                URIRef('http://exmple.edu/res/{}'.format(ii // 10))
-            ))
-
-        # Send request.
-        rsp = requests.request(
-                method, url, data=g.serialize(format='ttl'),
-                headers={ 'content-type': 'text/turtle'})
+        if res_type == 'r' or (res_type == 'b' and i % 2 == 0):
+            data = random_graph(size, ref).serialize(format='ttl')
+            headers = {'content-type': 'text/turtle'}
+        else:
+            img = random_image(name=uuid4(), ts=16, ims=512)
+            data = img['content']
+            data.seek(0)
+            headers = {
+                    'content-type': 'image/png',
+                    'content-disposition': 'attachment; filename="{}"'
+                        .format(uuid4())}
+
+        #import pdb; pdb.set_trace()
+        rsp = requests.request(method, url, data=data, headers=headers)
         rsp.raise_for_status()
-        prev_uri = rsp.headers['location']
+        ref = rsp.headers['location']
         if i % 10 == 0:
             now = arrow.utcnow()
             tdelta = now - ckpt
             ckpt = now
             print('Record: {}\tTime elapsed: {}'.format(i, tdelta))
 except KeyboardInterrupt:
-    print('Interruped after {} iterations.'.format(i))
+    print('Interrupted after {} iterations.'.format(i))
 
 tdelta = arrow.utcnow() - start
 print('Total elapsed time: {}'.format(tdelta))

+ 45 - 0
util/generators.py

@@ -2,11 +2,14 @@ import io
 import random
 
 from hashlib import sha1
+from math import floor
 
 import requests
 import numpy
 
 from PIL import Image
+from rdflib import Graph, URIRef, Literal
+from rdflib.namespace import Namespace, NamespaceManager
 
 
 # @TODO Update this to include code point ranges to be sampled
@@ -51,3 +54,45 @@ def random_image(name, ts=8, ims=256):
     }
 
 
+nsm = NamespaceManager(Graph())
+nsc = {
+    'extp': Namespace('http://ex.org/exturi_p#'),
+    'intp': Namespace('http://ex.org/inturi_p#'),
+    'litp': Namespace('http://ex.org/lit_p#'),
+}
+for pfx, ns in nsc.items():
+    nsm.bind(pfx, ns)
+
+def random_graph(size, ref):
+    '''
+    Generate a synthetic graph.
+
+    @param size (int) size Size of the graph. It will be rounded by a
+    multiplier of 4.
+    '''
+    gr = Graph()
+    gr.namespace_manager = nsm
+    for ii in range(floor(size / 4)):
+        gr.add((
+            URIRef(''),
+            nsc['intp'][str(ii % size)],
+            URIRef(ref)
+        ))
+        gr.add((
+            URIRef(''),
+            nsc['litp'][str(ii % size)],
+            Literal(random_utf8_string(64))
+        ))
+        gr.add((
+            URIRef(''),
+            nsc['litp'][str(ii % size)],
+            Literal(random_utf8_string(64))
+        ))
+        gr.add((
+            URIRef(''),
+            nsc['extp'][str(ii % size)],
+            URIRef('http://example.edu/res/{}'.format(ii // 10))
+        ))
+
+    #print('Graph: {}'.format(gr.serialize(format='turtle').decode('utf-8')))
+    return gr