Browse Source

Add option to use URI list file for migration.

Stefano Cossu 7 years ago
parent
commit
8e3b811f80
3 changed files with 38 additions and 27 deletions
  1. 9 10
      lakesuperior/api/admin.py
  2. 19 13
      lakesuperior/migrator.py
  3. 10 4
      lsup-admin

+ 9 - 10
lakesuperior/api/admin.py

@@ -28,18 +28,17 @@ def stats():
     return repo_stats
     return repo_stats
 
 
 
 
-def migrate(src, dest, start=('/',), **kwargs):
+def migrate(src, dest, start_pts=None, list_file=None, **kwargs):
     """
     """
     Migrate an LDP repository to a new LAKEsuperior instance.
     Migrate an LDP repository to a new LAKEsuperior instance.
 
 
     See :py:meth:`Migrator.__init__`.
     See :py:meth:`Migrator.__init__`.
     """
     """
-    start_pts = (
-            (start,)
-            if not isinstance(start, list) and not isinstance(start, tuple)
-            else start)
-
-    return Migrator(src, dest, start_pts, **kwargs).migrate()
-
-
-
+    if start_pts:
+        if not isinstance(
+                start_pts, list) and not isinstance(start_pts, tuple):
+            start_pts = (start_pts,)
+    elif not list_file:
+        start_pts = ('/',)
+
+    return Migrator(src, dest, **kwargs).migrate(start_pts, list_file)

+ 19 - 13
lakesuperior/migrator.py

@@ -71,8 +71,7 @@ class Migrator:
 
 
 
 
     def __init__(
     def __init__(
-            self, src, dest, start_pts, zero_binaries=False,
-            compact_uris=False):
+            self, src, dest, zero_binaries=False, compact_uris=False):
         """
         """
         Set up base paths and clean up existing directories.
         Set up base paths and clean up existing directories.
 
 
@@ -84,9 +83,6 @@ class Migrator:
         it must be a writable directory. It will be deleted and recreated. If
         it must be a writable directory. It will be deleted and recreated. If
         it does not exist, it will be created along with its parents if
         it does not exist, it will be created along with its parents if
         missing.
         missing.
-        :param start_pts: (tuple|list) List of starting points to retrieve
-        resources from. It would typically be the repository root in case of a
-        full dump or one or more resources in the repository for a partial one.
         :param binary_handling: (string) One of ``include``, ``truncate`` or
         :param binary_handling: (string) One of ``include``, ``truncate`` or
         ``split``.
         ``split``.
         :param compact_uris: (bool) NOT IMPLEMENTED. Whether the process should
         :param compact_uris: (bool) NOT IMPLEMENTED. Whether the process should
@@ -130,7 +126,6 @@ class Migrator:
         env.app_globals.nonrdfly.bootstrap()
         env.app_globals.nonrdfly.bootstrap()
 
 
         self.src = src.rstrip('/')
         self.src = src.rstrip('/')
-        self.start_pts = start_pts
         self.zero_binaries = zero_binaries
         self.zero_binaries = zero_binaries
 
 
         from lakesuperior.api import resource as rsrc_api
         from lakesuperior.api import resource as rsrc_api
@@ -140,22 +135,33 @@ class Migrator:
 
 
 
 
 
 
-    def migrate(self):
+    def migrate(self, start_pts=None, list_file=None):
         """
         """
         Migrate the database.
         Migrate the database.
 
 
         This method creates a fully functional and configured LAKEsuperior
         This method creates a fully functional and configured LAKEsuperior
-        environment contained in a folder from an LDP repository.
+        data set contained in a folder from an LDP repository.
+
+        :param tuple|list start_pts: List of starting points to retrieve
+        resources from. It would typically be the repository root in case of a
+        full dump or one or more resources in the repository for a partial one.
+        :param str listf_ile: path to a local file containing a list of URIs,
+        one per line.
         """
         """
         self._ct = 0
         self._ct = 0
         with StoreWrapper(env.app_globals.rdfly.store):
         with StoreWrapper(env.app_globals.rdfly.store):
-            for start in self.start_pts:
-                if not start.startswith('/'):
-                    raise ValueError(
+            if start_pts:
+                for start in start_pts:
+                    if not start.startswith('/'):
+                        raise ValueError(
                             'Starting point {} does not begin with a slash.'
                             'Starting point {} does not begin with a slash.'
                             .format(start))
                             .format(start))
 
 
-                self._crawl(start)
+                    self._crawl(start)
+            elif list_file:
+                with open(list_file, 'r') as fp:
+                    for uri in fp:
+                        self._crawl(uri.strip().replace(self.src, ''))
         self._remove_temp_options()
         self._remove_temp_options()
         logger.info('Dumped {} resources.'.format(self._ct))
         logger.info('Dumped {} resources.'.format(self._ct))
 
 
@@ -249,6 +255,7 @@ class Migrator:
         # Now, crawl through outbound links.
         # Now, crawl through outbound links.
         # LDP-NR fcr:metadata must be checked too.
         # LDP-NR fcr:metadata must be checked too.
         for pred, obj in gr.predicate_objects():
         for pred, obj in gr.predicate_objects():
+            #import pdb; pdb.set_trace()
             obj_uid = obj.replace(ibase, '')
             obj_uid = obj.replace(ibase, '')
             if (
             if (
                     isinstance(obj, URIRef)
                     isinstance(obj, URIRef)
@@ -258,7 +265,6 @@ class Migrator:
                     and pred not in self.ignored_preds
                     and pred not in self.ignored_preds
             ):
             ):
                 print('Object {} will be crawled.'.format(obj_uid))
                 print('Object {} will be crawled.'.format(obj_uid))
-                #import pdb; pdb.set_trace()
                 self._crawl(urldefrag(obj_uid).url)
                 self._crawl(urldefrag(obj_uid).url)
 
 
 
 

+ 10 - 4
lsup-admin

@@ -123,16 +123,21 @@ def copy():
 @click.argument('src')
 @click.argument('src')
 @click.argument('dest')
 @click.argument('dest')
 @click.option(
 @click.option(
-    '--start', '-s', default='/', show_default=True,
+    '--start', '-s', show_default=True,
     help='Starting point for looking for resources in the repository.\n'
     help='Starting point for looking for resources in the repository.\n'
     'The default `/` value starts at the root, i.e. migrates the whole '
     'The default `/` value starts at the root, i.e. migrates the whole '
     'repository.')
     'repository.')
+@click.option(
+    '--list-file', '-l', help='Path to a local file containing URIs to be '
+    'used as starting points, one per line. Use this alternatively to `-s`. '
+    'The URIs can be relative to the repository root (e.g. `/a/b/c`) or fully '
+    'qualified (e.g. `https://example.edu/fcrepo/rest/a/b/c`).')
 @click.option(
 @click.option(
     '--zero-binaries', '-z', is_flag=True,
     '--zero-binaries', '-z', is_flag=True,
     help='If set, binaries are created as zero-byte files in the proper '
     help='If set, binaries are created as zero-byte files in the proper '
     'folder structure rather than having their full content copied.')
     'folder structure rather than having their full content copied.')
 @click_log.simple_verbosity_option(logger)
 @click_log.simple_verbosity_option(logger)
-def migrate(src, dest, start, zero_binaries):
+def migrate(src, dest, start, list_file, zero_binaries):
     '''
     '''
     Migrate an LDP repository to LAKEsuperior.
     Migrate an LDP repository to LAKEsuperior.
 
 
@@ -147,9 +152,10 @@ def migrate(src, dest, start, zero_binaries):
     from this location.
     from this location.
     '''
     '''
     logger.info('Migrating {} into a new repository on {}.'.format(
     logger.info('Migrating {} into a new repository on {}.'.format(
-        src, dest))
+            src, dest))
     entries = admin_api.migrate(
     entries = admin_api.migrate(
-            src, dest, start=start, zero_binaries=zero_binaries)
+            src, dest, start_pts=start, list_file=list_file,
+            zero_binaries=zero_binaries)
     logger.info('Migrated {} resources.'.format(entries))
     logger.info('Migrated {} resources.'.format(entries))
     logger.info('''Migration complete. To start the new repository, from the
     logger.info('''Migration complete. To start the new repository, from the
     directory you launched this script run:
     directory you launched this script run: