浏览代码

Referential integrity check.

Stefano Cossu 7 年之前
父节点
当前提交
16560083d6
共有 4 个文件被更改,包括 88 次插入7 次删除
  1. 20 1
      lakesuperior/api/admin.py
  2. 24 0
      lakesuperior/store/ldp_rs/lmdb_store.py
  3. 22 0
      lakesuperior/store/ldp_rs/rsrc_centric_layout.py
  4. 22 6
      lsup-admin

+ 20 - 1
lakesuperior/api/admin.py

@@ -1,6 +1,8 @@
 import logging
 
+from lakesuperior.config_parser import parse_config
 from lakesuperior.env import env
+from lakesuperior.globals import AppGlobals
 from lakesuperior.migrator import Migrator
 from lakesuperior.store.ldp_nr.default_layout import DefaultLayout as FileLayout
 from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
@@ -18,7 +20,8 @@ def stats():
     """
     Get repository statistics.
 
-    @return dict Store statistics, resource statistics.
+    :rtype: dict
+    :return: Store statistics, resource statistics.
     """
     import lakesuperior.env_setup
     repo_stats = {'rsrc_stats': env.app_globals.rdfly.count_rsrc()}
@@ -42,3 +45,19 @@ def migrate(src, dest, start_pts=None, list_file=None, **kwargs):
         start_pts = ('/',)
 
     return Migrator(src, dest, **kwargs).migrate(start_pts, list_file)
+
+
+def integrity_check(config_dir=None):
+    """
+    Check integrity of the data set.
+
+    At the moment this is limited to referential integrity. Other checks can
+    be added and triggered by different argument flags.
+    """
+    if config_dir:
+        env.config = parse_config(config_dir)[0]
+        env.app_globals = AppGlobals(env.config)
+    else:
+        import lakesuperior.env_setup
+    with TxnManager(env.app_globals.rdfly.store):
+        return { t for t in env.app_globals.rdfly.find_refint_violations()}

+ 24 - 0
lakesuperior/store/ldp_rs/lmdb_store.py

@@ -595,6 +595,30 @@ class LmdbStore(Store):
                 yield self._from_key(spok), contexts
 
 
+    def all_terms(self, term_type):
+        """
+        Return all terms of a type (``s``, ``p``, or ``o``) in the store.
+
+        :param str term_type: one of ``s``, ``p`` or ``o``.
+
+        :rtype: Iterator(rdflib.term.Identifier)
+        :return: Iterator of all terms.
+        :raise ValueError: if the term type is not one of the expected values.
+        """
+        if term_type == 's':
+            idx_label = 's:po'
+        elif term_type == 'p':
+            idx_label = 'p:so'
+        elif term_type == 'o':
+            idx_label = 'o:sp'
+        else:
+            raise ValueError('Term type must be \'s\', \'p\' or \'o\'.')
+
+        with self.cur(idx_label) as cur:
+            for key in cur.iternext_nodup():
+                yield self._from_key(key)[0]
+
+
     def bind(self, prefix, namespace):
         '''
         Bind a prefix to a namespace.

+ 22 - 0
lakesuperior/store/ldp_rs/rsrc_centric_layout.py

@@ -556,6 +556,28 @@ class RsrcCentricLayout:
         return str(uri).replace(nsc['fcres'], '')
 
 
+    def find_refint_violations(self):
+        """
+        Find all referential integrity violations.
+
+        This method looks for dangling relationships within a repository by
+        checking the objects of each triple; if the object is an in-repo
+        resource reference, and no resource with that URI results to be in the
+        repo, that triple is reported.
+
+        :rtype: set
+        :return: Triples referencing a repository URI that is not a resource.
+        """
+        for obj in self.store.all_terms('o'):
+            if (
+                    isinstance(obj, URIRef)
+                    and str(obj).startswith(nsc['fcres'])
+                    and not self.ask_rsrc_exists(self.uri_to_uid(obj))):
+                print('Object not found: {}'.format(obj))
+                for trp in self.store.triples((None, None, obj)):
+                    yield trp
+
+
     ## PROTECTED MEMBERS ##
 
     def _check_rsrc_status(self, rsrc):

+ 22 - 6
lsup-admin

@@ -85,17 +85,33 @@ def check_fixity(uid):
     pass
 
 
+@click.option(
+    '--config-folder', '-c', default=None, help='Alternative configuration '
+    'folder to look up. If not set, the location set in the environment or '
+    'the default configuration is used.')
 @click.command()
-def check_refint():
-    '''
-    [STUB] Check referential integrity.
+def check_refint(config_folder=None):
+    """
+    Check referential integrity.
 
     This command scans the graph store to verify that all references to
     resources within the repository are effectively pointing to existing
-    resources. For repositories set up with the `referencial_integrity` option
+    resources. For repositories set up with the `referential_integrity` option
     (the default), this is a pre-condition for a consistent data set.
-    '''
-    pass
+
+    Note: this check is run regardless of whether the repository enforces
+    referential integrity.
+    """
+    check_results = admin_api.integrity_check(config_folder)
+    click.echo('Integrity check results:')
+    if len(check_results):
+        click.echo(click.style('Inconsistencies found!', fg='red', bold=True))
+        click.echo('Missing object in the following triples:')
+        for trp in check_results:
+            click.echo(' '.join([str(t) for t in trp[0]]))
+    else:
+        click.echo(click.style('Clean. ', fg='green', bold=True)
+                + 'No inconsistency found.')
 
 
 @click.command()