Browse Source

Implement Digest header in POST and PUT.

Stefano Cossu 5 years ago
parent
commit
bd6f5000fd

+ 15 - 4
docs/fcrepo4_deltas.rst

@@ -8,10 +8,11 @@ clients will use it.
 Not yet implemented (but in the plans)
 --------------------------------------
 
--  Various headers handling (partial)
--  AuthN/Z
--  Fixity check
--  Blank nodes
+- Various headers handling (partial)
+- AuthN and WebAC-based authZ
+- Fixity check
+- Blank nodes (at least partly working, but untested)
+- Multiple byte ranges for the ``Range`` request header
 
 Potentially breaking changes
 ----------------------------
@@ -62,6 +63,16 @@ regardless of whether the tombstone exists or not.
 Lakesuperior will return ``405`` only if the tombstone actually exists,
 ``404`` otherwise.
 
+``Limit`` Header
+~~~~~~~~~~~~~~~~
+
+Lakesuperior does not support the ``Limit`` header which in FCREPO can be used
+to limit the number of "child" resources displayed for a container graph. Since
+this seems to have a mostly cosmetic function in FCREPO to compensate for
+performance limitations (displaying a page with many thousands of children in
+the UI can take minutes), and since Lakesuperior already offers options in the
+``Prefer`` header to not return any children, this option is not implemented.
+
 Web UI
 ~~~~~~
 

+ 1 - 1
docs/release_notes.rst

@@ -7,7 +7,7 @@ Release Notes
 
 *October 10, 2018*
 
-A hotfix relase was necessary to adjust settings for the source to build
+A hotfix release was necessary to adjust settings for the source to build
 correctly on Read The Docs and Docker Hub, and to package correctly on PyPI.
 
 1.0 Alpha 18

+ 31 - 24
lakesuperior/endpoints/ldp.py

@@ -1,3 +1,4 @@
+import hashlib
 import logging
 import pdb
 
@@ -19,7 +20,8 @@ from lakesuperior import env
 from lakesuperior.api import resource as rsrc_api
 from lakesuperior.dictionaries.namespaces import ns_collection as nsc
 from lakesuperior.dictionaries.namespaces import ns_mgr as nsm
-from lakesuperior.exceptions import (ResourceNotExistsError, TombstoneError,
+from lakesuperior.exceptions import (
+        ChecksumValidationError, ResourceNotExistsError, TombstoneError,
         ServerManagedTermError, InvalidResourceError, SingleSubjectError,
         ResourceExistsError, IncompatibleLdpTypeError)
 from lakesuperior.globals import RES_CREATED
@@ -258,28 +260,29 @@ def post_resource(parent_uid):
     """
     rsp_headers = std_headers.copy()
     slug = request.headers.get('Slug')
-    logger.debug('Slug: {}'.format(slug))
 
-    handling, disposition = set_post_put_params()
+    kwargs = {}
+    kwargs['handling'], kwargs['disposition'] = set_post_put_params()
     stream, mimetype = _bistream_from_req()
 
     if mimetype in rdf_parsable_mimetypes:
         # If the content is RDF, localize in-repo URIs.
         global_rdf = stream.read()
-        rdf_data = g.tbox.localize_payload(global_rdf)
-        rdf_fmt = mimetype
-        stream = mimetype = None
+        kwargs['rdf_data'] = g.tbox.localize_payload(global_rdf)
+        kwargs['rdf_fmt'] = mimetype
     else:
-        rdf_data = rdf_fmt = None
+        kwargs['stream'] = stream
+        kwargs['mimetype'] = mimetype
+        # Check digest if requested.
+        if 'digest' in request.headers:
+            kwargs['prov_cksum_algo'], kwargs['prov_cksum'] = \
+                    request.headers['digest'].split('=')
 
     try:
-        rsrc = rsrc_api.create(
-            parent_uid, slug, stream=stream, mimetype=mimetype,
-            rdf_data=rdf_data, rdf_fmt=rdf_fmt, handling=handling,
-            disposition=disposition)
+        rsrc = rsrc_api.create(parent_uid, slug, **kwargs)
     except ResourceNotExistsError as e:
         return str(e), 404
-    except InvalidResourceError as e:
+    except (InvalidResourceError, ChecksumValidationError) as e:
         return str(e), 409
     except TombstoneError as e:
         return _tombstone_response(e, uid)
@@ -290,7 +293,7 @@ def post_resource(parent_uid):
     rsp_headers.update(_headers_from_metadata(rsrc))
     rsp_headers['Location'] = uri
 
-    if mimetype and rdf_fmt is None:
+    if mimetype and kwargs.get('rdf_fmt') is None:
         rsp_headers['Link'] = (f'<{uri}/fcr:metadata>; rel="describedby"; '
                                f'anchor="{uri}"')
 
@@ -313,24 +316,28 @@ def put_resource(uid):
     if cond_ret:
         return cond_ret
 
-    handling, disposition = set_post_put_params()
+    kwargs = {}
+    kwargs['handling'], kwargs['disposition'] = set_post_put_params()
     stream, mimetype = _bistream_from_req()
 
     if mimetype in rdf_parsable_mimetypes:
         # If the content is RDF, localize in-repo URIs.
         global_rdf = stream.read()
-        rdf_data = g.tbox.localize_payload(global_rdf)
-        rdf_fmt = mimetype
-        stream = mimetype = None
+        kwargs['rdf_data'] = g.tbox.localize_payload(global_rdf)
+        kwargs['rdf_fmt'] = mimetype
     else:
-        rdf_data = rdf_fmt = None
+        kwargs['stream'] = stream
+        kwargs['mimetype'] = mimetype
+        # Check digest if requested.
+        if 'digest' in request.headers:
+            kwargs['prov_cksum_algo'], kwargs['prov_cksum'] = \
+                    request.headers['digest'].split('=')
 
     try:
-        evt, rsrc = rsrc_api.create_or_replace(
-            uid, stream=stream, mimetype=mimetype,
-            rdf_data=rdf_data, rdf_fmt=rdf_fmt, handling=handling,
-            disposition=disposition)
-    except (InvalidResourceError, ResourceExistsError) as e:
+        evt, rsrc = rsrc_api.create_or_replace(uid, **kwargs)
+    except (
+            InvalidResourceError, ChecksumValidationError,
+            ResourceExistsError) as e:
         return str(e), 409
     except (ServerManagedTermError, SingleSubjectError) as e:
         return str(e), 412
@@ -346,7 +353,7 @@ def put_resource(uid):
     if evt == RES_CREATED:
         rsp_code = 201
         rsp_headers['Location'] = rsp_body = uri
-        if mimetype and not rdf_data:
+        if mimetype and not kwargs.get('rdf_data'):
             rsp_headers['Link'] = f'<{uri}/fcr:metadata>; rel="describedby"'
     else:
         rsp_code = 204

+ 9 - 2
lakesuperior/etc.defaults/application.yml

@@ -23,9 +23,16 @@ data_dir:
 # checksumn of the contents of the file.
 uuid:
     # Algorithm used to calculate the hash that generates the content path.
-    # One of: sha1, sha224, sha256, sha384, or sha512, corresponding to the
-    # omonymous hashlib function:
+    # This can be any one of the Python hashlib functions:
     # https://docs.python.org/3/library/hashlib.html
+    #
+    # This needs to be ``sha1`` if a compatibility with the Fedora4 file layout
+    # is needed, however in security-sensitive environments it is strongly
+    # advised to use a stronger algorithm, since SHA1 is known to be
+    # vulnerable to counterfeiting: see https://shattered.io/
+    #
+    # `blake2b` is a strong, fast cryptographic alternative to SHA2/3:
+    # https://blake2.net/
     algo: sha1
 
 # Data store configuration.

+ 10 - 4
lakesuperior/model/ldp_nr.py

@@ -15,7 +15,6 @@ from lakesuperior.model.ldp_rs import LdpRs
 nonrdfly = env.app_globals.nonrdfly
 logger = logging.getLogger(__name__)
 
-
 class LdpNr(Ldpr):
     """LDP-NR (Non-RDF Source).
 
@@ -30,7 +29,8 @@ class LdpNr(Ldpr):
     }
 
     def __init__(self, uuid, stream=None, mimetype=None,
-            disposition=None, **kwargs):
+            disposition=None, prov_cksum_algo=None, prov_cksum=None,
+            **kwargs):
         """
         Extends Ldpr.__init__ by adding LDP-NR specific parameters.
         """
@@ -51,6 +51,8 @@ class LdpNr(Ldpr):
                     if self.is_stored
                     else 'application/octet-stream')
 
+        self.prov_cksum_algo = prov_cksum_algo
+        self.prov_cksum = prov_cksum
         self.disposition = disposition
 
 
@@ -92,8 +94,10 @@ class LdpNr(Ldpr):
 
         :rtype: str
         """
+        default_hash_algo = \
+                env.app_globals.config['application']['uuid']['algo']
         cksum_term = self.metadata.value(nsc['premis'].hasMessageDigest)
-        cksum = str(cksum_term).replace('urn:sha1:','')
+        cksum = str(cksum_term).replace(f'urn:{default_hash_algo}:','')
         return nonrdfly.__class__.local_path(
                 nonrdfly.root, cksum, nonrdfly.bl, nonrdfly.bc)
 
@@ -106,7 +110,9 @@ class LdpNr(Ldpr):
             updated.
         """
         # Persist the stream.
-        self.digest, self.size = nonrdfly.persist(self.stream)
+        self.digest, self.size = nonrdfly.persist(
+                self.uid, self.stream, prov_cksum_algo=self.prov_cksum_algo,
+                prov_cksum=self.prov_cksum)
 
         # Try to persist metadata. If it fails, delete the file.
         logger.debug('Persisting LDP-NR triples in {}'.format(self.uri))

+ 50 - 22
lakesuperior/store/ldp_nr/default_layout.py

@@ -1,11 +1,13 @@
+import hashlib
 import logging
 import os
 import shutil
 
-from hashlib import sha1
 from uuid import uuid4
 
+from lakesuperior import env
 from lakesuperior.store.ldp_nr.base_non_rdf_layout import BaseNonRdfLayout
+from lakesuperior.exceptions import ChecksumValidationError
 
 
 logger = logging.getLogger(__name__)
@@ -16,7 +18,7 @@ class DefaultLayout(BaseNonRdfLayout):
     Default file layout.
 
     This is a simple filesystem layout that stores binaries in pairtree folders
-    in a local filesystem. Parameters can be specified for the 
+    in a local filesystem. Parameters can be specified for the
     """
     @staticmethod
     def local_path(root, uuid, bl=4, bc=4):
@@ -58,7 +60,9 @@ class DefaultLayout(BaseNonRdfLayout):
         os.makedirs(self.root + '/tmp')
 
 
-    def persist(self, stream, bufsize=8192):
+    def persist(
+            self, uid, stream, bufsize=8192, prov_cksum=None,
+            prov_cksum_algo=None):
         r"""
         Store the stream in the file system.
 
@@ -67,46 +71,70 @@ class DefaultLayout(BaseNonRdfLayout):
         to disk and hashed, the temp file is moved to its final location which
         is determined by the hash value.
 
+        :param str uid: UID of the resource.
         :param IOstream stream: file-like object to persist.
         :param int bufsize: Chunk size. 2\*\*12 to 2\*\*15 is a good range.
+        :param str prov_cksum: Checksum provided by the client to verify
+            that the content received matches what has been sent. If None (the
+            default) no verification will take place.
+        :param str prov_cksum_algo: Verification algorithm to validate the
+            integrity of the user-provided data. If this is different from
+            the default hash algorithm set in the application configuration,
+            which is used to calclate the checksum of the file for storing
+            purposes, a separate hash is calculated specifically for validation
+            purposes. Clearly it's more efficient to use the same algorithm and
+            avoid a second checksum calculation.
         """
-        tmp_file = '{}/tmp/{}'.format(self.root, uuid4())
+        # The temp file is created on the destination filesystem to minimize
+        # time and risk of moving it to its final destination.
+        tmp_fname = f'{self.root}/tmp/{uuid4()}'
+
+        default_hash_algo = \
+                env.app_globals.config['application']['uuid']['algo']
+        if prov_cksum_algo is None:
+            prov_cksum_algo = default_hash_algo
         try:
-            with open(tmp_file, 'wb') as f:
-                logger.debug('Writing temp file to {}.'.format(tmp_file))
+            with open(tmp_fname, 'wb') as f:
+                logger.debug(f'Writing temp file to {tmp_fname}.')
 
-                hash = sha1()
+                store_hash = hashlib.new(default_hash_algo)
+                verify_hash = (
+                        store_hash if prov_cksum_algo == default_hash_algo
+                        else hashlib.new(prov_cksum_algo))
                 size = 0
                 while True:
                     buf = stream.read(bufsize)
                     if not buf:
                         break
-                    hash.update(buf)
+                    store_hash.update(buf)
+                    if verify_hash is not store_hash:
+                        verify_hash.update(buf)
                     f.write(buf)
                     size += len(buf)
+
+                if prov_cksum and verify_hash.hexdigest() != prov_cksum:
+                    raise ChecksumValidationError(
+                        uid, prov_cksum, verify_hash.hexdigest())
         except:
-            logger.exception('File write failed on {}.'.format(tmp_file))
-            os.unlink(tmp_file)
+            logger.exception(f'File write failed on {tmp_fname}.')
+            os.unlink(tmp_fname)
             raise
         if size == 0:
             logger.warn('Zero-length file received.')
 
+        # If the file exists already, don't bother rewriting it.
+        dst = __class__.local_path(
+                self.root, store_hash.hexdigest(), self.bl, self.bc)
+        if os.path.exists(dst):
+            logger.info(f'File exists on {dst}. Not overwriting.')
+
         # Move temp file to final destination.
-        uuid = hash.hexdigest()
-        dst = __class__.local_path(self.root, uuid, self.bl, self.bc)
-        logger.debug('Saving file to disk: {}'.format(dst))
+        logger.debug(f'Saving file to disk: {dst}')
         if not os.access(os.path.dirname(dst), os.X_OK):
             os.makedirs(os.path.dirname(dst))
+        os.rename(tmp_fname, dst)
 
-        # If the file exists already, don't bother rewriting it.
-        if os.path.exists(dst):
-            logger.info(
-                    'File exists on {}. Not overwriting.'.format(dst))
-            os.unlink(tmp_file)
-        else:
-            os.rename(tmp_file, dst)
-
-        return uuid, size
+        return store_hash.hexdigest(), size
 
 
     def delete(self, uuid):

+ 56 - 2
tests/2_endpoints/test_ldp.py

@@ -3,7 +3,7 @@ import pytest
 
 from base64 import b64encode
 from datetime import timedelta
-from hashlib import sha1
+from hashlib import sha1, sha256, blake2b
 from uuid import uuid4
 from werkzeug.http import http_date
 
@@ -808,7 +808,7 @@ class TestDigestHeaders:
         TODO This is by design for now; when a reliable hashing method
         for a graph is devised, this test should change.
         """
-        path = '/ldp/test_etag_rdf1'
+        path = f'/ldp/{uuid4()}'
 
         put_rsp = self.client.put(path)
         assert not put_rsp.headers.get('etag')
@@ -819,6 +819,60 @@ class TestDigestHeaders:
         assert not get_rsp.headers.get('digest')
 
 
+    def test_digest_put(self):
+        """
+        Test the ``Digest`` header with PUT to verify content integrity.
+        """
+        path1 = f'/ldp/{uuid4()}'
+        path2 = f'/ldp/{uuid4()}'
+        path3 = f'/ldp/{uuid4()}'
+        content = uuid4().bytes
+        content_sha1 = sha1(content).hexdigest()
+        content_sha256 = sha256(content).hexdigest()
+        content_blake2b = blake2b(content).hexdigest()
+
+        assert self.client.put(path1, data=content, headers={
+                'digest': 'sha1=abcd'}).status_code == 409
+
+        assert self.client.put(path1, data=content, headers={
+                'digest': f'sha1={content_sha1}'}).status_code == 201
+
+        assert self.client.put(path2, data=content, headers={
+                'digest': f'SHA1={content_sha1}'}).status_code == 201
+
+        assert self.client.put(path3, data=content, headers={
+                'digest': f'SHA256={content_sha256}'}).status_code == 201
+
+        assert self.client.put(path3, data=content, headers={
+                'digest': f'blake2b={content_blake2b}'}).status_code == 204
+
+
+    def test_digest_post(self):
+        """
+        Test the ``Digest`` header with POST to verify content integrity.
+        """
+        path = '/ldp'
+        content = uuid4().bytes
+        content_sha1 = sha1(content).hexdigest()
+        content_sha256 = sha256(content).hexdigest()
+        content_blake2b = blake2b(content).hexdigest()
+
+        assert self.client.post(path, data=content, headers={
+                'digest': 'sha1=abcd'}).status_code == 409
+
+        assert self.client.post(path, data=content, headers={
+                'digest': f'sha1={content_sha1}'}).status_code == 201
+
+        assert self.client.post(path, data=content, headers={
+                'digest': f'SHA1={content_sha1}'}).status_code == 201
+
+        assert self.client.post(path, data=content, headers={
+                'digest': f'SHA256={content_sha256}'}).status_code == 201
+
+        assert self.client.post(path, data=content, headers={
+                'digest': f'blake2b={content_blake2b}'}).status_code == 201
+
+
 
 @pytest.mark.usefixtures('client_class')
 class TestETagCondHeaders: