Browse Source

Implement Digest header in POST and PUT.

Stefano Cossu 5 years ago
parent
commit
bd6f5000fd

+ 15 - 4
docs/fcrepo4_deltas.rst

@@ -8,10 +8,11 @@ clients will use it.
 Not yet implemented (but in the plans)
 Not yet implemented (but in the plans)
 --------------------------------------
 --------------------------------------
 
 
--  Various headers handling (partial)
--  AuthN/Z
--  Fixity check
--  Blank nodes
+- Various headers handling (partial)
+- AuthN and WebAC-based authZ
+- Fixity check
+- Blank nodes (at least partly working, but untested)
+- Multiple byte ranges for the ``Range`` request header
 
 
 Potentially breaking changes
 Potentially breaking changes
 ----------------------------
 ----------------------------
@@ -62,6 +63,16 @@ regardless of whether the tombstone exists or not.
 Lakesuperior will return ``405`` only if the tombstone actually exists,
 Lakesuperior will return ``405`` only if the tombstone actually exists,
 ``404`` otherwise.
 ``404`` otherwise.
 
 
+``Limit`` Header
+~~~~~~~~~~~~~~~~
+
+Lakesuperior does not support the ``Limit`` header which in FCREPO can be used
+to limit the number of "child" resources displayed for a container graph. Since
+this seems to have a mostly cosmetic function in FCREPO to compensate for
+performance limitations (displaying a page with many thousands of children in
+the UI can take minutes), and since Lakesuperior already offers options in the
+``Prefer`` header to not return any children, this option is not implemented.
+
 Web UI
 Web UI
 ~~~~~~
 ~~~~~~
 
 

+ 1 - 1
docs/release_notes.rst

@@ -7,7 +7,7 @@ Release Notes
 
 
 *October 10, 2018*
 *October 10, 2018*
 
 
-A hotfix relase was necessary to adjust settings for the source to build
+A hotfix release was necessary to adjust settings for the source to build
 correctly on Read The Docs and Docker Hub, and to package correctly on PyPI.
 correctly on Read The Docs and Docker Hub, and to package correctly on PyPI.
 
 
 1.0 Alpha 18
 1.0 Alpha 18

+ 31 - 24
lakesuperior/endpoints/ldp.py

@@ -1,3 +1,4 @@
+import hashlib
 import logging
 import logging
 import pdb
 import pdb
 
 
@@ -19,7 +20,8 @@ from lakesuperior import env
 from lakesuperior.api import resource as rsrc_api
 from lakesuperior.api import resource as rsrc_api
 from lakesuperior.dictionaries.namespaces import ns_collection as nsc
 from lakesuperior.dictionaries.namespaces import ns_collection as nsc
 from lakesuperior.dictionaries.namespaces import ns_mgr as nsm
 from lakesuperior.dictionaries.namespaces import ns_mgr as nsm
-from lakesuperior.exceptions import (ResourceNotExistsError, TombstoneError,
+from lakesuperior.exceptions import (
+        ChecksumValidationError, ResourceNotExistsError, TombstoneError,
         ServerManagedTermError, InvalidResourceError, SingleSubjectError,
         ServerManagedTermError, InvalidResourceError, SingleSubjectError,
         ResourceExistsError, IncompatibleLdpTypeError)
         ResourceExistsError, IncompatibleLdpTypeError)
 from lakesuperior.globals import RES_CREATED
 from lakesuperior.globals import RES_CREATED
@@ -258,28 +260,29 @@ def post_resource(parent_uid):
     """
     """
     rsp_headers = std_headers.copy()
     rsp_headers = std_headers.copy()
     slug = request.headers.get('Slug')
     slug = request.headers.get('Slug')
-    logger.debug('Slug: {}'.format(slug))
 
 
-    handling, disposition = set_post_put_params()
+    kwargs = {}
+    kwargs['handling'], kwargs['disposition'] = set_post_put_params()
     stream, mimetype = _bistream_from_req()
     stream, mimetype = _bistream_from_req()
 
 
     if mimetype in rdf_parsable_mimetypes:
     if mimetype in rdf_parsable_mimetypes:
         # If the content is RDF, localize in-repo URIs.
         # If the content is RDF, localize in-repo URIs.
         global_rdf = stream.read()
         global_rdf = stream.read()
-        rdf_data = g.tbox.localize_payload(global_rdf)
-        rdf_fmt = mimetype
-        stream = mimetype = None
+        kwargs['rdf_data'] = g.tbox.localize_payload(global_rdf)
+        kwargs['rdf_fmt'] = mimetype
     else:
     else:
-        rdf_data = rdf_fmt = None
+        kwargs['stream'] = stream
+        kwargs['mimetype'] = mimetype
+        # Check digest if requested.
+        if 'digest' in request.headers:
+            kwargs['prov_cksum_algo'], kwargs['prov_cksum'] = \
+                    request.headers['digest'].split('=')
 
 
     try:
     try:
-        rsrc = rsrc_api.create(
-            parent_uid, slug, stream=stream, mimetype=mimetype,
-            rdf_data=rdf_data, rdf_fmt=rdf_fmt, handling=handling,
-            disposition=disposition)
+        rsrc = rsrc_api.create(parent_uid, slug, **kwargs)
     except ResourceNotExistsError as e:
     except ResourceNotExistsError as e:
         return str(e), 404
         return str(e), 404
-    except InvalidResourceError as e:
+    except (InvalidResourceError, ChecksumValidationError) as e:
         return str(e), 409
         return str(e), 409
     except TombstoneError as e:
     except TombstoneError as e:
         return _tombstone_response(e, uid)
         return _tombstone_response(e, uid)
@@ -290,7 +293,7 @@ def post_resource(parent_uid):
     rsp_headers.update(_headers_from_metadata(rsrc))
     rsp_headers.update(_headers_from_metadata(rsrc))
     rsp_headers['Location'] = uri
     rsp_headers['Location'] = uri
 
 
-    if mimetype and rdf_fmt is None:
+    if mimetype and kwargs.get('rdf_fmt') is None:
         rsp_headers['Link'] = (f'<{uri}/fcr:metadata>; rel="describedby"; '
         rsp_headers['Link'] = (f'<{uri}/fcr:metadata>; rel="describedby"; '
                                f'anchor="{uri}"')
                                f'anchor="{uri}"')
 
 
@@ -313,24 +316,28 @@ def put_resource(uid):
     if cond_ret:
     if cond_ret:
         return cond_ret
         return cond_ret
 
 
-    handling, disposition = set_post_put_params()
+    kwargs = {}
+    kwargs['handling'], kwargs['disposition'] = set_post_put_params()
     stream, mimetype = _bistream_from_req()
     stream, mimetype = _bistream_from_req()
 
 
     if mimetype in rdf_parsable_mimetypes:
     if mimetype in rdf_parsable_mimetypes:
         # If the content is RDF, localize in-repo URIs.
         # If the content is RDF, localize in-repo URIs.
         global_rdf = stream.read()
         global_rdf = stream.read()
-        rdf_data = g.tbox.localize_payload(global_rdf)
-        rdf_fmt = mimetype
-        stream = mimetype = None
+        kwargs['rdf_data'] = g.tbox.localize_payload(global_rdf)
+        kwargs['rdf_fmt'] = mimetype
     else:
     else:
-        rdf_data = rdf_fmt = None
+        kwargs['stream'] = stream
+        kwargs['mimetype'] = mimetype
+        # Check digest if requested.
+        if 'digest' in request.headers:
+            kwargs['prov_cksum_algo'], kwargs['prov_cksum'] = \
+                    request.headers['digest'].split('=')
 
 
     try:
     try:
-        evt, rsrc = rsrc_api.create_or_replace(
-            uid, stream=stream, mimetype=mimetype,
-            rdf_data=rdf_data, rdf_fmt=rdf_fmt, handling=handling,
-            disposition=disposition)
-    except (InvalidResourceError, ResourceExistsError) as e:
+        evt, rsrc = rsrc_api.create_or_replace(uid, **kwargs)
+    except (
+            InvalidResourceError, ChecksumValidationError,
+            ResourceExistsError) as e:
         return str(e), 409
         return str(e), 409
     except (ServerManagedTermError, SingleSubjectError) as e:
     except (ServerManagedTermError, SingleSubjectError) as e:
         return str(e), 412
         return str(e), 412
@@ -346,7 +353,7 @@ def put_resource(uid):
     if evt == RES_CREATED:
     if evt == RES_CREATED:
         rsp_code = 201
         rsp_code = 201
         rsp_headers['Location'] = rsp_body = uri
         rsp_headers['Location'] = rsp_body = uri
-        if mimetype and not rdf_data:
+        if mimetype and not kwargs.get('rdf_data'):
             rsp_headers['Link'] = f'<{uri}/fcr:metadata>; rel="describedby"'
             rsp_headers['Link'] = f'<{uri}/fcr:metadata>; rel="describedby"'
     else:
     else:
         rsp_code = 204
         rsp_code = 204

+ 9 - 2
lakesuperior/etc.defaults/application.yml

@@ -23,9 +23,16 @@ data_dir:
 # checksumn of the contents of the file.
 # checksumn of the contents of the file.
 uuid:
 uuid:
     # Algorithm used to calculate the hash that generates the content path.
     # Algorithm used to calculate the hash that generates the content path.
-    # One of: sha1, sha224, sha256, sha384, or sha512, corresponding to the
-    # omonymous hashlib function:
+    # This can be any one of the Python hashlib functions:
     # https://docs.python.org/3/library/hashlib.html
     # https://docs.python.org/3/library/hashlib.html
+    #
+    # This needs to be ``sha1`` if a compatibility with the Fedora4 file layout
+    # is needed, however in security-sensitive environments it is strongly
+    # advised to use a stronger algorithm, since SHA1 is known to be
+    # vulnerable to counterfeiting: see https://shattered.io/
+    #
+    # `blake2b` is a strong, fast cryptographic alternative to SHA2/3:
+    # https://blake2.net/
     algo: sha1
     algo: sha1
 
 
 # Data store configuration.
 # Data store configuration.

+ 10 - 4
lakesuperior/model/ldp_nr.py

@@ -15,7 +15,6 @@ from lakesuperior.model.ldp_rs import LdpRs
 nonrdfly = env.app_globals.nonrdfly
 nonrdfly = env.app_globals.nonrdfly
 logger = logging.getLogger(__name__)
 logger = logging.getLogger(__name__)
 
 
-
 class LdpNr(Ldpr):
 class LdpNr(Ldpr):
     """LDP-NR (Non-RDF Source).
     """LDP-NR (Non-RDF Source).
 
 
@@ -30,7 +29,8 @@ class LdpNr(Ldpr):
     }
     }
 
 
     def __init__(self, uuid, stream=None, mimetype=None,
     def __init__(self, uuid, stream=None, mimetype=None,
-            disposition=None, **kwargs):
+            disposition=None, prov_cksum_algo=None, prov_cksum=None,
+            **kwargs):
         """
         """
         Extends Ldpr.__init__ by adding LDP-NR specific parameters.
         Extends Ldpr.__init__ by adding LDP-NR specific parameters.
         """
         """
@@ -51,6 +51,8 @@ class LdpNr(Ldpr):
                     if self.is_stored
                     if self.is_stored
                     else 'application/octet-stream')
                     else 'application/octet-stream')
 
 
+        self.prov_cksum_algo = prov_cksum_algo
+        self.prov_cksum = prov_cksum
         self.disposition = disposition
         self.disposition = disposition
 
 
 
 
@@ -92,8 +94,10 @@ class LdpNr(Ldpr):
 
 
         :rtype: str
         :rtype: str
         """
         """
+        default_hash_algo = \
+                env.app_globals.config['application']['uuid']['algo']
         cksum_term = self.metadata.value(nsc['premis'].hasMessageDigest)
         cksum_term = self.metadata.value(nsc['premis'].hasMessageDigest)
-        cksum = str(cksum_term).replace('urn:sha1:','')
+        cksum = str(cksum_term).replace(f'urn:{default_hash_algo}:','')
         return nonrdfly.__class__.local_path(
         return nonrdfly.__class__.local_path(
                 nonrdfly.root, cksum, nonrdfly.bl, nonrdfly.bc)
                 nonrdfly.root, cksum, nonrdfly.bl, nonrdfly.bc)
 
 
@@ -106,7 +110,9 @@ class LdpNr(Ldpr):
             updated.
             updated.
         """
         """
         # Persist the stream.
         # Persist the stream.
-        self.digest, self.size = nonrdfly.persist(self.stream)
+        self.digest, self.size = nonrdfly.persist(
+                self.uid, self.stream, prov_cksum_algo=self.prov_cksum_algo,
+                prov_cksum=self.prov_cksum)
 
 
         # Try to persist metadata. If it fails, delete the file.
         # Try to persist metadata. If it fails, delete the file.
         logger.debug('Persisting LDP-NR triples in {}'.format(self.uri))
         logger.debug('Persisting LDP-NR triples in {}'.format(self.uri))

+ 50 - 22
lakesuperior/store/ldp_nr/default_layout.py

@@ -1,11 +1,13 @@
+import hashlib
 import logging
 import logging
 import os
 import os
 import shutil
 import shutil
 
 
-from hashlib import sha1
 from uuid import uuid4
 from uuid import uuid4
 
 
+from lakesuperior import env
 from lakesuperior.store.ldp_nr.base_non_rdf_layout import BaseNonRdfLayout
 from lakesuperior.store.ldp_nr.base_non_rdf_layout import BaseNonRdfLayout
+from lakesuperior.exceptions import ChecksumValidationError
 
 
 
 
 logger = logging.getLogger(__name__)
 logger = logging.getLogger(__name__)
@@ -16,7 +18,7 @@ class DefaultLayout(BaseNonRdfLayout):
     Default file layout.
     Default file layout.
 
 
     This is a simple filesystem layout that stores binaries in pairtree folders
     This is a simple filesystem layout that stores binaries in pairtree folders
-    in a local filesystem. Parameters can be specified for the 
+    in a local filesystem. Parameters can be specified for the
     """
     """
     @staticmethod
     @staticmethod
     def local_path(root, uuid, bl=4, bc=4):
     def local_path(root, uuid, bl=4, bc=4):
@@ -58,7 +60,9 @@ class DefaultLayout(BaseNonRdfLayout):
         os.makedirs(self.root + '/tmp')
         os.makedirs(self.root + '/tmp')
 
 
 
 
-    def persist(self, stream, bufsize=8192):
+    def persist(
+            self, uid, stream, bufsize=8192, prov_cksum=None,
+            prov_cksum_algo=None):
         r"""
         r"""
         Store the stream in the file system.
         Store the stream in the file system.
 
 
@@ -67,46 +71,70 @@ class DefaultLayout(BaseNonRdfLayout):
         to disk and hashed, the temp file is moved to its final location which
         to disk and hashed, the temp file is moved to its final location which
         is determined by the hash value.
         is determined by the hash value.
 
 
+        :param str uid: UID of the resource.
         :param IOstream stream: file-like object to persist.
         :param IOstream stream: file-like object to persist.
         :param int bufsize: Chunk size. 2\*\*12 to 2\*\*15 is a good range.
         :param int bufsize: Chunk size. 2\*\*12 to 2\*\*15 is a good range.
+        :param str prov_cksum: Checksum provided by the client to verify
+            that the content received matches what has been sent. If None (the
+            default) no verification will take place.
+        :param str prov_cksum_algo: Verification algorithm to validate the
+            integrity of the user-provided data. If this is different from
+            the default hash algorithm set in the application configuration,
+            which is used to calclate the checksum of the file for storing
+            purposes, a separate hash is calculated specifically for validation
+            purposes. Clearly it's more efficient to use the same algorithm and
+            avoid a second checksum calculation.
         """
         """
-        tmp_file = '{}/tmp/{}'.format(self.root, uuid4())
+        # The temp file is created on the destination filesystem to minimize
+        # time and risk of moving it to its final destination.
+        tmp_fname = f'{self.root}/tmp/{uuid4()}'
+
+        default_hash_algo = \
+                env.app_globals.config['application']['uuid']['algo']
+        if prov_cksum_algo is None:
+            prov_cksum_algo = default_hash_algo
         try:
         try:
-            with open(tmp_file, 'wb') as f:
-                logger.debug('Writing temp file to {}.'.format(tmp_file))
+            with open(tmp_fname, 'wb') as f:
+                logger.debug(f'Writing temp file to {tmp_fname}.')
 
 
-                hash = sha1()
+                store_hash = hashlib.new(default_hash_algo)
+                verify_hash = (
+                        store_hash if prov_cksum_algo == default_hash_algo
+                        else hashlib.new(prov_cksum_algo))
                 size = 0
                 size = 0
                 while True:
                 while True:
                     buf = stream.read(bufsize)
                     buf = stream.read(bufsize)
                     if not buf:
                     if not buf:
                         break
                         break
-                    hash.update(buf)
+                    store_hash.update(buf)
+                    if verify_hash is not store_hash:
+                        verify_hash.update(buf)
                     f.write(buf)
                     f.write(buf)
                     size += len(buf)
                     size += len(buf)
+
+                if prov_cksum and verify_hash.hexdigest() != prov_cksum:
+                    raise ChecksumValidationError(
+                        uid, prov_cksum, verify_hash.hexdigest())
         except:
         except:
-            logger.exception('File write failed on {}.'.format(tmp_file))
-            os.unlink(tmp_file)
+            logger.exception(f'File write failed on {tmp_fname}.')
+            os.unlink(tmp_fname)
             raise
             raise
         if size == 0:
         if size == 0:
             logger.warn('Zero-length file received.')
             logger.warn('Zero-length file received.')
 
 
+        # If the file exists already, don't bother rewriting it.
+        dst = __class__.local_path(
+                self.root, store_hash.hexdigest(), self.bl, self.bc)
+        if os.path.exists(dst):
+            logger.info(f'File exists on {dst}. Not overwriting.')
+
         # Move temp file to final destination.
         # Move temp file to final destination.
-        uuid = hash.hexdigest()
-        dst = __class__.local_path(self.root, uuid, self.bl, self.bc)
-        logger.debug('Saving file to disk: {}'.format(dst))
+        logger.debug(f'Saving file to disk: {dst}')
         if not os.access(os.path.dirname(dst), os.X_OK):
         if not os.access(os.path.dirname(dst), os.X_OK):
             os.makedirs(os.path.dirname(dst))
             os.makedirs(os.path.dirname(dst))
+        os.rename(tmp_fname, dst)
 
 
-        # If the file exists already, don't bother rewriting it.
-        if os.path.exists(dst):
-            logger.info(
-                    'File exists on {}. Not overwriting.'.format(dst))
-            os.unlink(tmp_file)
-        else:
-            os.rename(tmp_file, dst)
-
-        return uuid, size
+        return store_hash.hexdigest(), size
 
 
 
 
     def delete(self, uuid):
     def delete(self, uuid):

+ 56 - 2
tests/2_endpoints/test_ldp.py

@@ -3,7 +3,7 @@ import pytest
 
 
 from base64 import b64encode
 from base64 import b64encode
 from datetime import timedelta
 from datetime import timedelta
-from hashlib import sha1
+from hashlib import sha1, sha256, blake2b
 from uuid import uuid4
 from uuid import uuid4
 from werkzeug.http import http_date
 from werkzeug.http import http_date
 
 
@@ -808,7 +808,7 @@ class TestDigestHeaders:
         TODO This is by design for now; when a reliable hashing method
         TODO This is by design for now; when a reliable hashing method
         for a graph is devised, this test should change.
         for a graph is devised, this test should change.
         """
         """
-        path = '/ldp/test_etag_rdf1'
+        path = f'/ldp/{uuid4()}'
 
 
         put_rsp = self.client.put(path)
         put_rsp = self.client.put(path)
         assert not put_rsp.headers.get('etag')
         assert not put_rsp.headers.get('etag')
@@ -819,6 +819,60 @@ class TestDigestHeaders:
         assert not get_rsp.headers.get('digest')
         assert not get_rsp.headers.get('digest')
 
 
 
 
+    def test_digest_put(self):
+        """
+        Test the ``Digest`` header with PUT to verify content integrity.
+        """
+        path1 = f'/ldp/{uuid4()}'
+        path2 = f'/ldp/{uuid4()}'
+        path3 = f'/ldp/{uuid4()}'
+        content = uuid4().bytes
+        content_sha1 = sha1(content).hexdigest()
+        content_sha256 = sha256(content).hexdigest()
+        content_blake2b = blake2b(content).hexdigest()
+
+        assert self.client.put(path1, data=content, headers={
+                'digest': 'sha1=abcd'}).status_code == 409
+
+        assert self.client.put(path1, data=content, headers={
+                'digest': f'sha1={content_sha1}'}).status_code == 201
+
+        assert self.client.put(path2, data=content, headers={
+                'digest': f'SHA1={content_sha1}'}).status_code == 201
+
+        assert self.client.put(path3, data=content, headers={
+                'digest': f'SHA256={content_sha256}'}).status_code == 201
+
+        assert self.client.put(path3, data=content, headers={
+                'digest': f'blake2b={content_blake2b}'}).status_code == 204
+
+
+    def test_digest_post(self):
+        """
+        Test the ``Digest`` header with POST to verify content integrity.
+        """
+        path = '/ldp'
+        content = uuid4().bytes
+        content_sha1 = sha1(content).hexdigest()
+        content_sha256 = sha256(content).hexdigest()
+        content_blake2b = blake2b(content).hexdigest()
+
+        assert self.client.post(path, data=content, headers={
+                'digest': 'sha1=abcd'}).status_code == 409
+
+        assert self.client.post(path, data=content, headers={
+                'digest': f'sha1={content_sha1}'}).status_code == 201
+
+        assert self.client.post(path, data=content, headers={
+                'digest': f'SHA1={content_sha1}'}).status_code == 201
+
+        assert self.client.post(path, data=content, headers={
+                'digest': f'SHA256={content_sha256}'}).status_code == 201
+
+        assert self.client.post(path, data=content, headers={
+                'digest': f'blake2b={content_blake2b}'}).status_code == 201
+
+
 
 
 @pytest.mark.usefixtures('client_class')
 @pytest.mark.usefixtures('client_class')
 class TestETagCondHeaders:
 class TestETagCondHeaders: