6 years ago · bd6f5000fd
--- a/docs/fcrepo4_deltas.rst
+++ b/docs/fcrepo4_deltas.rst
@@ -8,10 +8,11 @@ clients will use it.
 
															 Not yet implemented (but in the plans)
														
 
															 --------------------------------------
														
 
															--  Various headers handling (partial)
														
 
															--  AuthN/Z
														
 
															--  Fixity check
														
 
															--  Blank nodes
														
 
															+- Various headers handling (partial)
														
 
															+- AuthN and WebAC-based authZ
														
 
															+- Fixity check
														
 
															+- Blank nodes (at least partly working, but untested)
														
 
															+- Multiple byte ranges for the ``Range`` request header
														
 
															 Potentially breaking changes
														
 
															 ----------------------------
														
@@ -62,6 +63,16 @@ regardless of whether the tombstone exists or not.
 
															 Lakesuperior will return ``405`` only if the tombstone actually exists,
														
 
															 ``404`` otherwise.
														
 
															+``Limit`` Header
														
 
															+~~~~~~~~~~~~~~~~
														
 
															+
														
 
															+Lakesuperior does not support the ``Limit`` header which in FCREPO can be used
														
 
															+to limit the number of "child" resources displayed for a container graph. Since
														
 
															+this seems to have a mostly cosmetic function in FCREPO to compensate for
														
 
															+performance limitations (displaying a page with many thousands of children in
														
 
															+the UI can take minutes), and since Lakesuperior already offers options in the
														
 
															+``Prefer`` header to not return any children, this option is not implemented.
														
 
															+
														
 
															 Web UI
														
 
															 ~~~~~~
														
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -7,7 +7,7 @@ Release Notes
 
															 *October 10, 2018*
														
 
															-A hotfix relase was necessary to adjust settings for the source to build
														
 
															+A hotfix release was necessary to adjust settings for the source to build
														
 
															 correctly on Read The Docs and Docker Hub, and to package correctly on PyPI.
														
 
															 1.0 Alpha 18
														
--- a/lakesuperior/endpoints/ldp.py
+++ b/lakesuperior/endpoints/ldp.py
@@ -1,3 +1,4 @@
 
															+import hashlib
														
 
															 import logging
														
 
															 import pdb
														
@@ -19,7 +20,8 @@ from lakesuperior import env
 
															 from lakesuperior.api import resource as rsrc_api
														
 
															 from lakesuperior.dictionaries.namespaces import ns_collection as nsc
														
 
															 from lakesuperior.dictionaries.namespaces import ns_mgr as nsm
														
 
															-from lakesuperior.exceptions import (ResourceNotExistsError, TombstoneError,
														
 
															+from lakesuperior.exceptions import (
														
 
															+        ChecksumValidationError, ResourceNotExistsError, TombstoneError,
														
 
															         ServerManagedTermError, InvalidResourceError, SingleSubjectError,
														
 
															         ResourceExistsError, IncompatibleLdpTypeError)
														
 
															 from lakesuperior.globals import RES_CREATED
														
@@ -258,28 +260,29 @@ def post_resource(parent_uid):
 
															     """
														
 
															     rsp_headers = std_headers.copy()
														
 
															     slug = request.headers.get('Slug')
														
 
															-    logger.debug('Slug: {}'.format(slug))
														
 
															-    handling, disposition = set_post_put_params()
														
 
															+    kwargs = {}
														
 
															+    kwargs['handling'], kwargs['disposition'] = set_post_put_params()
														
 
															     stream, mimetype = _bistream_from_req()
														
 
															     if mimetype in rdf_parsable_mimetypes:
														
 
															         # If the content is RDF, localize in-repo URIs.
														
 
															         global_rdf = stream.read()
														
 
															-        rdf_data = g.tbox.localize_payload(global_rdf)
														
 
															-        rdf_fmt = mimetype
														
 
															-        stream = mimetype = None
														
 
															+        kwargs['rdf_data'] = g.tbox.localize_payload(global_rdf)
														
 
															+        kwargs['rdf_fmt'] = mimetype
														
 
															     else:
														
 
															-        rdf_data = rdf_fmt = None
														
 
															+        kwargs['stream'] = stream
														
 
															+        kwargs['mimetype'] = mimetype
														
 
															+        # Check digest if requested.
														
 
															+        if 'digest' in request.headers:
														
 
															+            kwargs['prov_cksum_algo'], kwargs['prov_cksum'] = \
														
 
															+                    request.headers['digest'].split('=')
														
 
															     try:
														
 
															-        rsrc = rsrc_api.create(
														
 
															-            parent_uid, slug, stream=stream, mimetype=mimetype,
														
 
															-            rdf_data=rdf_data, rdf_fmt=rdf_fmt, handling=handling,
														
 
															-            disposition=disposition)
														
 
															+        rsrc = rsrc_api.create(parent_uid, slug, **kwargs)
														
 
															     except ResourceNotExistsError as e:
														
 
															         return str(e), 404
														
 
															-    except InvalidResourceError as e:
														
 
															+    except (InvalidResourceError, ChecksumValidationError) as e:
														
 
															         return str(e), 409
														
 
															     except TombstoneError as e:
														
 
															         return _tombstone_response(e, uid)
														
@@ -290,7 +293,7 @@ def post_resource(parent_uid):
 
															     rsp_headers.update(_headers_from_metadata(rsrc))
														
 
															     rsp_headers['Location'] = uri
														
 
															-    if mimetype and rdf_fmt is None:
														
 
															+    if mimetype and kwargs.get('rdf_fmt') is None:
														
 
															         rsp_headers['Link'] = (f'<{uri}/fcr:metadata>; rel="describedby"; '
														
 
															                                f'anchor="{uri}"')
														
@@ -313,24 +316,28 @@ def put_resource(uid):
 
															     if cond_ret:
														
 
															         return cond_ret
														
 
															-    handling, disposition = set_post_put_params()
														
 
															+    kwargs = {}
														
 
															+    kwargs['handling'], kwargs['disposition'] = set_post_put_params()
														
 
															     stream, mimetype = _bistream_from_req()
														
 
															     if mimetype in rdf_parsable_mimetypes:
														
 
															         # If the content is RDF, localize in-repo URIs.
														
 
															         global_rdf = stream.read()
														
 
															-        rdf_data = g.tbox.localize_payload(global_rdf)
														
 
															-        rdf_fmt = mimetype
														
 
															-        stream = mimetype = None
														
 
															+        kwargs['rdf_data'] = g.tbox.localize_payload(global_rdf)
														
 
															+        kwargs['rdf_fmt'] = mimetype
														
 
															     else:
														
 
															-        rdf_data = rdf_fmt = None
														
 
															+        kwargs['stream'] = stream
														
 
															+        kwargs['mimetype'] = mimetype
														
 
															+        # Check digest if requested.
														
 
															+        if 'digest' in request.headers:
														
 
															+            kwargs['prov_cksum_algo'], kwargs['prov_cksum'] = \
														
 
															+                    request.headers['digest'].split('=')
														
 
															     try:
														
 
															-        evt, rsrc = rsrc_api.create_or_replace(
														
 
															-            uid, stream=stream, mimetype=mimetype,
														
 
															-            rdf_data=rdf_data, rdf_fmt=rdf_fmt, handling=handling,
														
 
															-            disposition=disposition)
														
 
															-    except (InvalidResourceError, ResourceExistsError) as e:
														
 
															+        evt, rsrc = rsrc_api.create_or_replace(uid, **kwargs)
														
 
															+    except (
														
 
															+            InvalidResourceError, ChecksumValidationError,
														
 
															+            ResourceExistsError) as e:
														
 
															         return str(e), 409
														
 
															     except (ServerManagedTermError, SingleSubjectError) as e:
														
 
															         return str(e), 412
														
@@ -346,7 +353,7 @@ def put_resource(uid):
 
															     if evt == RES_CREATED:
														
 
															         rsp_code = 201
														
 
															         rsp_headers['Location'] = rsp_body = uri
														
 
															-        if mimetype and not rdf_data:
														
 
															+        if mimetype and not kwargs.get('rdf_data'):
														
 
															             rsp_headers['Link'] = f'<{uri}/fcr:metadata>; rel="describedby"'
														
 
															     else:
														
 
															         rsp_code = 204
														
--- a/lakesuperior/etc.defaults/application.yml
+++ b/lakesuperior/etc.defaults/application.yml
@@ -23,9 +23,16 @@ data_dir:
 
															 # checksumn of the contents of the file.
														
 
															 uuid:
														
 
															     # Algorithm used to calculate the hash that generates the content path.
														
 
															-    # One of: sha1, sha224, sha256, sha384, or sha512, corresponding to the
														
 
															-    # omonymous hashlib function:
														
 
															+    # This can be any one of the Python hashlib functions:
														
 
															     # https://docs.python.org/3/library/hashlib.html
														
 
															+    #
														
 
															+    # This needs to be ``sha1`` if a compatibility with the Fedora4 file layout
														
 
															+    # is needed, however in security-sensitive environments it is strongly
														
 
															+    # advised to use a stronger algorithm, since SHA1 is known to be
														
 
															+    # vulnerable to counterfeiting: see https://shattered.io/
														
 
															+    #
														
 
															+    # `blake2b` is a strong, fast cryptographic alternative to SHA2/3:
														
 
															+    # https://blake2.net/
														
 
															     algo: sha1
														
 
															 # Data store configuration.
														
--- a/lakesuperior/model/ldp_nr.py
+++ b/lakesuperior/model/ldp_nr.py
@@ -15,7 +15,6 @@ from lakesuperior.model.ldp_rs import LdpRs
 
															 nonrdfly = env.app_globals.nonrdfly
														
 
															 logger = logging.getLogger(__name__)
														
 
															-
														
 
															 class LdpNr(Ldpr):
														
 
															     """LDP-NR (Non-RDF Source).
														
@@ -30,7 +29,8 @@ class LdpNr(Ldpr):
 
															     }
														
 
															     def __init__(self, uuid, stream=None, mimetype=None,
														
 
															-            disposition=None, **kwargs):
														
 
															+            disposition=None, prov_cksum_algo=None, prov_cksum=None,
														
 
															+            **kwargs):
														
 
															         """
														
 
															         Extends Ldpr.__init__ by adding LDP-NR specific parameters.
														
 
															         """
														
@@ -51,6 +51,8 @@ class LdpNr(Ldpr):
 
															                     if self.is_stored
														
 
															                     else 'application/octet-stream')
														
 
															+        self.prov_cksum_algo = prov_cksum_algo
														
 
															+        self.prov_cksum = prov_cksum
														
 
															         self.disposition = disposition
														
@@ -92,8 +94,10 @@ class LdpNr(Ldpr):
 
															         :rtype: str
														
 
															         """
														
 
															+        default_hash_algo = \
														
 
															+                env.app_globals.config['application']['uuid']['algo']
														
 
															         cksum_term = self.metadata.value(nsc['premis'].hasMessageDigest)
														
 
															-        cksum = str(cksum_term).replace('urn:sha1:','')
														
 
															+        cksum = str(cksum_term).replace(f'urn:{default_hash_algo}:','')
														
 
															         return nonrdfly.__class__.local_path(
														
 
															                 nonrdfly.root, cksum, nonrdfly.bl, nonrdfly.bc)
														
@@ -106,7 +110,9 @@ class LdpNr(Ldpr):
 
															             updated.
														
 
															         """
														
 
															         # Persist the stream.
														
 
															-        self.digest, self.size = nonrdfly.persist(self.stream)
														
 
															+        self.digest, self.size = nonrdfly.persist(
														
 
															+                self.uid, self.stream, prov_cksum_algo=self.prov_cksum_algo,
														
 
															+                prov_cksum=self.prov_cksum)
														
 
															         # Try to persist metadata. If it fails, delete the file.
														
 
															         logger.debug('Persisting LDP-NR triples in {}'.format(self.uri))
														
--- a/lakesuperior/store/ldp_nr/default_layout.py
+++ b/lakesuperior/store/ldp_nr/default_layout.py
@@ -1,11 +1,13 @@
 
															+import hashlib
														
 
															 import logging
														
 
															 import os
														
 
															 import shutil
														
 
															-from hashlib import sha1
														
 
															 from uuid import uuid4
														
 
															+from lakesuperior import env
														
 
															 from lakesuperior.store.ldp_nr.base_non_rdf_layout import BaseNonRdfLayout
														
 
															+from lakesuperior.exceptions import ChecksumValidationError
														
 
															 logger = logging.getLogger(__name__)
														
@@ -16,7 +18,7 @@ class DefaultLayout(BaseNonRdfLayout):
 
															     Default file layout.
														
 
															     This is a simple filesystem layout that stores binaries in pairtree folders
														
 
															-    in a local filesystem. Parameters can be specified for the 
														
 
															+    in a local filesystem. Parameters can be specified for the
														
 
															     """
														
 
															     @staticmethod
														
 
															     def local_path(root, uuid, bl=4, bc=4):
														
@@ -58,7 +60,9 @@ class DefaultLayout(BaseNonRdfLayout):
 
															         os.makedirs(self.root + '/tmp')
														
 
															-    def persist(self, stream, bufsize=8192):
														
 
															+    def persist(
														
 
															+            self, uid, stream, bufsize=8192, prov_cksum=None,
														
 
															+            prov_cksum_algo=None):
														
 
															         r"""
														
 
															         Store the stream in the file system.
														
@@ -67,46 +71,70 @@ class DefaultLayout(BaseNonRdfLayout):
 
															         to disk and hashed, the temp file is moved to its final location which
														
 
															         is determined by the hash value.
														
 
															+        :param str uid: UID of the resource.
														
 
															         :param IOstream stream: file-like object to persist.
														
 
															         :param int bufsize: Chunk size. 2\*\*12 to 2\*\*15 is a good range.
														
 
															+        :param str prov_cksum: Checksum provided by the client to verify
														
 
															+            that the content received matches what has been sent. If None (the
														
 
															+            default) no verification will take place.
														
 
															+        :param str prov_cksum_algo: Verification algorithm to validate the
														
 
															+            integrity of the user-provided data. If this is different from
														
 
															+            the default hash algorithm set in the application configuration,
														
 
															+            which is used to calclate the checksum of the file for storing
														
 
															+            purposes, a separate hash is calculated specifically for validation
														
 
															+            purposes. Clearly it's more efficient to use the same algorithm and
														
 
															+            avoid a second checksum calculation.
														
 
															         """
														
 
															-        tmp_file = '{}/tmp/{}'.format(self.root, uuid4())
														
 
															+        # The temp file is created on the destination filesystem to minimize
														
 
															+        # time and risk of moving it to its final destination.
														
 
															+        tmp_fname = f'{self.root}/tmp/{uuid4()}'
														
 
															+
														
 
															+        default_hash_algo = \
														
 
															+                env.app_globals.config['application']['uuid']['algo']
														
 
															+        if prov_cksum_algo is None:
														
 
															+            prov_cksum_algo = default_hash_algo
														
 
															         try:
														
 
															-            with open(tmp_file, 'wb') as f:
														
 
															-                logger.debug('Writing temp file to {}.'.format(tmp_file))
														
 
															+            with open(tmp_fname, 'wb') as f:
														
 
															+                logger.debug(f'Writing temp file to {tmp_fname}.')
														
 
															-                hash = sha1()
														
 
															+                store_hash = hashlib.new(default_hash_algo)
														
 
															+                verify_hash = (
														
 
															+                        store_hash if prov_cksum_algo == default_hash_algo
														
 
															+                        else hashlib.new(prov_cksum_algo))
														
 
															                 size = 0
														
 
															                 while True:
														
 
															                     buf = stream.read(bufsize)
														
 
															                     if not buf:
														
 
															                         break
														
 
															-                    hash.update(buf)
														
 
															+                    store_hash.update(buf)
														
 
															+                    if verify_hash is not store_hash:
														
 
															+                        verify_hash.update(buf)
														
 
															                     f.write(buf)
														
 
															                     size += len(buf)
														
 
															+
														
 
															+                if prov_cksum and verify_hash.hexdigest() != prov_cksum:
														
 
															+                    raise ChecksumValidationError(
														
 
															+                        uid, prov_cksum, verify_hash.hexdigest())
														
 
															         except:
														
 
															-            logger.exception('File write failed on {}.'.format(tmp_file))
														
 
															-            os.unlink(tmp_file)
														
 
															+            logger.exception(f'File write failed on {tmp_fname}.')
														
 
															+            os.unlink(tmp_fname)
														
 
															             raise
														
 
															         if size == 0:
														
 
															             logger.warn('Zero-length file received.')
														
 
															+        # If the file exists already, don't bother rewriting it.
														
 
															+        dst = __class__.local_path(
														
 
															+                self.root, store_hash.hexdigest(), self.bl, self.bc)
														
 
															+        if os.path.exists(dst):
														
 
															+            logger.info(f'File exists on {dst}. Not overwriting.')
														
 
															+
														
 
															         # Move temp file to final destination.
														
 
															-        uuid = hash.hexdigest()
														
 
															-        dst = __class__.local_path(self.root, uuid, self.bl, self.bc)
														
 
															-        logger.debug('Saving file to disk: {}'.format(dst))
														
 
															+        logger.debug(f'Saving file to disk: {dst}')
														
 
															         if not os.access(os.path.dirname(dst), os.X_OK):
														
 
															             os.makedirs(os.path.dirname(dst))
														
 
															+        os.rename(tmp_fname, dst)
														
 
															-        # If the file exists already, don't bother rewriting it.
														
 
															-        if os.path.exists(dst):
														
 
															-            logger.info(
														
 
															-                    'File exists on {}. Not overwriting.'.format(dst))
														
 
															-            os.unlink(tmp_file)
														
 
															-        else:
														
 
															-            os.rename(tmp_file, dst)
														
 
															-
														
 
															-        return uuid, size
														
 
															+        return store_hash.hexdigest(), size
														
 
															     def delete(self, uuid):
														
--- a/tests/2_endpoints/test_ldp.py
+++ b/tests/2_endpoints/test_ldp.py
@@ -3,7 +3,7 @@ import pytest
 
															 from base64 import b64encode
														
 
															 from datetime import timedelta
														
 
															-from hashlib import sha1
														
 
															+from hashlib import sha1, sha256, blake2b
														
 
															 from uuid import uuid4
														
 
															 from werkzeug.http import http_date
														
@@ -808,7 +808,7 @@ class TestDigestHeaders:
 
															         TODO This is by design for now; when a reliable hashing method
														
 
															         for a graph is devised, this test should change.
														
 
															         """
														
 
															-        path = '/ldp/test_etag_rdf1'
														
 
															+        path = f'/ldp/{uuid4()}'
														
 
															         put_rsp = self.client.put(path)
														
 
															         assert not put_rsp.headers.get('etag')
														
@@ -819,6 +819,60 @@ class TestDigestHeaders:
 
															         assert not get_rsp.headers.get('digest')
														
 
															+    def test_digest_put(self):
														
 
															+        """
														
 
															+        Test the ``Digest`` header with PUT to verify content integrity.
														
 
															+        """
														
 
															+        path1 = f'/ldp/{uuid4()}'
														
 
															+        path2 = f'/ldp/{uuid4()}'
														
 
															+        path3 = f'/ldp/{uuid4()}'
														
 
															+        content = uuid4().bytes
														
 
															+        content_sha1 = sha1(content).hexdigest()
														
 
															+        content_sha256 = sha256(content).hexdigest()
														
 
															+        content_blake2b = blake2b(content).hexdigest()
														
 
															+
														
 
															+        assert self.client.put(path1, data=content, headers={
														
 
															+                'digest': 'sha1=abcd'}).status_code == 409
														
 
															+
														
 
															+        assert self.client.put(path1, data=content, headers={
														
 
															+                'digest': f'sha1={content_sha1}'}).status_code == 201
														
 
															+
														
 
															+        assert self.client.put(path2, data=content, headers={
														
 
															+                'digest': f'SHA1={content_sha1}'}).status_code == 201
														
 
															+
														
 
															+        assert self.client.put(path3, data=content, headers={
														
 
															+                'digest': f'SHA256={content_sha256}'}).status_code == 201
														
 
															+
														
 
															+        assert self.client.put(path3, data=content, headers={
														
 
															+                'digest': f'blake2b={content_blake2b}'}).status_code == 204
														
 
															+
														
 
															+
														
 
															+    def test_digest_post(self):
														
 
															+        """
														
 
															+        Test the ``Digest`` header with POST to verify content integrity.
														
 
															+        """
														
 
															+        path = '/ldp'
														
 
															+        content = uuid4().bytes
														
 
															+        content_sha1 = sha1(content).hexdigest()
														
 
															+        content_sha256 = sha256(content).hexdigest()
														
 
															+        content_blake2b = blake2b(content).hexdigest()
														
 
															+
														
 
															+        assert self.client.post(path, data=content, headers={
														
 
															+                'digest': 'sha1=abcd'}).status_code == 409
														
 
															+
														
 
															+        assert self.client.post(path, data=content, headers={
														
 
															+                'digest': f'sha1={content_sha1}'}).status_code == 201
														
 
															+
														
 
															+        assert self.client.post(path, data=content, headers={
														
 
															+                'digest': f'SHA1={content_sha1}'}).status_code == 201
														
 
															+
														
 
															+        assert self.client.post(path, data=content, headers={
														
 
															+                'digest': f'SHA256={content_sha256}'}).status_code == 201
														
 
															+
														
 
															+        assert self.client.post(path, data=content, headers={
														
 
															+                'digest': f'blake2b={content_blake2b}'}).status_code == 201
														
 
															+
														
 
															+
														
 
															 @pytest.mark.usefixtures('client_class')
														
 
															 class TestETagCondHeaders: