7 anni fa · bd6f5000fd
--- a/docs/fcrepo4_deltas.rst
+++ b/docs/fcrepo4_deltas.rst
@@ -8,10 +8,11 @@ clients will use it.
 
				 Not yet implemented (but in the plans)
			
 
				 --------------------------------------
			
 
				 
			
 
				--  Various headers handling (partial)
			
 
				--  AuthN/Z
			
 
				--  Fixity check
			
 
				--  Blank nodes
			
 
				+- Various headers handling (partial)
			
 
				+- AuthN and WebAC-based authZ
			
 
				+- Fixity check
			
 
				+- Blank nodes (at least partly working, but untested)
			
 
				+- Multiple byte ranges for the ``Range`` request header
			
 
				 
			
 
				 Potentially breaking changes
			
 
				 ----------------------------
			
@@ -62,6 +63,16 @@ regardless of whether the tombstone exists or not.
 
				 Lakesuperior will return ``405`` only if the tombstone actually exists,
			
 
				 ``404`` otherwise.
			
 
				 
			
 
				+``Limit`` Header
			
 
				+~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Lakesuperior does not support the ``Limit`` header which in FCREPO can be used
			
 
				+to limit the number of "child" resources displayed for a container graph. Since
			
 
				+this seems to have a mostly cosmetic function in FCREPO to compensate for
			
 
				+performance limitations (displaying a page with many thousands of children in
			
 
				+the UI can take minutes), and since Lakesuperior already offers options in the
			
 
				+``Prefer`` header to not return any children, this option is not implemented.
			
 
				+
			
 
				 Web UI
			
 
				 ~~~~~~
			
 
				 
			
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -7,7 +7,7 @@ Release Notes
 
				 
			
 
				 *October 10, 2018*
			
 
				 
			
 
				-A hotfix relase was necessary to adjust settings for the source to build
			
 
				+A hotfix release was necessary to adjust settings for the source to build
			
 
				 correctly on Read The Docs and Docker Hub, and to package correctly on PyPI.
			
 
				 
			
 
				 1.0 Alpha 18
			
--- a/lakesuperior/endpoints/ldp.py
+++ b/lakesuperior/endpoints/ldp.py
@@ -1,3 +1,4 @@
 
				+import hashlib
			
 
				 import logging
			
 
				 import pdb
			
 
				 
			
@@ -19,7 +20,8 @@ from lakesuperior import env
 
				 from lakesuperior.api import resource as rsrc_api
			
 
				 from lakesuperior.dictionaries.namespaces import ns_collection as nsc
			
 
				 from lakesuperior.dictionaries.namespaces import ns_mgr as nsm
			
 
				-from lakesuperior.exceptions import (ResourceNotExistsError, TombstoneError,
			
 
				+from lakesuperior.exceptions import (
			
 
				+        ChecksumValidationError, ResourceNotExistsError, TombstoneError,
			
 
				         ServerManagedTermError, InvalidResourceError, SingleSubjectError,
			
 
				         ResourceExistsError, IncompatibleLdpTypeError)
			
 
				 from lakesuperior.globals import RES_CREATED
			
@@ -258,28 +260,29 @@ def post_resource(parent_uid):
 
				     """
			
 
				     rsp_headers = std_headers.copy()
			
 
				     slug = request.headers.get('Slug')
			
 
				-    logger.debug('Slug: {}'.format(slug))
			
 
				 
			
 
				-    handling, disposition = set_post_put_params()
			
 
				+    kwargs = {}
			
 
				+    kwargs['handling'], kwargs['disposition'] = set_post_put_params()
			
 
				     stream, mimetype = _bistream_from_req()
			
 
				 
			
 
				     if mimetype in rdf_parsable_mimetypes:
			
 
				         # If the content is RDF, localize in-repo URIs.
			
 
				         global_rdf = stream.read()
			
 
				-        rdf_data = g.tbox.localize_payload(global_rdf)
			
 
				-        rdf_fmt = mimetype
			
 
				-        stream = mimetype = None
			
 
				+        kwargs['rdf_data'] = g.tbox.localize_payload(global_rdf)
			
 
				+        kwargs['rdf_fmt'] = mimetype
			
 
				     else:
			
 
				-        rdf_data = rdf_fmt = None
			
 
				+        kwargs['stream'] = stream
			
 
				+        kwargs['mimetype'] = mimetype
			
 
				+        # Check digest if requested.
			
 
				+        if 'digest' in request.headers:
			
 
				+            kwargs['prov_cksum_algo'], kwargs['prov_cksum'] = \
			
 
				+                    request.headers['digest'].split('=')
			
 
				 
			
 
				     try:
			
 
				-        rsrc = rsrc_api.create(
			
 
				-            parent_uid, slug, stream=stream, mimetype=mimetype,
			
 
				-            rdf_data=rdf_data, rdf_fmt=rdf_fmt, handling=handling,
			
 
				-            disposition=disposition)
			
 
				+        rsrc = rsrc_api.create(parent_uid, slug, **kwargs)
			
 
				     except ResourceNotExistsError as e:
			
 
				         return str(e), 404
			
 
				-    except InvalidResourceError as e:
			
 
				+    except (InvalidResourceError, ChecksumValidationError) as e:
			
 
				         return str(e), 409
			
 
				     except TombstoneError as e:
			
 
				         return _tombstone_response(e, uid)
			
@@ -290,7 +293,7 @@ def post_resource(parent_uid):
 
				     rsp_headers.update(_headers_from_metadata(rsrc))
			
 
				     rsp_headers['Location'] = uri
			
 
				 
			
 
				-    if mimetype and rdf_fmt is None:
			
 
				+    if mimetype and kwargs.get('rdf_fmt') is None:
			
 
				         rsp_headers['Link'] = (f'<{uri}/fcr:metadata>; rel="describedby"; '
			
 
				                                f'anchor="{uri}"')
			
 
				 
			
@@ -313,24 +316,28 @@ def put_resource(uid):
 
				     if cond_ret:
			
 
				         return cond_ret
			
 
				 
			
 
				-    handling, disposition = set_post_put_params()
			
 
				+    kwargs = {}
			
 
				+    kwargs['handling'], kwargs['disposition'] = set_post_put_params()
			
 
				     stream, mimetype = _bistream_from_req()
			
 
				 
			
 
				     if mimetype in rdf_parsable_mimetypes:
			
 
				         # If the content is RDF, localize in-repo URIs.
			
 
				         global_rdf = stream.read()
			
 
				-        rdf_data = g.tbox.localize_payload(global_rdf)
			
 
				-        rdf_fmt = mimetype
			
 
				-        stream = mimetype = None
			
 
				+        kwargs['rdf_data'] = g.tbox.localize_payload(global_rdf)
			
 
				+        kwargs['rdf_fmt'] = mimetype
			
 
				     else:
			
 
				-        rdf_data = rdf_fmt = None
			
 
				+        kwargs['stream'] = stream
			
 
				+        kwargs['mimetype'] = mimetype
			
 
				+        # Check digest if requested.
			
 
				+        if 'digest' in request.headers:
			
 
				+            kwargs['prov_cksum_algo'], kwargs['prov_cksum'] = \
			
 
				+                    request.headers['digest'].split('=')
			
 
				 
			
 
				     try:
			
 
				-        evt, rsrc = rsrc_api.create_or_replace(
			
 
				-            uid, stream=stream, mimetype=mimetype,
			
 
				-            rdf_data=rdf_data, rdf_fmt=rdf_fmt, handling=handling,
			
 
				-            disposition=disposition)
			
 
				-    except (InvalidResourceError, ResourceExistsError) as e:
			
 
				+        evt, rsrc = rsrc_api.create_or_replace(uid, **kwargs)
			
 
				+    except (
			
 
				+            InvalidResourceError, ChecksumValidationError,
			
 
				+            ResourceExistsError) as e:
			
 
				         return str(e), 409
			
 
				     except (ServerManagedTermError, SingleSubjectError) as e:
			
 
				         return str(e), 412
			
@@ -346,7 +353,7 @@ def put_resource(uid):
 
				     if evt == RES_CREATED:
			
 
				         rsp_code = 201
			
 
				         rsp_headers['Location'] = rsp_body = uri
			
 
				-        if mimetype and not rdf_data:
			
 
				+        if mimetype and not kwargs.get('rdf_data'):
			
 
				             rsp_headers['Link'] = f'<{uri}/fcr:metadata>; rel="describedby"'
			
 
				     else:
			
 
				         rsp_code = 204
			
--- a/lakesuperior/etc.defaults/application.yml
+++ b/lakesuperior/etc.defaults/application.yml
@@ -23,9 +23,16 @@ data_dir:
 
				 # checksumn of the contents of the file.
			
 
				 uuid:
			
 
				     # Algorithm used to calculate the hash that generates the content path.
			
 
				-    # One of: sha1, sha224, sha256, sha384, or sha512, corresponding to the
			
 
				-    # omonymous hashlib function:
			
 
				+    # This can be any one of the Python hashlib functions:
			
 
				     # https://docs.python.org/3/library/hashlib.html
			
 
				+    #
			
 
				+    # This needs to be ``sha1`` if a compatibility with the Fedora4 file layout
			
 
				+    # is needed, however in security-sensitive environments it is strongly
			
 
				+    # advised to use a stronger algorithm, since SHA1 is known to be
			
 
				+    # vulnerable to counterfeiting: see https://shattered.io/
			
 
				+    #
			
 
				+    # `blake2b` is a strong, fast cryptographic alternative to SHA2/3:
			
 
				+    # https://blake2.net/
			
 
				     algo: sha1
			
 
				 
			
 
				 # Data store configuration.
			
--- a/lakesuperior/model/ldp_nr.py
+++ b/lakesuperior/model/ldp_nr.py
@@ -15,7 +15,6 @@ from lakesuperior.model.ldp_rs import LdpRs
 
				 nonrdfly = env.app_globals.nonrdfly
			
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
 
				-
			
 
				 class LdpNr(Ldpr):
			
 
				     """LDP-NR (Non-RDF Source).
			
 
				 
			
@@ -30,7 +29,8 @@ class LdpNr(Ldpr):
 
				     }
			
 
				 
			
 
				     def __init__(self, uuid, stream=None, mimetype=None,
			
 
				-            disposition=None, **kwargs):
			
 
				+            disposition=None, prov_cksum_algo=None, prov_cksum=None,
			
 
				+            **kwargs):
			
 
				         """
			
 
				         Extends Ldpr.__init__ by adding LDP-NR specific parameters.
			
 
				         """
			
@@ -51,6 +51,8 @@ class LdpNr(Ldpr):
 
				                     if self.is_stored
			
 
				                     else 'application/octet-stream')
			
 
				 
			
 
				+        self.prov_cksum_algo = prov_cksum_algo
			
 
				+        self.prov_cksum = prov_cksum
			
 
				         self.disposition = disposition
			
 
				 
			
 
				 
			
@@ -92,8 +94,10 @@ class LdpNr(Ldpr):
 
				 
			
 
				         :rtype: str
			
 
				         """
			
 
				+        default_hash_algo = \
			
 
				+                env.app_globals.config['application']['uuid']['algo']
			
 
				         cksum_term = self.metadata.value(nsc['premis'].hasMessageDigest)
			
 
				-        cksum = str(cksum_term).replace('urn:sha1:','')
			
 
				+        cksum = str(cksum_term).replace(f'urn:{default_hash_algo}:','')
			
 
				         return nonrdfly.__class__.local_path(
			
 
				                 nonrdfly.root, cksum, nonrdfly.bl, nonrdfly.bc)
			
 
				 
			
@@ -106,7 +110,9 @@ class LdpNr(Ldpr):
 
				             updated.
			
 
				         """
			
 
				         # Persist the stream.
			
 
				-        self.digest, self.size = nonrdfly.persist(self.stream)
			
 
				+        self.digest, self.size = nonrdfly.persist(
			
 
				+                self.uid, self.stream, prov_cksum_algo=self.prov_cksum_algo,
			
 
				+                prov_cksum=self.prov_cksum)
			
 
				 
			
 
				         # Try to persist metadata. If it fails, delete the file.
			
 
				         logger.debug('Persisting LDP-NR triples in {}'.format(self.uri))
			
--- a/lakesuperior/store/ldp_nr/default_layout.py
+++ b/lakesuperior/store/ldp_nr/default_layout.py
@@ -1,11 +1,13 @@
 
				+import hashlib
			
 
				 import logging
			
 
				 import os
			
 
				 import shutil
			
 
				 
			
 
				-from hashlib import sha1
			
 
				 from uuid import uuid4
			
 
				 
			
 
				+from lakesuperior import env
			
 
				 from lakesuperior.store.ldp_nr.base_non_rdf_layout import BaseNonRdfLayout
			
 
				+from lakesuperior.exceptions import ChecksumValidationError
			
 
				 
			
 
				 
			
 
				 logger = logging.getLogger(__name__)
			
@@ -16,7 +18,7 @@ class DefaultLayout(BaseNonRdfLayout):
 
				     Default file layout.
			
 
				 
			
 
				     This is a simple filesystem layout that stores binaries in pairtree folders
			
 
				-    in a local filesystem. Parameters can be specified for the 
			
 
				+    in a local filesystem. Parameters can be specified for the
			
 
				     """
			
 
				     @staticmethod
			
 
				     def local_path(root, uuid, bl=4, bc=4):
			
@@ -58,7 +60,9 @@ class DefaultLayout(BaseNonRdfLayout):
 
				         os.makedirs(self.root + '/tmp')
			
 
				 
			
 
				 
			
 
				-    def persist(self, stream, bufsize=8192):
			
 
				+    def persist(
			
 
				+            self, uid, stream, bufsize=8192, prov_cksum=None,
			
 
				+            prov_cksum_algo=None):
			
 
				         r"""
			
 
				         Store the stream in the file system.
			
 
				 
			
@@ -67,46 +71,70 @@ class DefaultLayout(BaseNonRdfLayout):
 
				         to disk and hashed, the temp file is moved to its final location which
			
 
				         is determined by the hash value.
			
 
				 
			
 
				+        :param str uid: UID of the resource.
			
 
				         :param IOstream stream: file-like object to persist.
			
 
				         :param int bufsize: Chunk size. 2\*\*12 to 2\*\*15 is a good range.
			
 
				+        :param str prov_cksum: Checksum provided by the client to verify
			
 
				+            that the content received matches what has been sent. If None (the
			
 
				+            default) no verification will take place.
			
 
				+        :param str prov_cksum_algo: Verification algorithm to validate the
			
 
				+            integrity of the user-provided data. If this is different from
			
 
				+            the default hash algorithm set in the application configuration,
			
 
				+            which is used to calclate the checksum of the file for storing
			
 
				+            purposes, a separate hash is calculated specifically for validation
			
 
				+            purposes. Clearly it's more efficient to use the same algorithm and
			
 
				+            avoid a second checksum calculation.
			
 
				         """
			
 
				-        tmp_file = '{}/tmp/{}'.format(self.root, uuid4())
			
 
				+        # The temp file is created on the destination filesystem to minimize
			
 
				+        # time and risk of moving it to its final destination.
			
 
				+        tmp_fname = f'{self.root}/tmp/{uuid4()}'
			
 
				+
			
 
				+        default_hash_algo = \
			
 
				+                env.app_globals.config['application']['uuid']['algo']
			
 
				+        if prov_cksum_algo is None:
			
 
				+            prov_cksum_algo = default_hash_algo
			
 
				         try:
			
 
				-            with open(tmp_file, 'wb') as f:
			
 
				-                logger.debug('Writing temp file to {}.'.format(tmp_file))
			
 
				+            with open(tmp_fname, 'wb') as f:
			
 
				+                logger.debug(f'Writing temp file to {tmp_fname}.')
			
 
				 
			
 
				-                hash = sha1()
			
 
				+                store_hash = hashlib.new(default_hash_algo)
			
 
				+                verify_hash = (
			
 
				+                        store_hash if prov_cksum_algo == default_hash_algo
			
 
				+                        else hashlib.new(prov_cksum_algo))
			
 
				                 size = 0
			
 
				                 while True:
			
 
				                     buf = stream.read(bufsize)
			
 
				                     if not buf:
			
 
				                         break
			
 
				-                    hash.update(buf)
			
 
				+                    store_hash.update(buf)
			
 
				+                    if verify_hash is not store_hash:
			
 
				+                        verify_hash.update(buf)
			
 
				                     f.write(buf)
			
 
				                     size += len(buf)
			
 
				+
			
 
				+                if prov_cksum and verify_hash.hexdigest() != prov_cksum:
			
 
				+                    raise ChecksumValidationError(
			
 
				+                        uid, prov_cksum, verify_hash.hexdigest())
			
 
				         except:
			
 
				-            logger.exception('File write failed on {}.'.format(tmp_file))
			
 
				-            os.unlink(tmp_file)
			
 
				+            logger.exception(f'File write failed on {tmp_fname}.')
			
 
				+            os.unlink(tmp_fname)
			
 
				             raise
			
 
				         if size == 0:
			
 
				             logger.warn('Zero-length file received.')
			
 
				 
			
 
				+        # If the file exists already, don't bother rewriting it.
			
 
				+        dst = __class__.local_path(
			
 
				+                self.root, store_hash.hexdigest(), self.bl, self.bc)
			
 
				+        if os.path.exists(dst):
			
 
				+            logger.info(f'File exists on {dst}. Not overwriting.')
			
 
				+
			
 
				         # Move temp file to final destination.
			
 
				-        uuid = hash.hexdigest()
			
 
				-        dst = __class__.local_path(self.root, uuid, self.bl, self.bc)
			
 
				-        logger.debug('Saving file to disk: {}'.format(dst))
			
 
				+        logger.debug(f'Saving file to disk: {dst}')
			
 
				         if not os.access(os.path.dirname(dst), os.X_OK):
			
 
				             os.makedirs(os.path.dirname(dst))
			
 
				+        os.rename(tmp_fname, dst)
			
 
				 
			
 
				-        # If the file exists already, don't bother rewriting it.
			
 
				-        if os.path.exists(dst):
			
 
				-            logger.info(
			
 
				-                    'File exists on {}. Not overwriting.'.format(dst))
			
 
				-            os.unlink(tmp_file)
			
 
				-        else:
			
 
				-            os.rename(tmp_file, dst)
			
 
				-
			
 
				-        return uuid, size
			
 
				+        return store_hash.hexdigest(), size
			
 
				 
			
 
				 
			
 
				     def delete(self, uuid):
			
--- a/tests/2_endpoints/test_ldp.py
+++ b/tests/2_endpoints/test_ldp.py
@@ -3,7 +3,7 @@ import pytest
 
				 
			
 
				 from base64 import b64encode
			
 
				 from datetime import timedelta
			
 
				-from hashlib import sha1
			
 
				+from hashlib import sha1, sha256, blake2b
			
 
				 from uuid import uuid4
			
 
				 from werkzeug.http import http_date
			
 
				 
			
@@ -808,7 +808,7 @@ class TestDigestHeaders:
 
				         TODO This is by design for now; when a reliable hashing method
			
 
				         for a graph is devised, this test should change.
			
 
				         """
			
 
				-        path = '/ldp/test_etag_rdf1'
			
 
				+        path = f'/ldp/{uuid4()}'
			
 
				 
			
 
				         put_rsp = self.client.put(path)
			
 
				         assert not put_rsp.headers.get('etag')
			
@@ -819,6 +819,60 @@ class TestDigestHeaders:
 
				         assert not get_rsp.headers.get('digest')
			
 
				 
			
 
				 
			
 
				+    def test_digest_put(self):
			
 
				+        """
			
 
				+        Test the ``Digest`` header with PUT to verify content integrity.
			
 
				+        """
			
 
				+        path1 = f'/ldp/{uuid4()}'
			
 
				+        path2 = f'/ldp/{uuid4()}'
			
 
				+        path3 = f'/ldp/{uuid4()}'
			
 
				+        content = uuid4().bytes
			
 
				+        content_sha1 = sha1(content).hexdigest()
			
 
				+        content_sha256 = sha256(content).hexdigest()
			
 
				+        content_blake2b = blake2b(content).hexdigest()
			
 
				+
			
 
				+        assert self.client.put(path1, data=content, headers={
			
 
				+                'digest': 'sha1=abcd'}).status_code == 409
			
 
				+
			
 
				+        assert self.client.put(path1, data=content, headers={
			
 
				+                'digest': f'sha1={content_sha1}'}).status_code == 201
			
 
				+
			
 
				+        assert self.client.put(path2, data=content, headers={
			
 
				+                'digest': f'SHA1={content_sha1}'}).status_code == 201
			
 
				+
			
 
				+        assert self.client.put(path3, data=content, headers={
			
 
				+                'digest': f'SHA256={content_sha256}'}).status_code == 201
			
 
				+
			
 
				+        assert self.client.put(path3, data=content, headers={
			
 
				+                'digest': f'blake2b={content_blake2b}'}).status_code == 204
			
 
				+
			
 
				+
			
 
				+    def test_digest_post(self):
			
 
				+        """
			
 
				+        Test the ``Digest`` header with POST to verify content integrity.
			
 
				+        """
			
 
				+        path = '/ldp'
			
 
				+        content = uuid4().bytes
			
 
				+        content_sha1 = sha1(content).hexdigest()
			
 
				+        content_sha256 = sha256(content).hexdigest()
			
 
				+        content_blake2b = blake2b(content).hexdigest()
			
 
				+
			
 
				+        assert self.client.post(path, data=content, headers={
			
 
				+                'digest': 'sha1=abcd'}).status_code == 409
			
 
				+
			
 
				+        assert self.client.post(path, data=content, headers={
			
 
				+                'digest': f'sha1={content_sha1}'}).status_code == 201
			
 
				+
			
 
				+        assert self.client.post(path, data=content, headers={
			
 
				+                'digest': f'SHA1={content_sha1}'}).status_code == 201
			
 
				+
			
 
				+        assert self.client.post(path, data=content, headers={
			
 
				+                'digest': f'SHA256={content_sha256}'}).status_code == 201
			
 
				+
			
 
				+        assert self.client.post(path, data=content, headers={
			
 
				+                'digest': f'blake2b={content_blake2b}'}).status_code == 201
			
 
				+
			
 
				+
			
 
				 
			
 
				 @pytest.mark.usefixtures('client_class')
			
 
				 class TestETagCondHeaders: