digest.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import pickle
  2. from hashlib import sha1
  3. from rdflib.term import Literal, URIRef, Variable
  4. from lakesuperior.core.namespaces import ns_collection as nsc
  5. class Digest:
  6. '''
  7. Various digest functions. May be merged into something more generic later.
  8. '''
  9. @staticmethod
  10. def rdf_cksum(g):
  11. '''
  12. Generate a checksum for a graph.
  13. This is not straightforward because a graph is derived from an
  14. unordered data structure (RDF).
  15. What this method does is ordering the graph by subject, predicate,
  16. object, then creating a pickle string and a checksum of it.
  17. N.B. The context of the triples is ignored, so isomorphic graphs would
  18. have the same checksum regardless of the context(s) they are found in.
  19. @TODO This can be later reworked to use a custom hashing algorithm.
  20. @param rdflib.Graph g The graph to be hashed.
  21. @return string SHA1 checksum.
  22. '''
  23. # Remove the messageDigest property, which at this point is very likely
  24. # old.
  25. g.remove((Variable('s'), nsc['premis'].messageDigest, Variable('o')))
  26. ord_g = sorted(list(g), key=lambda x : (x[0], x[1], x[2]))
  27. hash = sha1(pickle.dumps(ord_g)).hexdigest()
  28. return hash
  29. @staticmethod
  30. def non_rdf_checksum(data):
  31. '''
  32. Generate a checksum of non-RDF content.
  33. @TODO This can be later reworked to use a custom hashing algorithm.
  34. '''
  35. return sha1(data).hexdigest()