toolbox.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. import logging
  2. import pickle
  3. import re
  4. from collections import defaultdict
  5. from hashlib import sha1
  6. from flask import g
  7. from rdflib.term import URIRef, Variable
  8. from lakesuperior.dictionaries.namespaces import ns_collection as nsc
  9. class Toolbox:
  10. '''
  11. Utility class to translate and generate strings and other objects.
  12. '''
  13. _logger = logging.getLogger(__name__)
  14. ROOT_NODE_URN = nsc['fcsystem'].root
  15. def replace_term_domain(self, term, search, replace):
  16. '''
  17. Replace the domain of a term.
  18. @param term (URIRef) The term (URI) to change.
  19. @param search (string) Domain string to replace.
  20. @param replace (string) Domain string to use for replacement.
  21. @return URIRef
  22. '''
  23. s = str(term)
  24. if s.startswith(search):
  25. s = s.replace(search, replace)
  26. return URIRef(s)
  27. def uuid_to_uri(self, uuid):
  28. '''Convert a UUID to a URI.
  29. @return URIRef
  30. '''
  31. uri = '{}/{}'.format(g.webroot, uuid) if uuid else g.webroot
  32. return URIRef(uri)
  33. def uri_to_uuid(self, uri):
  34. '''Convert an absolute URI (internal or external) to a UUID.
  35. @return string
  36. '''
  37. if uri == self.ROOT_NODE_URN:
  38. return None
  39. elif uri.startswith(nsc['fcres']):
  40. return str(uri).replace(nsc['fcres'], '')
  41. else:
  42. return str(uri).replace(g.webroot, '').strip('/')
  43. def localize_string(self, s):
  44. '''Convert URIs into URNs in a string using the application base URI.
  45. @param string s Input string.
  46. @return string
  47. '''
  48. if s.strip('/') == g.webroot:
  49. return str(self.ROOT_NODE_URN)
  50. else:
  51. return s.strip('/').replace(g.webroot+'/', str(nsc['fcres']))
  52. def localize_term(self, uri):
  53. '''
  54. Convert an URI into an URN.
  55. @param rdflib.term.URIRef urn Input URI.
  56. @return rdflib.term.URIRef
  57. '''
  58. return URIRef(self.localize_string(str(uri)))
  59. def localize_graph(self, gr):
  60. '''
  61. Locbalize a graph.
  62. '''
  63. q = '''
  64. CONSTRUCT {{ ?s ?p ?o . }} WHERE {{
  65. {{
  66. ?s ?p ?o .
  67. FILTER (
  68. STRSTARTS(str(?s), "{0}")
  69. ||
  70. STRSTARTS(str(?o), "{0}")
  71. ||
  72. STRSTARTS(str(?s), "{0}/")
  73. ||
  74. STRSTARTS(str(?o), "{0}/")
  75. ) .
  76. }}
  77. }}'''.format(g.webroot)
  78. flt_gr = gr.query(q)
  79. for t in flt_gr:
  80. local_s = self.localize_term(t[0])
  81. local_o = self.localize_term(t[2]) \
  82. if isinstance(t[2], URIRef) \
  83. else t[2]
  84. gr.remove(t)
  85. gr.add((local_s, t[1], local_o))
  86. return gr
  87. def localize_ext_str(self, s, urn):
  88. '''
  89. Convert global URIs to local in a SPARQL or RDF string.
  90. Also replace empty URIs (`<>`) with a fixed local URN and take care
  91. of fragments and relative URIs.
  92. This is a 3-pass replacement. First, global URIs whose webroot matches
  93. the application ones are replaced with local URNs. Then, relative URIs
  94. are converted to absolute using the URN as the base; finally, the
  95. root node is appropriately addressed.
  96. '''
  97. esc_webroot = g.webroot.replace('/', '\\/')
  98. #loc_ptn = r'<({}\/?)?(.*?)?(\?.*?)?(#.*?)?>'.format(esc_webroot)
  99. loc_ptn1 = r'<{}\/?(.*?)>'.format(esc_webroot)
  100. loc_sub1 = '<{}\\1>'.format(nsc['fcres'])
  101. s1 = re.sub(loc_ptn1, loc_sub1, s)
  102. loc_ptn2 = r'<([#?].*?)?>'
  103. loc_sub2 = '<{}\\1>'.format(urn)
  104. s2 = re.sub(loc_ptn2, loc_sub2, s1)
  105. loc_ptn3 = r'<{}([#?].*?)?>'.format(nsc['fcres'])
  106. loc_sub3 = '<{}\\1>'.format(self.ROOT_NODE_URN)
  107. s3 = re.sub(loc_ptn3, loc_sub3, s2)
  108. return s3
  109. def globalize_string(self, s):
  110. '''Convert URNs into URIs in a string using the application base URI.
  111. @param string s Input string.
  112. @return string
  113. '''
  114. return s.replace(str(nsc['fcres']), g.webroot + '/')
  115. def globalize_term(self, urn):
  116. '''
  117. Convert an URN into an URI using the application base URI.
  118. @param rdflib.term.URIRef urn Input URN.
  119. @return rdflib.term.URIRef
  120. '''
  121. if urn == self.ROOT_NODE_URN:
  122. urn = nsc['fcres']
  123. return URIRef(self.globalize_string(str(urn)))
  124. def globalize_graph(self, gr):
  125. '''
  126. Globalize a graph.
  127. '''
  128. q = '''
  129. CONSTRUCT {{ ?s ?p ?o . }} WHERE {{
  130. {{
  131. ?s ?p ?o .
  132. FILTER (
  133. STRSTARTS(str(?s), "{0}")
  134. ||
  135. STRSTARTS(str(?o), "{0}")
  136. ||
  137. STRSTARTS(str(?s), "{1}")
  138. ||
  139. STRSTARTS(str(?o), "{1}")
  140. ) .
  141. }}
  142. }}'''.format(nsc['fcres'], self.ROOT_NODE_URN)
  143. flt_gr = gr.query(q)
  144. for t in flt_gr:
  145. global_s = self.globalize_term(t[0])
  146. global_o = self.globalize_term(t[2]) \
  147. if isinstance(t[2], URIRef) \
  148. else t[2]
  149. gr.remove(t)
  150. gr.add((global_s, t[1], global_o))
  151. return gr
  152. def globalize_rsrc(self, rsrc):
  153. '''
  154. Globalize a resource.
  155. '''
  156. gr = rsrc.graph
  157. urn = rsrc.identifier
  158. global_gr = self.globalize_graph(gr)
  159. global_uri = self.globalize_term(urn)
  160. return global_gr.resource(global_uri)
  161. def parse_rfc7240(self, h_str):
  162. '''
  163. Parse `Prefer` header as per https://tools.ietf.org/html/rfc7240
  164. The `cgi.parse_header` standard method does not work with all possible
  165. use cases for this header.
  166. @param h_str (string) The header(s) as a comma-separated list of Prefer
  167. statements, excluding the `Prefer: ` token.
  168. '''
  169. parsed_hdr = defaultdict(dict)
  170. # Split up headers by comma
  171. hdr_list = [ x.strip() for x in h_str.split(',') ]
  172. for hdr in hdr_list:
  173. parsed_pref = defaultdict(dict)
  174. # Split up tokens by semicolon
  175. token_list = [ token.strip() for token in hdr.split(';') ]
  176. prefer_token = token_list.pop(0).split('=')
  177. prefer_name = prefer_token[0]
  178. # If preference has a '=', it has a value, else none.
  179. if len(prefer_token)>1:
  180. parsed_pref['value'] = prefer_token[1].strip('"')
  181. for param_token in token_list:
  182. # If the token list had a ';' the preference has a parameter.
  183. print('Param token: {}'.format(param_token))
  184. param_parts = [ prm.strip().strip('"') \
  185. for prm in param_token.split('=') ]
  186. param_value = param_parts[1] if len(param_parts) > 1 else None
  187. parsed_pref['parameters'][param_parts[0]] = param_value
  188. parsed_hdr[prefer_name] = parsed_pref
  189. return parsed_hdr
  190. def rdf_cksum(self, gr):
  191. '''
  192. Generate a checksum for a graph.
  193. This is not straightforward because a graph is derived from an
  194. unordered data structure (RDF).
  195. What this method does is ordering the graph by subject, predicate,
  196. object, then creating a pickle string and a checksum of it.
  197. N.B. The context of the triples is ignored, so isomorphic graphs would
  198. have the same checksum regardless of the context(s) they are found in.
  199. @TODO This can be later reworked to use a custom hashing algorithm.
  200. @param rdflib.Graph gr The graph to be hashed.
  201. @return string SHA1 checksum.
  202. '''
  203. # Remove the messageDigest property, which very likely reflects the
  204. # previous state of the resource.
  205. gr.remove((Variable('s'), nsc['premis'].messageDigest, Variable('o')))
  206. ord_gr = sorted(list(gr), key=lambda x : (x[0], x[1], x[2]))
  207. hash = sha1(pickle.dumps(ord_gr)).hexdigest()
  208. return hash
  209. def split_uuid(self, uuid):
  210. '''
  211. Split a UUID into pairtree segments. This mimics FCREPO4 behavior.
  212. '''
  213. path = '{}/{}/{}/{}/{}'.format(uuid[:2], uuid[2:4],
  214. uuid[4:6], uuid[6:8], uuid)
  215. return path