rsrc_centric_layout.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. import logging
  2. from collections import defaultdict
  3. from flask import g
  4. from rdflib import Graph
  5. from rdflib.namespace import RDF
  6. from rdflib.query import ResultException
  7. from rdflib.resource import Resource
  8. from rdflib.term import Literal
  9. from lakesuperior.dictionaries.namespaces import ns_collection as nsc
  10. from lakesuperior.dictionaries.namespaces import ns_mgr as nsm
  11. from lakesuperior.dictionaries.srv_mgd_terms import srv_mgd_subjects, \
  12. srv_mgd_predicates, srv_mgd_types
  13. from lakesuperior.exceptions import (InvalidResourceError,
  14. ResourceNotExistsError, TombstoneError, PathSegmentError)
  15. META_GR_URI = nsc['fcsystem']['meta']
  16. HIST_GR_URI = nsc['fcsystem']['histmeta']
  17. PTREE_GR_URI = nsc['fcsystem']['pairtree']
  18. VERS_CONT_LABEL = 'fcr:versions'
  19. class RsrcCentricLayout:
  20. '''
  21. This class exposes an interface to build graph store layouts. It also
  22. provides the basics of the triplestore connection.
  23. Some store layouts are provided. New ones aimed at specific uses
  24. and optimizations of the repository may be developed by extending this
  25. class and implementing all its abstract methods.
  26. A layout is implemented via application configuration. However, once
  27. contents are ingested in a repository, changing a layout will most likely
  28. require a migration.
  29. The custom layout must be in the lakesuperior.store_layouts.rdf
  30. package and the class implementing the layout must be called
  31. `StoreLayout`. The module name is the one defined in the app
  32. configuration.
  33. E.g. if the configuration indicates `simple_layout` the application will
  34. look for
  35. `lakesuperior.store_layouts.rdf.simple_layout.SimpleLayout`.
  36. Some method naming conventions:
  37. - Methods starting with `get_` return a resource.
  38. - Methods starting with `list_` return an iterable or generator of URIs.
  39. - Methods starting with `select_` return an iterable or generator with
  40. table-like data such as from a SELECT statement.
  41. - Methods starting with `ask_` return a boolean value.
  42. '''
  43. _logger = logging.getLogger(__name__)
  44. _graph_uids = ('fcadmin', 'fcmain', 'fcstruct')
  45. # @TODO Move to a config file?
  46. attr_map = {
  47. nsc['fcadmin']: {
  48. # List of server-managed predicates. Triples bearing one of these
  49. # predicates will go in the metadata graph.
  50. 'p': {
  51. nsc['fcrepo'].created,
  52. nsc['fcrepo'].createdBy,
  53. nsc['fcrepo'].hasParent,
  54. nsc['fcrepo'].hasVersion,
  55. nsc['fcrepo'].lastModified,
  56. nsc['fcrepo'].lastModifiedBy,
  57. nsc['fcsystem'].tombstone,
  58. # The following 3 are set by the user but still in this group
  59. # for convenience.
  60. nsc['ldp'].membershipResource,
  61. nsc['ldp'].hasMemberRelation,
  62. nsc['ldp'].insertedContentRelation,
  63. nsc['iana'].describedBy,
  64. nsc['premis'].hasMessageDigest,
  65. nsc['premis'].hasSize,
  66. },
  67. # List of metadata RDF types. Triples bearing one of these types in
  68. # the object will go in the metadata graph.
  69. 't': {
  70. nsc['fcrepo'].Binary,
  71. nsc['fcrepo'].Container,
  72. nsc['fcrepo'].Pairtree,
  73. nsc['fcrepo'].Resource,
  74. nsc['fcsystem'].Tombstone,
  75. nsc['ldp'].BasicContainer,
  76. nsc['ldp'].Container,
  77. nsc['ldp'].DirectContainer,
  78. nsc['ldp'].IndirectContainer,
  79. nsc['ldp'].NonRDFSource,
  80. nsc['ldp'].RDFSource,
  81. nsc['ldp'].Resource,
  82. },
  83. },
  84. nsc['fcstruct']: {
  85. # These are placed in a separate graph for optimization purposes.
  86. 'p': {
  87. nsc['fcsystem'].contains,
  88. nsc['ldp'].contains,
  89. nsc['pcdm'].hasMember,
  90. }
  91. },
  92. }
  93. # RDF types of graphs by prefix.
  94. graph_ns_types = {
  95. nsc['fcadmin']: nsc['fcsystem'].AdminGraph,
  96. nsc['fcmain']: nsc['fcsystem'].UserProvidedGraph,
  97. nsc['fcstruct']: nsc['fcsystem'].StructureGraph,
  98. }
  99. ## MAGIC METHODS ##
  100. def __init__(self, conn, config):
  101. '''Initialize the graph store and a layout.
  102. NOTE: `rdflib.Dataset` requires a RDF 1.1 compliant store with support
  103. for Graph Store HTTP protocol
  104. (https://www.w3.org/TR/sparql11-http-rdf-update/). Blazegraph supports
  105. this only in the (currently unreleased) 2.2 branch. It works with Jena,
  106. which is currently the reference implementation.
  107. '''
  108. self.config = config
  109. self._conn = conn
  110. self.store = self._conn.store
  111. #self.UNION_GRAPH_URI = self._conn.UNION_GRAPH_URI
  112. self.ds = self._conn.ds
  113. self.ds.namespace_manager = nsm
  114. @property
  115. def attr_routes(self):
  116. '''
  117. This is a map that allows specific triples to go to certain graphs.
  118. It is a machine-friendly version of the static attribute `attr_map`
  119. which is formatted for human readability and to avoid repetition.
  120. The attributes not mapped here (usually user-provided triples with no
  121. special meaning to the application) go to the `fcmain:` graph.
  122. The output of this is a dict with a similar structure:
  123. {
  124. 'p': {
  125. <Predicate P1>: <destination graph G1>,
  126. <Predicate P2>: <destination graph G1>,
  127. <Predicate P3>: <destination graph G1>,
  128. <Predicate P4>: <destination graph G2>,
  129. [...]
  130. },
  131. 't': {
  132. <RDF Type T1>: <destination graph G1>,
  133. <RDF Type T2>: <destination graph G3>,
  134. [...]
  135. }
  136. }
  137. '''
  138. if not hasattr(self, '_attr_routes'):
  139. self._attr_routes = {'p': {}, 't': {}}
  140. for dest in self.attr_map.keys():
  141. for term_k, terms in self.attr_map[dest].items():
  142. self._attr_routes[term_k].update(
  143. {term: dest for term in terms})
  144. return self._attr_routes
  145. def bootstrap(self):
  146. '''
  147. Delete all graphs and insert the basic triples.
  148. '''
  149. self._logger.info('Deleting all data from the graph store.')
  150. self.ds.update('DROP SILENT ALL')
  151. self._logger.info('Initializing the graph store with system data.')
  152. #self.ds.default_context.parse(
  153. # source='data/bootstrap/rsrc_centric_layout.nq', format='nquads')
  154. with open('data/bootstrap/rsrc_centric_layout.sparql', 'r') as f:
  155. self.ds.update(f.read())
  156. self.ds.store.commit()
  157. self.ds.store.close()
  158. def get_raw(self, uri, ctx):
  159. '''
  160. Get a raw graph of a non-LDP resource.
  161. The graph is queried across all contexts or within a specific one.
  162. @param s(rdflib.term.URIRef) URI of the subject.
  163. @param ctx (rdflib.term.URIRef) URI of the optional context. If None,
  164. all named graphs are queried.
  165. return rdflib.Graph
  166. '''
  167. bindings = {'s': uri}
  168. if ctx:
  169. bindings['g'] = ctx
  170. qry = '''
  171. CONSTRUCT { ?s ?p ?o . } {
  172. GRAPH ?g {
  173. ?s ?p ?o .
  174. }
  175. }'''
  176. return self._parse_construct(qry, init_bindings=bindings)
  177. def raw_query(self, qry_str):
  178. '''
  179. Perform a straight query to the graph store.
  180. '''
  181. return self.ds.query(qry_str)
  182. def extract_imr(
  183. self, uid, ver_uid=None, strict=True, incl_inbound=False,
  184. incl_children=True, embed_children=False, **kwargs):
  185. '''
  186. See base_rdf_layout.extract_imr.
  187. '''
  188. if ver_uid:
  189. uid = self.snapshot_uid(uid, ver_uid)
  190. if incl_children:
  191. incl_child_qry = ''
  192. if embed_children:
  193. pass # Not implemented. May never be.
  194. else:
  195. incl_child_qry = (
  196. '\n FILTER NOT EXISTS { ?g a fcsystem:StructureGraph .}')
  197. qry = '''
  198. CONSTRUCT {?s ?p ?o . }
  199. WHERE {
  200. GRAPH fcsystem:meta {
  201. ?g foaf:primaryTopic ?rsrc .
  202. }
  203. GRAPH ?g { ?s ?p ?o . }
  204. ''' + incl_child_qry + '\n}'
  205. gr = self._parse_construct(qry, init_bindings={
  206. 'rsrc': nsc['fcres'][uid],
  207. 'ag': nsc['fcadmin'][uid],
  208. 'mg': nsc['fcmain'][uid],
  209. 'sg': nsc['fcstruct'][uid],
  210. })
  211. if incl_inbound and len(gr):
  212. gr += self.get_inbound_rel(nsc['fcres'][uid])
  213. self._logger.debug('Found resource: {}'.format(
  214. gr.serialize(format='turtle').decode('utf-8')))
  215. rsrc = Resource(gr, nsc['fcres'][uid])
  216. if strict:
  217. self._check_rsrc_status(rsrc)
  218. return rsrc
  219. def ask_rsrc_exists(self, uid):
  220. '''
  221. See base_rdf_layout.ask_rsrc_exists.
  222. '''
  223. meta_gr = self.ds.graph(nsc['fcadmin'][uid])
  224. return bool(
  225. meta_gr[nsc['fcres'][uid] : RDF.type : nsc['fcrepo'].Resource])
  226. def get_metadata(self, uid, ver_uid=None, strict=True):
  227. '''
  228. This is an optimized query to get only the administrative metadata.
  229. '''
  230. if ver_uid:
  231. uid = self.snapshot_uid(uid, ver_uid)
  232. gr = self.ds.graph(nsc['fcadmin'][uid]) | Graph()
  233. if not len(gr):
  234. # If no resource is found, search in pairtree graph.
  235. try:
  236. gr = self.ds.graph(PTREE_GR_URI).query(
  237. 'CONSTRUCT WHERE {?s ?p ?o}',
  238. initBindings={'s': nsc['fcres'][uid]}).graph
  239. except ResultException:
  240. gr = Graph()
  241. rsrc = Resource(gr, nsc['fcres'][uid])
  242. if strict:
  243. self._check_rsrc_status(rsrc)
  244. return rsrc
  245. def get_version_info(self, uid, strict=True):
  246. '''
  247. Get all metadata about a resource's versions.
  248. '''
  249. # @NOTE This pretty much bends the ontology—it replaces the graph URI
  250. # with the subject URI. But the concepts of data and metadata in Fedora
  251. # are quite fluid anyways...
  252. qry = '''
  253. CONSTRUCT {
  254. ?s fcrepo:hasVersion ?v .
  255. ?v ?p ?o .
  256. } {
  257. GRAPH ?ag {
  258. ?s fcrepo:hasVersion ?v .
  259. }
  260. GRAPH ?hg {
  261. ?vm foaf:primaryTopic ?v .
  262. ?vm ?p ?o .
  263. FILTER (?o != ?v)
  264. }
  265. }'''
  266. gr = self._parse_construct(qry, init_bindings={
  267. 'ag': nsc['fcadmin'][uid],
  268. 'hg': HIST_GR_URI,
  269. 's': nsc['fcres'][uid]})
  270. rsrc = Resource(gr, nsc['fcres'][uid])
  271. if strict:
  272. self._check_rsrc_status(rsrc)
  273. return rsrc
  274. def get_inbound_rel(self, uri):
  275. '''
  276. Query inbound relationships for a subject.
  277. @param subj_uri Subject URI.
  278. '''
  279. # Only search in non-historic graphs.
  280. qry = '''
  281. CONSTRUCT { ?s1 ?p1 ?s }
  282. WHERE {
  283. GRAPH ?g {
  284. ?s1 ?p1 ?s .
  285. }
  286. GRAPH ?mg {
  287. ?g foaf:primaryTopic ?s1 .
  288. }
  289. }
  290. '''
  291. return self._parse_construct(qry, init_bindings={'s': uri})
  292. def patch_rsrc(self, uid, qry):
  293. '''
  294. Patch a resource with SPARQL-Update statements.
  295. The statement(s) is/are executed on the user-provided graph only
  296. to ensure that the scope is limited to the resource.
  297. @param uid (string) UID of the resource to be patched.
  298. @param qry (dict) Parsed and translated query, or query string.
  299. '''
  300. gr = self.ds.graph(nsc['fcmain'][uid])
  301. self._logger.debug('Updating graph {} with statements: {}'.format(
  302. nsc['fcmain'][uid], qry))
  303. return gr.update(qry)
  304. def purge_rsrc(self, uid, inbound=True, backup_uid=None):
  305. '''
  306. Completely delete a resource and (optionally) its references.
  307. '''
  308. qry = '''
  309. DELETE WHERE {{
  310. GRAPH ?g {{ {rsrc} fcrepo:hasVersion ?v . }}
  311. GRAPH {histmeta} {{
  312. ?vg foaf:primaryTopic ?v ;
  313. ?gp ?go .
  314. }}
  315. GRAPH ?vg {{ ?vs ?vp ?vo }}
  316. }}
  317. ;
  318. DELETE WHERE {{
  319. GRAPH {meta} {{
  320. ?g foaf:primaryTopic {rsrc} ;
  321. ?gp ?go .
  322. }}
  323. GRAPH ?g {{ ?s ?p ?o . }}
  324. }}
  325. '''.format(rsrc=nsc['fcres'][uid].n3(),
  326. meta=META_GR_URI.n3(), histmeta=HIST_GR_URI.n3())
  327. self.ds.update(qry)
  328. if inbound:
  329. # Gather ALL subjects in the user graph. There may be fragments.
  330. # Do not delete inbound references from historic graphs.
  331. qry = '''
  332. DELETE {{ GRAPH ?ibg {{ ?ibs ?ibp ?s . }} }}
  333. WHERE {{
  334. GRAPH {ug} {{ ?s ?p ?o . }}
  335. GRAPH ?ibg {{ ?ibs ?ibp ?s . }}
  336. GRAPH {mg} {{ ?ibg foaf:primaryTopic ?ibs . }}
  337. }}'''.format(
  338. mg=META_GR_URI.n3(),
  339. ug=nsc['fcmain'][uid].n3())
  340. self.ds.update(qry)
  341. def create_or_replace_rsrc(self, uid, trp):
  342. '''
  343. Create a new resource or replace an existing one.
  344. '''
  345. self.delete_rsrc_data(uid)
  346. return self.modify_rsrc(uid, add_trp=trp)
  347. def modify_rsrc(self, uid, remove_trp=set(), add_trp=set()):
  348. '''
  349. Modify triples about a subject.
  350. This method adds and removes triple sets from specific graphs,
  351. indicated by the term rotuer. It also adds metadata about the changed
  352. graphs.
  353. '''
  354. remove_routes = defaultdict(set)
  355. add_routes = defaultdict(set)
  356. historic = VERS_CONT_LABEL in uid
  357. graph_types = set() # Graphs that need RDF type metadata added.
  358. # Create add and remove sets for each graph.
  359. for t in remove_trp:
  360. map_graph = self._map_graph_uri(t, uid)
  361. target_gr_uri = map_graph[0]
  362. remove_routes[target_gr_uri].add(t)
  363. graph_types.add(map_graph)
  364. for t in add_trp:
  365. map_graph = self._map_graph_uri(t, uid)
  366. target_gr_uri = map_graph[0]
  367. add_routes[target_gr_uri].add(t)
  368. graph_types.add(map_graph)
  369. # Decide if metadata go into historic or current graph.
  370. meta_gr_uri = HIST_GR_URI if historic else META_GR_URI
  371. meta_gr = self.ds.graph(meta_gr_uri)
  372. # Remove and add triple sets from each graph.
  373. for gr_uri, trp in remove_routes.items():
  374. gr = self.ds.graph(gr_uri)
  375. gr -= trp
  376. for gr_uri, trp in add_routes.items():
  377. gr = self.ds.graph(gr_uri)
  378. gr += trp
  379. # Add metadata.
  380. meta_gr.set((gr_uri, nsc['foaf'].primaryTopic, nsc['fcres'][uid]))
  381. meta_gr.set((gr_uri, nsc['fcrepo'].created, g.timestamp_term))
  382. if historic:
  383. # @FIXME Ugly reverse engineering.
  384. ver_uid = uid.split(VERS_CONT_LABEL)[1].lstrip('/')
  385. meta_gr.set((
  386. gr_uri, nsc['fcrepo'].hasVersionLabel, Literal(ver_uid)))
  387. # @TODO More provenance metadata can be added here.
  388. # Add graph RDF types.
  389. for gr_uri, gr_type in graph_types:
  390. meta_gr.add((gr_uri, RDF.type, gr_type))
  391. def delete_rsrc_data(self, uid):
  392. for guid in self._graph_uids:
  393. self.ds.remove_graph(self.ds.graph(nsc[guid][uid]))
  394. def snapshot_uid(self, uid, ver_uid):
  395. '''
  396. Create a versioned UID string from a main UID and a version UID.
  397. '''
  398. if VERS_CONT_LABEL in uid:
  399. raise InvalidResourceError(uid,
  400. 'Resource \'{}\' is already a version.')
  401. return '{}/{}/{}'.format(uid, VERS_CONT_LABEL, ver_uid)
  402. def add_path_segment(self, uid, next_uid, parent_uid, child_uid):
  403. '''
  404. Add a pairtree segment.
  405. @param uid (string) The UID of the subject.
  406. @param next_uid (string) UID of the next step down. This may be an LDP
  407. resource or another segment.
  408. @param parent_uid (string) UID of the actual resource(s) that contains
  409. the segment.
  410. @param child_uid (string) UID of the LDP resource contained by the
  411. segment.
  412. '''
  413. props = (
  414. (RDF.type, nsc['fcsystem'].PathSegment),
  415. (nsc['fcsystem'].contains, nsc['fcres'][next_uid]),
  416. (nsc['ldp'].contains, nsc['fcres'][child_uid]),
  417. #(RDF.type, nsc['ldp'].Container),
  418. #(RDF.type, nsc['ldp'].BasicContainer),
  419. #(RDF.type, nsc['ldp'].RDFSource),
  420. #(RDF.type, nsc['fcrepo'].Pairtree),
  421. (nsc['fcrepo'].hasParent, nsc['fcres'][parent_uid]),
  422. )
  423. for p, o in props:
  424. self.ds.graph(PTREE_GR_URI).add((nsc['fcres'][uid], p, o))
  425. def delete_path_segment(self, uid):
  426. '''
  427. Delete a pairtree segment.
  428. '''
  429. self.ds.graph(PTREE_GR_URI).delete((nsc['fcres'][uid], None, None))
  430. def clear_smt(self, uid):
  431. '''
  432. This is an ugly way to deal with lenient SPARQL update statements
  433. that may insert server-managed triples into a user graph.
  434. @TODO Deprecate when a solution to provide a sanitized SPARQL update
  435. sring is found.
  436. '''
  437. gr = self.ds.graph(nsc['fcmain'][uid])
  438. for p in srv_mgd_predicates:
  439. gr.remove((None, p, None))
  440. for t in srv_mgd_types:
  441. gr.remove((None, RDF.type, t))
  442. ## PROTECTED MEMBERS ##
  443. def _check_rsrc_status(self, rsrc):
  444. '''
  445. Check if a resource is not existing or if it is a tombstone.
  446. '''
  447. uid = g.tbox.uri_to_uuid(rsrc.identifier)
  448. if not len(rsrc.graph):
  449. raise ResourceNotExistsError(uid)
  450. # Check if resource is a tombstone.
  451. if rsrc[RDF.type : nsc['fcsystem'].Tombstone]:
  452. raise TombstoneError(
  453. uid, rsrc.value(nsc['fcrepo'].created))
  454. elif rsrc.value(nsc['fcsystem'].tombstone):
  455. raise TombstoneError(
  456. g.tbox.uri_to_uuid(
  457. rsrc.value(nsc['fcsystem'].tombstone).identifier),
  458. rsrc.value(nsc['fcrepo'].created))
  459. def _parse_construct(self, qry, init_bindings={}):
  460. '''
  461. Parse a CONSTRUCT query and return a Graph.
  462. '''
  463. try:
  464. qres = self.ds.query(qry, initBindings=init_bindings)
  465. except ResultException:
  466. # RDFlib bug: https://github.com/RDFLib/rdflib/issues/775
  467. return Graph()
  468. else:
  469. return qres.graph
  470. def _map_graph_uri(self, t, uid):
  471. '''
  472. Map a triple to a namespace prefix corresponding to a graph.
  473. @return Tuple with a graph URI and an associated RDF type.
  474. '''
  475. if t[1] in self.attr_routes['p'].keys():
  476. pfx = self.attr_routes['p'][t[1]]
  477. elif t[1] == RDF.type and t[2] in self.attr_routes['t'].keys():
  478. pfx = self.attr_routes['t'][t[2]]
  479. else:
  480. pfx = nsc['fcmain']
  481. return (pfx[uid], self.graph_ns_types[pfx])