rsrc_centric_layout.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601
  1. import logging
  2. from collections import defaultdict
  3. from itertools import chain
  4. from rdflib import Dataset, Graph, Literal, URIRef, plugin
  5. from rdflib.namespace import RDF
  6. from rdflib.query import ResultException
  7. from rdflib.resource import Resource
  8. from rdflib.store import Store
  9. from lakesuperior.dictionaries.namespaces import ns_collection as nsc
  10. from lakesuperior.dictionaries.namespaces import ns_mgr as nsm
  11. from lakesuperior.dictionaries.srv_mgd_terms import srv_mgd_subjects, \
  12. srv_mgd_predicates, srv_mgd_types
  13. from lakesuperior.exceptions import (InvalidResourceError,
  14. ResourceNotExistsError, TombstoneError, PathSegmentError)
  15. from lakesuperior.env import env
  16. from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
  17. META_GR_URI = nsc['fcsystem']['meta']
  18. HIST_GR_URI = nsc['fcsystem']['histmeta']
  19. PTREE_GR_URI = nsc['fcsystem']['pairtree']
  20. VERS_CONT_LABEL = 'fcr:versions'
  21. Lmdb = plugin.register('Lmdb', Store,
  22. 'lakesuperior.store.ldp_rs.lmdb_store', 'LmdbStore')
  23. logger = logging.getLogger(__name__)
  24. class RsrcCentricLayout:
  25. '''
  26. This class exposes an interface to build graph store layouts. It also
  27. provides the basics of the triplestore connection.
  28. Some store layouts are provided. New ones aimed at specific uses
  29. and optimizations of the repository may be developed by extending this
  30. class and implementing all its abstract methods.
  31. A layout is implemented via application configuration. However, once
  32. contents are ingested in a repository, changing a layout will most likely
  33. require a migration.
  34. The custom layout must be in the lakesuperior.store.rdf
  35. package and the class implementing the layout must be called
  36. `StoreLayout`. The module name is the one defined in the app
  37. configuration.
  38. E.g. if the configuration indicates `simple_layout` the application will
  39. look for
  40. `lakesuperior.store.rdf.simple_layout.SimpleLayout`.
  41. '''
  42. _graph_uids = ('fcadmin', 'fcmain', 'fcstruct')
  43. # @TODO Move to a config file?
  44. attr_map = {
  45. nsc['fcadmin']: {
  46. # List of server-managed predicates. Triples bearing one of these
  47. # predicates will go in the metadata graph.
  48. 'p': {
  49. nsc['ebucore'].hasMimeType,
  50. nsc['fcrepo'].created,
  51. nsc['fcrepo'].createdBy,
  52. nsc['fcrepo'].hasParent,
  53. nsc['fcrepo'].hasVersion,
  54. nsc['fcrepo'].lastModified,
  55. nsc['fcrepo'].lastModifiedBy,
  56. nsc['fcsystem'].tombstone,
  57. # The following 3 are set by the user but still in this group
  58. # for convenience.
  59. nsc['ldp'].membershipResource,
  60. nsc['ldp'].hasMemberRelation,
  61. nsc['ldp'].insertedContentRelation,
  62. nsc['iana'].describedBy,
  63. nsc['premis'].hasMessageDigest,
  64. nsc['premis'].hasSize,
  65. },
  66. # List of metadata RDF types. Triples bearing one of these types in
  67. # the object will go in the metadata graph.
  68. 't': {
  69. nsc['fcrepo'].Binary,
  70. nsc['fcrepo'].Container,
  71. nsc['fcrepo'].Pairtree,
  72. nsc['fcrepo'].Resource,
  73. nsc['fcsystem'].Tombstone,
  74. nsc['ldp'].BasicContainer,
  75. nsc['ldp'].Container,
  76. nsc['ldp'].DirectContainer,
  77. nsc['ldp'].IndirectContainer,
  78. nsc['ldp'].NonRDFSource,
  79. nsc['ldp'].RDFSource,
  80. nsc['ldp'].Resource,
  81. },
  82. },
  83. nsc['fcstruct']: {
  84. # These are placed in a separate graph for optimization purposes.
  85. 'p': {
  86. nsc['ldp'].contains,
  87. nsc['pcdm'].hasMember,
  88. }
  89. },
  90. }
  91. # RDF types of graphs by prefix.
  92. graph_ns_types = {
  93. nsc['fcadmin']: nsc['fcsystem'].AdminGraph,
  94. nsc['fcmain']: nsc['fcsystem'].UserProvidedGraph,
  95. nsc['fcstruct']: nsc['fcsystem'].StructureGraph,
  96. }
  97. ## MAGIC METHODS ##
  98. def __init__(self, config):
  99. '''Initialize the graph store and a layout.
  100. NOTE: `rdflib.Dataset` requires a RDF 1.1 compliant store with support
  101. for Graph Store HTTP protocol
  102. (https://www.w3.org/TR/sparql11-http-rdf-update/). Blazegraph supports
  103. this only in the (currently unreleased) 2.2 branch. It works with Jena,
  104. which is currently the reference implementation.
  105. '''
  106. self.config = config
  107. self.store = plugin.get('Lmdb', Store)(config['location'])
  108. self.ds = Dataset(self.store, default_union=True)
  109. self.ds.namespace_manager = nsm
  110. @property
  111. def attr_routes(self):
  112. '''
  113. This is a map that allows specific triples to go to certain graphs.
  114. It is a machine-friendly version of the static attribute `attr_map`
  115. which is formatted for human readability and to avoid repetition.
  116. The attributes not mapped here (usually user-provided triples with no
  117. special meaning to the application) go to the `fcmain:` graph.
  118. The output of this is a dict with a similar structure:
  119. {
  120. 'p': {
  121. <Predicate P1>: <destination graph G1>,
  122. <Predicate P2>: <destination graph G1>,
  123. <Predicate P3>: <destination graph G1>,
  124. <Predicate P4>: <destination graph G2>,
  125. [...]
  126. },
  127. 't': {
  128. <RDF Type T1>: <destination graph G1>,
  129. <RDF Type T2>: <destination graph G3>,
  130. [...]
  131. }
  132. }
  133. '''
  134. if not hasattr(self, '_attr_routes'):
  135. self._attr_routes = {'p': {}, 't': {}}
  136. for dest in self.attr_map.keys():
  137. for term_k, terms in self.attr_map[dest].items():
  138. self._attr_routes[term_k].update(
  139. {term: dest for term in terms})
  140. return self._attr_routes
  141. def bootstrap(self):
  142. '''
  143. Delete all graphs and insert the basic triples.
  144. '''
  145. logger.info('Deleting all data from the graph store.')
  146. store = self.ds.store
  147. if getattr(store, 'is_txn_open', False):
  148. store.rollback()
  149. store.destroy(store.path)
  150. logger.info('Initializing the graph store with system data.')
  151. store.open()
  152. with TxnManager(store, True):
  153. with open('data/bootstrap/rsrc_centric_layout.sparql', 'r') as f:
  154. self.ds.update(f.read())
  155. def get_raw(self, uri, ctx=None):
  156. '''
  157. Get a raw graph of a non-LDP resource.
  158. The graph is queried across all contexts or within a specific one.
  159. @param s(rdflib.term.URIRef) URI of the subject.
  160. @param ctx (rdflib.term.URIRef) URI of the optional context. If None,
  161. all named graphs are queried.
  162. return rdflib.Graph
  163. '''
  164. return self.store.triples((nsc['fcres'][uid], None, None), ctx)
  165. def count_rsrc(self):
  166. '''
  167. Return a count of first-class resources, subdivided in "live" and
  168. historic snapshots.
  169. '''
  170. with TxnManager(self.ds.store) as txn:
  171. main = set(
  172. self.ds.graph(META_GR_URI)[ : nsc['foaf'].primaryTopic : ])
  173. hist = set(
  174. self.ds.graph(HIST_GR_URI)[ : nsc['foaf'].primaryTopic : ])
  175. return {'main': len(main), 'hist': len(hist)}
  176. def raw_query(self, qry_str):
  177. '''
  178. Perform a straight query to the graph store.
  179. '''
  180. return self.ds.query(qry_str)
  181. def extract_imr(
  182. self, uid, ver_uid=None, strict=True, incl_inbound=False,
  183. incl_children=True, embed_children=False, **kwargs):
  184. '''
  185. See base_rdf_layout.extract_imr.
  186. '''
  187. if ver_uid:
  188. uid = self.snapshot_uid(uid, ver_uid)
  189. graphs = {pfx[uid] for pfx in self.graph_ns_types.keys()}
  190. # Exclude children: remove containment graphs.
  191. if not incl_children:
  192. graphs.remove(nsc['fcstruct'][uid])
  193. rsrc_graphs = [
  194. self.ds.graph(gr)
  195. for gr in graphs]
  196. resultset = set(chain.from_iterable(rsrc_graphs))
  197. gr = Graph()
  198. gr += resultset
  199. # Include inbound relationships.
  200. if incl_inbound and len(gr):
  201. gr += self.get_inbound_rel(nsc['fcres'][uid])
  202. #logger.debug('Found resource: {}'.format(
  203. # gr.serialize(format='turtle').decode('utf-8')))
  204. rsrc = Resource(gr, nsc['fcres'][uid])
  205. if strict:
  206. self._check_rsrc_status(rsrc)
  207. return rsrc
  208. def ask_rsrc_exists(self, uid):
  209. '''
  210. See base_rdf_layout.ask_rsrc_exists.
  211. '''
  212. logger.debug('Checking if resource exists: {}'.format(uid))
  213. meta_gr = self.ds.graph(nsc['fcadmin'][uid])
  214. return bool(
  215. meta_gr[nsc['fcres'][uid] : RDF.type : nsc['fcrepo'].Resource])
  216. def get_metadata(self, uid, ver_uid=None, strict=True):
  217. '''
  218. This is an optimized query to get only the administrative metadata.
  219. '''
  220. logger.debug('Getting metadata for: {}'.format(uid))
  221. if ver_uid:
  222. uid = self.snapshot_uid(uid, ver_uid)
  223. gr = self.ds.graph(nsc['fcadmin'][uid]) | Graph()
  224. uri = nsc['fcres'][uid]
  225. rsrc = Resource(gr, uri)
  226. if strict:
  227. self._check_rsrc_status(rsrc)
  228. return rsrc
  229. def get_user_data(self, uid):
  230. '''
  231. Get all the user-provided data.
  232. @param uid (string) Resource UID.
  233. '''
  234. # @TODO This only works as long as there is only one user-provided
  235. # graph. If multiple user-provided graphs will be supported, this
  236. # should use another query to get all of them.
  237. userdata_gr = self.ds.graph(nsc['fcmain'][uid])
  238. return userdata_gr | Graph()
  239. def get_version_info(self, uid, strict=True):
  240. '''
  241. Get all metadata about a resource's versions.
  242. '''
  243. # @NOTE This pretty much bends the ontology—it replaces the graph URI
  244. # with the subject URI. But the concepts of data and metadata in Fedora
  245. # are quite fluid anyways...
  246. # WIP—Is it worth to replace SPARQL here?
  247. #versions = self.ds.graph(nsc['fcadmin'][uid]).triples(
  248. # (nsc['fcres'][uid], nsc['fcrepo'].hasVersion, None))
  249. #for version in versions:
  250. # version_meta = self.ds.graph(HIST_GRAPH_URI).triples(
  251. qry = '''
  252. CONSTRUCT {
  253. ?s fcrepo:hasVersion ?v .
  254. ?v ?p ?o .
  255. } {
  256. GRAPH ?ag {
  257. ?s fcrepo:hasVersion ?v .
  258. }
  259. GRAPH ?hg {
  260. ?vm foaf:primaryTopic ?v .
  261. ?vm ?p ?o .
  262. FILTER (?o != ?v)
  263. }
  264. }'''
  265. gr = self._parse_construct(qry, init_bindings={
  266. 'ag': nsc['fcadmin'][uid],
  267. 'hg': HIST_GR_URI,
  268. 's': nsc['fcres'][uid]})
  269. rsrc = Resource(gr, nsc['fcres'][uid])
  270. # @TODO Should return a graph.
  271. if strict:
  272. self._check_rsrc_status(rsrc)
  273. return rsrc
  274. def get_inbound_rel(self, subj_uri, full_triple=True):
  275. '''
  276. Query inbound relationships for a subject.
  277. This can be a list of either complete triples, or of subjects referring
  278. to the given URI. It excludes historic version snapshots.
  279. @param subj_uri (rdflib.URIRef) Subject URI.
  280. @param full_triple (boolean) Whether to return the full triples found
  281. or only the subjects. By default, full triples are returned.
  282. @return iterator(tuple(rdflib.term.Identifier) | rdflib.URIRef)
  283. Inbound triples or subjects.
  284. '''
  285. # Only return non-historic graphs.
  286. meta_gr = self.ds.graph(META_GR_URI)
  287. ptopic_uri = nsc['foaf'].primaryTopic
  288. yield from (
  289. (match[:3] if full_triple else match[0])
  290. for match in self.ds.quads((None, None, subj_uri, None))
  291. if set(meta_gr[ : ptopic_uri : match[0]])
  292. )
  293. def get_descendants(self, uid, recurse=True):
  294. '''
  295. Get descendants (recursive children) of a resource.
  296. @param uid (string) Resource UID.
  297. result set.
  298. @return iterator(rdflib.URIRef) Subjects of descendant resources.
  299. '''
  300. ds = self.ds
  301. subj_uri = nsc['fcres'][uid]
  302. ctx_uri = nsc['fcstruct'][uid]
  303. def _recurse(dset, s, p, c):
  304. new_dset = set(ds.graph(c)[s : p])
  305. for ss in new_dset:
  306. dset.add(ss)
  307. cc = URIRef(ss.replace(nsc['fcres'], nsc['fcstruct']))
  308. if set(ds.graph(cc)[ss : p]):
  309. _recurse(dset, ss, p, cc)
  310. return dset
  311. return (
  312. _recurse(set(), subj_uri, nsc['ldp'].contains, ctx_uri)
  313. if recurse
  314. else ds.graph(ctx_uri)[subj_uri : nsc['ldp'].contains : ])
  315. def patch_rsrc(self, uid, qry):
  316. '''
  317. Patch a resource with SPARQL-Update statements.
  318. The statement(s) is/are executed on the user-provided graph only
  319. to ensure that the scope is limited to the resource.
  320. @param uid (string) UID of the resource to be patched.
  321. @param qry (dict) Parsed and translated query, or query string.
  322. '''
  323. # Add meta graph for user-defined triples. This may not be used but
  324. # it's simple and harmless to add here.
  325. self.ds.graph(META_GR_URI).add(
  326. (nsc['fcmain'][uid], nsc['foaf'].primaryTopic,
  327. nsc['fcres'][uid]))
  328. gr = self.ds.graph(nsc['fcmain'][uid])
  329. logger.debug('Updating graph {} with statements: {}'.format(
  330. nsc['fcmain'][uid], qry))
  331. return gr.update(qry)
  332. def forget_rsrc(self, uid, inbound=True, children=True):
  333. '''
  334. Completely delete a resource and (optionally) its children and inbound
  335. references.
  336. NOTE: inbound references in historic versions are not affected.
  337. '''
  338. # Localize variables to be used in loops.
  339. uri = nsc['fcres'][uid]
  340. topic_uri = nsc['foaf'].primaryTopic
  341. uid_fn = self.uri_to_uid
  342. # remove children.
  343. if children:
  344. logger.debug('Purging children for /{}'.format(uid))
  345. for rsrc_uri in self.get_descendants(uid, False):
  346. self.forget_rsrc(uid_fn(rsrc_uri), inbound, False)
  347. # Remove structure graph.
  348. self.ds.remove_graph(nsc['fcstruct'][uid])
  349. # Remove inbound references.
  350. if inbound:
  351. for ibs in self.get_inbound_rel(uri):
  352. self.ds.remove(ibs)
  353. # Remove versions.
  354. for ver_uri in self.ds.graph(nsc['fcadmin'][uid])[
  355. uri : nsc['fcrepo'].hasVersion : None]:
  356. self.delete_rsrc(uid_fn(ver_uri), True)
  357. # Remove resource itself.
  358. self.delete_rsrc(uid)
  359. def truncate_rsrc(self, uid):
  360. '''
  361. Remove all user-provided data from a resource and only leave admin and
  362. structure data.
  363. '''
  364. userdata = set(self.get_user_data(uid))
  365. return self.modify_rsrc(uid, remove_trp=userdata)
  366. def modify_rsrc(self, uid, remove_trp=set(), add_trp=set()):
  367. '''
  368. Modify triples about a subject.
  369. This method adds and removes triple sets from specific graphs,
  370. indicated by the term router. It also adds metadata about the changed
  371. graphs.
  372. '''
  373. remove_routes = defaultdict(set)
  374. add_routes = defaultdict(set)
  375. historic = VERS_CONT_LABEL in uid
  376. graph_types = set() # Graphs that need RDF type metadata added.
  377. # Create add and remove sets for each graph.
  378. for t in remove_trp:
  379. map_graph = self._map_graph_uri(t, uid)
  380. target_gr_uri = map_graph[0]
  381. remove_routes[target_gr_uri].add(t)
  382. graph_types.add(map_graph)
  383. for t in add_trp:
  384. map_graph = self._map_graph_uri(t, uid)
  385. target_gr_uri = map_graph[0]
  386. add_routes[target_gr_uri].add(t)
  387. graph_types.add(map_graph)
  388. # Decide if metadata go into historic or current graph.
  389. meta_gr_uri = HIST_GR_URI if historic else META_GR_URI
  390. meta_gr = self.ds.graph(meta_gr_uri)
  391. # Remove and add triple sets from each graph.
  392. for gr_uri, trp in remove_routes.items():
  393. gr = self.ds.graph(gr_uri)
  394. gr -= trp
  395. for gr_uri, trp in add_routes.items():
  396. gr = self.ds.graph(gr_uri)
  397. gr += trp
  398. # Add metadata.
  399. meta_gr.set(
  400. (gr_uri, nsc['foaf'].primaryTopic, nsc['fcres'][uid]))
  401. meta_gr.set((gr_uri, nsc['fcrepo'].created, env.timestamp_term))
  402. if historic:
  403. # @FIXME Ugly reverse engineering.
  404. ver_uid = uid.split(VERS_CONT_LABEL)[1].lstrip('/')
  405. meta_gr.set((
  406. gr_uri, nsc['fcrepo'].hasVersionLabel, Literal(ver_uid)))
  407. # @TODO More provenance metadata can be added here.
  408. # Add graph RDF types.
  409. for gr_uri, gr_type in graph_types:
  410. meta_gr.add((gr_uri, RDF.type, gr_type))
  411. def delete_rsrc(self, uid, historic=False):
  412. '''
  413. Delete all aspect graphs of an individual resource.
  414. @param uid Resource UID.
  415. @param historic (bool) Whether the UID is of a historic version.
  416. '''
  417. meta_gr_uri = HIST_GR_URI if historic else META_GR_URI
  418. for gr_uri in self.ds.graph(meta_gr_uri)[
  419. : nsc['foaf'].primaryTopic : nsc['fcres'][uid]]:
  420. self.ds.remove_context(gr_uri)
  421. self.ds.graph(meta_gr_uri).remove((gr_uri, None, None))
  422. def snapshot_uid(self, uid, ver_uid):
  423. '''
  424. Create a versioned UID string from a main UID and a version UID.
  425. '''
  426. if VERS_CONT_LABEL in uid:
  427. raise InvalidResourceError(uid,
  428. 'Resource \'{}\' is already a version.')
  429. return '{}/{}/{}'.format(uid, VERS_CONT_LABEL, ver_uid)
  430. def clear_smt(self, uid):
  431. '''
  432. This is an ugly way to deal with lenient SPARQL update statements
  433. that may insert server-managed triples into a user graph.
  434. @TODO Deprecate when a solution to provide a sanitized SPARQL update
  435. sring is found.
  436. '''
  437. gr = self.ds.graph(nsc['fcmain'][uid])
  438. for p in srv_mgd_predicates:
  439. gr.remove((None, p, None))
  440. for t in srv_mgd_types:
  441. gr.remove((None, RDF.type, t))
  442. def uri_to_uid(self, uri):
  443. '''
  444. Convert an internal URI to a UID.
  445. '''
  446. return str(uri).replace(nsc['fcres'], '')
  447. ## PROTECTED MEMBERS ##
  448. def _check_rsrc_status(self, rsrc):
  449. '''
  450. Check if a resource is not existing or if it is a tombstone.
  451. '''
  452. uid = self.uri_to_uid(rsrc.identifier)
  453. if not len(rsrc.graph):
  454. raise ResourceNotExistsError(uid)
  455. # Check if resource is a tombstone.
  456. if rsrc[RDF.type : nsc['fcsystem'].Tombstone]:
  457. raise TombstoneError(
  458. uid, rsrc.value(nsc['fcrepo'].created))
  459. elif rsrc.value(nsc['fcsystem'].tombstone):
  460. raise TombstoneError(
  461. self.uri_to_uid(
  462. rsrc.value(nsc['fcsystem'].tombstone).identifier),
  463. rsrc.value(nsc['fcrepo'].created))
  464. def _parse_construct(self, qry, init_bindings={}):
  465. '''
  466. Parse a CONSTRUCT query and return a Graph.
  467. '''
  468. try:
  469. qres = self.ds.query(qry, initBindings=init_bindings)
  470. except ResultException:
  471. # RDFlib bug: https://github.com/RDFLib/rdflib/issues/775
  472. return Graph()
  473. else:
  474. return qres.graph
  475. def _map_graph_uri(self, t, uid):
  476. '''
  477. Map a triple to a namespace prefix corresponding to a graph.
  478. @return Tuple with a graph URI and an associated RDF type.
  479. '''
  480. if t[1] in self.attr_routes['p'].keys():
  481. pfx = self.attr_routes['p'][t[1]]
  482. elif t[1] == RDF.type and t[2] in self.attr_routes['t'].keys():
  483. pfx = self.attr_routes['t'][t[2]]
  484. else:
  485. pfx = nsc['fcmain']
  486. return (pfx[uid], self.graph_ns_types[pfx])