admin.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. import logging
  2. from contextlib import ExitStack
  3. from shutil import rmtree
  4. import lmdb
  5. import requests
  6. from rdflib import Graph, URIRef
  7. from lakesuperior.dictionaries.namespaces import ns_collection as nsc
  8. from lakesuperior.env import env
  9. from lakesuperior.store.ldp_rs.lmdb_store import TxnManager
  10. __doc__ = '''
  11. Admin API.
  12. This module contains maintenance utilities and stats.
  13. '''
  14. logger = logging.getLogger(__name__)
  15. app_globals = env.app_globals
  16. def stats():
  17. '''
  18. Get repository statistics.
  19. @return dict Store statistics, resource statistics.
  20. '''
  21. repo_stats = {'rsrc_stats': env.app_globals.rdfly.count_rsrc()}
  22. with TxnManager(env.app_globals.rdf_store) as txn:
  23. repo_stats['store_stats'] = env.app_globals.rdf_store.stats()
  24. return repo_stats
  25. def dump(
  26. src, dest, start=('/',), binary_handling='include',
  27. compact_uris=False):
  28. '''
  29. Dump a whole LDP repository or parts of it to disk.
  30. @param src (rdflib.term.URIRef) Webroot of source repository. This must
  31. correspond to the LDP root node (for Fedora it can be e.g.
  32. `http://localhost:8080fcrepo/rest/`) and is used to determine if URIs
  33. retrieved are managed by this repository.
  34. @param dest (rdflib.URIRef) Base URI of the destination. This can be any
  35. container in a LAKEsuperior server. If the resource exists, it must be an
  36. LDP container. If it does not exist, it will be created.
  37. @param start (tuple|list) List of starting points to retrieve resources
  38. from. It would typically be the repository root in case of a full dump
  39. or one or more resources in the repository for a partial one.
  40. @param binary_handling (string) One of 'include', 'truncate' or 'split'.
  41. @param compact_uris (bool) NOT IMPLEMENTED. Whether the process should
  42. attempt to compact URIs generated with broken up path segments. If the UID
  43. matches a pattern such as `/12/34/56/123456...` it is converted to
  44. `/123456...`. This would remove a lot of cruft caused by the pairtree
  45. segments. Note that this will change the publicly exposed URIs. If
  46. durability is a concern, a rewrite directive can be added to the HTTP
  47. server that proxies the WSGI endpoint.
  48. '''
  49. # 1. Retrieve list of resources.
  50. if not isinstance(start, list) and not isinstance(start, tuple):
  51. start = (start,)
  52. _gather_resources(src, start)
  53. def _gather_resources(webroot, start_pts):
  54. '''
  55. Gather all resources recursively and save them to temporary store.
  56. Resource UIDs (without the repository webroot) are saved as unique keys
  57. in a temporary store.
  58. @param webroot (string) Base URI of the repository.
  59. @param start_pts (tuple|list) Starting points to gather.
  60. '''
  61. dbpath = '/var/tmp/fcrepo_migration_data'
  62. rmtree(dbpath, ignore_errors=True)
  63. with lmdb.open(
  64. dbpath, 1024 ** 4, metasync=False, readahead=False,
  65. meminit=False) as db:
  66. #import pdb; pdb.set_trace()
  67. for start in start_pts:
  68. if not start.startswith('/'):
  69. raise ValueError(
  70. 'Starting point {} does not begin with a slash.'
  71. .format(start))
  72. _gather_refs(db, webroot, start)
  73. def _gather_refs(db, base, path):
  74. '''
  75. Get the UID of a resource and its relationships recursively.
  76. This method recurses into itself each time a reference to a resource
  77. managed by the repository is encountered.
  78. @param base (string) Base URL of repository. This is used to determine
  79. whether encountered URI terms are repository-managed.
  80. @param base (string) Path, relative to base URL, of the resource to gather.
  81. '''
  82. pfx = base.rstrip('/')
  83. # Public URI of source repo.
  84. uri = pfx + path
  85. # Internal URI of destination.
  86. iuri = uri.replace(pfx, nsc['fcres'])
  87. rsp = requests.head(uri)
  88. rsp.raise_for_status()
  89. # Determine LDP type.
  90. ldp_type = 'ldp_nr'
  91. for link in requests.utils.parse_header_links(rsp.headers.get('link')):
  92. if (
  93. link.get('rel') == 'type'
  94. and link.get('url') == str(nsc['ldp'].RDFSource)):
  95. ldp_type = 'ldp_rs'
  96. break
  97. if ldp_type == 'ldp_rs':
  98. # Get the whole RDF document now because we have to know all outbound
  99. # links.
  100. get_uri = uri
  101. else:
  102. get_uri = uri + '/fcr:metadata'
  103. get_req = requests.get(get_uri)
  104. get_req.raise_for_status()
  105. data = get_req.content
  106. gr = Graph(identifier=iuri).parse(data=data, format='turtle')
  107. # First store the resource, so when we recurse, a resource referring back
  108. # to this resource will skip it as already existing and avoid an infinite
  109. # loop.
  110. #
  111. # The RDF data stream inserted is the turtle-serialized bytestring as it
  112. # comes from the request.
  113. with db.begin(write=True) as txn:
  114. with txn.cursor() as cur:
  115. if not cur.set_key(iuri.encode('utf-8')):
  116. cur.put(uri.encode('utf-8'), data)
  117. # Now, crawl through outbound links.
  118. # LDP-NR fcr:metadata must be checked too.
  119. for pred, obj in gr.predicate_objects():
  120. if (
  121. isinstance(obj, URIRef)
  122. and obj.startswith(uri)
  123. and pred != nsc['fcrepo'].hasParent):
  124. with db.begin() as txn:
  125. with txn.cursor() as cur:
  126. # Avoid ∞
  127. if cur.set_key(obj.encode('utf-8')):
  128. #import pdb; pdb.set_trace()
  129. continue
  130. _gather_refs(db, base, obj.replace(base, ''))