submission.lua 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. local io = io
  2. local csv = require "csv"
  3. local dir = require "pl.dir"
  4. local libmagic = require "libmagic"
  5. local plpath = require "pl.path"
  6. local pp = require "pl.pretty"
  7. local term = require "volksdata.term"
  8. local triple = require "volksdata.triple"
  9. local graph = require "volksdata.graph"
  10. local pkar = require "pocket_archive"
  11. local model = require "pocket_archive.model"
  12. local mc = require "pocket_archive.monocypher"
  13. local transformers = require "pocket_archive.transformers"
  14. local validator = require "pocket_archive.validator"
  15. local logger = pkar.logger
  16. local dbg = require "debugger"
  17. -- "nil" table - for missing key fallback in chaining.
  18. local NT = {}
  19. local M = {} -- Submission module
  20. -- Adapted from lua-núcleo
  21. local function escape_pattern(s)
  22. local matches = {
  23. ["^"] = "%^";
  24. ["$"] = "%$";
  25. ["("] = "%(";
  26. [")"] = "%)";
  27. ["%"] = "%%";
  28. ["."] = "%.";
  29. ["["] = "%[";
  30. ["]"] = "%]";
  31. ["*"] = "%*";
  32. ["+"] = "%+";
  33. ["-"] = "%-";
  34. ["?"] = "%?";
  35. ["\0"] = "%z";
  36. }
  37. return (s:gsub(".", matches))
  38. end
  39. --[[
  40. Only generate a thumbnail for pas:File types.
  41. Non-file resources may be assigned a thumbnail from a contained file
  42. or from a stock type icon in the metadata population phase.
  43. --]]
  44. --[=[
  45. local function generate_thumbnail(rsrc, sip_root, tn_dir)
  46. local mconf = model.models[rsrc["pas:contentType"]]
  47. if not mconf.types["pas:File"] then return end
  48. local txconf = (mconf.transformers or NT).thumbnail or {fn = "type_icon"}
  49. local src = plpath.join(sip_root, rsrc["pas:sourcePath"])
  50. local dest_fname = rsrc.id:gsub("^par:", "")
  51. local ext = txconf.ext or plpath.extension(src)
  52. local dest = plpath.join(tn_dir, dest_fname .. ext)
  53. assert(transformers[txconf.fn](
  54. src, dest, table.unpack(txconf or NT)))
  55. local deliverable = dest:gsub(pkar.config.htmlgen.out_dir, "..")
  56. logger:debug("thumbnail: ", dest)
  57. return dest
  58. end
  59. --]=]
  60. -- Initialize libmagic database.
  61. local magic = libmagic.open( libmagic.MIME_TYPE, libmagic.NO_CHECK_COMPRESS )
  62. assert(magic:load())
  63. -- For idgen(). Makes a 60-character pool with ~5.9 bits of entropy per char.
  64. local chpool = {}
  65. for i = 48, 57 do table.insert(chpool, i) end -- 0-9
  66. for i = 65, 90 do table.insert(chpool, i) end -- A-Z
  67. for i = 97, 122 do table.insert(chpool, i) end -- a-z
  68. --[[
  69. Generate a random, reader-friendly ID.
  70. A 16-character ID with the above defined #chpool of 60 smybols has an entropy
  71. of 94.5 bits, which should be plenty for a small repository.
  72. ]]
  73. M.idgen = function(len)
  74. local charlist = {}
  75. for i = 1, (len or pkar.config.id.len) do
  76. table.insert(charlist, string.char(chpool[math.random(1, #chpool)]))
  77. end
  78. return table.concat(charlist)
  79. end
  80. M.generate_sip = function(path)
  81. local sub_data = assert(csv.open(path, {header = true}))
  82. local sip = {root_path = path:match("(.*/)")}
  83. local tn_dir = plpath.join(sip.root_path, "proc", "tn")
  84. dir.makepath(tn_dir)
  85. local prev_path
  86. local i = 0
  87. local row_n = 2 -- Skipping header row.
  88. for row in sub_data:lines() do
  89. logger:debug("Row path: ", row["pas:sourcePath"])
  90. logger:debug("Parsing row:", pp.write(row))
  91. if row["pas:sourcePath"] ~= "" then
  92. i = i + 1
  93. logger:info(
  94. ("Processing LL resource #%d at row #%d.")
  95. :format(i, row_n))
  96. prev_path = row["pas:sourcePath"]
  97. -- New row.
  98. local id
  99. if row.id then
  100. id = "par:" .. row.id
  101. row.id = nil
  102. else id = "par:" .. M.idgen() end
  103. sip[i] = {id = id}
  104. for k, v in pairs(row) do
  105. if v == "" then goto cont1 end -- skip empty strings.
  106. if pkar.config.md.single_values[k] then sip[i][k] = v
  107. else sip[i][k] = {[v] = true} end -- Multi-values are a set.
  108. ::cont1::
  109. end
  110. --[[
  111. -- Generate thumbnail for files.
  112. local rsrc_path = plpath.join(
  113. sip.root_path, sip[i]["pas:sourcePath"])
  114. if plpath.isfile(rsrc_path) then
  115. --require "debugger"()
  116. sip[i]["pas:thumbnail"] = generate_thumbnail(
  117. sip[i], sip.root_path, tn_dir)
  118. end
  119. --]]
  120. else
  121. -- Continuation of values from a previous row.
  122. if i < 1 then
  123. error("First row MUST have a path value.", 2)
  124. elseif not prev_path then
  125. error(("No path information at row %d"):format(i), 2)
  126. else
  127. for k, v in pairs(row) do
  128. if v == "" then goto cont2 end -- skip empty strings.
  129. if pkar.config.md.single_values[k] then
  130. -- It doesn't make much sense to overwrite, maybe throw an error?
  131. error(
  132. ("On CSV row #%d: field %s is single-valued.")
  133. :format(row_n, k))
  134. else
  135. logger:debug("Value: ", v)
  136. logger:debug("Inserting at row ", i - 1)
  137. sip[i][k][v] = true
  138. end
  139. ::cont2::
  140. end
  141. row["pas:sourcePath"] = prev_path
  142. end
  143. end
  144. row_n = row_n + 1
  145. end
  146. -- Infer structure from paths and row ordering.
  147. for i, v in ipairs(sip) do
  148. local rmod = model.parse_model(v["pas:contentType"])
  149. if rmod.properties["pas:next"] then
  150. for j = i + 1, #sip do
  151. if not v["pas:next"] and
  152. sip[j]["pas:sourcePath"]:match("(.*/)") ==
  153. v["pas:sourcePath"]:match("(.*/)") then
  154. v["pas:next"] = sip[j].id
  155. end
  156. end
  157. end
  158. if rmod.properties["pas:first"] then
  159. for j = i + 1, #sip do
  160. if not v["pas:first"] and
  161. sip[j]["pas:sourcePath"]:match(
  162. "^" .. escape_pattern(v["pas:sourcePath"])
  163. ) then
  164. v["pas:first"] = sip[j].id
  165. end
  166. end
  167. end
  168. end
  169. --require "debugger"()
  170. return sip
  171. end
  172. --[[ Convert a SIP resource table to an in-memory Volksdata graph.
  173. --]]
  174. M.rsrc_to_graph = function(rsrc)
  175. local rmod = model.parse_model(rsrc["pas:contentType"])
  176. logger:info("Updating resource md: ", pp.write(rsrc))
  177. local s = term.new_iriref_ns(rsrc.id)
  178. triples = {}
  179. for k, v in pairs(rsrc) do
  180. -- id is the subject, it won't be an attribute.
  181. if k == "id" then goto skip end
  182. logger:debug(("Adding attribute: %s = %s"):format(k, pp.write(v)))
  183. local p = term.new_iriref_ns(k)
  184. local o
  185. local datatype = ((rmod.properties or NT)[k] or NT).type
  186. local rdf_type_str = pkar.config.md.datatypes[datatype]
  187. local rdf_type
  188. if rdf_type_str then
  189. rdf_type = term.new_iriref_ns(rdf_type_str).data
  190. end
  191. -- Force all fields to be multi-valued.
  192. if type(v) ~= "table" then v = {[v] = true} end
  193. for vv in pairs(v) do
  194. if k == "pas:contentType" then
  195. vv = "pas:" .. vv
  196. end
  197. if datatype == "resource" then
  198. o = term.new_iriref_ns(vv)
  199. elseif datatype == "ext_resource" then
  200. o = term.new_iriref(vv)
  201. else o = term.new_lit(vv, rdf_type) end
  202. table.insert(triples, triple.new(s, p, o))
  203. end
  204. ::skip::
  205. end
  206. for i, m in ipairs(rmod.lineage) do
  207. table.insert(
  208. triples, triple.new(s, pkar.RDF_TYPE, term.new_iriref_ns(m)))
  209. end
  210. local gr = graph.new(nil)
  211. -- This is a full replacement.
  212. --require "debugger"()
  213. logger:info("Removing triples.")
  214. gr:remove(s)
  215. logger:info("Adding triples.")
  216. -- TODO implement volksdata_lua fn to add a single triple and add triples
  217. -- in the previous loop.
  218. gr:add(triples)
  219. return gr, s
  220. end
  221. M.store_updates = function(tmp_gr, s)
  222. -- TODO use a transaction when volksdata_lua supports it.
  223. logger:debug("Graph: ", tmp_gr:encode("ttl"))
  224. local val_report = validator.validate(tmp_gr, s)
  225. if val_report.max_level == "ERROR" then error(
  226. "Validation raised errors: " .. pp.write(val_report))
  227. elseif val_report.max_level == "WARN" then logger:warn(
  228. "Validation raised warnings: " .. pp.write(val_report))
  229. elseif val_report.max_level == "NOTICE" then logger:warn(
  230. "Validation raised notices: " .. pp.write(val_report)) end
  231. local stored_gr = graph.new(pkar.store, term.DEFAULT_CTX)
  232. return tmp_gr:copy(stored_gr)
  233. end
  234. M.deposit = function(sip)
  235. for i, rsrc in ipairs(sip) do
  236. -- TODO Wrap this chunk into a txn. Each row is atomic.
  237. logger:debug(("Processing resource #%d of %d: %s"):format(
  238. i, #sip, rsrc.id))
  239. local in_path = sip.root_path .. rsrc["pas:sourcePath"]
  240. local fext = plpath.extension(in_path)
  241. -- If it's a directory, skip file processing.
  242. if not plpath.isfile(in_path) then goto continue end
  243. do
  244. local tmp_dir = plpath.join(pkar.config.fs.ores_path, "tmp/")
  245. local file_ext
  246. _, file_ext = plpath.splitext(in_path)
  247. local tmp_path = tmp_dir .. rsrc.id .. file_ext
  248. dir.makepath(tmp_dir)
  249. local ifh = assert(io.open(in_path, "r"))
  250. rsrc["dc:format"] = {[magic:filehandle(ifh)] = true}
  251. local hash_it = mc.new_blake2b()
  252. local fsize = 0
  253. logger:debug("Hashing ", in_path)
  254. local ofh = assert(io.open(tmp_path, "w"))
  255. while true do
  256. chunk = ifh:read(pkar.config.fs.stream_chunk_size)
  257. if not chunk then break end
  258. hash_it:update(chunk)
  259. ofh:write(chunk)
  260. fsize = fsize + #chunk
  261. end
  262. local checksum = hash_it:final(true)
  263. rsrc["premis:hasMessageDigest"] = {
  264. ["urn:blake2:" .. checksum] = true}
  265. rsrc["dc:extent"] = fsize
  266. ofh:close()
  267. ifh:close()
  268. -- Copy file and calculate checksum.
  269. local out_dir, out_path
  270. out_dir = plpath.join(
  271. pkar.config.fs.ores_path,
  272. checksum:sub(1, 2),
  273. checksum:sub(3, 4))
  274. out_path = plpath.join(out_dir, checksum:sub(1,32) .. fext)
  275. dir.makepath(out_dir)
  276. logger:debug(("Moving file %s to %s"):format(tmp_path, out_path))
  277. dir.movefile(tmp_path, out_path)
  278. rsrc["pas:path"] = out_path
  279. -- Copy thumbnail if existing.
  280. if rsrc["pas:thumbnail"] then
  281. src_path = rsrc["pas:thumbnail"]
  282. out_path = plpath.join(
  283. out_dir, plpath.basename(src_path))
  284. logger:debug(("Moving file %s to %s"):format(src_path, out_path))
  285. dir.movefile(src_path, out_path)
  286. rsrc["pas:thumbnail"] = out_path
  287. end
  288. end
  289. ::continue::
  290. tstamp = os.date("!%Y-%m-%dT%TZ")
  291. rsrc["dc:created"] = tstamp
  292. rsrc["dc:modified"] = tstamp
  293. M.store_updates(M.rsrc_to_graph(rsrc))
  294. end
  295. -- Remove processing directory.
  296. dir.rmtree(plpath.join(sip.root_path, "proc"))
  297. end
  298. return M