submission.lua 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. local io = io
  2. local csv = require "ftcsv"
  3. local dir = require "pl.dir"
  4. local libmagic = require "libmagic"
  5. local plpath = require "pl.path"
  6. local pp = require "pl.pretty"
  7. local term = require "volksdata.term"
  8. local triple = require "volksdata.triple"
  9. local graph = require "volksdata.graph"
  10. local pkar = require "pocket_archive"
  11. local model = require "pocket_archive.model"
  12. local mc = require "pocket_archive.monocypher"
  13. local repo = require "pocket_archive.repo"
  14. local transformers = require "pocket_archive.transformers"
  15. local validator = require "pocket_archive.validator"
  16. local logger = pkar.logger
  17. local dbg = require "debugger"
  18. -- "nil" table - for missing key fallback in chaining.
  19. local NT = {}
  20. local M = {} -- Submission module
  21. -- Adapted from lua-núcleo
  22. local function escape_pattern(s)
  23. local matches = {
  24. ["^"] = "%^";
  25. ["$"] = "%$";
  26. ["("] = "%(";
  27. [")"] = "%)";
  28. ["%"] = "%%";
  29. ["."] = "%.";
  30. ["["] = "%[";
  31. ["]"] = "%]";
  32. ["*"] = "%*";
  33. ["+"] = "%+";
  34. ["-"] = "%-";
  35. ["?"] = "%?";
  36. ["\0"] = "%z";
  37. }
  38. return (s:gsub(".", matches))
  39. end
  40. --[[
  41. Only generate a thumbnail for pas:File types.
  42. Non-file resources may be assigned a thumbnail from a contained file
  43. or from a stock type icon in the metadata population phase.
  44. --]]
  45. --[=[
  46. local function generate_thumbnail(rsrc, sip_root, tn_dir)
  47. local mconf = model.models[rsrc["pas:contentType"]]
  48. if not mconf.types["pas:File"] then return end
  49. local txconf = (mconf.transformers or NT).thumbnail or {fn = "type_icon"}
  50. local src = plpath.join(sip_root, rsrc["pas:sourcePath"])
  51. local dest_fname = rsrc.id:gsub("^par:", "")
  52. local ext = txconf.ext or plpath.extension(src)
  53. local dest = plpath.join(tn_dir, dest_fname .. ext)
  54. assert(transformers[txconf.fn](
  55. src, dest, table.unpack(txconf or NT)))
  56. local deliverable = dest:gsub(pkar.config.htmlgen.out_dir, "..")
  57. logger:debug("thumbnail: ", dest)
  58. return dest
  59. end
  60. --]=]
  61. -- Initialize libmagic database.
  62. local magic = libmagic.open( libmagic.MIME_TYPE, libmagic.NO_CHECK_COMPRESS )
  63. assert(magic:load())
  64. -- For idgen(). Makes a 60-character pool with ~5.9 bits of entropy per char.
  65. local chpool = {}
  66. for i = 48, 57 do table.insert(chpool, i) end -- 0-9
  67. for i = 65, 90 do table.insert(chpool, i) end -- A-Z
  68. for i = 97, 122 do table.insert(chpool, i) end -- a-z
  69. --[[
  70. Generate a random, reader-friendly ID.
  71. A 16-character ID with the above defined #chpool of 60 smybols has an entropy
  72. of 94.5 bits, which should be plenty for a small repository.
  73. ]]
  74. M.idgen = function(len)
  75. local charlist = {}
  76. for i = 1, (len or pkar.config.id.len) do
  77. table.insert(charlist, string.char(chpool[math.random(1, #chpool)]))
  78. end
  79. return table.concat(charlist)
  80. end
  81. M.generate_sip = function(path)
  82. local sip = {root_path = path:match("(.*/)")}
  83. local tn_dir = plpath.join(sip.root_path, "proc", "tn")
  84. dir.makepath(tn_dir)
  85. local prev_path
  86. local i = 0
  87. for row_n, row in csv.parseLine(path) do
  88. logger:debug("Row path: ", row["pas:sourcePath"])
  89. logger:debug("Parsing row:", pp.write(row))
  90. if #row["pas:sourcePath"] > 0 then
  91. i = i + 1
  92. logger:info(
  93. ("Processing LL resource #%d at row #%d.")
  94. :format(i, row_n))
  95. prev_path = row["pas:sourcePath"]
  96. -- New row.
  97. local id
  98. if #row.id > 0 then
  99. id = "par:" .. row.id
  100. row.id = nil
  101. else id = "par:" .. M.idgen() end
  102. sip[i] = {id = id}
  103. for k, v in pairs(row) do
  104. if v == "" then goto cont1 end -- skip empty strings.
  105. if pkar.config.md.single_values[k] then sip[i][k] = v
  106. else sip[i][k] = {[v] = true} end -- Multi-values are a set.
  107. ::cont1::
  108. end
  109. --[[
  110. -- Generate thumbnail for files.
  111. local rsrc_path = plpath.join(
  112. sip.root_path, sip[i]["pas:sourcePath"])
  113. if plpath.isfile(rsrc_path) then
  114. --require "debugger"()
  115. sip[i]["pas:thumbnail"] = generate_thumbnail(
  116. sip[i], sip.root_path, tn_dir)
  117. end
  118. --]]
  119. else
  120. -- Continuation of values from a previous row.
  121. if i < 1 then
  122. error("First row MUST have a path value.", 2)
  123. elseif not prev_path then
  124. error(("No path information at row %d"):format(i), 2)
  125. else
  126. for k, v in pairs(row) do
  127. if v == "" then goto cont2 end -- skip empty strings.
  128. if pkar.config.md.single_values[k] then
  129. -- It doesn't make much sense to overwrite, maybe throw an error?
  130. error(
  131. ("On CSV row #%d: field %s is single-valued.")
  132. :format(row_n, k))
  133. else
  134. logger:debug("Value: ", v)
  135. logger:debug("Inserting at row ", i - 1)
  136. sip[i][k][v] = true
  137. end
  138. ::cont2::
  139. end
  140. row["pas:sourcePath"] = prev_path
  141. end
  142. end
  143. row_n = row_n + 1
  144. end
  145. -- Infer structure from paths and row ordering.
  146. for i, v in ipairs(sip) do
  147. local rmod = model.parse_model(v["pas:contentType"])
  148. if rmod.properties["pas:next"] then
  149. for j = i + 1, #sip do
  150. if not v["pas:next"] and
  151. sip[j]["pas:sourcePath"]:match("(.*/)") ==
  152. v["pas:sourcePath"]:match("(.*/)") then
  153. v["pas:next"] = sip[j].id
  154. end
  155. end
  156. end
  157. if rmod.properties["pas:first"] then
  158. for j = i + 1, #sip do
  159. if not v["pas:first"] and
  160. sip[j]["pas:sourcePath"]:match(
  161. "^" .. escape_pattern(v["pas:sourcePath"])
  162. ) then
  163. v["pas:first"] = sip[j].id
  164. end
  165. end
  166. end
  167. end
  168. --require "debugger"()
  169. return sip
  170. end
  171. --[[ Convert a SIP resource table to an in-memory Volksdata graph.
  172. --]]
  173. M.rsrc_to_graph = function(rsrc)
  174. local rmod = model.parse_model(rsrc["pas:contentType"])
  175. logger:info("Updating resource md: ", pp.write(rsrc))
  176. local s = term.new_iriref_ns(rsrc.id)
  177. local gr = graph.new(nil)
  178. it = gr:add_init()
  179. for k, v in pairs(rsrc) do
  180. -- id is the subject, it won't be an attribute.
  181. if k == "id" then goto skip end
  182. logger:debug(("Adding attribute: %s = %s"):format(k, pp.write(v)))
  183. local p = term.new_iriref_ns(k)
  184. local o
  185. local datatype = ((rmod.properties or NT)[k] or NT).type
  186. local rdf_type_str = pkar.config.md.datatypes[datatype]
  187. local rdf_type
  188. if rdf_type_str then
  189. rdf_type = term.new_iriref_ns(rdf_type_str).data
  190. end
  191. -- Force all fields to be multi-valued.
  192. if type(v) ~= "table" then v = {[v] = true} end
  193. for vv in pairs(v) do
  194. if k == "pas:contentType" then
  195. vv = "pas:" .. vv
  196. end
  197. if datatype == "resource" then
  198. o = term.new_iriref_ns(vv)
  199. elseif datatype == "ext_resource" then
  200. o = term.new_iriref(vv)
  201. else o = term.new_lit(vv, rdf_type) end
  202. it:add_iter(triple.new(s, p, o))
  203. end
  204. ::skip::
  205. end
  206. for i, m in ipairs(rmod.lineage) do
  207. it:add_iter(triple.new(s, pkar.RDF_TYPE, term.new_iriref_ns(m)))
  208. end
  209. it:add_done()
  210. return gr, s
  211. end
  212. M.deposit = function(sip)
  213. for i, rsrc in ipairs(sip) do
  214. -- TODO Wrap this chunk into a txn. Each row is atomic.
  215. logger:debug(("Processing resource #%d of %d: %s"):format(
  216. i, #sip, rsrc.id))
  217. local in_path = sip.root_path .. rsrc["pas:sourcePath"]
  218. local fext = plpath.extension(in_path)
  219. -- If it's a directory, skip file processing.
  220. if not plpath.isfile(in_path) then goto continue end
  221. do
  222. local tmp_dir = plpath.join(pkar.config.fs.ores_path, "tmp/")
  223. local file_ext
  224. _, file_ext = plpath.splitext(in_path)
  225. local tmp_path = tmp_dir .. rsrc.id .. file_ext
  226. dir.makepath(tmp_dir)
  227. local ifh = assert(io.open(in_path, "r"))
  228. rsrc["dc:format"] = {[magic:filehandle(ifh)] = true}
  229. local hash_it = mc.new_blake2b()
  230. local fsize = 0
  231. logger:debug("Hashing ", in_path)
  232. local ofh = assert(io.open(tmp_path, "w"))
  233. while true do
  234. chunk = ifh:read(pkar.config.fs.stream_chunk_size)
  235. if not chunk then break end
  236. hash_it:update(chunk)
  237. ofh:write(chunk)
  238. fsize = fsize + #chunk
  239. end
  240. local checksum = hash_it:final(true)
  241. rsrc["premis:hasMessageDigest"] = {
  242. ["urn:blake2:" .. checksum] = true}
  243. rsrc["dc:extent"] = fsize
  244. ofh:close()
  245. ifh:close()
  246. -- Copy file and calculate checksum.
  247. local out_dir, out_path
  248. out_dir = plpath.join(
  249. pkar.config.fs.ores_path,
  250. checksum:sub(1, 2),
  251. checksum:sub(3, 4))
  252. out_path = plpath.join(out_dir, checksum:sub(1,32) .. fext)
  253. dir.makepath(out_dir)
  254. logger:debug(("Moving file %s to %s"):format(tmp_path, out_path))
  255. dir.movefile(tmp_path, out_path)
  256. rsrc["pas:path"] = out_path
  257. -- Copy thumbnail if existing.
  258. if rsrc["pas:thumbnail"] then
  259. src_path = rsrc["pas:thumbnail"]
  260. out_path = plpath.join(
  261. out_dir, plpath.basename(src_path))
  262. logger:debug(("Moving file %s to %s"):format(src_path, out_path))
  263. dir.movefile(src_path, out_path)
  264. rsrc["pas:thumbnail"] = out_path
  265. end
  266. end
  267. ::continue::
  268. tstamp = os.date("!%Y-%m-%dT%TZ")
  269. rsrc["dc:created"] = tstamp
  270. rsrc["dc:modified"] = tstamp
  271. repo.store_updates(M.rsrc_to_graph(rsrc))
  272. end
  273. -- Remove processing directory.
  274. dir.rmtree(plpath.join(sip.root_path, "proc"))
  275. end
  276. return M