submission.lua 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. local io = io
  2. local csv = require "csv"
  3. local dir = require "pl.dir"
  4. local plpath = require "pl.path"
  5. local term = require "lsup.term"
  6. local triple = require "lsup.triple"
  7. local graph = require "lsup.graph"
  8. local mc = require "pocket_archive.monocypher"
  9. local pkar = require "pocket_archive"
  10. local M = {} -- Submission module
  11. -- Adapted from lua-núcleo
  12. local function escape_pattern(s)
  13. local matches = {
  14. ["^"] = "%^";
  15. ["$"] = "%$";
  16. ["("] = "%(";
  17. [")"] = "%)";
  18. ["%"] = "%%";
  19. ["."] = "%.";
  20. ["["] = "%[";
  21. ["]"] = "%]";
  22. ["*"] = "%*";
  23. ["+"] = "%+";
  24. ["-"] = "%-";
  25. ["?"] = "%?";
  26. ["\0"] = "%z";
  27. }
  28. return (s:gsub(".", matches))
  29. end
  30. -- For idgen(). Makes a 60-character pool with ~5.9 bits of entropy per char.
  31. local chpool = {}
  32. for i = 48, 57 do table.insert(chpool, i) end -- 0-9
  33. for i = 65, 90 do table.insert(chpool, i) end -- A-Z
  34. for i = 97, 122 do table.insert(chpool, i) end -- a-z
  35. --[[
  36. Generate a random, reader-friendly ID.
  37. A 16-character ID with the above defined #chpool of 60 smybols has an entropy
  38. of 94.5 bits, which should be plenty for a small repository.
  39. ]]
  40. M.idgen = function(len)
  41. local charlist = {}
  42. for i = 1, (len or pkar.config.id.len) do
  43. table.insert(charlist, string.char(chpool[math.random(1, #chpool)]))
  44. end
  45. return table.concat(charlist)
  46. end
  47. --[=[
  48. M.generate_sip_v1 = function(path)
  49. --[[
  50. Version 1 CSV parsing.
  51. Each row has 3 cells: reference path (ref), metadata key (k), value (v).
  52. Each v cell contains one value.
  53. Properties for the same ref can be added in successive rows without
  54. repeating the ref cell.
  55. Multiple values can be added in successive rows without repeating the ref
  56. and v cells.
  57. --]]
  58. local sub_data = assert(csv.open(path))
  59. local md = {}
  60. local prev_ref, prev_k
  61. -- Collate metadata.
  62. local i = 1
  63. for row in sub_data:lines() do
  64. ref, k, v = table.unpack(row)
  65. -- nil-out empty cells (they come through as "")
  66. if ref == "" then ref = nil end
  67. if k == "" then k = nil end
  68. if v == "" then v = nil end
  69. print("Parsing row:", ref, k, v)
  70. -- v can be a legit false value.
  71. if ref and not k and v == nil then
  72. -- This can be a placeholder for ordering purposes.
  73. md[ref] = md_ref or {}
  74. goto continue
  75. elseif v == nil then
  76. goto continue
  77. else
  78. -- If ref or k are missing, reuse the previous one.
  79. if ref then prev_ref = ref
  80. else
  81. if not prev_ref then
  82. -- If column 1 is empty, it must have been set in a
  83. -- previous row.
  84. error(string.format(
  85. "Reference in column 1, row %d not found!", i), 2)
  86. end
  87. ref = prev_ref
  88. end
  89. if k then prev_k = k
  90. else
  91. if not prev_k then
  92. -- If column 2 is empty, it must have been set in a
  93. -- previous row.
  94. error(string.format(
  95. "Property key in column 2, row %d not found!", i), 2)
  96. end
  97. k = prev_k
  98. end
  99. end
  100. md[ref] = md[ref] or {id = M.idgen(), path = ref, _sort = i}
  101. md[ref][k] = md[ref][k] or {}
  102. if k == "type" then
  103. md[ref][k] = v
  104. else
  105. table.insert(md[ref][k], v)
  106. end
  107. ::continue::
  108. i = i + 1
  109. end
  110. -- Move md to an ordered list.
  111. mdlist = {root_path = path:match("(.*/)")}
  112. for _, v in pairs(md) do table.insert(mdlist, v) end
  113. table.sort(mdlist, function (a, b) return (a._sort < b._sort) end)
  114. -- Infer structure from paths and row ordering.
  115. for i, v in ipairs(mdlist) do
  116. for j = i + 1, #mdlist do
  117. --print(string.format("comparing %s : %s", v.path, mdlist[j].path))
  118. if not v["next"] and
  119. mdlist[j].path:match("(.*/)") == v.path:match("(.*/)") then
  120. --print("next match.")
  121. v["next"] = mdlist[j].path
  122. end
  123. if not v.firstChild and
  124. mdlist[j].path:match("^" .. escape_pattern(v.path)) then
  125. --print("First child match.")
  126. v.firstChild = mdlist[j].path
  127. end
  128. end
  129. v._sort = nil
  130. end
  131. return mdlist
  132. end
  133. --]=]
  134. M.generate_sip_v2 = function(path)
  135. local sub_data = assert(csv.open(path, {header = true}))
  136. local sip = {root_path = path:match("(.*/)")}
  137. local prev_path
  138. local i = 1
  139. for row in sub_data:lines() do
  140. print("Processing row: " .. i)
  141. print("Row path: " .. row["path"])
  142. if row["path"] ~= "" then
  143. prev_path = row["path"]
  144. -- New row.
  145. sip[i] = {id = M.idgen()}
  146. for k, v in pairs(row) do
  147. if v == "" then goto cont1 end -- skip empty strings.
  148. if pkar.config.md.single_values[k] then sip[i][k] = v
  149. else sip[i][k] = {[v] = true} end -- Multi-values are a set.
  150. ::cont1::
  151. end
  152. i = i + 1
  153. else
  154. -- Continuation of values from a previous row.
  155. if i == 1 then
  156. error("First row MUST have a path value.", 2)
  157. elseif not prev_path then
  158. error(("No path information at row %d"):format(i), 2)
  159. else
  160. row.path = prev_path
  161. for k, v in pairs(row) do
  162. if v == "" then goto cont2 end -- skip empty strings.
  163. if pkar.config.md.single_values[k] then
  164. -- It doesn't make much sense to overwrite, maybe throw an error?
  165. sip[i - 1][k] = v
  166. else
  167. print("Value: " .. v)
  168. print("Inserting at row " .. i -1)
  169. sip[i - 1][k][v] = true
  170. end
  171. ::cont2::
  172. end
  173. end
  174. end
  175. end
  176. -- Infer structure from paths and row ordering.
  177. for i, v in ipairs(sip) do
  178. for j = i + 1, #sip do
  179. --print(string.format("comparing %s : %s", v.path, sip[j].path))
  180. if not v["pas:next"] and
  181. sip[j].path:match("(.*/)") == v.path:match("(.*/)") then
  182. --print("next match.")
  183. v["pas:next"] = sip[j].path
  184. end
  185. if not v["pas:firstChild"] and
  186. sip[j].path:match("^" .. escape_pattern(v.path)) then
  187. --print("First child match.")
  188. v["pas:firstChild"] = sip[j].path
  189. end
  190. end
  191. v._sort = nil
  192. end
  193. return sip
  194. end
  195. M.validate = function(sip)
  196. -- TODO
  197. end
  198. M.update_rsrc_md = function(rsrc)
  199. -- TODO use a transaction when lsup_lua supports it.
  200. triples = {}
  201. local s = term.new_iriref("par:" .. rsrc.id, pkar.nsm)
  202. gr = graph.new(pkar.store, s.data, pkar.nsm)
  203. rsrc.id = nil -- Exclude from metadata scan.
  204. for k, v in pairs(rsrc) do
  205. print("Adding attribute:", k, v)
  206. local p = term.new_iriref(k, pkar.nsm)
  207. if type(v) == "table" then
  208. for vv, _ in pairs(v) do
  209. table.insert(triples, triple.new(s, p, term.new_lit(vv)))
  210. end
  211. else table.insert(triples, triple.new(s, p, term.new_lit(v))) end
  212. end
  213. -- This is a full replacement.
  214. print("Removing triples.")
  215. gr:remove();
  216. print("Adding triples.")
  217. -- TODO implement lsup_lua fn to add a single triple and add triples in
  218. -- the previous loop.
  219. gr:add(triples)
  220. end
  221. M.update_md = function(sip)
  222. end
  223. M.deposit = function(sip)
  224. for i, rsrc in ipairs(sip) do
  225. -- TODO Wrap this chunk into a txn. Each row is atomic.
  226. print(("Processing resource #%d of %d: %s"):format(i, #sip, rsrc.id))
  227. in_path = sip.root_path .. rsrc.path
  228. -- If it's a directory, skip file processing.
  229. if not plpath.isfile(in_path) then goto continue end
  230. do
  231. tmp_dir = pkar.config.fs.ores_path .. "tmp/"
  232. local tmp_path = tmp_dir .. rsrc.id
  233. dir.makepath(tmp_dir)
  234. local ifh = assert(io.open(in_path, "r"))
  235. local ofh = assert(io.open(tmp_path, "w"))
  236. local hash_it = mc.new_blake2b()
  237. local fsize = 0
  238. print(("Hashing %s"):format(in_path))
  239. while true do
  240. chunk = ifh:read(pkar.config.fs.stream_chunk_size)
  241. if not chunk then break end
  242. hash_it:update(chunk)
  243. ofh:write(chunk)
  244. fsize = fsize + #chunk
  245. end
  246. local checksum = hash_it:final(true)
  247. rsrc["premis:hasMessageDigest"] = {["blake2:" .. checksum] = true}
  248. rsrc["dc:extent"] = fsize
  249. ofh:close()
  250. ifh:close()
  251. out_dir = ("%s%s/%s/"):format(
  252. pkar.config.fs.ores_path,
  253. checksum:sub(1, 4),
  254. checksum:sub(5, 9))
  255. out_path = out_dir .. checksum:sub(1,32)
  256. rsrc.path = out_path
  257. dir.makepath(out_dir)
  258. print(("Moving file %s to %s"):format(tmp_path, rsrc.path))
  259. dir.movefile(tmp_path, rsrc.path)
  260. end
  261. ::continue::
  262. tstamp = os.date("!%Y-%m-%dT%TZ")
  263. rsrc["dc:created"] = tstamp
  264. rsrc["dc:modified"] = tstamp
  265. M.update_rsrc_md(rsrc)
  266. end
  267. end
  268. return M