submission.lua 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. local io = io
  2. local csv = require "csv"
  3. local dir = require "pl.dir"
  4. local lfs = require "lfs"
  5. local uuid = require "uuid"
  6. local path = require "pl.path"
  7. local cksum = require "sha2"
  8. local config = require "config.app"
  9. -- Random number generator for uuid()
  10. local posix_uuid = pcall(function()
  11. uuid.set_rng(uuid.rng.urandom())
  12. end)
  13. if not posix_uuid then rng = uuid.set_rng(uuid.rng.win_ffi()) end
  14. local M = {} -- Submission module
  15. -- Adapted from lua-núcleo
  16. local function escape_pattern(s)
  17. local matches = {
  18. ["^"] = "%^";
  19. ["$"] = "%$";
  20. ["("] = "%(";
  21. [")"] = "%)";
  22. ["%"] = "%%";
  23. ["."] = "%.";
  24. ["["] = "%[";
  25. ["]"] = "%]";
  26. ["*"] = "%*";
  27. ["+"] = "%+";
  28. ["-"] = "%-";
  29. ["?"] = "%?";
  30. ["\0"] = "%z";
  31. }
  32. return (s:gsub(".", matches))
  33. end
  34. M.generate_sip_v1 = function(path)
  35. --[[
  36. Version 1 CSV parsing.
  37. Each row has 3 cells: reference path (ref), metadata key (k), value (v).
  38. Each v cell contains one value.
  39. Properties for the same ref can be added in successive rows without
  40. repeating the ref cell.
  41. Multiple values can be added in successive rows without repeating the ref
  42. and v cells.
  43. --]]
  44. local sub_data = assert(csv.open(path))
  45. local md = {}
  46. local prev_ref, prev_k
  47. -- Collate metadata.
  48. local i = 1
  49. for row in sub_data:lines() do
  50. ref, k, v = table.unpack(row)
  51. -- nil-out empty cells (they come through as "")
  52. if ref == "" then ref = nil end
  53. if k == "" then k = nil end
  54. if v == "" then v = nil end
  55. print("Parsing row:", ref, k, v)
  56. -- v can be a legit false value.
  57. if ref and not k and v == nil then
  58. -- This can be a placeholder for ordering purposes.
  59. md[ref] = md_ref or {}
  60. goto continue
  61. elseif v == nil then
  62. goto continue
  63. else
  64. -- If ref or k are missing, reuse the previous one.
  65. if ref then prev_ref = ref
  66. else
  67. if not prev_ref then
  68. -- If column 1 is empty, it must have been set in a
  69. -- previous row.
  70. error(string.format(
  71. "Reference in column 1, row %d not found!", i), 2)
  72. end
  73. ref = prev_ref
  74. end
  75. if k then prev_k = k
  76. else
  77. if not prev_k then
  78. -- If column 2 is empty, it must have been set in a
  79. -- previous row.
  80. error(string.format(
  81. "Property key in column 2, row %d not found!", i), 2)
  82. end
  83. k = prev_k
  84. end
  85. end
  86. md[ref] = md[ref] or {id = uuid(), path = ref, _sort = i}
  87. md[ref][k] = md[ref][k] or {}
  88. if k == "type" then
  89. md[ref][k] = v
  90. else
  91. table.insert(md[ref][k], v)
  92. end
  93. ::continue::
  94. i = i + 1
  95. end
  96. -- Move md to an ordered list.
  97. mdlist = {root_path = path:match("(.*/)")}
  98. for _, v in pairs(md) do table.insert(mdlist, v) end
  99. table.sort(mdlist, function (a, b) return (a._sort < b._sort) end)
  100. -- Infer structure from paths and row ordering.
  101. for i, v in ipairs(mdlist) do
  102. for j = i + 1, #mdlist do
  103. --print(string.format("comparing %s : %s", v.path, mdlist[j].path))
  104. if not v["next"] and
  105. mdlist[j].path:match("(.*/)") == v.path:match("(.*/)") then
  106. --print("next match.")
  107. v["next"] = mdlist[j].path
  108. end
  109. if not v.firstChild and
  110. mdlist[j].path:match("^" .. escape_pattern(v.path)) then
  111. --print("First child match.")
  112. v.firstChild = mdlist[j].path
  113. end
  114. end
  115. v._sort = nil
  116. end
  117. return mdlist
  118. end
  119. M.generate_sip_v2 = function(path)
  120. local sub_data = assert(csv.open(path, {header = true}))
  121. local sip = {root_path = path:match("(.*/)")}
  122. local prev_path
  123. local i = 1
  124. for row in sub_data:lines() do
  125. print("Processing row: " .. i)
  126. print("Row path: " .. row["path"])
  127. if row["path"] ~= "" then
  128. prev_path = row["path"]
  129. -- New row.
  130. sip[i] = {pas_id = uuid()}
  131. for k, v in pairs(row) do
  132. if v == "" then goto cont1 end -- skip empty strings.
  133. if config.md.single_values[k] then
  134. sip[i][k] = v
  135. else
  136. sip[i][k] = {v}
  137. end
  138. ::cont1::
  139. end
  140. i = i + 1
  141. else
  142. -- Continuation of values from a previous row.
  143. if i == 1 then
  144. error("First row MUST have a path value.", 2)
  145. elseif not prev_path then
  146. error(("No path information at row %d"):format(i), 2)
  147. else
  148. row.path = prev_path
  149. for k, v in pairs(row) do
  150. if v == "" then goto cont2 end -- skip empty strings.
  151. if config.md.single_values[k] then
  152. -- It doesn't make much sense to overwrite, maybe throw an error?
  153. sip[i - 1][k] = v
  154. else
  155. print("Value: " .. v)
  156. print("Inserting at row " .. i -1)
  157. table.insert(sip[i - 1][k], v)
  158. end
  159. ::cont2::
  160. end
  161. end
  162. end
  163. end
  164. return sip
  165. end
  166. M.validate = function(sip)
  167. -- TODO
  168. end
  169. M.deposit = function(sip)
  170. for i, rsrc in ipairs(sip) do
  171. print(("Processing resource #%d of %d: %s"):format(i, #sip, rsrc.id))
  172. in_path = sip.root_path .. rsrc.path
  173. -- If it's a directory, skip processing.
  174. if not path.isfile(in_path) then goto continue end
  175. local tmp_dir = config.fs.ores_path .. "tmp/"
  176. local tmp_path = tmp_dir .. rsrc.id
  177. dir.makepath(tmp_dir)
  178. local ifh = io.open(in_path, "r")
  179. local ofh = io.open(tmp_path, "w")
  180. b2 = cksum.blake2b()
  181. while true do
  182. chunk = ifh:read(config.fs.stream_chunk_size)
  183. if not chunk then break end
  184. b2(chunk)
  185. ofh:write(chunk)
  186. end
  187. rsrc.b2checksum = b2()
  188. ofh:close()
  189. ifh:close()
  190. out_dir = ("%s%s/%s/"):format(
  191. config.fs.ores_path,
  192. rsrc.b2checksum:sub(1,4),
  193. rsrc.b2checksum:sub(5,8))
  194. out_path = out_dir .. rsrc.b2checksum
  195. rsrc.path = out_path
  196. dir.makepath(out_dir)
  197. print(("Moving file %s"):format(rsrc.id))
  198. dir.movefile(tmp_path, rsrc.path)
  199. ::continue::
  200. end
  201. end
  202. return M