123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238 |
- local io = io
- local csv = require "csv"
- local dir = require "pl.dir"
- local lfs = require "lfs"
- local uuid = require "uuid"
- local path = require "pl.path"
- local cksum = require "sha2"
- local config = require "config.app"
- -- Random number generator for uuid()
- local posix_uuid = pcall(function()
- uuid.set_rng(uuid.rng.urandom())
- end)
- if not posix_uuid then rng = uuid.set_rng(uuid.rng.win_ffi()) end
- local M = {} -- Submission module
- -- Adapted from lua-núcleo
- local function escape_pattern(s)
- local matches = {
- ["^"] = "%^";
- ["$"] = "%$";
- ["("] = "%(";
- [")"] = "%)";
- ["%"] = "%%";
- ["."] = "%.";
- ["["] = "%[";
- ["]"] = "%]";
- ["*"] = "%*";
- ["+"] = "%+";
- ["-"] = "%-";
- ["?"] = "%?";
- ["\0"] = "%z";
- }
- return (s:gsub(".", matches))
- end
- M.generate_sip_v1 = function(path)
- --[[
- Version 1 CSV parsing.
- Each row has 3 cells: reference path (ref), metadata key (k), value (v).
- Each v cell contains one value.
- Properties for the same ref can be added in successive rows without
- repeating the ref cell.
- Multiple values can be added in successive rows without repeating the ref
- and v cells.
- --]]
- local sub_data = assert(csv.open(path))
- local md = {}
- local prev_ref, prev_k
- -- Collate metadata.
- local i = 1
- for row in sub_data:lines() do
- ref, k, v = table.unpack(row)
- -- nil-out empty cells (they come through as "")
- if ref == "" then ref = nil end
- if k == "" then k = nil end
- if v == "" then v = nil end
- print("Parsing row:", ref, k, v)
- -- v can be a legit false value.
- if ref and not k and v == nil then
- -- This can be a placeholder for ordering purposes.
- md[ref] = md_ref or {}
- goto continue
- elseif v == nil then
- goto continue
- else
- -- If ref or k are missing, reuse the previous one.
- if ref then prev_ref = ref
- else
- if not prev_ref then
- -- If column 1 is empty, it must have been set in a
- -- previous row.
- error(string.format(
- "Reference in column 1, row %d not found!", i), 2)
- end
- ref = prev_ref
- end
- if k then prev_k = k
- else
- if not prev_k then
- -- If column 2 is empty, it must have been set in a
- -- previous row.
- error(string.format(
- "Property key in column 2, row %d not found!", i), 2)
- end
- k = prev_k
- end
- end
- md[ref] = md[ref] or {id = uuid(), path = ref, _sort = i}
- md[ref][k] = md[ref][k] or {}
- if k == "type" then
- md[ref][k] = v
- else
- table.insert(md[ref][k], v)
- end
- ::continue::
- i = i + 1
- end
- -- Move md to an ordered list.
- mdlist = {root_path = path:match("(.*/)")}
- for _, v in pairs(md) do table.insert(mdlist, v) end
- table.sort(mdlist, function (a, b) return (a._sort < b._sort) end)
- -- Infer structure from paths and row ordering.
- for i, v in ipairs(mdlist) do
- for j = i + 1, #mdlist do
- --print(string.format("comparing %s : %s", v.path, mdlist[j].path))
- if not v["next"] and
- mdlist[j].path:match("(.*/)") == v.path:match("(.*/)") then
- --print("next match.")
- v["next"] = mdlist[j].path
- end
- if not v.firstChild and
- mdlist[j].path:match("^" .. escape_pattern(v.path)) then
- --print("First child match.")
- v.firstChild = mdlist[j].path
- end
- end
- v._sort = nil
- end
- return mdlist
- end
- M.generate_sip_v2 = function(path)
- local sub_data = assert(csv.open(path, {header = true}))
- local sip = {root_path = path:match("(.*/)")}
- local prev_path
- local i = 1
- for row in sub_data:lines() do
- print("Processing row: " .. i)
- print("Row path: " .. row["path"])
- if row["path"] ~= "" then
- prev_path = row["path"]
- -- New row.
- sip[i] = {pas_id = uuid()}
- for k, v in pairs(row) do
- if v == "" then goto cont1 end -- skip empty strings.
- if config.md.single_values[k] then
- sip[i][k] = v
- else
- sip[i][k] = {v}
- end
- ::cont1::
- end
- i = i + 1
- else
- -- Continuation of values from a previous row.
- if i == 1 then
- error("First row MUST have a path value.", 2)
- elseif not prev_path then
- error(("No path information at row %d"):format(i), 2)
- else
- row.path = prev_path
- for k, v in pairs(row) do
- if v == "" then goto cont2 end -- skip empty strings.
- if config.md.single_values[k] then
- -- It doesn't make much sense to overwrite, maybe throw an error?
- sip[i - 1][k] = v
- else
- print("Value: " .. v)
- print("Inserting at row " .. i -1)
- table.insert(sip[i - 1][k], v)
- end
- ::cont2::
- end
- end
- end
- end
- return sip
- end
- M.validate = function(sip)
- -- TODO
- end
- M.deposit = function(sip)
- for i, rsrc in ipairs(sip) do
- print(("Processing resource #%d of %d: %s"):format(i, #sip, rsrc.id))
- in_path = sip.root_path .. rsrc.path
- -- If it's a directory, skip processing.
- if not path.isfile(in_path) then goto continue end
- local tmp_dir = config.fs.ores_path .. "tmp/"
- local tmp_path = tmp_dir .. rsrc.id
- dir.makepath(tmp_dir)
- local ifh = io.open(in_path, "r")
- local ofh = io.open(tmp_path, "w")
- b2 = cksum.blake2b()
- while true do
- chunk = ifh:read(config.fs.stream_chunk_size)
- if not chunk then break end
- b2(chunk)
- ofh:write(chunk)
- end
- rsrc.b2checksum = b2()
- ofh:close()
- ifh:close()
- out_dir = ("%s%s/%s/"):format(
- config.fs.ores_path,
- rsrc.b2checksum:sub(1,4),
- rsrc.b2checksum:sub(5,8))
- out_path = out_dir .. rsrc.b2checksum
- rsrc.path = out_path
- dir.makepath(out_dir)
- print(("Moving file %s"):format(rsrc.id))
- dir.movefile(tmp_path, rsrc.path)
- ::continue::
- end
- end
- return M
|