local io = io local csv = require "csv" local dir = require "pl.dir" local plpath = require "pl.path" local term = require "lsup.term" local triple = require "lsup.triple" local graph = require "lsup.graph" local mc = require "pocket_archive.monocypher" local pkar = require "pocket_archive" local M = {} -- Submission module -- Adapted from lua-nĂșcleo local function escape_pattern(s) local matches = { ["^"] = "%^"; ["$"] = "%$"; ["("] = "%("; [")"] = "%)"; ["%"] = "%%"; ["."] = "%."; ["["] = "%["; ["]"] = "%]"; ["*"] = "%*"; ["+"] = "%+"; ["-"] = "%-"; ["?"] = "%?"; ["\0"] = "%z"; } return (s:gsub(".", matches)) end -- For idgen(). Makes a 60-character pool with ~5.9 bits of entropy per char. local chpool = {} for i = 48, 57 do table.insert(chpool, i) end -- 0-9 for i = 65, 90 do table.insert(chpool, i) end -- A-Z for i = 97, 122 do table.insert(chpool, i) end -- a-z --[[ Generate a random, reader-friendly ID. A 16-character ID with the above defined #chpool of 60 smybols has an entropy of 94.5 bits, which should be plenty for a small repository. ]] M.idgen = function(len) local charlist = {} for i = 1, (len or pkar.config.id.len) do table.insert(charlist, string.char(chpool[math.random(1, #chpool)])) end return table.concat(charlist) end --[=[ M.generate_sip_v1 = function(path) --[[ Version 1 CSV parsing. Each row has 3 cells: reference path (ref), metadata key (k), value (v). Each v cell contains one value. Properties for the same ref can be added in successive rows without repeating the ref cell. Multiple values can be added in successive rows without repeating the ref and v cells. --]] local sub_data = assert(csv.open(path)) local md = {} local prev_ref, prev_k -- Collate metadata. local i = 1 for row in sub_data:lines() do ref, k, v = table.unpack(row) -- nil-out empty cells (they come through as "") if ref == "" then ref = nil end if k == "" then k = nil end if v == "" then v = nil end print("Parsing row:", ref, k, v) -- v can be a legit false value. if ref and not k and v == nil then -- This can be a placeholder for ordering purposes. md[ref] = md_ref or {} goto continue elseif v == nil then goto continue else -- If ref or k are missing, reuse the previous one. if ref then prev_ref = ref else if not prev_ref then -- If column 1 is empty, it must have been set in a -- previous row. error(string.format( "Reference in column 1, row %d not found!", i), 2) end ref = prev_ref end if k then prev_k = k else if not prev_k then -- If column 2 is empty, it must have been set in a -- previous row. error(string.format( "Property key in column 2, row %d not found!", i), 2) end k = prev_k end end md[ref] = md[ref] or {id = M.idgen(), path = ref, _sort = i} md[ref][k] = md[ref][k] or {} if k == "type" then md[ref][k] = v else table.insert(md[ref][k], v) end ::continue:: i = i + 1 end -- Move md to an ordered list. mdlist = {root_path = path:match("(.*/)")} for _, v in pairs(md) do table.insert(mdlist, v) end table.sort(mdlist, function (a, b) return (a._sort < b._sort) end) -- Infer structure from paths and row ordering. for i, v in ipairs(mdlist) do for j = i + 1, #mdlist do --print(string.format("comparing %s : %s", v.path, mdlist[j].path)) if not v["next"] and mdlist[j].path:match("(.*/)") == v.path:match("(.*/)") then --print("next match.") v["next"] = mdlist[j].path end if not v.firstChild and mdlist[j].path:match("^" .. escape_pattern(v.path)) then --print("First child match.") v.firstChild = mdlist[j].path end end v._sort = nil end return mdlist end --]=] M.generate_sip_v2 = function(path) local sub_data = assert(csv.open(path, {header = true})) local sip = {root_path = path:match("(.*/)")} local prev_path local i = 1 for row in sub_data:lines() do print("Processing row: " .. i) print("Row path: " .. row["path"]) if row["path"] ~= "" then prev_path = row["path"] -- New row. sip[i] = {id = M.idgen()} for k, v in pairs(row) do if v == "" then goto cont1 end -- skip empty strings. if pkar.config.md.single_values[k] then sip[i][k] = v else sip[i][k] = {[v] = true} end -- Multi-values are a set. ::cont1:: end i = i + 1 else -- Continuation of values from a previous row. if i == 1 then error("First row MUST have a path value.", 2) elseif not prev_path then error(("No path information at row %d"):format(i), 2) else row.path = prev_path for k, v in pairs(row) do if v == "" then goto cont2 end -- skip empty strings. if pkar.config.md.single_values[k] then -- It doesn't make much sense to overwrite, maybe throw an error? sip[i - 1][k] = v else print("Value: " .. v) print("Inserting at row " .. i -1) sip[i - 1][k][v] = true end ::cont2:: end end end end -- Infer structure from paths and row ordering. for i, v in ipairs(sip) do for j = i + 1, #sip do --print(string.format("comparing %s : %s", v.path, sip[j].path)) if not v["pas:next"] and sip[j].path:match("(.*/)") == v.path:match("(.*/)") then --print("next match.") v["pas:next"] = sip[j].path end if not v["pas:firstChild"] and sip[j].path:match("^" .. escape_pattern(v.path)) then --print("First child match.") v["pas:firstChild"] = sip[j].path end end v._sort = nil end return sip end M.validate = function(sip) -- TODO end M.update_rsrc_md = function(rsrc) -- TODO use a transaction when lsup_lua supports it. triples = {} local s = term.new_iriref("par:" .. rsrc.id, pkar.nsm) gr = graph.new(pkar.store, s.data, pkar.nsm) rsrc.id = nil -- Exclude from metadata scan. for k, v in pairs(rsrc) do print("Adding attribute:", k, v) local p = term.new_iriref(k, pkar.nsm) if type(v) == "table" then for vv, _ in pairs(v) do table.insert(triples, triple.new(s, p, term.new_lit(vv))) end else table.insert(triples, triple.new(s, p, term.new_lit(v))) end end -- This is a full replacement. print("Removing triples.") gr:remove(); print("Adding triples.") -- TODO implement lsup_lua fn to add a single triple and add triples in -- the previous loop. gr:add(triples) end M.update_md = function(sip) end M.deposit = function(sip) for i, rsrc in ipairs(sip) do -- TODO Wrap this chunk into a txn. Each row is atomic. print(("Processing resource #%d of %d: %s"):format(i, #sip, rsrc.id)) in_path = sip.root_path .. rsrc.path -- If it's a directory, skip file processing. if not plpath.isfile(in_path) then goto continue end do tmp_dir = pkar.config.fs.ores_path .. "tmp/" local tmp_path = tmp_dir .. rsrc.id dir.makepath(tmp_dir) local ifh = assert(io.open(in_path, "r")) local ofh = assert(io.open(tmp_path, "w")) local hash_it = mc.new_blake2b() local fsize = 0 print(("Hashing %s"):format(in_path)) while true do chunk = ifh:read(pkar.config.fs.stream_chunk_size) if not chunk then break end hash_it:update(chunk) ofh:write(chunk) fsize = fsize + #chunk end local checksum = hash_it:final(true) rsrc["premis:hasMessageDigest"] = {["blake2:" .. checksum] = true} rsrc["dc:extent"] = fsize ofh:close() ifh:close() out_dir = ("%s%s/%s/"):format( pkar.config.fs.ores_path, checksum:sub(1, 4), checksum:sub(5, 9)) out_path = out_dir .. checksum:sub(1,32) rsrc.path = out_path dir.makepath(out_dir) print(("Moving file %s to %s"):format(tmp_path, rsrc.path)) dir.movefile(tmp_path, rsrc.path) end ::continue:: tstamp = os.date("!%Y-%m-%dT%TZ") rsrc["dc:created"] = tstamp rsrc["dc:modified"] = tstamp M.update_rsrc_md(rsrc) end end return M