123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319 |
- local io = io
- local csv = require "csv"
- local dir = require "pl.dir"
- local plpath = require "pl.path"
- local term = require "lsup.term"
- local triple = require "lsup.triple"
- local graph = require "lsup.graph"
- local mc = require "pocket_archive.monocypher"
- local pkar = require "pocket_archive"
- local M = {} -- Submission module
- -- Adapted from lua-núcleo
- local function escape_pattern(s)
- local matches = {
- ["^"] = "%^";
- ["$"] = "%$";
- ["("] = "%(";
- [")"] = "%)";
- ["%"] = "%%";
- ["."] = "%.";
- ["["] = "%[";
- ["]"] = "%]";
- ["*"] = "%*";
- ["+"] = "%+";
- ["-"] = "%-";
- ["?"] = "%?";
- ["\0"] = "%z";
- }
- return (s:gsub(".", matches))
- end
- -- For idgen(). Makes a 60-character pool with ~5.9 bits of entropy per char.
- local chpool = {}
- for i = 48, 57 do table.insert(chpool, i) end -- 0-9
- for i = 65, 90 do table.insert(chpool, i) end -- A-Z
- for i = 97, 122 do table.insert(chpool, i) end -- a-z
- --[[
- Generate a random, reader-friendly ID.
- A 16-character ID with the above defined #chpool of 60 smybols has an entropy
- of 94.5 bits, which should be plenty for a small repository.
- ]]
- M.idgen = function(len)
- local charlist = {}
- for i = 1, (len or pkar.config.id.len) do
- table.insert(charlist, string.char(chpool[math.random(1, #chpool)]))
- end
- return table.concat(charlist)
- end
- --[=[
- M.generate_sip_v1 = function(path)
- --[[
- Version 1 CSV parsing.
- Each row has 3 cells: reference path (ref), metadata key (k), value (v).
- Each v cell contains one value.
- Properties for the same ref can be added in successive rows without
- repeating the ref cell.
- Multiple values can be added in successive rows without repeating the ref
- and v cells.
- --]]
- local sub_data = assert(csv.open(path))
- local md = {}
- local prev_ref, prev_k
- -- Collate metadata.
- local i = 1
- for row in sub_data:lines() do
- ref, k, v = table.unpack(row)
- -- nil-out empty cells (they come through as "")
- if ref == "" then ref = nil end
- if k == "" then k = nil end
- if v == "" then v = nil end
- print("Parsing row:", ref, k, v)
- -- v can be a legit false value.
- if ref and not k and v == nil then
- -- This can be a placeholder for ordering purposes.
- md[ref] = md_ref or {}
- goto continue
- elseif v == nil then
- goto continue
- else
- -- If ref or k are missing, reuse the previous one.
- if ref then prev_ref = ref
- else
- if not prev_ref then
- -- If column 1 is empty, it must have been set in a
- -- previous row.
- error(string.format(
- "Reference in column 1, row %d not found!", i), 2)
- end
- ref = prev_ref
- end
- if k then prev_k = k
- else
- if not prev_k then
- -- If column 2 is empty, it must have been set in a
- -- previous row.
- error(string.format(
- "Property key in column 2, row %d not found!", i), 2)
- end
- k = prev_k
- end
- end
- md[ref] = md[ref] or {id = M.idgen(), path = ref, _sort = i}
- md[ref][k] = md[ref][k] or {}
- if k == "type" then
- md[ref][k] = v
- else
- table.insert(md[ref][k], v)
- end
- ::continue::
- i = i + 1
- end
- -- Move md to an ordered list.
- mdlist = {root_path = path:match("(.*/)")}
- for _, v in pairs(md) do table.insert(mdlist, v) end
- table.sort(mdlist, function (a, b) return (a._sort < b._sort) end)
- -- Infer structure from paths and row ordering.
- for i, v in ipairs(mdlist) do
- for j = i + 1, #mdlist do
- --print(string.format("comparing %s : %s", v.path, mdlist[j].path))
- if not v["next"] and
- mdlist[j].path:match("(.*/)") == v.path:match("(.*/)") then
- --print("next match.")
- v["next"] = mdlist[j].path
- end
- if not v.firstChild and
- mdlist[j].path:match("^" .. escape_pattern(v.path)) then
- --print("First child match.")
- v.firstChild = mdlist[j].path
- end
- end
- v._sort = nil
- end
- return mdlist
- end
- --]=]
- M.generate_sip_v2 = function(path)
- local sub_data = assert(csv.open(path, {header = true}))
- local sip = {root_path = path:match("(.*/)")}
- local prev_path
- local i = 1
- for row in sub_data:lines() do
- print("Processing row: " .. i)
- print("Row path: " .. row["path"])
- if row["path"] ~= "" then
- prev_path = row["path"]
- -- New row.
- sip[i] = {id = M.idgen()}
- for k, v in pairs(row) do
- if v == "" then goto cont1 end -- skip empty strings.
- if pkar.config.md.single_values[k] then sip[i][k] = v
- else sip[i][k] = {[v] = true} end -- Multi-values are a set.
- ::cont1::
- end
- i = i + 1
- else
- -- Continuation of values from a previous row.
- if i == 1 then
- error("First row MUST have a path value.", 2)
- elseif not prev_path then
- error(("No path information at row %d"):format(i), 2)
- else
- row.path = prev_path
- for k, v in pairs(row) do
- if v == "" then goto cont2 end -- skip empty strings.
- if pkar.config.md.single_values[k] then
- -- It doesn't make much sense to overwrite, maybe throw an error?
- sip[i - 1][k] = v
- else
- print("Value: " .. v)
- print("Inserting at row " .. i -1)
- sip[i - 1][k][v] = true
- end
- ::cont2::
- end
- end
- end
- end
- -- Infer structure from paths and row ordering.
- for i, v in ipairs(sip) do
- for j = i + 1, #sip do
- --print(string.format("comparing %s : %s", v.path, sip[j].path))
- if not v["pas:next"] and
- sip[j].path:match("(.*/)") == v.path:match("(.*/)") then
- --print("next match.")
- v["pas:next"] = sip[j].path
- end
- if not v["pas:firstChild"] and
- sip[j].path:match("^" .. escape_pattern(v.path)) then
- --print("First child match.")
- v["pas:firstChild"] = sip[j].path
- end
- end
- v._sort = nil
- end
- return sip
- end
- M.validate = function(sip)
- -- TODO
- end
- M.update_rsrc_md = function(rsrc)
- -- TODO use a transaction when lsup_lua supports it.
- triples = {}
- local s = term.new_iriref("par:" .. rsrc.id, pkar.nsm)
- gr = graph.new(pkar.store, s.data, pkar.nsm)
- rsrc.id = nil -- Exclude from metadata scan.
- for k, v in pairs(rsrc) do
- print("Adding attribute:", k, v)
- local p = term.new_iriref(k, pkar.nsm)
- if type(v) == "table" then
- for vv, _ in pairs(v) do
- table.insert(triples, triple.new(s, p, term.new_lit(vv)))
- end
- else table.insert(triples, triple.new(s, p, term.new_lit(v))) end
- end
- -- This is a full replacement.
- print("Removing triples.")
- gr:remove();
- print("Adding triples.")
- -- TODO implement lsup_lua fn to add a single triple and add triples in
- -- the previous loop.
- gr:add(triples)
- end
- M.update_md = function(sip)
- end
- M.deposit = function(sip)
- for i, rsrc in ipairs(sip) do
- -- TODO Wrap this chunk into a txn. Each row is atomic.
- print(("Processing resource #%d of %d: %s"):format(i, #sip, rsrc.id))
- in_path = sip.root_path .. rsrc.path
- -- If it's a directory, skip file processing.
- if not plpath.isfile(in_path) then goto continue end
- do
- tmp_dir = pkar.config.fs.ores_path .. "tmp/"
- local tmp_path = tmp_dir .. rsrc.id
- dir.makepath(tmp_dir)
- local ifh = assert(io.open(in_path, "r"))
- local ofh = assert(io.open(tmp_path, "w"))
- local hash_it = mc.new_blake2b()
- local fsize = 0
- print(("Hashing %s"):format(in_path))
- while true do
- chunk = ifh:read(pkar.config.fs.stream_chunk_size)
- if not chunk then break end
- hash_it:update(chunk)
- ofh:write(chunk)
- fsize = fsize + #chunk
- end
- local checksum = hash_it:final(true)
- rsrc["premis:hasMessageDigest"] = {["blake2:" .. checksum] = true}
- rsrc["dc:extent"] = fsize
- ofh:close()
- ifh:close()
- out_dir = ("%s%s/%s/"):format(
- pkar.config.fs.ores_path,
- checksum:sub(1, 4),
- checksum:sub(5, 9))
- out_path = out_dir .. checksum:sub(1,32)
- rsrc.path = out_path
- dir.makepath(out_dir)
- print(("Moving file %s to %s"):format(tmp_path, rsrc.path))
- dir.movefile(tmp_path, rsrc.path)
- end
- ::continue::
- tstamp = os.date("!%Y-%m-%dT%TZ")
- rsrc["dc:created"] = tstamp
- rsrc["dc:modified"] = tstamp
- M.update_rsrc_md(rsrc)
- end
- end
- return M
|