123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352 |
- local io = io
- local csv = require "csv"
- local dir = require "pl.dir"
- local libmagic = require "libmagic"
- local plpath = require "pl.path"
- local pp = require "pl.pretty"
- local term = require "volksdata.term"
- local triple = require "volksdata.triple"
- local graph = require "volksdata.graph"
- local pkar = require "pocket_archive"
- local model = require "pocket_archive.model"
- local mc = require "pocket_archive.monocypher"
- local transformers = require "pocket_archive.transformers"
- local validator = require "pocket_archive.validator"
- local logger = pkar.logger
- local dbg = require "debugger"
- -- "nil" table - for missing key fallback in chaining.
- local NT = {}
- local M = {} -- Submission module
- -- Adapted from lua-núcleo
- local function escape_pattern(s)
- local matches = {
- ["^"] = "%^";
- ["$"] = "%$";
- ["("] = "%(";
- [")"] = "%)";
- ["%"] = "%%";
- ["."] = "%.";
- ["["] = "%[";
- ["]"] = "%]";
- ["*"] = "%*";
- ["+"] = "%+";
- ["-"] = "%-";
- ["?"] = "%?";
- ["\0"] = "%z";
- }
- return (s:gsub(".", matches))
- end
- --[[
- Only generate a thumbnail for pas:File types.
- Non-file resources may be assigned a thumbnail from a contained file
- or from a stock type icon in the metadata population phase.
- --]]
- --[=[
- local function generate_thumbnail(rsrc, sip_root, tn_dir)
- local mconf = model.models[rsrc["pas:contentType"]]
- if not mconf.types["pas:File"] then return end
- local txconf = (mconf.transformers or NT).thumbnail or {fn = "type_icon"}
- local src = plpath.join(sip_root, rsrc["pas:sourcePath"])
- local dest_fname = rsrc.id:gsub("^par:", "")
- local ext = txconf.ext or plpath.extension(src)
- local dest = plpath.join(tn_dir, dest_fname .. ext)
- assert(transformers[txconf.fn](
- src, dest, table.unpack(txconf or NT)))
- local deliverable = dest:gsub(pkar.config.htmlgen.out_dir, "..")
- logger:debug("thumbnail: ", dest)
- return dest
- end
- --]=]
- -- Initialize libmagic database.
- local magic = libmagic.open( libmagic.MIME_TYPE, libmagic.NO_CHECK_COMPRESS )
- assert(magic:load())
- -- For idgen(). Makes a 60-character pool with ~5.9 bits of entropy per char.
- local chpool = {}
- for i = 48, 57 do table.insert(chpool, i) end -- 0-9
- for i = 65, 90 do table.insert(chpool, i) end -- A-Z
- for i = 97, 122 do table.insert(chpool, i) end -- a-z
- --[[
- Generate a random, reader-friendly ID.
- A 16-character ID with the above defined #chpool of 60 smybols has an entropy
- of 94.5 bits, which should be plenty for a small repository.
- ]]
- M.idgen = function(len)
- local charlist = {}
- for i = 1, (len or pkar.config.id.len) do
- table.insert(charlist, string.char(chpool[math.random(1, #chpool)]))
- end
- return table.concat(charlist)
- end
- M.generate_sip = function(path)
- local sub_data = assert(csv.open(path, {header = true}))
- local sip = {root_path = path:match("(.*/)")}
- local tn_dir = plpath.join(sip.root_path, "proc", "tn")
- dir.makepath(tn_dir)
- local prev_path
- local i = 0
- local row_n = 2 -- Skipping header row.
- for row in sub_data:lines() do
- logger:debug("Row path: ", row["pas:sourcePath"])
- logger:debug("Parsing row:", pp.write(row))
- if row["pas:sourcePath"] ~= "" then
- i = i + 1
- logger:info(
- ("Processing LL resource #%d at row #%d.")
- :format(i, row_n))
- prev_path = row["pas:sourcePath"]
- -- New row.
- local id
- if row.id then
- id = "par:" .. row.id
- row.id = nil
- else id = "par:" .. M.idgen() end
- sip[i] = {id = id}
- for k, v in pairs(row) do
- if v == "" then goto cont1 end -- skip empty strings.
- if pkar.config.md.single_values[k] then sip[i][k] = v
- else sip[i][k] = {[v] = true} end -- Multi-values are a set.
- ::cont1::
- end
- --[[
- -- Generate thumbnail for files.
- local rsrc_path = plpath.join(
- sip.root_path, sip[i]["pas:sourcePath"])
- if plpath.isfile(rsrc_path) then
- --require "debugger"()
- sip[i]["pas:thumbnail"] = generate_thumbnail(
- sip[i], sip.root_path, tn_dir)
- end
- --]]
- else
- -- Continuation of values from a previous row.
- if i < 1 then
- error("First row MUST have a path value.", 2)
- elseif not prev_path then
- error(("No path information at row %d"):format(i), 2)
- else
- for k, v in pairs(row) do
- if v == "" then goto cont2 end -- skip empty strings.
- if pkar.config.md.single_values[k] then
- -- It doesn't make much sense to overwrite, maybe throw an error?
- error(
- ("On CSV row #%d: field %s is single-valued.")
- :format(row_n, k))
- else
- logger:debug("Value: ", v)
- logger:debug("Inserting at row ", i - 1)
- sip[i][k][v] = true
- end
- ::cont2::
- end
- row["pas:sourcePath"] = prev_path
- end
- end
- row_n = row_n + 1
- end
- -- Infer structure from paths and row ordering.
- for i, v in ipairs(sip) do
- local rmod = model.parse_model(v["pas:contentType"])
- if rmod.properties["pas:next"] then
- for j = i + 1, #sip do
- if not v["pas:next"] and
- sip[j]["pas:sourcePath"]:match("(.*/)") ==
- v["pas:sourcePath"]:match("(.*/)") then
- v["pas:next"] = sip[j].id
- end
- end
- end
- if rmod.properties["pas:first"] then
- for j = i + 1, #sip do
- if not v["pas:first"] and
- sip[j]["pas:sourcePath"]:match(
- "^" .. escape_pattern(v["pas:sourcePath"])
- ) then
- v["pas:first"] = sip[j].id
- end
- end
- end
- end
- --require "debugger"()
- return sip
- end
- --[[ Convert a SIP resource table to an in-memory Volksdata graph.
- --]]
- M.rsrc_to_graph = function(rsrc)
- local rmod = model.parse_model(rsrc["pas:contentType"])
- logger:info("Updating resource md: ", pp.write(rsrc))
- local s = term.new_iriref_ns(rsrc.id)
- triples = {}
- for k, v in pairs(rsrc) do
- -- id is the subject, it won't be an attribute.
- if k == "id" then goto skip end
- logger:debug(("Adding attribute: %s = %s"):format(k, pp.write(v)))
- local p = term.new_iriref_ns(k)
- local o
- local datatype = ((rmod.properties or NT)[k] or NT).type
- local rdf_type_str = pkar.config.md.datatypes[datatype]
- local rdf_type
- if rdf_type_str then
- rdf_type = term.new_iriref_ns(rdf_type_str).data
- end
- -- Force all fields to be multi-valued.
- if type(v) ~= "table" then v = {[v] = true} end
- for vv in pairs(v) do
- if k == "pas:contentType" then
- vv = "pas:" .. vv
- end
- if datatype == "resource" then
- o = term.new_iriref_ns(vv)
- elseif datatype == "ext_resource" then
- o = term.new_iriref(vv)
- else o = term.new_lit(vv, rdf_type) end
- table.insert(triples, triple.new(s, p, o))
- end
- ::skip::
- end
- for i, m in ipairs(rmod.lineage) do
- table.insert(
- triples, triple.new(s, pkar.RDF_TYPE, term.new_iriref_ns(m)))
- end
- local gr = graph.new(nil)
- -- This is a full replacement.
- --require "debugger"()
- logger:info("Removing triples.")
- gr:remove(s)
- logger:info("Adding triples.")
- -- TODO implement volksdata_lua fn to add a single triple and add triples
- -- in the previous loop.
- gr:add(triples)
- return gr, s
- end
- M.store_updates = function(tmp_gr, s)
- -- TODO use a transaction when volksdata_lua supports it.
- logger:debug("Graph: ", tmp_gr:encode("ttl"))
- local val_report = validator.validate(tmp_gr, s)
- if val_report.max_level == "ERROR" then error(
- "Validation raised errors: " .. pp.write(val_report))
- elseif val_report.max_level == "WARN" then logger:warn(
- "Validation raised warnings: " .. pp.write(val_report))
- elseif val_report.max_level == "NOTICE" then logger:warn(
- "Validation raised notices: " .. pp.write(val_report)) end
- local stored_gr = graph.new(pkar.store, term.DEFAULT_CTX)
- return tmp_gr:copy(stored_gr)
- end
- M.deposit = function(sip)
- for i, rsrc in ipairs(sip) do
- -- TODO Wrap this chunk into a txn. Each row is atomic.
- logger:debug(("Processing resource #%d of %d: %s"):format(
- i, #sip, rsrc.id))
- local in_path = sip.root_path .. rsrc["pas:sourcePath"]
- local fext = plpath.extension(in_path)
- -- If it's a directory, skip file processing.
- if not plpath.isfile(in_path) then goto continue end
- do
- local tmp_dir = plpath.join(pkar.config.fs.ores_path, "tmp/")
- local file_ext
- _, file_ext = plpath.splitext(in_path)
- local tmp_path = tmp_dir .. rsrc.id .. file_ext
- dir.makepath(tmp_dir)
- local ifh = assert(io.open(in_path, "r"))
- rsrc["dc:format"] = {[magic:filehandle(ifh)] = true}
- local hash_it = mc.new_blake2b()
- local fsize = 0
- logger:debug("Hashing ", in_path)
- local ofh = assert(io.open(tmp_path, "w"))
- while true do
- chunk = ifh:read(pkar.config.fs.stream_chunk_size)
- if not chunk then break end
- hash_it:update(chunk)
- ofh:write(chunk)
- fsize = fsize + #chunk
- end
- local checksum = hash_it:final(true)
- rsrc["premis:hasMessageDigest"] = {
- ["urn:blake2:" .. checksum] = true}
- rsrc["dc:extent"] = fsize
- ofh:close()
- ifh:close()
- -- Copy file and calculate checksum.
- local out_dir, out_path
- out_dir = plpath.join(
- pkar.config.fs.ores_path,
- checksum:sub(1, 2),
- checksum:sub(3, 4))
- out_path = plpath.join(out_dir, checksum:sub(1,32) .. fext)
- dir.makepath(out_dir)
- logger:debug(("Moving file %s to %s"):format(tmp_path, out_path))
- dir.movefile(tmp_path, out_path)
- rsrc["pas:path"] = out_path
- -- Copy thumbnail if existing.
- if rsrc["pas:thumbnail"] then
- src_path = rsrc["pas:thumbnail"]
- out_path = plpath.join(
- out_dir, plpath.basename(src_path))
- logger:debug(("Moving file %s to %s"):format(src_path, out_path))
- dir.movefile(src_path, out_path)
- rsrc["pas:thumbnail"] = out_path
- end
- end
- ::continue::
- tstamp = os.date("!%Y-%m-%dT%TZ")
- rsrc["dc:created"] = tstamp
- rsrc["dc:modified"] = tstamp
- M.store_updates(M.rsrc_to_graph(rsrc))
- end
- -- Remove processing directory.
- dir.rmtree(plpath.join(sip.root_path, "proc"))
- end
- return M
|