local io = io local csv = require "csv" local dir = require "pl.dir" local libmagic = require "libmagic" local plpath = require "pl.path" local pp = require "pl.pretty" local term = require "volksdata.term" local triple = require "volksdata.triple" local graph = require "volksdata.graph" local pkar = require "pocket_archive" local model = require "pocket_archive.model" local mc = require "pocket_archive.monocypher" local transformers = require "pocket_archive.transformers" local validator = require "pocket_archive.validator" local logger = pkar.logger local dbg = require "debugger" -- "nil" table - for missing key fallback in chaining. local NT = {} local M = {} -- Submission module -- Adapted from lua-nĂșcleo local function escape_pattern(s) local matches = { ["^"] = "%^"; ["$"] = "%$"; ["("] = "%("; [")"] = "%)"; ["%"] = "%%"; ["."] = "%."; ["["] = "%["; ["]"] = "%]"; ["*"] = "%*"; ["+"] = "%+"; ["-"] = "%-"; ["?"] = "%?"; ["\0"] = "%z"; } return (s:gsub(".", matches)) end --[[ Only generate a thumbnail for pas:File types. Non-file resources may be assigned a thumbnail from a contained file or from a stock type icon in the metadata population phase. --]] --[=[ local function generate_thumbnail(rsrc, sip_root, tn_dir) local mconf = model.models[rsrc["pas:contentType"]] if not mconf.types["pas:File"] then return end local txconf = (mconf.transformers or NT).thumbnail or {fn = "type_icon"} local src = plpath.join(sip_root, rsrc["pas:sourcePath"]) local dest_fname = rsrc.id:gsub("^par:", "") local ext = txconf.ext or plpath.extension(src) local dest = plpath.join(tn_dir, dest_fname .. ext) assert(transformers[txconf.fn]( src, dest, table.unpack(txconf or NT))) local deliverable = dest:gsub(pkar.config.htmlgen.out_dir, "..") logger:debug("thumbnail: ", dest) return dest end --]=] -- Initialize libmagic database. local magic = libmagic.open( libmagic.MIME_TYPE, libmagic.NO_CHECK_COMPRESS ) assert(magic:load()) -- For idgen(). Makes a 60-character pool with ~5.9 bits of entropy per char. local chpool = {} for i = 48, 57 do table.insert(chpool, i) end -- 0-9 for i = 65, 90 do table.insert(chpool, i) end -- A-Z for i = 97, 122 do table.insert(chpool, i) end -- a-z --[[ Generate a random, reader-friendly ID. A 16-character ID with the above defined #chpool of 60 smybols has an entropy of 94.5 bits, which should be plenty for a small repository. ]] M.idgen = function(len) local charlist = {} for i = 1, (len or pkar.config.id.len) do table.insert(charlist, string.char(chpool[math.random(1, #chpool)])) end return table.concat(charlist) end M.generate_sip = function(path) local sub_data = assert(csv.open(path, {header = true})) local sip = {root_path = path:match("(.*/)")} local tn_dir = plpath.join(sip.root_path, "proc", "tn") dir.makepath(tn_dir) local prev_path local i = 0 local row_n = 2 -- Skipping header row. for row in sub_data:lines() do logger:debug("Row path: ", row["pas:sourcePath"]) logger:debug("Parsing row:", pp.write(row)) if row["pas:sourcePath"] ~= "" then i = i + 1 logger:info( ("Processing LL resource #%d at row #%d.") :format(i, row_n)) prev_path = row["pas:sourcePath"] -- New row. local id if row.id then id = "par:" .. row.id row.id = nil else id = "par:" .. M.idgen() end sip[i] = {id = id} for k, v in pairs(row) do if v == "" then goto cont1 end -- skip empty strings. if pkar.config.md.single_values[k] then sip[i][k] = v else sip[i][k] = {[v] = true} end -- Multi-values are a set. ::cont1:: end --[[ -- Generate thumbnail for files. local rsrc_path = plpath.join( sip.root_path, sip[i]["pas:sourcePath"]) if plpath.isfile(rsrc_path) then --require "debugger"() sip[i]["pas:thumbnail"] = generate_thumbnail( sip[i], sip.root_path, tn_dir) end --]] else -- Continuation of values from a previous row. if i < 1 then error("First row MUST have a path value.", 2) elseif not prev_path then error(("No path information at row %d"):format(i), 2) else for k, v in pairs(row) do if v == "" then goto cont2 end -- skip empty strings. if pkar.config.md.single_values[k] then -- It doesn't make much sense to overwrite, maybe throw an error? error( ("On CSV row #%d: field %s is single-valued.") :format(row_n, k)) else logger:debug("Value: ", v) logger:debug("Inserting at row ", i - 1) sip[i][k][v] = true end ::cont2:: end row["pas:sourcePath"] = prev_path end end row_n = row_n + 1 end -- Infer structure from paths and row ordering. for i, v in ipairs(sip) do local rmod = model.parse_model(v["pas:contentType"]) if rmod.properties["pas:next"] then for j = i + 1, #sip do if not v["pas:next"] and sip[j]["pas:sourcePath"]:match("(.*/)") == v["pas:sourcePath"]:match("(.*/)") then v["pas:next"] = sip[j].id end end end if rmod.properties["pas:first"] then for j = i + 1, #sip do if not v["pas:first"] and sip[j]["pas:sourcePath"]:match( "^" .. escape_pattern(v["pas:sourcePath"]) ) then v["pas:first"] = sip[j].id end end end end --require "debugger"() return sip end --[[ Convert a SIP resource table to an in-memory Volksdata graph. --]] M.rsrc_to_graph = function(rsrc) local rmod = model.parse_model(rsrc["pas:contentType"]) logger:info("Updating resource md: ", pp.write(rsrc)) local s = term.new_iriref_ns(rsrc.id) triples = {} for k, v in pairs(rsrc) do -- id is the subject, it won't be an attribute. if k == "id" then goto skip end logger:debug(("Adding attribute: %s = %s"):format(k, pp.write(v))) local p = term.new_iriref_ns(k) local o local datatype = ((rmod.properties or NT)[k] or NT).type local rdf_type_str = pkar.config.md.datatypes[datatype] local rdf_type if rdf_type_str then rdf_type = term.new_iriref_ns(rdf_type_str).data end -- Force all fields to be multi-valued. if type(v) ~= "table" then v = {[v] = true} end for vv in pairs(v) do if k == "pas:contentType" then vv = "pas:" .. vv end if datatype == "resource" then o = term.new_iriref_ns(vv) elseif datatype == "ext_resource" then o = term.new_iriref(vv) else o = term.new_lit(vv, rdf_type) end table.insert(triples, triple.new(s, p, o)) end ::skip:: end for i, m in ipairs(rmod.lineage) do table.insert( triples, triple.new(s, pkar.RDF_TYPE, term.new_iriref_ns(m))) end local gr = graph.new(nil) -- This is a full replacement. --require "debugger"() logger:info("Removing triples.") gr:remove(s) logger:info("Adding triples.") -- TODO implement volksdata_lua fn to add a single triple and add triples -- in the previous loop. gr:add(triples) return gr, s end M.store_updates = function(tmp_gr, s) -- TODO use a transaction when volksdata_lua supports it. logger:debug("Graph: ", tmp_gr:encode("ttl")) local val_report = validator.validate(tmp_gr, s) if val_report.max_level == "ERROR" then error( "Validation raised errors: " .. pp.write(val_report)) elseif val_report.max_level == "WARN" then logger:warn( "Validation raised warnings: " .. pp.write(val_report)) elseif val_report.max_level == "NOTICE" then logger:warn( "Validation raised notices: " .. pp.write(val_report)) end local stored_gr = graph.new(pkar.store, term.DEFAULT_CTX) return tmp_gr:copy(stored_gr) end M.deposit = function(sip) for i, rsrc in ipairs(sip) do -- TODO Wrap this chunk into a txn. Each row is atomic. logger:debug(("Processing resource #%d of %d: %s"):format( i, #sip, rsrc.id)) local in_path = sip.root_path .. rsrc["pas:sourcePath"] local fext = plpath.extension(in_path) -- If it's a directory, skip file processing. if not plpath.isfile(in_path) then goto continue end do local tmp_dir = plpath.join(pkar.config.fs.ores_path, "tmp/") local file_ext _, file_ext = plpath.splitext(in_path) local tmp_path = tmp_dir .. rsrc.id .. file_ext dir.makepath(tmp_dir) local ifh = assert(io.open(in_path, "r")) rsrc["dc:format"] = {[magic:filehandle(ifh)] = true} local hash_it = mc.new_blake2b() local fsize = 0 logger:debug("Hashing ", in_path) local ofh = assert(io.open(tmp_path, "w")) while true do chunk = ifh:read(pkar.config.fs.stream_chunk_size) if not chunk then break end hash_it:update(chunk) ofh:write(chunk) fsize = fsize + #chunk end local checksum = hash_it:final(true) rsrc["premis:hasMessageDigest"] = { ["urn:blake2:" .. checksum] = true} rsrc["dc:extent"] = fsize ofh:close() ifh:close() -- Copy file and calculate checksum. local out_dir, out_path out_dir = plpath.join( pkar.config.fs.ores_path, checksum:sub(1, 2), checksum:sub(3, 4)) out_path = plpath.join(out_dir, checksum:sub(1,32) .. fext) dir.makepath(out_dir) logger:debug(("Moving file %s to %s"):format(tmp_path, out_path)) dir.movefile(tmp_path, out_path) rsrc["pas:path"] = out_path -- Copy thumbnail if existing. if rsrc["pas:thumbnail"] then src_path = rsrc["pas:thumbnail"] out_path = plpath.join( out_dir, plpath.basename(src_path)) logger:debug(("Moving file %s to %s"):format(src_path, out_path)) dir.movefile(src_path, out_path) rsrc["pas:thumbnail"] = out_path end end ::continue:: tstamp = os.date("!%Y-%m-%dT%TZ") rsrc["dc:created"] = tstamp rsrc["dc:modified"] = tstamp M.store_updates(M.rsrc_to_graph(rsrc)) end -- Remove processing directory. dir.rmtree(plpath.join(sip.root_path, "proc")) end return M