|
@@ -1,9 +1,30 @@
|
|
|
|
+--[[ Deposit module.
|
|
|
|
+
|
|
|
|
+This module takes care of the complete deposit process (except for the back
|
|
|
|
+end storage, which is called here but defined in the repo module).
|
|
|
|
+
|
|
|
|
+The deposit process is carried out in several steps:
|
|
|
|
+
|
|
|
|
+- SIP generation (`generate_sip()`): scans the laundry list CSV and builds a
|
|
|
|
+ temporary data structure with the found metadata; generates unique IDs for
|
|
|
|
+ resources; infers some implicit relationships from the position of the CSV
|
|
|
|
+ rows and folder layout; adds system-controlled metadata.
|
|
|
|
+- File staging (`deposit()`): scan through the generated SIP, identifies the
|
|
|
|
+ files, calculates their checksums, and moves them to temporary storage; adds
|
|
|
|
+ checksums to the metadata. TODO allow user-provided metadata and validation
|
|
|
|
+- graph generation: generate an RDF graph for each resource in the SIP.
|
|
|
|
+- permanent storage: push the RDF graph to permanent store (via functions in
|
|
|
|
+ the `repo` module), which includes content model validation; if this
|
|
|
|
+ succeeds, related files are also moved from the staging area to the archival
|
|
|
|
+ store.
|
|
|
|
+
|
|
|
|
+--]]
|
|
local io = io
|
|
local io = io
|
|
|
|
|
|
local csv = require "ftcsv"
|
|
local csv = require "ftcsv"
|
|
local dir = require "pl.dir"
|
|
local dir = require "pl.dir"
|
|
local libmagic = require "libmagic"
|
|
local libmagic = require "libmagic"
|
|
-local plpath = require "pl.path"
|
|
|
|
|
|
+local path = require "pl.path"
|
|
local pp = require "pl.pretty"
|
|
local pp = require "pl.pretty"
|
|
|
|
|
|
local term = require "volksdata.term"
|
|
local term = require "volksdata.term"
|
|
@@ -15,7 +36,6 @@ local model = require "pocket_archive.model"
|
|
local mc = require "pocket_archive.monocypher"
|
|
local mc = require "pocket_archive.monocypher"
|
|
local repo = require "pocket_archive.repo"
|
|
local repo = require "pocket_archive.repo"
|
|
local transformers = require "pocket_archive.transformers"
|
|
local transformers = require "pocket_archive.transformers"
|
|
-local validator = require "pocket_archive.validator"
|
|
|
|
|
|
|
|
local logger = pkar.logger
|
|
local logger = pkar.logger
|
|
local dbg = require "debugger"
|
|
local dbg = require "debugger"
|
|
@@ -57,17 +77,27 @@ M.idgen = function(len)
|
|
end
|
|
end
|
|
|
|
|
|
|
|
|
|
-M.generate_sip = function(path)
|
|
|
|
- local sip = {root_path = path:match("(.*/)")}
|
|
|
|
|
|
+M.generate_sip = function(src_path)
|
|
|
|
+ local sip = {root_path = src_path:match("(.*/)")}
|
|
|
|
+ local src_dir = path.dirname(src_path)
|
|
path_to_uri = {}
|
|
path_to_uri = {}
|
|
|
|
|
|
- local tn_dir = plpath.join(sip.root_path, "proc", "tn")
|
|
|
|
|
|
+ local tn_dir = path.join(sip.root_path, "proc", "tn")
|
|
dir.makepath(tn_dir)
|
|
dir.makepath(tn_dir)
|
|
|
|
|
|
local prev_path
|
|
local prev_path
|
|
|
|
|
|
local i = 0
|
|
local i = 0
|
|
- for row_n, row in csv.parseLine(path) do
|
|
|
|
|
|
+ for row_n, row in csv.parseLine(src_path) do
|
|
|
|
+ local has_content
|
|
|
|
+ for k, v in pairs(row) do
|
|
|
|
+ -- Change "" to nil.
|
|
|
|
+ if v == "" then row[k] = nil
|
|
|
|
+ else has_content = true end
|
|
|
|
+ end
|
|
|
|
+ -- Skip empty lines.
|
|
|
|
+ if not has_content then goto skip end
|
|
|
|
+
|
|
logger:debug("Row path: ", row.source_path)
|
|
logger:debug("Row path: ", row.source_path)
|
|
logger:debug("Parsing row:", pp.write(row))
|
|
logger:debug("Parsing row:", pp.write(row))
|
|
if #row.source_path > 0 then
|
|
if #row.source_path > 0 then
|
|
@@ -81,7 +111,7 @@ M.generate_sip = function(path)
|
|
-- Add to path to URI map for later referencing.
|
|
-- Add to path to URI map for later referencing.
|
|
path_to_uri[row.source_path] = sip[i].id
|
|
path_to_uri[row.source_path] = sip[i].id
|
|
for k, v in pairs(row) do
|
|
for k, v in pairs(row) do
|
|
- if v == "" then goto cont1 end -- skip empty strings.
|
|
|
|
|
|
+ if not v then goto cont1 end -- skip empty strings.
|
|
if pkar.config.md.single_values[k] then sip[i][k] = v
|
|
if pkar.config.md.single_values[k] then sip[i][k] = v
|
|
-- Multi-values are ordered in the SIP for further processing.
|
|
-- Multi-values are ordered in the SIP for further processing.
|
|
else sip[i][k] = {v} end
|
|
else sip[i][k] = {v} end
|
|
@@ -90,9 +120,9 @@ M.generate_sip = function(path)
|
|
|
|
|
|
--[[
|
|
--[[
|
|
-- Generate thumbnail for files.
|
|
-- Generate thumbnail for files.
|
|
- local rsrc_path = plpath.join(
|
|
|
|
|
|
+ local rsrc_path = path.join(
|
|
sip.root_path, sip[i].source_path)
|
|
sip.root_path, sip[i].source_path)
|
|
- if plpath.isfile(rsrc_path) then
|
|
|
|
|
|
+ if path.isfile(rsrc_path) then
|
|
--require "debugger"()
|
|
--require "debugger"()
|
|
sip[i].thumbnail = generate_thumbnail(
|
|
sip[i].thumbnail = generate_thumbnail(
|
|
sip[i], sip.root_path, tn_dir)
|
|
sip[i], sip.root_path, tn_dir)
|
|
@@ -106,7 +136,7 @@ M.generate_sip = function(path)
|
|
error(("No path information at row %d"):format(i), 2)
|
|
error(("No path information at row %d"):format(i), 2)
|
|
else
|
|
else
|
|
for k, v in pairs(row) do
|
|
for k, v in pairs(row) do
|
|
- if v == "" then goto cont2 end -- skip empty strings.
|
|
|
|
|
|
+ if not v then goto cont2 end -- skip empty strings.
|
|
if pkar.config.md.single_values[k] then
|
|
if pkar.config.md.single_values[k] then
|
|
-- It doesn't make much sense to overwrite, maybe throw an error?
|
|
-- It doesn't make much sense to overwrite, maybe throw an error?
|
|
error(
|
|
error(
|
|
@@ -122,13 +152,31 @@ M.generate_sip = function(path)
|
|
row.source_path = prev_path
|
|
row.source_path = prev_path
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
+ ::skip::
|
|
row_n = row_n + 1
|
|
row_n = row_n + 1
|
|
end
|
|
end
|
|
-- Infer structure from paths and row ordering.
|
|
-- Infer structure from paths and row ordering.
|
|
for i, v in ipairs(sip) do
|
|
for i, v in ipairs(sip) do
|
|
local rmod = model.types[v.content_type]
|
|
local rmod = model.types[v.content_type]
|
|
|
|
+ dbg.assert(v.source_path)
|
|
|
|
+ local fpath = path.join(src_dir, v.source_path)
|
|
--dbg.assert(rmod)
|
|
--dbg.assert(rmod)
|
|
v.has_member = v.has_member or {}
|
|
v.has_member = v.has_member or {}
|
|
|
|
+ -- Create implicit members from single-file artifact.
|
|
|
|
+ if rmod.types.artifact and path.isfile(fpath) then
|
|
|
|
+ local file_id = "par:" .. M.idgen()
|
|
|
|
+ -- Insert file resource and move it into a new sub-folder.
|
|
|
|
+ table.insert(sip, {
|
|
|
|
+ content_type = rmod.default_fmodel or "file",
|
|
|
|
+ id = file_id,
|
|
|
|
+ label = path.basename(v.source_path),
|
|
|
|
+ source_path = v.source_path,
|
|
|
|
+ })
|
|
|
|
+ sip[i].has_file = file_id
|
|
|
|
+ sip[i].pref_rep = file_id
|
|
|
|
+ sip[i].source_path = nil
|
|
|
|
+ goto skip
|
|
|
|
+ end
|
|
for j = i + 1, #sip do
|
|
for j = i + 1, #sip do
|
|
if sip[j].source_path:match(
|
|
if sip[j].source_path:match(
|
|
"^" .. pkar.escape_pattern(v.source_path))
|
|
"^" .. pkar.escape_pattern(v.source_path))
|
|
@@ -142,6 +190,7 @@ M.generate_sip = function(path)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
+ ::skip::
|
|
end
|
|
end
|
|
|
|
|
|
return sip
|
|
return sip
|
|
@@ -255,15 +304,18 @@ M.deposit = function(sip)
|
|
logger:debug(("Processing resource #%d of %d: %s"):format(
|
|
logger:debug(("Processing resource #%d of %d: %s"):format(
|
|
i, #sip, rsrc.id))
|
|
i, #sip, rsrc.id))
|
|
|
|
|
|
- local in_path = sip.root_path .. rsrc.source_path
|
|
|
|
- local fext = plpath.extension(in_path)
|
|
|
|
|
|
+ local in_path, fext
|
|
|
|
+ if not rsrc.source_path then goto continue end
|
|
|
|
+
|
|
|
|
+ in_path = sip.root_path .. rsrc.source_path
|
|
|
|
+ fext = path.extension(in_path)
|
|
-- If it's a directory, skip file processing.
|
|
-- If it's a directory, skip file processing.
|
|
- if not plpath.isfile(in_path) then goto continue end
|
|
|
|
|
|
+ if not path.isfile(in_path) then goto continue end
|
|
|
|
|
|
do
|
|
do
|
|
- local tmp_dir = plpath.join(pkar.config.fs.ores_path, "tmp/")
|
|
|
|
|
|
+ local tmp_dir = path.join(pkar.config.fs.ores_path, "tmp/")
|
|
local file_ext
|
|
local file_ext
|
|
- _, file_ext = plpath.splitext(in_path)
|
|
|
|
|
|
+ _, file_ext = path.splitext(in_path)
|
|
local tmp_path = tmp_dir .. rsrc.id .. file_ext
|
|
local tmp_path = tmp_dir .. rsrc.id .. file_ext
|
|
dir.makepath(tmp_dir)
|
|
dir.makepath(tmp_dir)
|
|
|
|
|
|
@@ -290,11 +342,11 @@ M.deposit = function(sip)
|
|
|
|
|
|
-- Copy file and calculate checksum.
|
|
-- Copy file and calculate checksum.
|
|
local out_dir, out_path
|
|
local out_dir, out_path
|
|
- out_dir = plpath.join(
|
|
|
|
|
|
+ out_dir = path.join(
|
|
pkar.config.fs.ores_path,
|
|
pkar.config.fs.ores_path,
|
|
checksum:sub(1, 2),
|
|
checksum:sub(1, 2),
|
|
checksum:sub(3, 4))
|
|
checksum:sub(3, 4))
|
|
- out_path = plpath.join(out_dir, checksum:sub(1,32) .. fext)
|
|
|
|
|
|
+ out_path = path.join(out_dir, checksum:sub(1,32) .. fext)
|
|
dir.makepath(out_dir)
|
|
dir.makepath(out_dir)
|
|
logger:debug(("Moving file %s to %s"):format(tmp_path, out_path))
|
|
logger:debug(("Moving file %s to %s"):format(tmp_path, out_path))
|
|
dir.movefile(tmp_path, out_path)
|
|
dir.movefile(tmp_path, out_path)
|
|
@@ -303,8 +355,8 @@ M.deposit = function(sip)
|
|
-- Copy thumbnail if existing.
|
|
-- Copy thumbnail if existing.
|
|
if rsrc.thumbnail then
|
|
if rsrc.thumbnail then
|
|
src_path = rsrc.thumbnail
|
|
src_path = rsrc.thumbnail
|
|
- out_path = plpath.join(
|
|
|
|
- out_dir, plpath.basename(src_path))
|
|
|
|
|
|
+ out_path = path.join(
|
|
|
|
+ out_dir, path.basename(src_path))
|
|
logger:debug(("Moving file %s to %s"):format(src_path, out_path))
|
|
logger:debug(("Moving file %s to %s"):format(src_path, out_path))
|
|
dir.movefile(src_path, out_path)
|
|
dir.movefile(src_path, out_path)
|
|
rsrc.thumbnail = out_path
|
|
rsrc.thumbnail = out_path
|
|
@@ -320,7 +372,7 @@ M.deposit = function(sip)
|
|
end
|
|
end
|
|
|
|
|
|
-- Remove processing directory.
|
|
-- Remove processing directory.
|
|
- dir.rmtree(plpath.join(sip.root_path, "proc"))
|
|
|
|
|
|
+ dir.rmtree(path.join(sip.root_path, "proc"))
|
|
end
|
|
end
|
|
|
|
|
|
|
|
|