Ver código fonte

WIP convert submission into a class.

scossu 2 dias atrás
pai
commit
5eef07e13c
4 arquivos alterados com 227 adições e 191 exclusões
  1. 6 3
      doc/user_guide/docs/roadmap.md
  2. 210 183
      src/submission.lua
  3. 5 3
      src/util/pkar.lua
  4. 6 2
      src/util/watcher.lua

+ 6 - 3
doc/user_guide/docs/roadmap.md

@@ -54,12 +54,14 @@ usage and opportunities for expanding adoption in relevant areas.
 
 - ✖︎ Management UI & API
     - ✖︎ Deposit via single tar or zip file submission
--  submission
+-  submission
     - ✓ Watch local folder and trigger submission
         - ✓ Option to regenerate site after submission
         - ✓ Option to clean up sources & LL on success
     - ✓ Submission report
     - ✓ Deleting resources
+    - ❏ Prevent modification of system properties
+    - ⚒ Allow updating a file's metadata if it's only present in the archive
 - ✓ Proper collection handling
     - ✓ Dedicated template
     - ✓ Link to markdown doc for presentation page
@@ -98,13 +100,14 @@ usage and opportunities for expanding adoption in relevant areas.
 
 ## ❏ Post-release wishlist
 
-(will be turned into separate release plans)
+(will be grouped into separate release plans)
 
+- Provided checksum verification
 - Multilingual support
 - Schema definition validator
 - Incremental build
 - Rebuild only site assets
 - Custom templating
-- Auto relatioships inference
+- Auto relationships inference
 - Markdown support for property values
 

+ 210 - 183
src/submission.lua

@@ -5,7 +5,7 @@ end storage, which is called here but defined in the repo module).
 
 The deposit process is carried out in several steps:
 
-- SIP generation (`generate_sip()`): scans the laundry list CSV and builds a
+- SIP generation (`parse_ll()`): scans the laundry list CSV and builds a
   temporary data structure with the found metadata; generates unique IDs for
   resources; infers some implicit relationships from the position of the CSV
   rows and folder layout; adds system-controlled metadata.
@@ -46,9 +46,6 @@ local logger = pkar.logger
 -- "nil" table - for missing key fallback in chaining.
 local NT = {}
 
--- Submission ID and name.
-local sub_id, sub_name
-
 -- Initialize libmagic database.
 local magic = libmagic.open(libmagic.MIME_TYPE, libmagic.NO_CHECK_COMPRESS )
 assert(magic:load())
@@ -61,7 +58,30 @@ for i = 65, 90  do table.insert(chpool, i) end  -- A-Z
 for i = 97, 122 do table.insert(chpool, i) end  -- a-z
 
 
-local function generate_sip(sub)
+--[[
+Generate a random, reader-friendly ID.
+
+A 16-character ID with the above defined #chpool of 60 smybols has an entropy
+of 94.5 bits, which should be plenty for a medium-sized repository.
+]]
+function idgen(len)
+    local charlist = {}
+    for i = 1, (len or pkar.config.id.len) do
+        table.insert(charlist, string.char(chpool[math.random(1, #chpool)]))
+    end
+
+    return table.concat(charlist)
+end
+
+
+--[[--
+Parse laundry list and generate the basic SIP.
+
+@tparam Submission sub Submission object to populate.
+
+@return true on success; false, error report on failure.
+]]
+local function parse_ll(sub)
     local tn_dir = path.join(sub.root_path, "proc", "tn")
     dir.makepath(tn_dir)
 
@@ -89,8 +109,9 @@ local function generate_sip(sub)
 
             sub.sip[i] = {
                 -- Normalize provided ID or generate random ID if not provided.
-                id = "par:" .. (row.id or sub:idgen()),
+                id = "par:" .. (row.id or idgen()),
                 sub_id = sub.id,
+                sub = sub,
             }
             prev_id = row.id
             sub.sip_ids[sub.sip[i].id] = true  -- Add to common SIP ID set.
@@ -135,163 +156,85 @@ end
 
 
 --[[--
-Process SIP files and metadata.
-During this step, files are moved to a staging area, their checksums are
+Process a file resource.
+
+During this step, the file is moved to a staging area, its checksum is
 calculated, and some more technical metadata are extracted and added to the
-D-Res.
+file's D-Res.
 
-@tparam table rsrc Resource from the SIP parsed by #{generate_sip}.
---]]
-local function process_rsrc(sub, rsrc)
-    -- Infer structure from paths and row ordering.
-    local rmod = model.types[rsrc.content_type]
-    --require "debugger".assert(rmod)
-    local fpath = path.join(sub.root_path, rsrc.source_path)
-    --dbg.assert(rmod)
-    rsrc.has_member = rsrc.has_member or {}
-    -- Create implicit members from single-file artifact.
-    if rmod.types.artifact and path.isfile(fpath) then
-        local file_id = "par:" .. sub:idgen()
-        sub.sip_ids[file_id] = true
-        -- Insert file resource and move it into a new sub-folder.
-        table.insert(sub.sip, {
-            content_type = rmod.default_fmodel or "file",
-            id = file_id,
-            sub_id = sub.id,
-            label = path.basename(rsrc.source_path),
-            source_path = rsrc.source_path,
-        })
-        rsrc.has_file = file_id
-        rsrc.pref_rep = file_id
-        rsrc.source_path = nil
-        goto skip
-    end
-    for j = i + 1, #sub.sip do
-        if sub.sip[j].source_path:match(
-            "^" .. pkar.escape_ptn(rsrc.source_path))
-        then
-            local rel_path = sub.sip[j].source_path:sub(#rsrc.source_path + 2)
-            logger:debug("rel_path: " .. rel_path)
-            if not rel_path:match("/") then
-                logger:debug(("Adding member %s to %s"):format(
-                        rel_path, rsrc.source_path))
-                table.insert(rsrc.has_member, sub.sip[j].id)
-            end
-        end
+@tparam table rsrc Resource table to be updated.
+]]
+local function process_file(rsrc)
+    local src_path = path.join(rsrc.sub.root_path, rsrc.source_path)
+    local tmp_dir = path.join(pkar.config.fs.ores_path, "tmp/")
+    local fext
+    _, fext = path.splitext(src_path)
+    local tmp_path = tmp_dir .. rsrc.id .. fext
+    dir.makepath(tmp_dir)
+
+    local ifh = io.open(src_path, "r")
+    if not ifh then
     end
-    ::skip::
 
-    local in_path, fext
-    if not rsrc.source_path then goto skip_file_proc end
-    -- FIXME need to account for file MD-only updates.
-
-    in_path = path.join(sub.root_path, rsrc.source_path)
-    fext = path.extension(in_path)
-    -- If it's a directory, skip file processing.
-    if path.isdir(in_path) then goto skip_file_proc end
-
-    do
-        local tmp_dir = path.join(pkar.config.fs.ores_path, "tmp/")
-        local file_ext
-        _, file_ext = path.splitext(in_path)
-        local tmp_path = tmp_dir .. rsrc.id .. file_ext
-        dir.makepath(tmp_dir)
-
-        local ifh = io.open(in_path, "r")
-        -- If the source file is not in the SIP, check if it is already in the
-        -- archive. In that case, this is considered a metadata-only update.
-        if not ifh then
-            if repo.gr:contains(triple.new(
-                term.new_iriref_ns("par:" .. rsrc.id),
-                pkar.RDF_TYPE,
-                model.id_to_uri("file")
-            )) then
-                logger:info("Metadata-only update on file: " .. rsrc.id)
-                goto skip_file_proc
-            else
-                error(
-                    "No source path was provided and no file with this ID \z
-                    was found in the archive: " .. rsrc.id)
-            end
-        end
-
-        rsrc.format = {magic:filehandle(ifh)}
-        local hash_it = mc.new_blake2b()
-        local fsize = 0
-        logger:debug("Hashing ", in_path)
-        local ofh = assert(io.open(tmp_path, "w"))
-        while true do
-            chunk = ifh:read(pkar.config.fs.stream_chunk_size)
-            if not chunk then break end
-            hash_it:update(chunk)
-            ofh:write(chunk)
-            fsize = fsize + #chunk
-        end
-        local checksum = hash_it:final(true)
-        rsrc.checksum = {"blake2:" .. checksum}
-        rsrc.size = fsize
-
-        ofh:close()
-        ifh:close()
-
-        -- Copy file and calculate checksum.
-        local out_dir, out_path
-        out_dir = path.join(
-                pkar.config.fs.ores_path,
-                checksum:sub(1, 2),
-                checksum:sub(3, 4))
-        out_path = path.join(out_dir, checksum:sub(1,32) .. fext)
-        dir.makepath(out_dir)
-        logger:debug(("Moving file %s to %s"):format(tmp_path, out_path))
-        dir.movefile(tmp_path, out_path)
-        rsrc.archive_path = out_path
-
-        -- Copy thumbnail if existing.
-        if rsrc.thumbnail then
-            src_path = rsrc.thumbnail
-            out_path = path.join(
-                    out_dir, path.basename(src_path))
-            logger:debug(("Moving file %s to %s"):format(src_path, out_path))
-            dir.movefile(src_path, out_path)
-            rsrc.thumbnail = out_path
-        end
+    rsrc.format = {magic:filehandle(ifh)}
+    local hash_it = mc.new_blake2b()
+    local fsize = 0
+    logger:debug("Hashing ", src_path)
+    local ofh = assert(io.open(tmp_path, "w"))
+    while true do
+        chunk = ifh:read(pkar.config.fs.stream_chunk_size)
+        if not chunk then break end
+        hash_it:update(chunk)
+        ofh:write(chunk)
+        fsize = fsize + #chunk
+    end
+    local checksum = hash_it:final(true)
+    rsrc.checksum = {"blake2:" .. checksum}
+    rsrc.size = fsize
+
+    ofh:close()
+    ifh:close()
+
+    -- Copy file and calculate checksum.
+    local out_dir, out_path
+    out_dir = path.join(
+            pkar.config.fs.ores_path,
+            checksum:sub(1, 2),
+            checksum:sub(3, 4))
+    out_path = path.join(out_dir, checksum:sub(1,32) .. fext)
+    dir.makepath(out_dir)
+    logger:debug(("Moving file %s to %s"):format(tmp_path, out_path))
+    dir.movefile(tmp_path, out_path)
+    rsrc.archive_path = out_path
+
+    -- Copy thumbnail if existing.
+    if rsrc.thumbnail then
+        src_path = rsrc.thumbnail
+        out_path = path.join(
+                out_dir, path.basename(src_path))
+        logger:debug(("Moving file %s to %s"):format(src_path, out_path))
+        dir.movefile(src_path, out_path)
+        rsrc.thumbnail = out_path
     end
 
-    ::skip_file_proc::
-
-    local tstamp = os.date("!%Y-%m-%dT%TZ")
-    rsrc.submitted = tstamp
-    rsrc.last_modified = tstamp
-
-    local tmp_gr, s
-    tmp_gr, s = rsrc_to_graph(rsrc)
-
-    local val_report = validator.validate(tmp_gr, s)
-    if val_report.max_level == "ERROR" then error(
-        "Validation raised errors: " .. pp.write(val_report))
-    elseif val_report.max_level == "WARN" then logger:warn(
-        "Validation raised warnings: " .. pp.write(val_report))
-    elseif val_report.max_level == "NOTICE" then logger:warn(
-        "Validation raised notices: " .. pp.write(val_report)) end
-
-    repo.store_updates(tmp_gr, s)
-    logger:info("Stored: ", s.data)
+    return true
 end
 
 
 --[[--
 Convert a SIP resource table to an in-memory Volksdata graph.
 ]]
-local function rsrc_to_graph(sub, rsrc)
+local function rsrc_to_graph(rsrc)
     local rmod = model.types[rsrc.content_type]
-    logger:debug("Updating resource md: ", pp.write(rsrc))
+    --logger:debug("Updating resource md: ", pp.write(rsrc))
 
     local s = term.new_iriref_ns(rsrc.id)
     local gr = graph.new(nil)
+    local skip_props = {id = true, sub = true}
 
     it = gr:add_init()
     for prop, v in pairs(rsrc) do
-        if prop == "id" then goto skip end
+        if skip_props[prop] then goto skip end
         logger:debug(("Adding attribute: %s = %s"):format(prop, pp.write(v)))
         local p = model.id_to_uri[prop]
         if not p then
@@ -321,7 +264,7 @@ local function rsrc_to_graph(sub, rsrc)
                 -- "par:" could have been added previously.
                 local rel_id = "par:" .. vv:gsub("^par:", "")
                 if
-                    not sub.sip_ids[rel_id]
+                    not rsrc.sub.sip_ids[rel_id]
                     and not repo.gr:contains(triple.new(
                         term.new_iriref_ns(rel_id),
                         pkar.RDF_TYPE,
@@ -329,7 +272,7 @@ local function rsrc_to_graph(sub, rsrc)
                 ))
                 then
                     -- Convert local path to URIs.
-                    local uri = sub.path_to_uri[vv]
+                    local uri = rsrc.sub.path_to_uri[vv]
                     if not uri then error(
                         ("Not a valid path: %s for property: %s on res: %s")
                         :format(vv, prop, rsrc.id))
@@ -352,9 +295,9 @@ local function rsrc_to_graph(sub, rsrc)
             local proxy_s
             for i, vv in ipairs(v) do
                 -- Add linked list proxies.
-                local brick_id = "par:" .. sub:idgen()
+                local brick_id = "par:" .. idgen()
                 local brick_uri = term.new_iriref_ns(brick_id)
-                sub.sip_ids[brick_id] = true
+                rsrc.sub.sip_ids[brick_id] = true
                 if i == 1 then
                     proxy_s = s
                     it:add_iter(triple.new(
@@ -407,6 +350,93 @@ local function rsrc_to_graph(sub, rsrc)
 end
 
 
+--[[--
+Process SIP files and metadata.
+
+@tparam table rsrc Resource from the SIP parsed by #{parse_ll}.
+--]]
+local function process_rsrc(rsrc)
+    local rmod = model.types[rsrc.content_type]
+    --require "debugger".assert(rmod)
+
+    -- BEGIN file check and processing.
+    if rmod.types.file then
+        if rsrc.source_path then
+            local in_path = path.join(rsrc.sub.root_path, rsrc.source_path)
+            if path.isfile(in_path) then process_file(rsrc, in_path)
+            else error(
+                "File type provided for " .. rsrc.id ..
+                "but the source path " .. in_path .. " is not a file.")
+            end
+        else
+            -- If it's a file and no path is provided, look for it in the repo.
+            -- This will obviously fail if no ID was provided either, and
+            -- a random one was just generated.
+            if repo.gr:contains(triple.new(
+                term.new_iriref_ns("par:" .. rsrc.id),
+                pkar.RDF_TYPE,
+                model.id_to_uri("file")
+            )) then
+                -- File is in the repo. This is a metadata-only update.
+                logger:info("Metadata-only update on file: " .. rsrc.id)
+            else
+                -- No file found. That's an error.
+                error(
+                    "No source path was provided and no file with this ID \z
+                    was found in the archive: " .. rsrc.id)
+            end
+        end
+    end
+    -- END file processing.
+
+    ::skip_file_proc::
+
+    -- BEGIN metadata processing.
+    local tstamp = os.date("!%Y-%m-%dT%TZ")
+    rsrc.submitted = tstamp
+    rsrc.last_modified = tstamp
+
+    -- Infer structure from paths and row ordering.
+    --require "debugger".assert(rmod)
+    local fpath = path.join(rsrc.sub.root_path, rsrc.source_path)
+    rsrc.has_member = rsrc.has_member or {}
+    -- Create implicit members from single-file artifact.
+    if rmod.types.artifact and path.isfile(fpath) then
+        local file_id = "par:" .. idgen()
+        rsrc.sub.sip_ids[file_id] = true
+        -- Insert file resource. It will be processed as part of the sip table.
+        table.insert(rsrc.sub.sip, {
+            content_type = rmod.default_fmodel or "file",
+            id = file_id,
+            sub_id = rsrc.sub.id,
+            label = path.basename(rsrc.source_path),
+            source_path = rsrc.source_path,
+        })
+        rsrc.has_file = file_id
+        rsrc.pref_rep = file_id
+        rsrc.source_path = nil
+        goto skip
+    end
+    ::skip::
+
+    -- END metadata processing.
+
+    local tmp_gr, s
+    tmp_gr, s = rsrc_to_graph(rsrc)
+
+    local val_report = validator.validate(tmp_gr, s)
+    if val_report.max_level == "ERROR" then error(
+        "Validation raised errors: " .. pp.write(val_report))
+    elseif val_report.max_level == "WARN" then logger:warn(
+        "Validation raised warnings: " .. pp.write(val_report))
+    elseif val_report.max_level == "NOTICE" then logger:warn(
+        "Validation raised notices: " .. pp.write(val_report)) end
+
+    repo.store_updates(tmp_gr, s)
+    logger:info("Stored: ", s.data)
+end
+
+
 local function add_sub_meta(sub)
     -- Add triples for submission metadata directly to the stored graph.
     local it = repo.gr:add_init()
@@ -419,7 +449,7 @@ local function add_sub_meta(sub)
         it:add_iter(triple.new(
             sub.uri,
             term.new_iriref_ns("rdfs:label"),
-            term.new_lit(sub_name)
+            term.new_lit(sub.name)
         ))
     end
     local tstamp = os.date("!%Y-%m-%dT%TZ")
@@ -471,6 +501,8 @@ end
 -- Public class & members
 --
 
+local Submission = {}
+
 
 --[[--
 Create a new submission.
@@ -486,7 +518,7 @@ function Submission:new (ll_path, report_path)
     local sub = {
         ll_path = ll_path,
         report_path = report_path,
-        id = "sub:" .. sub:idgen(),
+        id = "sub:" .. idgen(),
         name = ll_path:match("pkar_submission[%-_%.](.*)%.csv"),
         sip = {},
         root_path = path.dirname(ll_path),
@@ -499,9 +531,8 @@ function Submission:new (ll_path, report_path)
     self.__index = self
     setmetatable(sub, self)
 
-    local rc, ret = xpcall(generate_sip, debug.traceback, sub)
-    if rc then sub.sip = ret
-    else
+    local rc, ret = xpcall(parse_ll, debug.traceback, sub)
+    if not rc then
         return nil, generate_report(report_path, {
             result = "failure",
             message = "An error occurred while parsing the SIP.",
@@ -516,22 +547,6 @@ function Submission:new (ll_path, report_path)
 end
 
 
---[[
-Generate a random, reader-friendly ID.
-
-A 16-character ID with the above defined #chpool of 60 smybols has an entropy
-of 94.5 bits, which should be plenty for a medium-sized repository.
-]]
-function Submission:idgen(len)
-    local charlist = {}
-    for i = 1, (len or pkar.config.id.len) do
-        table.insert(charlist, string.char(chpool[math.random(1, #chpool)]))
-    end
-
-    return table.concat(charlist)
-end
-
-
 --[[--
 Deposit resources from a SIP.
 @tparam string ll_path Path of the laundry list. All SIP source references are
@@ -550,10 +565,7 @@ function Submission:deposit(ll_path, cleanup)
         logger:debug(("Processing resource #%d of %d: %s"):format(
                 i, #self.sip, rsrc.id))
 
-        local rc, ret = xpcall(
-            process_rsrc,
-            debug.traceback,
-            self, rsrc, self.root_path)
+        local rc, ret = xpcall(process_rsrc, debug.traceback, rsrc)
         if not rc then
             return generate_report(report_path, {
                 result = "failure",
@@ -566,6 +578,20 @@ function Submission:deposit(ll_path, cleanup)
                 },
             })
         end
+        -- Look ahead for resources under this container and add membership.
+        for j = i + 1, #rsrc.sub.sip do
+            if rsrc.sub.sip[j].source_path:match(
+                "^" .. pkar.escape_ptn(rsrc.source_path))
+            then
+                local rel_path = rsrc.sub.sip[j].source_path:sub(#rsrc.source_path + 2)
+                logger:debug("rel_path: " .. rel_path)
+                if not rel_path:match("/") then
+                    logger:debug(("Adding member %s to %s"):format(
+                            rel_path, rsrc.source_path))
+                    table.insert(rsrc.has_member, rsrc.sub.sip[j].id)
+                end
+            end
+        end
     end
 
     rc, ret = xpcall(add_sub_meta, debug.traceback, self)
@@ -612,13 +638,14 @@ function Submission:deposit(ll_path, cleanup)
 end
 
 
-function Submission.reset_ores()
-    if path.isdir(pkar.config.fs.ores_path) then
-        logger:warn("Removing existing opaque resource store.")
-        dir.rmtree(pkar.config.fs.ores_path)
-    end
-    dir.makepath(pkar.config.fs.ores_path)
-end
-
-
-return Submission
+return {
+    Submission = Submission,
+    idgen = idgen,
+    reset_ores = function()
+        if path.isdir(pkar.config.fs.ores_path) then
+            logger:warn("Removing existing opaque resource store.")
+            dir.rmtree(pkar.config.fs.ores_path)
+        end
+        dir.makepath(pkar.config.fs.ores_path)
+    end,
+}

+ 5 - 3
src/util/pkar.lua

@@ -14,7 +14,7 @@ local cmdoc = require "pocket_archive.cmdoc"
 local model = require "pocket_archive.model"
 local pres = require "pocket_archive.presentation"
 local repo = require "pocket_archive.repo"
-local sub = require "pocket_archive.submission"
+local submission = require "pocket_archive.submission"
 
 
 cli.locale "en_US"  -- TODO set with multilingual support.
@@ -31,7 +31,7 @@ init = cli.command {
         if a == "yes" then
             io.write("Alright, you asked for it.\n")
             repo.reset_store()
-            sub.reset_ores()
+            submission.reset_ores()
             pres.reset_site()
         else io.write("Chicken out.\n")
         end
@@ -63,8 +63,10 @@ deposit = cli.command {
 
     function(args)
         --require "debugger"()
-        local report = sub.deposit(args.path, args.cleanup)
+        local sub = submission.Submission:new(args.path)
+        local report = sub:deposit(args.cleanup)
         io.write(json.encode(report))
+        io.write("\n")
     end
 }
 

+ 6 - 2
src/util/watcher.lua

@@ -11,7 +11,7 @@ local watchdog = require "watchdog"
 local pkar = require "pocket_archive"
 local logger = pkar.logger
 local pres = require "pocket_archive.presentation"
-local sub = require "pocket_archive.submission"
+local submission = require "pocket_archive.submission"
 
 
 local running = true
@@ -96,7 +96,11 @@ cli.program {
                     logger:info("Starting submission with pid: " .. mypid)
                     local fpath = path.join(args.path, ev.name)
                     local sub_rc, sub_ret = pcall(
-                        sub.deposit,
+                        function(fpath, cleanup, report_path)
+                            local sub = submission.Submission:new(
+                                fpath, report_path)
+                            local report = sub:deposit(cleanup)
+                        end,
                         fpath,
                         args.cleanup,
                         fpath:gsub("%.csv$", "-report.json")