Browse Source

Create submission MD; dump LL for submission.

scossu 2 weeks ago
parent
commit
6ee77c0641

+ 3 - 2
config/app.lua

@@ -63,14 +63,15 @@ return {
 
     -- Namespace prefixes to populate the Pocket Archive NS map.
     namespace = {
-        b2 = "urn:blake2:",
+        b2 = "blake2:",
         dc = "http://purl.org/dc/terms/",
         foaf = "http://xmlns.com/foaf/0.1/",
-        par = "urn:pkar:resource/",
+        par = "http://id.pkar.knowledgetx.com/resource/",
         pas = "http://id.pkar.knowledgetx.com/schema#",
         premis = "http://id.loc.gov/vocabulary/preservation/",
         rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
         rdfs = "http://www.w3.org/2000/01/rdf-schema#",
+        sub = "http://id.pkar.knowledgetx.com/submission/",
         xsd = "http://www.w3.org/2001/XMLSchema#",
     },
 

+ 1 - 1
config/model/typedef/anything.lua

@@ -34,7 +34,7 @@ return {
             uri = "pas:submissionID",
             description = "Unique ID for the submission that the resource \z
                 was created or updated in.",
-            type = "string",
+            type = "resource",
             min_cardinality = 1,
         },
         ext_id = {

+ 5 - 7
config/model/typedef/collection.lua

@@ -8,19 +8,17 @@ return {
         pref_rep = {
             uri = "pas:hasPreferredRepresentation",
             label = "Preferred representation",
-            description =
-                [[Preferred representation. Used to generate
-                a thumbnail (for a visual item) or sample (for non-visual
-                materials such as audio).]],
+            description = "Preferred representation. Used to generate \z
+                a thumbnail (for a visual item) or sample (for non-visual \z
+                materials such as audio).",
             type = "resource",
             range = {file = true},
         },
         long_description = {
             uri = "pas:longDescription",
             label = "Long description",
-            description =
-                [[Text document in Markdown format with a detailed
-                description of the collection.]],
+            description = "Text document in Markdown format with a detailed \z
+                description of the collection.",
             type = "resource",
             range = {file = true},
         },

+ 4 - 6
config/model/typedef/file.lua

@@ -8,9 +8,8 @@ return {
         archive_path = {
             uri = "pas:archivePath",
             label = "Archival path",
-            description =
-              [[Path of the preserved resource, relative to the archival
-              root.]],
+            description = "Path of the preserved resource, relative to the \z
+                archival root.",
             type = "string",
             min_cardinality = 1,
             max_cardinality = 1,
@@ -44,9 +43,8 @@ return {
         checksum = {
             uri = "premis:hasMessageDigest",
             label = "Checksum",
-            description = [[
-                File checksum formatted as an URN:
-                `urn:<algorithm>:<hex digest>`]],
+            description = "File checksum formatted as an URN: \z
+                `<algorithm>:<hex digest>`",
             type = "ext_resource",
             min_cardinality = 1,
         },

+ 3 - 2
doc/roadmap.md

@@ -70,14 +70,15 @@ usage and opportunities for expanding adoption in relevant areas.
 - ⚒ Generator
   - ⎊ Generate site for one collection only
   - ❏ htmlgen option for local file or webserver URL generation
-  - ❏ Generate LL (multi)
-  - ❏ Generate RDF (multi)
+  - ✓ Generate LL for submission
+  - ✖︎ Generate RDF (multi) [addressed by dump archive RDF]
 - ❏ Front end
   - ❏ Category browsing
   - ❏ Improve search indexing
   - ❏ Enhanced styling and access
 - ❏ Testing
     - ❏ Unit tests
+    - ❏ Roundtrip submission, download LL, update, resubmission
     - ❏ >100 resource data set
 - ❏ Documentation
   - ❏ Break main sections off README

+ 36 - 9
src/generator.lua

@@ -533,7 +533,7 @@ M.generate_search_idx = function(s, mconf)
 end
 
 
-M.generate_ll = function(s)
+local function get_tdata(s)
     local res_gr = repo.get_rsrc(s)
     local mconf = get_mconf(s)
 
@@ -543,7 +543,7 @@ M.generate_ll = function(s)
             content_type = mconf.id,
         },
     }
-    for p, ots_it in pairs(res_gr:connections(s, term.LINK_OUTBOUND)) do
+    for p, ots_it in res_gr:connections(s, term.LINK_OUTBOUND):iter() do
         local pname = model.uri_to_id[nsm.denormalize_uri(p.data)]
         --if p == pkar.RDF_TYPE then goto skip_p end
         if not pname then goto skip_p end
@@ -565,10 +565,34 @@ M.generate_ll = function(s)
         end
         ::skip_p::
     end
-    -- FIXME ftcsv encodes nil values as `"nil"`. See
-    -- https://github.com/FourierTransformer/ftcsv/issues/46
 
-    return csv.encode(tdata, {encodeNilAs = ""})
+    return tdata
+end
+
+M.generate_res_ll = function(s)
+    local tdata = get_tdata(s)
+    return csv.encode(tdata, {
+        encodeNilAs = "", allowEmpty = true, fieldsToKeep = model.pnames
+    })
+end
+
+
+M.generate_sub_ll = function(s)
+    -- TODO this is quite inefficient. Rewrite this and generate_res_ll in a
+    -- streaming application. https://github.com/rgamble/libcsv looks better
+    -- than any Lua solution.
+    local res_ts = repo.gr:term_set(
+        model.id_to_uri.sub_id, triple.POS_P,
+        s, triple.POS_O
+    )
+    local dip = {}
+    for s in res_ts:iter() do
+        for _, row in ipairs(get_tdata(s)) do table.insert(dip, row) end
+    end
+
+    return csv.encode(dip, {
+        encodeNilAs = "", allowEmpty = true, fieldsToKeep = model.pnames
+    })
 end
 
 
@@ -602,9 +626,9 @@ end
 
 
 M.generate_resources = function(coll_id)
-    -- TODO It's more complicated than this. Each member in the collection
-    -- must be scanned recursively for outbound links and visited links must
-    -- be noted down to avoid loops.
+    -- TODO implement update only for one collection.
+    -- Each member in the collection must be scanned recursively for outbound
+    -- links and visited links must be added to a set to avoid loops.
     --[[
     if coll_id then
         subject_ts = repo.gr:term_set(
@@ -613,7 +637,10 @@ M.generate_resources = function(coll_id)
         )
     else subjects_ts = repo.gr:unique_terms(triple.POS_S) end
     --]]
-    subjects_ts = repo.gr:unique_terms(triple.POS_S)
+    subjects_ts = repo.gr:term_set(
+        pkar.RDF_TYPE, triple.POS_P,
+        term.new_iriref_ns("pas:Anything"), triple.POS_O
+    )
 
     -- Initialize the JSON template with an opening brace.
     local ofh = assert(io.open(index_path, "w"))

+ 47 - 7
src/model.lua

@@ -10,10 +10,27 @@ local logger = pkar.logger
 local dbg = require "debugger"
 
 
+-- "nil" table.
+local NT = {}
+
+local no_ll_pnames = {
+    archive_path = true,
+    checksum = true,
+    size = true,
+    submitted = true,
+    sub_id = true,
+}
+
+
 local M = {
     -- Parsed typedef configurations.
     types = {},
 
+    -- Field names. Order is kept in encoding laundry lists. The hardcoded
+    -- values come first, all others are harvested from the typedef
+    -- configuration and ordered alphabetically.
+    pnames = {"content_type", "id", "source_path"},
+
     -- Term-to-URI map. URIs are volksdata.Term objects.
     id_to_uri = {},
 
@@ -101,15 +118,38 @@ local function parse_model(mod_id)
 end
 
 
--- Collect all type names from config file names.
-for _, fpath in ipairs(dir.getfiles(
-            path.join(MODEL_PATH, "typedef"), "*.lua")) do
-    local mname = path.basename(fpath):gsub(".lua$", "")
-    local typedef = parse_model(mname)
+local function setup_model()
+    -- Temp store (set) for property names.
+    local all_pnames = {}
+    -- Collect all type names from config file names.
+    for _, fpath in ipairs(dir.getfiles(
+                path.join(MODEL_PATH, "typedef"), "*.lua")) do
+        local mname = path.basename(fpath):gsub(".lua$", "")
+        local typedef = parse_model(mname)
+
+        -- Store parsed typedef configurations.
+        M.types[mname] = typedef
 
-    -- Store parsed typedef configurations.
-    M.types[mname] = typedef
+        -- Store unique prop names.
+        for pn in pairs(typedef.properties or NT) do
+            if not no_ll_pnames[pn] then all_pnames[pn] = true end
+        end
+    end
+
+    -- Remove hardcoded prop names, reorder, and append to module's pnames.
+    local pnames_ordered = {}
+    for _, hcpn in ipairs(M.pnames) do
+        all_pnames[hcpn] = nil
+    end
+    for pn in pairs(all_pnames) do
+        table.insert(pnames_ordered, pn)
+    end
+    table.sort(pnames_ordered)
+    for _, pn in ipairs(pnames_ordered) do table.insert(M.pnames, pn) end
+    logger:debug("Property names ordered: " .. table.concat(M.pnames, ", "))
 end
 
 
+setup_model()
+
 return M

+ 52 - 13
src/submission.lua

@@ -52,6 +52,8 @@ local path_to_uri
 -- Track IDs in SIP to validate links created in a submission.
 local sip_ids
 
+-- Submission ID and name.
+local sub_id, sub_name
 
 -- Initialize libmagic database.
 local magic = libmagic.open(libmagic.MIME_TYPE, libmagic.NO_CHECK_COMPRESS )
@@ -85,7 +87,8 @@ local function generate_sip(ll_path)
     if not path.isfile(ll_path) then error(ll_path .. " is not a file.", 2) end
 
     -- Submission ID sticks to all the resources.
-    local sub_id = idgen()
+    sub_id = "sub:" .. idgen()
+    sub_name = ll_path:match("pkar_submission[%-_%.](.*)%.csv")
 
     local sip = {root_path = path.dirname(ll_path)}
     path_to_uri = {}
@@ -94,7 +97,7 @@ local function generate_sip(ll_path)
     local tn_dir = path.join(sip.root_path, "proc", "tn")
     dir.makepath(tn_dir)
 
-    local prev_path
+    local prev_id
 
     local i = 0
     for row_n, row in csv.parseLine(ll_path) do
@@ -107,20 +110,21 @@ local function generate_sip(ll_path)
         -- Skip empty lines.
         if not has_content then goto skip end
 
-        logger:debug("Row path: ", row.source_path or "")
         logger:debug("Parsing row:", pp.write(row))
-        if row.source_path then
+        -- content_type is the only real mandatory entry.
+        if row.content_type then
             i = i + 1
+            -- New row.
             logger:info(
                     ("Processing LL resource #%d at row #%d.")
                     :format(i, row_n))
-            prev_path = row.source_path
-            -- New row.
+
             sip[i] = {
                 -- Normalize provided ID or generate random ID if not provided.
                 id = "par:" .. (row.id or idgen()),
                 sub_id = sub_id,
             }
+            prev_id = row.id
             sip_ids[sip[i].id] = true  -- Add to common sip ID set.
             for k, v in pairs(row) do
                 if not v or k == "id" then goto cont1 end  -- skip empty strings.
@@ -135,7 +139,7 @@ local function generate_sip(ll_path)
             -- Continuation of values from a previous row.
             if i < 1 then
                 error("First row MUST have a path value.", 2)
-            elseif not prev_path then
+            elseif not prev_id then
                 error(("No path information at row %d"):format(i), 2)
             else
                 for k, v in pairs(row) do
@@ -151,7 +155,7 @@ local function generate_sip(ll_path)
                     end
                     ::cont2::
                 end
-                row.source_path = prev_path
+                row.id = prev_id
             end
         end
         ::skip::
@@ -237,6 +241,8 @@ local function rsrc_to_graph(rsrc)
         for i, vv in ipairs(v) do
             if prop == "content_type" then
                 o = term.new_iriref_ns(rmod.uri)
+            elseif prop == "sub_id" then
+                o = term.new_iriref_ns(vv)
             elseif pconf.type == "resource" then
                 -- "par:" could have been added previously.
                 local rel_id = "par:" .. vv:gsub("^par:", "")
@@ -249,19 +255,21 @@ local function rsrc_to_graph(rsrc)
                 ))
                 then
                     -- Convert local path to URIs.
-                    v[i] = path_to_uri[vv]
-                    if not v[i] then error(
+                    local uri = path_to_uri[vv]
+                    if not uri then error(
                         ("Not a valid path: %s for property: %s on res: %s")
                         :format(vv, prop, rsrc.id))
                     end
-                    logger:debug("Converted path ".. vv .. " to URI: " .. v[i])
+                    v[i] = uri
+                    logger:debug("Converted path ".. vv .. " to URI: " .. uri)
                 else v[i] = rel_id
                 end
                 --if not v[i]:find("^par:") then dbg() end
                 o = term.new_iriref_ns(v[i])
             elseif pconf.type == "ext_resource" then
                 o = term.new_iriref(vv)
-            else o = term.new_lit(vv, rdf_type) end
+            else o = term.new_lit(vv, rdf_type)
+            end
             it:add_iter(triple.new(s, p, o))
         end
 
@@ -311,6 +319,8 @@ local function rsrc_to_graph(rsrc)
         end
         ::skip::
     end
+
+    -- Add resource lineage triples.
     for i, m in ipairs(rmod.lineage) do
         it:add_iter(triple.new(
             s, pkar.RDF_TYPE,
@@ -337,6 +347,7 @@ local M = {
 
 M.deposit = function(ll_path, cleanup)
     local sip = generate_sip(ll_path)
+    local tstamp
 
     for i, rsrc in ipairs(sip) do
         -- TODO Wrap this chunk into a txn. Each row is atomic.
@@ -373,7 +384,7 @@ M.deposit = function(ll_path, cleanup)
                 fsize = fsize + #chunk
             end
             local checksum = hash_it:final(true)
-            rsrc.checksum = {"urn:blake2:" .. checksum}
+            rsrc.checksum = {"blake2:" .. checksum}
             rsrc.size = fsize
 
             ofh:close()
@@ -423,6 +434,34 @@ M.deposit = function(ll_path, cleanup)
         logger:info("Stored: ", s.data)
     end
 
+    -- Add triples for submission metadata directly to the stored graph.
+    local it = repo.gr:add_init()
+    local sub_uri = term.new_iriref_ns(sub_id)
+    it:add_iter(triple.new(
+        sub_uri,
+        pkar.RDF_TYPE,
+        term.new_iriref_ns("par:Submission")
+    ))
+    if sub_name then
+        it:add_iter(triple.new(
+            sub_uri,
+            term.new_iriref_ns("rdfs:label"),
+            term.new_lit(sub_name)
+        ))
+    end
+    tstamp = os.date("!%Y-%m-%dT%TZ")
+    it:add_iter(triple.new(
+        sub_uri,
+        model.id_to_uri.submitted,
+        term.new_lit(tstamp, "xsd:dateTime", nil, true)
+    ))
+    it:add_iter(triple.new(
+        sub_uri,
+        model.id_to_uri.last_modified,
+        term.new_lit(tstamp, "xsd:dateTime", nil, true)
+    ))
+    it:add_done()
+
     -- Remove processing directory.
     local proc_dir = path.join(sip.root_path, "proc")
     if path.isdir(proc_dir) then dir.rmtree(proc_dir) end

+ 29 - 14
src/util/pkar.lua

@@ -81,43 +81,58 @@ dump_res = cli.command {
     cli.flag "o,output" {
         "Output file. If not specified, output to stdout.",
         type = cli.string,
-        default = "",
+        default = io.stdout,
     },
 
     function(args)
         local s = term.new_iriref_ns(args.id)
-        args.output = args.output or io.stdout
-        local fh = assert(io.open(args.output, "w"))
+        io.output(args.output)
         for chunk in repo.serialize_rsrc(s, args.format) do
-            fh:write(chunk)
+            io.write(chunk)
+        end
+        io.close()  -- This will fail for io.stdout, but it's OK.
+        if args.output ~= io.stdout then
+            print("File written to ", args.output)
         end
-        fh:close()
-        print("File written to ", args.output)
     end,
 }
 
 dump_ll = cli.command {
-    "Generate a laundry list for a stored resource.",
+    "Generate a laundry list for a stored resource or a whole submission.",
 
     cli.positional "id" {
         "ID of the resource, prefixed by `par:`",
         type = cli.string,
     },
+
     cli.flag "o,output" {
         "Output file. If not specified, output to stdout.",
         type = cli.string,
-        default = "",
+        default = io.stdout,
     },
 
     function(args)
         local s = term.new_iriref_ns(args.id)
-        local out = gen.generate_ll(s)
-        if args.output ~= "" then
-            local fh = assert(io.open(args.output, "w"))
-            fh:write(out)
-            fh:close()
+        io.output(args.output)
+        if args.id:find("^sub:") then
+            -- Dump whole submission.
+            --[[
+            local co = coroutine.create(gen.generate_sub_ll)
+            while true do
+                local r, res = coroutine.resume(co, s)
+                if not r then break end
+                if res then io.write(res) end
+            end
+            --]]
+            io.write(gen.generate_sub_ll(s))
+        else
+            -- One resource only.
+            io.write(gen.generate_res_ll(s))
+        end
+        io.close()  -- This will fail for io.stdout, but it's OK.
+        if args.output ~= io.stdout then
             print("File written to ", args.output)
-        else print(out) end
+        end
     end
 }
 

+ 3 - 0
src/validator.lua

@@ -14,6 +14,9 @@ local E_RANGE = "Range error"
 
 local M = {}
 
+-- "nil" table.
+local NT = {}
+
 
 M.validate = function(gr, s)
     ctype = gr:attr(s, model.id_to_uri.content_type):iter()() or NT

+ 0 - 0
test/sample_submission/demo01/pkar_submission.csv → test/sample_submission/demo01/pkar_submission-demo.csv