Browse Source

Various improvements:

* Maintain constant buffer size on encoding.
* Add env_init() to Python module.
* Implement graph_store() (not tested).
Stefano Cossu 3 years ago
parent
commit
6b11b4252c
9 changed files with 104 additions and 61 deletions
  1. 6 1
      README.md
  2. 5 3
      TODO.md
  3. 1 0
      cpython/lsup_rdf/__init__.py
  4. 4 4
      cpython/py_graph.h
  5. 21 0
      cpython/py_lsup_rdf.c
  6. 22 0
      include/graph.h
  7. 2 0
      setup.py
  8. 26 53
      src/codec/nt_lexer.re
  9. 17 0
      src/graph.c

+ 6 - 1
README.md

@@ -102,4 +102,9 @@ unset and let the software adjust it to the hardware architecture. By default,
 it is set to 1Tb for 64-bit systems and 4Gb for 32-bit systems. The map size by
 itself does not use up any extra resources.
 
-
+`LSUP_RDF_STREAM_CHUNK_SIZE`: Size of RDF decoding buffer, i.e., maximum size
+of a chunk of RDF data fed to the parser when decoding a RDF file into a graph.
+This should be larger than the maximum expected size of a single term in your
+RDF source. The default value is 8192, which is mildly conservative. If you
+experience parsing errors on decoding, and they happen to be on a term such a
+very long string literal, try recompiling the library with a larger value.

+ 5 - 3
TODO.md

@@ -8,21 +8,23 @@
 - *D* Hash table back end
 - *D* Namespace manager
 - *D* N3 serialization / deserialization
+- *D* Environment
+- *D* Better error handling
+- *D* Logging
+- *D* Store graph
 - *W* Python bindings
     - *D* Basic module framework
     - *D* term, triple, graph modules
     - *D* Codec integration
     - *D* Graph remove and lookup ops
     - *W* Namespace module
+    - *P* Query and slicing methods
     - *P* Tests
-- *P* Environment
 - *P* Turtle serialization / deserialization
-- *P* Better error handling
 
 
 ## Non-critical for MVP
 
-- Logging
 - Term and triple validation
 - NQ codec
 - TriG codec

+ 1 - 0
cpython/lsup_rdf/__init__.py

@@ -7,6 +7,7 @@ import importlib.util
 import sys
 
 import _lsup_rdf
+from _lsup_rdf import env_init
 
 
 pkg_path = _lsup_rdf.__file__

+ 4 - 4
cpython/py_graph.h

@@ -253,7 +253,7 @@ Graph_new_from_rdf (PyTypeObject *cls, PyObject *args)
     char *err;
     codec->decode_graph (fh, &res->ob_struct, &ct, &err);
 
-    TRACE ("Decoded %lu triples.\n", ct);
+    log_debug ("Decoded %lu triples.", ct);
     if (UNLIKELY (err)) {
         PyErr_SetString (PyExc_IOError, err);
         return NULL;
@@ -324,7 +324,7 @@ Graph_add (PyObject *self, PyObject *triples)
             goto finalize;
         }
 
-        TRACE ("Inserting triple #%lu\n", i);
+        log_trace ("Inserting triple #%lu", i);
 
         LSUP_triple_serialize (((TripleObject *)trp_obj)->ob_struct, sspo);
         LSUP_rc db_rc = LSUP_graph_add_iter (it, sspo);
@@ -395,7 +395,7 @@ static PyObject *Graph_remove (PyObject *self, PyObject *args)
         PyErr_SetString (PyExc_SystemError, "Error removing triples.");
         goto finally;
     }
-    TRACE ("Removed %lu triples.", ct);
+    log_debug ("Removed %lu triples.", ct);
 
 finally:
     if (rc < 0) return NULL;
@@ -422,7 +422,7 @@ static PyObject *Graph_lookup (PyObject *self, PyObject *args)
         rc = -1;
         goto finally;
     }
-    TRACE ("Found %lu triples.", ct);
+    log_debug ("Found %lu triples.", ct);
 
     // Initialize the generator object.
     it_obj = PyObject_New (

+ 21 - 0
cpython/py_lsup_rdf.c

@@ -15,11 +15,32 @@
 #include "py_namespace.h"
 
 
+static PyObject *
+env_init (PyObject *self)
+{
+    if (LSUP_init() != LSUP_OK) {
+        PyErr_SetString (PyExc_SystemError, "Error initializing environment.");
+        return NULL;
+    }
+
+    Py_RETURN_NONE;
+}
+
+
+static PyMethodDef lsup_rdf_methods[] = {
+    {
+        "env_init", (PyCFunction)env_init, METH_NOARGS,
+        "Initialize the LSUP_RDF environment."
+    },
+    {NULL}
+};
+
 static PyModuleDef _lsup_rdf_pkg = {
     PyModuleDef_HEAD_INIT,
     .m_name = "_lsup_rdf",
     .m_doc = "Lakesuperior RDF package.",
     .m_size = -1,
+    .m_methods = lsup_rdf_methods,
 };
 
 PyMODINIT_FUNC

+ 22 - 0
include/graph.h

@@ -80,6 +80,28 @@ LSUP_Graph *
 LSUP_graph_copy (const LSUP_Graph *src);
 
 
+/** @brief Copy the contents of a graph into a permanent store.
+ *
+ * It is possible to store a memory graph, a RAMdisk MDB graph, or a
+ * permanently stored graph into another environment.
+ *
+ * The namespace map associated with the graph is stored into the destination
+ * as well, except for existing namespaces and prefixes.
+ *
+ * @param[in] src Graph to store.
+ *
+ * @param[in] env Environment to copy to. If NULL, it is set to the deafult
+ *  LSUP store. This makes it possible to copy MDB graphs across different
+ *  envirnments. If the source is a MDB graph and the environment is the same
+ *  as the source, no change occurs.
+ *
+ * @return LSUP_OK on success; LSUP_NOACTION if the graph is already stored in
+ *  the same enviroment; <0 on error.
+ */
+LSUP_rc
+LSUP_graph_store (const LSUP_Graph *src, const LSUP_Env *env);
+
+
 /** Perform a boolean operation between two graphs.
  *
  * This method yields a new graph as the result of the operation.

+ 2 - 0
setup.py

@@ -17,6 +17,7 @@ sources = (
         path.join(EXT_DIR, 'openldap', 'libraries', 'liblmdb', 'mdb.c'),
         path.join(EXT_DIR, 'openldap', 'libraries', 'liblmdb', 'midl.c'),
         path.join(EXT_DIR, 'xxHash', 'xxhash.c'),
+        path.join(EXT_DIR, 'log', 'src', 'log.c'),
     ]
 )
 
@@ -48,6 +49,7 @@ setup(
                 ROOT_DIR,
                 INCL_DIR,
                 path.join(EXT_DIR, 'uthash', 'src'),
+                path.join(EXT_DIR, 'log', 'src'),
             ],
             libraries=['uuid'],
             extra_compile_args=compile_args,

+ 26 - 53
src/codec/nt_lexer.re

@@ -12,32 +12,27 @@
 /**
  * Max chunk size passed to scanner at each iteration.
  */
-#define CHUNK_SIZE 256
-
-/* Max possible token size. If a matching patten is not found, the scanner
- * keeps pulling data from input until a) a match is unambiguously found, or
- * not found; or b) EOF is reached; or c) the size of the buffer being searched
- * exceeds this size. Setting this to 0 disables any limit, which means that a
- * bad token might consume the whole input and, possibly, exhaust the available
- * memory and throw an error.
- */
-#define MAX_TOKEN_SIZE 8192
+#ifdef LSUP_RDF_STREAM_CHUNK_SIZE
+#define CHUNK_SIZE LSUP_RDF_STREAM_CHUNK_SIZE
+#else
+#define CHUNK_SIZE 8192
+#endif
 
 
 typedef struct {
-    FILE *          file;           // Input file handle.
-    YYCTYPE *       buf,            // Start of buffer.
-            *       lim,            // Position after the last available input
-                                    //   character (YYLIMIT).
-            *       cur,            // Next input character to be read
-                                    //   (YYCURSOR)
-            *       mar,            // Most recent match (YYMARKER)
-            *       tok,            // Start of current token.
-            *       bol;            // Address of the beginning of the current
-                                    //   line (for debugging).
-    unsigned        line;           // Current line no. (for debugging).
-    unsigned        ct;             // Number of parsed triples.
-    bool            eof;            // if we have reached EOF.
+    FILE *          fh;                 // Input file handle.
+    YYCTYPE         buf[CHUNK_SIZE + 1],// Start of buffer.
+            *       lim,                // Position after the last available
+                                        //   input character (YYLIMIT).
+            *       cur,                // Next input character to be read
+                                        //   (YYCURSOR)
+            *       mar,                // Most recent match (YYMARKER)
+            *       tok,                // Start of current token.
+            *       bol;                // Address of the beginning of the
+                                        //   current line (for debugging).
+    unsigned        line;               // Current line no. (for debugging).
+    unsigned        ct;                 // Number of parsed triples.
+    bool            eof;                // if we have reached EOF.
     /*!stags:re2c format = "YYCTYPE *@@;"; */
 } ParseIterator;
 
@@ -70,7 +65,7 @@ static int fill(ParseIterator *it)
     it->cur -= shift;
     it->mar -= shift;
     it->tok -= shift;
-    it->lim += fread(it->lim, 1, shift, it->file);
+    it->lim += fread(it->lim, 1, shift, it->fh);
     /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
     it->lim[0] = 0;
     it->eof |= it->lim < it->buf + CHUNK_SIZE;
@@ -78,10 +73,9 @@ static int fill(ParseIterator *it)
 }
 
 
-static void parse_init(ParseIterator *it, FILE *file)
+static void parse_init(ParseIterator *it, FILE *fh)
 {
-    it->file = file;
-    it->buf = malloc (CHUNK_SIZE + 1);
+    it->fh = fh;
     it->cur = it->mar = it->tok = it->lim = it->buf + CHUNK_SIZE;
     it->line = 1;
     it->bol = it->buf;
@@ -92,25 +86,6 @@ static void parse_init(ParseIterator *it, FILE *file)
 }
 
 
-// TODO Make buffer extensible if a token is larger than the current buf size.
-static int __attribute__((unused)) extend (ParseIterator *it)
-{
-    size_t delta = YYLIMIT - it->buf + CHUNK_SIZE;
-    YYCTYPE *tmp = realloc (it->buf, delta);
-    if (!tmp) return ENOMEM;
-
-    it->lim += delta;
-
-    it->buf = tmp;
-
-    return 0;
-}
-
-
-static void parse_done (ParseIterator *it)
-{ free (it->buf); }
-
-
 /** @brief Replace \uxxxx and \Uxxxxxxxx with Unicode bytes.
  */
 static YYCTYPE *unescape_unicode (const YYCTYPE *esc_str, size_t size)
@@ -323,15 +298,14 @@ loop:
 LSUP_rc
 LSUP_nt_parse_term (const char *rep, const LSUP_NSMap *map, LSUP_Term **term)
 {
-    FILE *stream = fmemopen ((void *)rep, strlen (rep), "r");
+    FILE *fh = fmemopen ((void *)rep, strlen (rep), "r");
 
     ParseIterator it;
-    parse_init (&it, stream);
+    parse_init (&it, fh);
 
     int ttype = lex (&it, term);
 
-    parse_done (&it);
-    fclose (stream);
+    fclose (fh);
 
     switch (ttype) {
         case T_IRIREF:
@@ -344,13 +318,13 @@ LSUP_nt_parse_term (const char *rep, const LSUP_NSMap *map, LSUP_Term **term)
 }
 
 LSUP_rc
-LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct, char **err_p)
+LSUP_nt_parse_doc (FILE *fh, LSUP_Graph **gr_p, size_t *ct, char **err_p)
 {
     *err_p = NULL;
     *gr_p = NULL;
 
     ParseIterator parse_it;
-    parse_init (&parse_it, stream);
+    parse_init (&parse_it, fh);
 
     void *parser = ParseAlloc (malloc);
 
@@ -407,7 +381,6 @@ LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct, char **err_p)
 finally:
     Parse (parser, 0, NULL, it);
     ParseFree (parser, free);
-    parse_done (&parse_it);
 
     LSUP_graph_add_done (it);
     LSUP_term_free (term);

+ 17 - 0
src/graph.c

@@ -153,6 +153,23 @@ LSUP_graph_copy (const Graph *src)
 }
 
 
+LSUP_rc
+LSUP_graph_store (const LSUP_Graph *src, const LSUP_Env *env)
+{
+    if (!env) env = LSUP_default_env;
+    if (src->store_type == LSUP_STORE_MDB && src->env == env)
+        return LSUP_NOACTION;
+
+    LSUP_Graph *dest = LSUP_graph_new_env (env, LSUP_STORE_MDB);
+    if (UNLIKELY (!dest)) return LSUP_DB_ERR;
+
+    LSUP_rc rc = graph_copy_contents (src, dest);
+    if (UNLIKELY (rc != LSUP_OK)) return LSUP_DB_ERR;
+
+    return LSUP_OK;
+}
+
+
 // TODO support boolean ops between any types of graphs.
 Graph *
 LSUP_graph_bool_op(