Browse Source

Add Python methods for codec; naming changes.

Stefano Cossu 3 years ago
parent
commit
df5909aa1f
6 changed files with 254 additions and 87 deletions
  1. 1 0
      TODO.md
  2. 155 14
      cpython/py_graph.h
  3. 49 31
      include/codec_base.h
  4. 3 1
      setup.py
  5. 35 30
      src/codec_nt.c
  6. 11 11
      test/test_codec_nt.c

+ 1 - 0
TODO.md

@@ -12,6 +12,7 @@ P = pending; W = working on it; D = done.
 - P Environment
 - *P* Turtle serialization / deserialization
 - P* Better error handling
+- P Python tests
 
 
 ## Non-critical for MVP

+ 155 - 14
cpython/py_graph.h

@@ -7,9 +7,69 @@
 #include <structmember.h>
 
 #include "graph.h"
+#include "codec_nt.h"
 #include "py_triple.h"
 
 
+/*
+ * String iterator for codec output.
+ *
+ * Yields one string (one or more lines) at a time.
+ */
+typedef struct {
+    PyObject_HEAD
+    LSUP_CodecIterator *it;
+    char *line;
+} StringIteratorObject;
+
+
+static void
+StringIterator_dealloc (StringIteratorObject *it_obj)
+{ it_obj->it->codec->encode_graph_done (it_obj->it); }
+
+
+static PyObject *
+StringIterator_next (StringIteratorObject *it_obj)
+{
+    unsigned char *rdf_str;
+    LSUP_rc rc = it_obj->it->codec->encode_graph_iter (it_obj->it, &rdf_str);
+    if (rc != LSUP_OK) {
+        if (rc != LSUP_END)
+            PyErr_SetString (PyExc_ValueError, "Error encoding graph.");
+
+        // If not an error, this raises StopIteration.
+        return NULL;
+    }
+
+    PyObject *rdf_obj = PyUnicode_FromString ((char*)rdf_str);
+    if (UNLIKELY (!rdf_obj)) return NULL;
+
+    Py_INCREF (rdf_obj);
+    return (rdf_obj);
+}
+
+/*
+ * String iterator type.
+ *
+ * Objects of this type are never generated from Python code, rather from
+ * Graph_encode, hence the type has no special new or init function.
+ */
+PyTypeObject StringIteratorType = {
+    PyVarObject_HEAD_INIT(&PyType_Type, 0)
+    .tp_name            = "graph.StringIterator",
+    .tp_basicsize       = sizeof (StringIteratorObject),
+    .tp_itemsize        = 0,
+    .tp_flags           = Py_TPFLAGS_DEFAULT,
+    .tp_dealloc         = (destructor) StringIterator_dealloc,
+    .tp_iter            = PyObject_SelfIter,
+    .tp_iternext        = (iternextfunc)StringIterator_next,
+};
+
+
+/*
+ * Graph stuff.
+ */
+
 typedef struct {
     PyObject_HEAD
     LSUP_Graph *ob_struct;
@@ -73,6 +133,7 @@ static PyGetSetDef Graph_getsetters[] = {
     {NULL}
 };
 
+
 static PyObject *
 Graph_copy (PyTypeObject *cls, PyObject *src)
 {
@@ -88,6 +149,44 @@ Graph_copy (PyTypeObject *cls, PyObject *src)
 }
 
 
+static PyObject *
+Graph_new_from_rdf (PyTypeObject *cls, PyObject *args)
+{
+
+    Py_buffer *buf;
+    const char *type;
+    if (! PyArg_ParseTuple (args, "s*s", &buf, &type)) return NULL;
+
+    GraphObject *res = (GraphObject *) cls->tp_alloc(cls, 0);
+    if (!res) goto fail;
+
+    const LSUP_Codec *codec;
+    if (strcmp(type, "nt") == 0) codec = &nt_codec;
+    // TODO other codecs here.
+    else {
+            PyErr_SetString (PyExc_ValueError, "Unsupported codec.");
+            goto fail;
+    }
+
+    size_t ct;
+    char *err;
+    codec->decode_graph (buf->buf, &res->ob_struct, &ct, &err);
+
+    TRACE ("Decoded %lu triples.\n", ct);
+    if (!UNLIKELY (err)) goto fail;
+
+    PyBuffer_Release (buf);
+    Py_INCREF(res);
+
+    return (PyObject *) res;
+
+fail:
+    PyBuffer_Release (buf);
+
+    return NULL;
+}
+
+
 static PyObject *
 Graph_richcmp (PyObject *self, PyObject *other, int op)
 {
@@ -135,7 +234,7 @@ Graph_add (PyObject *self, PyObject *triples)
     PyObject *trp_obj = NULL;
     int rc = 0;
     size_t i;
-    LSUP_SerTriple *sspo = STRP_DUMMY;
+    LSUP_SerTriple *sspo = LSUP_striple_new (BUF_DUMMY, BUF_DUMMY, BUF_DUMMY);
     LSUP_GraphIterator *it = LSUP_graph_add_init (
             ((GraphObject *)self)->ob_struct);
 
@@ -173,11 +272,14 @@ finalize:
 }
 
 
+// TODO
 static int Graph_remove (PyObject *self, PyObject *s, PyObject *p, PyObject *o)
 {
     return 0;
 }
 
+
+// TODO
 static PyObject *Graph_lookup (
         PyObject *self, PyObject *s, PyObject *p, PyObject *o)
 {
@@ -185,8 +287,43 @@ static PyObject *Graph_lookup (
 }
 
 
+static PyObject *
+Graph_encode (PyObject *self, PyObject *args)
+{
+    char *type;
+
+    if (! PyArg_ParseTuple (args, "s", &type)) return NULL;
+
+    const LSUP_Codec *codec;
+    if (strcmp(type, "nt") == 0) codec = &nt_codec;
+    // TODO other codecs here.
+    else {
+        PyErr_SetString (PyExc_ValueError, "Unsupported codec.");
+        return NULL;
+    }
+
+    LSUP_CodecIterator *it = codec->encode_graph_init (
+            ((GraphObject *)self)->ob_struct);
+
+    // Initialize the generator object.
+    //StringIteratorType *it_type;
+    StringIteratorObject *it_obj = PyObject_New (
+            StringIteratorObject, &StringIteratorType);
+    if (!it_obj) return NULL;
+
+    it_obj->it = it;
+
+    Py_INCREF (it_obj);
+    return (PyObject *)it_obj;
+}
+
+
 static PyMethodDef Graph_methods[] = {
     {"copy", (PyCFunction) Graph_copy, METH_CLASS, "Copy a graph."},
+    {
+        "from_rdf", (PyCFunction) Graph_new_from_rdf, METH_CLASS,
+        "Create a graph from a RDF file."
+    },
     {"add", (PyCFunction) Graph_add, METH_O, "Add triples to a graph."},
     {
         "remove", (PyCFunction) Graph_remove, METH_VARARGS,
@@ -196,6 +333,10 @@ static PyMethodDef Graph_methods[] = {
         "lookup", (PyCFunction) Graph_lookup, METH_VARARGS,
         "Look triples in a graph by matching a pattern."
     },
+    {
+        "to_rdf", (PyCFunction) Graph_encode, METH_VARARGS,
+        "Encode a graph into a RDF byte buffer."
+    },
     {NULL},
 };
 
@@ -253,19 +394,19 @@ static PySequenceMethods Graph_seq_methods = {
 
 PyTypeObject GraphType = {
     PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "graph.Graph",
-    .tp_doc = "RDF graph",
-    .tp_basicsize = sizeof (GraphObject),
-    .tp_itemsize = 0,
-    .tp_flags = Py_TPFLAGS_DEFAULT,
-    .tp_new = PyType_GenericNew,
-    .tp_init = (initproc) Graph_init,
-    .tp_dealloc = (destructor) Graph_dealloc,
-    .tp_getset = Graph_getsetters,
-    .tp_methods = Graph_methods,
-    .tp_richcompare = (richcmpfunc) Graph_richcmp,
-    .tp_as_number = &Graph_number_methods,
-    .tp_as_sequence = &Graph_seq_methods,
+    .tp_name            = "graph.Graph",
+    .tp_doc             = "RDF graph",
+    .tp_basicsize       = sizeof (GraphObject),
+    .tp_itemsize        = 0,
+    .tp_flags           = Py_TPFLAGS_DEFAULT,
+    .tp_new             = PyType_GenericNew,
+    .tp_init            = (initproc) Graph_init,
+    .tp_dealloc         = (destructor) Graph_dealloc,
+    .tp_getset          = Graph_getsetters,
+    .tp_methods         = Graph_methods,
+    .tp_richcompare     = (richcmpfunc) Graph_richcmp,
+    .tp_as_number       = &Graph_number_methods,
+    .tp_as_sequence     = &Graph_seq_methods,
 };
 
 #endif

+ 49 - 31
include/codec_base.h

@@ -4,25 +4,29 @@
 #include "graph.h"
 
 
+typedef struct codec_t LSUP_Codec;
+
+
 /** @brief Codec iterator type.
  *
- * This structure holds state data including input and output for encoding and
- * decoding RDF. Normally it should not be inspected ormanipulated directly,
+ * This structure holds state data including input and output for encoding a
+ * graph into RDF. Normally it should not be inspected or manipulated directly,
  * but rather passed to codec iteration functions for processing RDF.
  *
  * NOTE: This should be used as an opaque handle, however it is exposed here
  * for easier inclusion into each codec.
  */
 typedef struct codec_iter_t {
-    char *              rep;        // String representation of a RDF fragment.
-    LSUP_Triple *       trp;        // RDF fragment being encoded or decoded.
+    const LSUP_Codec *  codec;      // Codec that generated this iterator.
+    LSUP_Triple *       trp;        // RDF fragment being encoded.
     LSUP_GraphIterator *gr_it;      // Graph iterator.
-    LSUP_NSMap *        nsm;        // Namespace map.
+    const LSUP_NSMap *  nsm;        // Namespace map.
     size_t              cur;        // Internal cursor.
     LSUP_rc             rc;         // Internal return code.
-    char *              str_s;      // Temporary string.
-    char *              str_p;      // Temporary string.
-    char *              str_o;      // Temporary string.
+    char *              rep,        // String representation of a RDF fragment.
+         *              str_s,      // Temporary string.
+         *              str_p,      // Temporary string.
+         *              str_o;      // Temporary string.
 } LSUP_CodecIterator;
 
 
@@ -59,6 +63,9 @@ typedef LSUP_rc (*term_enc_fn_t)(
  * This prototype is to be implemented by graph encoding loops. It should
  * create an iterator and perform all initial setup for finding triples.
  *
+ * Implementations MUST set the "codec" member of the iterator to the address
+ * of the codec that generated it.
+ *
  * @param[in] gr The graph to be encoded. The graph's namespace map is used by the
  * codec for namespace prefixing. The graph may only be freed after the loop is
  * finalized.
@@ -71,24 +78,35 @@ typedef LSUP_CodecIterator * (*gr_encode_init_fn_t)(const LSUP_Graph *gr);
 
 /** @brief Perform one encoding iteration.
  *
- * Implementations of this prototype should perform all the steps to encode one
- * or more triples into a complete RDF fragment representing a complete triple
- * or a set thereof. The input unit is up to the implementation.
+ * Implementations of this prototype MUST perform all the steps to encode one
+ * or more complete triples into an RDF fragment representing those triples.
+ * The input and output units are up to the implementation and a caller SHOULD
+ * assume that multiple lines may be yielded at each iteration.
  *
  * @param[in] it Iterator handle.
  *
  * @param[out] res Handle to be populated with a string obtained from encoding.
- * This pointer must be passed initialized (it may be NULL) and should be
- * eventually freed manually at the end of the loop (it is reallocated at each
- * iteration, so memory from a previous iteration may be overwritten with new
- * data).
+ *  The output data should be UTF-8 [TODO or UTF-16] encoded. This pointer
+ *  should be eventually freed manually at the end of the loop. It is
+ *  reallocated at each iteration, so memory from a previous iteration may be
+ *  overwritten with new data.
  *
  * @return LSUP_OK if a new token was processed; LSUP_END if the end of the
  *  loop was reached.
  */
-typedef LSUP_rc (*gr_codec_iter_fn_t)(LSUP_CodecIterator *it, void **res);
+typedef LSUP_rc (*gr_encode_iter_fn_t)(
+        LSUP_CodecIterator *it, unsigned char **res);
+
 
-typedef void (*gr_codec_done_fn_t)(LSUP_CodecIterator *it);
+/** @brief Finalize an encoding operation.
+ *
+ * Implementations SHOULD use this function to perform all necessary steps to
+ * clean up memory and free the iterator handle after a graph has been
+ * completely encoded.
+ *
+ * @param[in] it Iterator handle.
+ */
+typedef void (*gr_encode_done_fn_t)(LSUP_CodecIterator *it);
 
 
 /** @brief Prototype for decoding a string into a LSUP_Term.
@@ -143,33 +161,33 @@ typedef LSUP_rc (*gr_decode_fn_t)(
  * - mimetype: MIME type (32-char max) associated with the codec.
  * - extension: File extension associated with the serialized file.
  *
- * - term_encoder: Encode a single term.
+ * - encode_term: Encode a single term.
  *
- * - gr_encode_init: Initialize a graph decoding loop.
- * - gr_encode_iter: Run one iteration of encoding on one or more triples.
- * - gr_encode_done: Finalize the encoding loop and free the support data.
+ * - encode_graph_init: Initialize a graph decoding loop.
+ * - encode_graph_iter: Run one iteration of encoding on one or more triples.
+ * - encode_graph_done: Finalize the encoding loop and free the support data.
  *
- * - term_decoder: Decode a single term.
- * - gr_decoder: Decode a RDF document into a graph.
+ * - decode_term: Decode a single term.
+ * - decode_graph: Decode a RDF document into a graph.
  *
  * For documentation on the individual encoding and decoding callbacks, see the
  * related function prototypes.
  */
-typedef struct codec_t {
+struct codec_t {
     char                name[16];       // Name of the codec.
     char                mimetype[32];   // MIME type associated with the codec.
     char                extension[8];   // Serialized file extension.
 
     // Encoding.
-    term_enc_fn_t       term_encoder;   // Term encoder function.
+    term_enc_fn_t       encode_term;    // Term encoder function.
 
-    gr_encode_init_fn_t gr_encode_init; // Graph encoder initialization.
-    gr_codec_iter_fn_t  gr_encode_iter; // Graph encoder iteration.
-    gr_codec_done_fn_t  gr_encode_done; // Graph encoder finalization.
+    gr_encode_init_fn_t encode_graph_init; // Graph encoder initialization.
+    gr_encode_iter_fn_t encode_graph_iter; // Graph encoder iteration.
+    gr_encode_done_fn_t encode_graph_done; // Graph encoder finalization.
 
     // Decoding.
-    term_decode_fn_t    term_decoder;   // Term decoder function.
-    gr_decode_fn_t      gr_decoder;     // Graph decoder function.
-} LSUP_Codec;
+    term_decode_fn_t    decode_term;   // Term decoder function.
+    gr_decode_fn_t      decode_graph;     // Graph decoder function.
+};
 
 #endif

+ 3 - 1
setup.py

@@ -11,6 +11,7 @@ EXT_DIR = path.join(ROOT_DIR, 'ext')
 
 sources = (
     glob(path.join(SRC_DIR, '*.c')) +
+    glob(path.join(SRC_DIR, 'codec', '*.c')) +
     glob(path.join(MOD_DIR, '*.c')) +
     [
         path.join(EXT_DIR, 'openldap', 'libraries', 'liblmdb', 'mdb.c'),
@@ -22,7 +23,7 @@ sources = (
 debug = True
 
 compile_args = [
-    '-std=c99',
+    # '-std=c99',
 ]
 if debug:
     compile_args.extend(['-DDEBUG', '-g3', '-O0'])
@@ -44,6 +45,7 @@ setup(
             "_lsup_rdf",
             sources,
             include_dirs=[
+                ROOT_DIR,
                 INCL_DIR,
                 path.join(EXT_DIR, 'uthash', 'src'),
             ],

+ 35 - 30
src/codec_nt.c

@@ -98,27 +98,13 @@ term_to_nt (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
 
 
 static LSUP_CodecIterator *
-gr_to_nt_init (const LSUP_Graph *gr)
-{
-    LSUP_CodecIterator *it;
-    MALLOC_GUARD (it, NULL);
-    LSUP_Triple lut = {NULL, NULL, NULL};
-
-    it->gr_it = LSUP_graph_lookup(gr, &lut, &it->cur);
-    it->nsm = LSUP_graph_namespace (gr);
-    it->cur = 0;
-    it->trp = LSUP_triple_new (TERM_DUMMY, TERM_DUMMY, TERM_DUMMY);
-    it->rep = NULL;
-    it->str_s = NULL;
-    it->str_p = NULL;
-    it->str_o = NULL;
-
-    return it;
-}
+gr_to_nt_init (const LSUP_Graph *gr);
 
 
 static LSUP_rc
-gr_to_nt_iter (LSUP_CodecIterator *it, void **res) {
+gr_to_nt_iter (LSUP_CodecIterator *it, unsigned char **res) {
+    *res = NULL;
+
     LSUP_rc rc = LSUP_graph_iter_next (it->gr_it, it->trp);
     if (rc != LSUP_OK) return rc;
 
@@ -126,15 +112,13 @@ gr_to_nt_iter (LSUP_CodecIterator *it, void **res) {
     term_to_nt (it->trp->p, it->nsm, &it->str_p);
     term_to_nt (it->trp->o, it->nsm, &it->str_o);
 
-    char *tmp = realloc (
+    // 3 term separators + dot + newline + terminal = 6
+    unsigned char *tmp = realloc (
             *res, strlen (it->str_s) + strlen (it->str_p)
             + strlen (it->str_o) + 6);
-    if (UNLIKELY (!tmp)) {
-        *res = NULL;
-        return LSUP_MEM_ERR;
-    }
+    if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
 
-    sprintf (tmp, "%s %s %s .\n", it->str_s, it->str_p, it->str_o);
+    sprintf ((char*)tmp, "%s %s %s .\n", it->str_s, it->str_p, it->str_o);
     *res = tmp;
 
     it->cur++;
@@ -161,14 +145,14 @@ const LSUP_Codec nt_codec = {
     .mimetype           = "application/n-triples",
     .extension          = "nt",
 
-    .term_encoder       = term_to_nt,
+    .encode_term        = term_to_nt,
 
-    .gr_encode_init     = gr_to_nt_init,
-    .gr_encode_iter     = gr_to_nt_iter,
-    .gr_encode_done     = gr_to_nt_done,
+    .encode_graph_init  = gr_to_nt_init,
+    .encode_graph_iter  = gr_to_nt_iter,
+    .encode_graph_done  = gr_to_nt_done,
 
-    .term_decoder       = LSUP_nt_parse_term,
-    .gr_decoder         = LSUP_nt_parse_doc,
+    .decode_term        = LSUP_nt_parse_term,
+    .decode_graph       = LSUP_nt_parse_doc,
 };
 
 
@@ -190,6 +174,27 @@ static inline char replace_char(const char c) {
 }
 
 
+static LSUP_CodecIterator *
+gr_to_nt_init (const LSUP_Graph *gr)
+{
+    LSUP_CodecIterator *it;
+    MALLOC_GUARD (it, NULL);
+    LSUP_Triple lut = {NULL, NULL, NULL};
+
+    it->codec = &nt_codec;
+    it->gr_it = LSUP_graph_lookup(gr, &lut, &it->cur);
+    it->nsm = LSUP_graph_namespace (gr);
+    it->cur = 0;
+    it->trp = LSUP_triple_new (TERM_DUMMY, TERM_DUMMY, TERM_DUMMY);
+    it->rep = NULL;
+    it->str_s = NULL;
+    it->str_p = NULL;
+    it->str_o = NULL;
+
+    return it;
+}
+
+
 /** @brief Add escape character (backslash) to illegal literal characters.
  */
 static LSUP_rc

+ 11 - 11
test/test_codec_nt.c

@@ -133,18 +133,18 @@ test_encode_nt_term()
     char *out = NULL;
 
     // Test that passing a NS map has no effect.
-    EXPECT_PASS (nt_codec.term_encoder (terms[0], nsm, &out));
+    EXPECT_PASS (nt_codec.encode_term (terms[0], nsm, &out));
     EXPECT_STR_EQ (out, end_nt[0]);
 
     for (int i = 0; i < TERM_CT - 2; i++) {
-        EXPECT_PASS (nt_codec.term_encoder (terms[i], NULL, &out));
+        EXPECT_PASS (nt_codec.encode_term (terms[i], NULL, &out));
         EXPECT_STR_EQ (out, end_nt[i]);
     }
 
-    EXPECT_INT_EQ (nt_codec.term_encoder (terms[8], NULL, &out), LSUP_VALUE_ERR);
+    EXPECT_INT_EQ (nt_codec.encode_term (terms[8], NULL, &out), LSUP_VALUE_ERR);
     ASSERT (out == NULL, "Encoding of undefined term should be NULL!");
 
-    EXPECT_INT_EQ (nt_codec.term_encoder (terms[9], NULL, &out), LSUP_VALUE_ERR);
+    EXPECT_INT_EQ (nt_codec.encode_term (terms[9], NULL, &out), LSUP_VALUE_ERR);
     ASSERT (out == NULL, "Encoding of undefined term should be NULL!");
 
     free (out);
@@ -164,17 +164,17 @@ static int test_encode_nt_graph()
     LSUP_graph_add_trp (gr, trp, &ins);
 
     char *out = calloc (1, 1);
-    LSUP_CodecIterator *it = nt_codec.gr_encode_init (gr);
+    LSUP_CodecIterator *it = nt_codec.encode_graph_init (gr);
     ASSERT (it != NULL, "Error creating codec iterator!");
 
-    char *tmp = NULL;
+    char *tmp;
     LSUP_rc rc;
-    while ((rc = nt_codec.gr_encode_iter (it, (void**)&tmp)) != LSUP_END) {
+    while ((rc = nt_codec.encode_graph_iter (it, (unsigned char **)&tmp)) != LSUP_END) {
         ASSERT (rc >= 0, "Encoding step failed!");
         out = realloc (out, strlen(out) + strlen (tmp) + 1);
         out = strcat (out, tmp);
     }
-    nt_codec.gr_encode_done (it);
+    nt_codec.encode_graph_done (it);
     free (tmp);
     LSUP_graph_free (gr);
     //printf("Serialized graph: %s\n", out);
@@ -193,7 +193,7 @@ test_decode_nt_term()
 {
     for (int i = 0; i < TERM_CT - 2; i++) {
         LSUP_Term *term;
-        EXPECT_PASS (nt_codec.term_decoder (start_nt[i], NULL, &term));
+        EXPECT_PASS (nt_codec.decode_term (start_nt[i], NULL, &term));
         LSUP_term_free (term);
     }
 
@@ -209,7 +209,7 @@ test_decode_nt_graph()
     LSUP_Graph *gr;
     size_t ct;
     char *err;
-    EXPECT_PASS (nt_codec.gr_decoder (input, &gr, &ct, &err));
+    EXPECT_PASS (nt_codec.decode_graph (input, &gr, &ct, &err));
 
     fclose (input);
 
@@ -234,7 +234,7 @@ test_decode_nt_bad_graph()
     LSUP_Graph *gr;
     size_t ct;
     char *err;
-    EXPECT_INT_EQ (nt_codec.gr_decoder (input, &gr, &ct, &err), LSUP_VALUE_ERR);
+    EXPECT_INT_EQ (nt_codec.decode_graph (input, &gr, &ct, &err), LSUP_VALUE_ERR);
 
     TRACE ("Error: %s", err);
     ASSERT (strstr (err, "`dc:title") != NULL, "Wrong error string report!");