ソースを参照

N-Triples term encoder + light tests.

Stefano Cossu 3 年 前
コミット
7ecd6a539c
8 ファイル変更386 行追加20 行削除
  1. 6 4
      cpython/py_lsup_rdf.c
  2. 83 0
      include/codec_base.h
  3. 10 0
      include/codec_nt.h
  4. 8 14
      include/term.h
  5. 177 0
      src/codec_nt.c
  6. 2 2
      src/term.c
  7. 2 0
      test.c
  8. 98 0
      test/test_codec_nt.c

+ 6 - 4
cpython/py_lsup_rdf.c

@@ -42,10 +42,12 @@ PyInit_term()
     PyObject *m = PyModule_Create(&term_mod);
     if (m == NULL) return NULL;
 
-#define ENTRY(a, b) \
-    if (PyModule_AddIntConstant (m, "TERM_" #a, b) < 0) return NULL;
-    TTYPE_TBL
-#undef ENTRY
+    if (
+        PyModule_AddIntConstant (m, "TERM_UNDEFINED", LSUP_TERM_UNDEFINED) < 0
+        || PyModule_AddIntConstant (m, "TERM_URI", LSUP_TERM_URI) < 0
+        || PyModule_AddIntConstant (m, "TERM_BNODE", LSUP_TERM_BNODE) < 0
+        || PyModule_AddIntConstant (m, "TERM_LITERAL", LSUP_TERM_LITERAL) < 0
+    ) return NULL;
 
     Py_INCREF(&TermType);
     if (PyModule_AddObject(m, "Term", (PyObject *) &TermType) < 0) {

+ 83 - 0
include/codec_base.h

@@ -0,0 +1,83 @@
+#ifndef _LSUP_CODEC_BASE_H
+#define _LSUP_CODEC_BASE_H
+
+#include "graph.h"
+
+
+/** @brief Term encoder callback type.
+ *
+ * @param[in] term Single term handle.
+ *
+ * @param[in] nsm Namespace map. May be NULL for no prefix shortening.
+ *
+ * @param[out] rep Pointer to a string to be filled with the encoded term. The
+ *  caller is in charge of freeing the string after use. Returns undefined on
+ *  error.
+ *
+ * @return LSUP_OK on successful encoding; <0 for other errors.
+ */
+typedef LSUP_rc (*term_enc_fn_t)(
+        const LSUP_Term *term, const LSUP_NSMap *nsm, char **rep);
+
+
+/** TODO
+ */
+typedef LSUP_rc (*term_dec_fn_t)(
+        const char *rep, const LSUP_NSMap *nsm, LSUP_Term **term);
+
+
+/** @brief Triple encoder callback type.
+ *
+ * @return LSUP_OK on successful encoding; <0 for other errors.
+ */
+typedef LSUP_rc (*trp_enc_fn_t)(
+        const LSUP_Triple trp[], const LSUP_NSMap *nsm, char **rep);
+
+
+/** TODO
+ */
+typedef LSUP_rc (*trp_dec_fn_t)(
+        const char *rep, const LSUP_NSMap *nsm, LSUP_Triple **trp);
+
+
+/** @brief Graph encoder callback type.
+ *
+ * @return LSUP_OK on successful encoding; <0 for other errors.
+ */
+typedef LSUP_rc (*gr_enc_fn_t)(const LSUP_Graph *gr, char **rep);
+
+
+/** TODO
+ */
+typedef LSUP_rc (*gr_dec_fn_t)(const char *rep, LSUP_Graph **gr);
+
+
+/** @brief Codec structure.
+ *
+ * An instance of this structure is usually defined at compile time (see
+ * examples in "include/codec_*.h" and "src/codec_*.c") and should have the
+ * following defined:
+ *
+ * - name: A brief (16-char max), human-readable to identify the codec.
+ * - mimetype: MIME type associated with the codec.
+ * - extension: File extension associated with the serialized file.
+ * - term_encoder: Callback function for encoding a single term.
+ * - term_decoder: Callback function for decoding a single term.
+ *
+ * There is no validation enforced, but at least the name, mimetype and
+ * extension, as well as one or more encoding functions and their respective
+ * decoding functions, should be defined in a codec.
+ */
+typedef struct codec_t {
+    char                name[16];       // Name of the codec.
+    char                mimetype[32];   // MIME type associated with the codec.
+    char                extension[8];   // Serialized file extension.
+    term_enc_fn_t       term_encoder;   // Term encoder function.
+    term_dec_fn_t       term_decoder;   // Term decoder function.
+    trp_enc_fn_t        trp_encoder;    // Triple encoder function.
+    trp_dec_fn_t        trp_decoder;    // Triple decoder function.
+    gr_enc_fn_t         gr_encoder;     // Graph encoder function.
+    gr_dec_fn_t         gr_decoder;     // Graph decoder function.
+} LSUP_Codec;
+
+#endif

+ 10 - 0
include/codec_nt.h

@@ -0,0 +1,10 @@
+#ifndef _LSUP_CODEC_NT_H
+#define _LSUP_CODEC_NT_H
+
+#include "codec_base.h"
+
+/** @brief N-Triples codec.
+ */
+extern const LSUP_Codec nt_codec;
+
+#endif

+ 8 - 14
include/term.h

@@ -18,27 +18,21 @@
 
 typedef XXH64_hash_t LSUP_TermHash64;
 typedef char langtag[LANG_SIZE];
+typedef char LSUP_term_type;
 
-#define TTYPE_TBL \
-    ENTRY (UNDEFINED,     0) \
-    ENTRY (URI,           1) \
-    ENTRY (BNODE,         2) \
-    ENTRY (LITERAL,       3)
-
-typedef enum LSUP_term_type {
-#define ENTRY(a, b) LSUP_TERM_##a = b,
-    TTYPE_TBL
-#undef ENTRY
-} LSUP_term_type;
+#define LSUP_TERM_UNDEFINED      0
+#define LSUP_TERM_URI            1
+#define LSUP_TERM_BNODE          2
+#define LSUP_TERM_LITERAL        3
 
 typedef struct LSUP_Term {
-    LSUP_term_type type;
+    char *data;
+    char *datatype;
     // This language variable currently supports a 2-digit ISO 639 language
     // code and a 2-character ISO 3166-1 country code, separated by a hyphen.
     // See https://tools.ietf.org/html/bcp47#section-2.1
     langtag lang;
-    char *datatype;
-    char *data;
+    LSUP_term_type type;
 } LSUP_Term;
 
 

+ 177 - 0
src/codec_nt.c

@@ -0,0 +1,177 @@
+#include "codec_nt.h"
+
+/** @brief List of characters to be escaped in serialized literals.
+ *
+ * https://www.w3.org/TR/n-triples/#grammar-production-ECHAR
+ */
+#define LIT_ECHAR "\t\b\n\r\f\"\'\\"
+
+/** @brief Regex of characters to be escaped in serialized IRIs.
+ *
+ * https://www.w3.org/TR/n-triples/#grammar-production-IRIREF
+ */
+#define IRI_ECHAR_PTN "[\x00-\x20<>\"\\{\\}\\|\\^`\\\\]"
+
+/** @brief Default NT literal type.
+ */
+#define XSD_STRING "http://www.w3.org/2001/XMLSchema#string"
+
+
+/* * * Static prototypes. * * */
+
+static LSUP_rc escape_lit (const char *in, char **out_p);
+
+
+/* * * Codec functions. * * */
+
+static LSUP_rc
+term_to_nt (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
+{
+    LSUP_rc rc;
+    char *out = NULL;
+    size_t buf_len;
+
+    switch (term->type) {
+        case LSUP_TERM_URI:
+            out = malloc (strlen (term->data) + 3);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
+
+            sprintf (out, "<%s>", term->data);
+            rc = LSUP_OK;
+            break;
+
+        case LSUP_TERM_LITERAL:
+            buf_len = strlen (term->data) + 3; // Room for ""
+
+            if (term->datatype && strcmp (term->datatype, XSD_STRING) != 0)
+                buf_len += strlen (term->datatype) + 2; // Room for ^^
+
+            if (strlen (term->lang) > 0) buf_len += strlen(term->lang) + 1; //@
+
+            out = malloc (buf_len);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
+
+            char *escaped;
+            if (escape_lit (term->data, &escaped) != LSUP_OK)
+                return LSUP_ERROR;
+            sprintf (out, "\"%s\"", escaped);
+            free (escaped);
+
+            // Always suppress xsd:string data type.
+            if (term->datatype && strcmp (term->datatype, XSD_STRING) != 0)
+                out = strcat (strcat (out, "^^"), term->datatype);
+
+            if (strlen (term->lang) > 0)
+                out = strcat (strcat (out, "@"), term->lang);
+
+            rc = LSUP_OK;
+
+            break;
+
+        case LSUP_TERM_BNODE:
+            out = malloc (strlen (term->data) + 2);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
+
+            sprintf (out, "_:%s", term->data);
+            rc = LSUP_OK;
+
+            break;
+
+        default:
+            out = NULL;
+            rc = LSUP_VALUE_ERR;
+    }
+
+    *out_p = out;
+    return rc;
+}
+
+
+static LSUP_rc
+nt_to_term (const char *rep, const LSUP_NSMap *nsm, LSUP_Term **term)
+{
+    // TODO
+    return LSUP_NOT_IMPL_ERR;
+}
+
+
+static LSUP_rc
+gr_to_nt (const LSUP_Graph *gr, char **rep)
+{
+    // TODO
+    return LSUP_NOT_IMPL_ERR;
+}
+
+
+static LSUP_rc
+nt_to_gr (const char *rep, LSUP_Graph **gr)
+{
+    // TODO
+    return LSUP_NOT_IMPL_ERR;
+}
+
+
+const LSUP_Codec nt_codec = {
+    .name           = "N-Triples",
+    .mimetype       = "application/n-triples",
+    .extension      = "nt",
+    .term_encoder   = term_to_nt,
+    .term_decoder   = nt_to_term,
+    .gr_encoder     = gr_to_nt,
+    .gr_decoder     = nt_to_gr,
+};
+
+
+/* * * Other internal functions. * * */
+
+/** Replace non-printable characters with their literal byte.
+ *
+ *  Escape backslash is to be added separately.
+ */
+static inline char replace_char(const char c) {
+    switch (c) {
+        case '\t': return 't';
+        case '\b': return 'b';
+        case '\n': return 'n';
+        case '\r': return 'r';
+        case '\f': return 'f';
+        default: return c;
+    }
+}
+
+
+/** @brief Add escape character (backslash) to illegal literal characters.
+ */
+static LSUP_rc
+escape_lit (const char *in, char **out_p)
+{
+    size_t out_size = strlen (in) + 1;
+
+    // Expand output string size to accommodate escape characters.
+    //size_t i = strcspn (in, LIT_ECHAR);
+    for (
+            size_t i = strcspn (in, LIT_ECHAR);
+            i < strlen (in);
+            i += strcspn (in + i + 1, LIT_ECHAR) + 1) {
+        out_size ++;
+    }
+
+    char *out = calloc (1, out_size);
+
+    size_t boundary;
+    boundary = strcspn (in, LIT_ECHAR);
+    for (size_t i = 0, j = 0;;) {
+        out = strncat (out, in + i, boundary);
+
+        i += boundary;
+        j += boundary;
+        if (i >= strlen (in)) break;
+
+        out[j++] = '\\';
+        out[j++] = replace_char (in[i++]);
+        boundary = strcspn (in + i, LIT_ECHAR);
+    }
+
+    *out_p = out;
+    return 0;
+}

+ 2 - 2
src/term.c

@@ -193,7 +193,7 @@ LSUP_term_serialize (const LSUP_Term *term, LSUP_Buffer *sterm)
 
         if (strlen (term->lang) > 0) {
             lang_idx = size;
-            size += LANG_SIZE;
+            size += strlen (term->lang) + 1;
         }
     }
 
@@ -211,7 +211,7 @@ LSUP_term_serialize (const LSUP_Term *term, LSUP_Buffer *sterm)
 
         // Copy lang tag.
         if (strlen (term->lang) > 0)
-            memcpy (sterm->addr + lang_idx, term->lang, LANG_SIZE);
+            strcpy (sterm->addr + lang_idx, term->lang);
     }
 
     return LSUP_OK;

+ 2 - 0
test.c

@@ -1,5 +1,6 @@
 #include "test_term.c"
 #include "test_namespace.c"
+#include "test_codec_nt.c"
 #include "test_store_ht.c"
 #include "test_store_mdb.c"
 #include "test_graph.c"
@@ -16,6 +17,7 @@ int main(int argc, char **argv) {
     if (
         term_tests() ||
         namespace_tests() ||
+        codec_nt_tests() ||
         store_ht_tests() ||
         store_mdb_tests() ||
         graph_tests() ||

+ 98 - 0
test/test_codec_nt.c

@@ -0,0 +1,98 @@
+#include "test.h"
+#include "codec_nt.h"
+
+static int
+test_encode_nt_term()
+{
+    LSUP_Term *uri1 = LSUP_uri_new ("urn:local:s1");
+    LSUP_Term *uri2 = LSUP_uri_new ("http://example.org/p1");
+    LSUP_Term *lit1 = LSUP_term_new (LSUP_TERM_LITERAL, "hello", NULL, NULL);
+    LSUP_Term *lit2 = LSUP_term_new (
+            LSUP_TERM_LITERAL, "hello", NULL, "en-US");
+    LSUP_Term *lit3 = LSUP_term_new (
+            LSUP_TERM_LITERAL, "hello",
+            "http://www.w3.org/2001/XMLSchema#string", "es-ES");
+    LSUP_Term *lit4 = LSUP_term_new (
+            LSUP_TERM_LITERAL, "25",
+            "http://www.w3.org/2001/XMLSchema#integer", NULL);
+    LSUP_Term *lit5 = LSUP_term_new (
+            LSUP_TERM_LITERAL, "This \\is\\ a \"multi-line\"\n'literal'\t.",
+            NULL, NULL);
+    LSUP_Term *bnode1 = LSUP_term_new (LSUP_TERM_BNODE, "bn1", NULL, NULL);
+    LSUP_Term *undef1 = LSUP_term_new (
+            LSUP_TERM_UNDEFINED, "bogus", NULL, NULL);
+    LSUP_Term *undef2 = TERM_DUMMY;
+
+    LSUP_NSMap *nsm = LSUP_nsmap_new();
+    LSUP_nsmap_add (nsm, "local", "urn:local:");
+    LSUP_nsmap_add (nsm, "ext", "http://example.org");
+
+    char *out;
+    EXPECT_PASS (nt_codec.term_encoder (uri1, NULL, &out));
+    EXPECT_STR_EQ (out, "<urn:local:s1>");
+    free (out);
+
+    EXPECT_PASS (nt_codec.term_encoder (uri1, nsm, &out));
+    EXPECT_STR_EQ (out, "<urn:local:s1>");
+    free (out);
+
+    EXPECT_PASS (nt_codec.term_encoder (uri2, NULL, &out));
+    EXPECT_STR_EQ (out, "<http://example.org/p1>");
+    free (out);
+
+    EXPECT_PASS (nt_codec.term_encoder (lit1, NULL, &out));
+    EXPECT_STR_EQ (out, "\"hello\"");
+    free (out);
+
+    EXPECT_PASS (nt_codec.term_encoder (lit2, NULL, &out));
+    EXPECT_STR_EQ (out, "\"hello\"@en-US");
+    free (out);
+
+    EXPECT_PASS (nt_codec.term_encoder (lit3, NULL, &out));
+    EXPECT_STR_EQ (
+            out, "\"hello\"@es-ES");
+    free (out);
+
+    EXPECT_PASS (nt_codec.term_encoder (lit4, NULL, &out));
+    EXPECT_STR_EQ (
+            out, "\"25\"^^http://www.w3.org/2001/XMLSchema#integer");
+    free (out);
+
+    EXPECT_PASS (nt_codec.term_encoder (lit5, NULL, &out));
+    EXPECT_STR_EQ (
+            out,
+            "\"This \\\\is\\\\ a \\\"multi-line\\\"\\n\\'literal\\'\\t.\"");
+    free (out);
+
+    EXPECT_PASS (nt_codec.term_encoder (bnode1, NULL, &out));
+    EXPECT_STR_EQ (out, "_:bn1");
+    free (out);
+
+    EXPECT_INT_EQ (nt_codec.term_encoder (undef1, NULL, &out), LSUP_VALUE_ERR);
+    ASSERT (out == NULL, "Encoding of undefined term should be NULL!");
+    free (out);
+
+    EXPECT_INT_EQ (nt_codec.term_encoder (undef2, NULL, &out), LSUP_VALUE_ERR);
+    ASSERT (out == NULL, "Encoding of undefined term should be NULL!");
+    free (out);
+
+    LSUP_term_free (uri1);
+    LSUP_term_free (uri2);
+    LSUP_term_free (lit1);
+    LSUP_term_free (lit2);
+    LSUP_term_free (lit3);
+    LSUP_term_free (lit4);
+    LSUP_term_free (lit5);
+    LSUP_term_free (bnode1);
+    LSUP_term_free (undef1);
+    LSUP_term_free (undef2);
+
+    return 0;
+}
+
+
+int codec_nt_tests()
+{
+    RUN (test_encode_nt_term);
+    return 0;
+}