Browse Source

Rearrange token parsing; start testing bad triples.

Stefano Cossu 2 years ago
parent
commit
91491632bf
10 changed files with 182 additions and 122 deletions
  1. 4 1
      TODO.md
  2. 18 0
      include/codec.h
  3. 4 4
      include/namespace.h
  4. 1 0
      src/codec.c
  5. 17 32
      src/codec/codec_nt.c
  6. 15 31
      src/codec/codec_ttl.c
  7. 10 12
      src/codec/grammar_ttl.y
  8. 95 33
      src/codec/lexer_ttl.re
  9. 9 9
      src/namespace.c
  10. 9 0
      test/test_codec_ttl.c

+ 4 - 1
TODO.md

@@ -25,20 +25,23 @@
 - *D* Flexible store interface
 - *D* Transaction control
 - *W* Turtle serialization / deserialization
+  - *D* TTL decoder
+  - *W* TTL encoder
 - *P* Full UTF-8 support
 - *P* Extended tests
     - *P* C API
+      - *P* Codec (full W3C suite + custom cases)
     - *P* Python API
 
 
 ## Non-critical for MVP
 
+- Graph checksum and semantic equality
 - Term and triple validation
 - Enhanced graph operations
     - Extract unique terms and 2-term tuples
 - NQ codec
 - TriG codec
-- UTF-16 support
 
 
 ## Long-term

+ 18 - 0
include/codec.h

@@ -248,6 +248,23 @@ inline uint8_t
 { return (uint8_t *) strndup ((char *) str, size); }
 
 
+/** Replace non-printable characters with their literal byte.
+ *
+ *  Escape backslash is to be added separately.
+ */
+static inline char
+escape_char (const char c) {
+    switch (c) {
+        case '\t': return 't';
+        case '\b': return 'b';
+        case '\n': return 'n';
+        case '\r': return 'r';
+        case '\f': return 'f';
+        default: return c;
+    }
+}
+
+
 /** @brief Unescape a single character.
  *
  * Convert escaped special characters such as `\t`, `\n`, etc. into their
@@ -273,6 +290,7 @@ unescape_char (const char c)
     }
 }
 
+
 /** @brief Replace \uxxxx and \Uxxxxxxxx with Unicode bytes.
  *
  * @param[in] esc_str Escaped string.

+ 4 - 4
include/namespace.h

@@ -79,7 +79,7 @@ LSUP_nsmap_remove (LSUP_NSMap *map, const char *pfx);
  *  should not be modified directly.
  */
 const char *
-LSUP_nsmap_get_ns (LSUP_NSMap *map, const char *pfx);
+LSUP_nsmap_get_ns (const LSUP_NSMap *map, const char *pfx);
 
 
 /** @brief Get the prefix for a namespace.
@@ -91,7 +91,7 @@ LSUP_nsmap_get_ns (LSUP_NSMap *map, const char *pfx);
  * @return Found prefix, or NULL if the namespace is not mapped.
  */
 const char *
-LSUP_nsmap_get_pfx (LSUP_NSMap *map, const char *ns);
+LSUP_nsmap_get_pfx (const LSUP_NSMap *map, const char *ns);
 
 
 /** @brief Convert a namespace-prefixed string to a FQ URI sring if mapped.
@@ -109,7 +109,7 @@ LSUP_nsmap_get_pfx (LSUP_NSMap *map, const char *ns);
  */
 LSUP_rc
 LSUP_nsmap_normalize_uri (
-        LSUP_NSMap *map, const char *pfx_uri, char **fq_uri);
+        const LSUP_NSMap *map, const char *pfx_uri, char **fq_uri);
 
 
 /** @brief Convert a FQ URI string to a prefixed string if the prefix is found.
@@ -133,7 +133,7 @@ LSUP_nsmap_normalize_uri (
  */
 LSUP_rc
 LSUP_nsmap_denormalize_uri (
-        LSUP_NSMap *map, const char *fq_uri, char **pfx_uri);
+        const LSUP_NSMap *map, const char *fq_uri, char **pfx_uri);
 
 
 /** @brief Dump all entries of a namespace map.

+ 1 - 0
src/codec.c

@@ -220,6 +220,7 @@ LSUP_bnode_add_collection (LSUP_GraphIterator *it, LSUP_Term **ol)
  * Extern inline functions.
  */
 
+char escape_char (const char c);
 char unescape_char (const char c);
 uint8_t *uint8_dup (const uint8_t *str);
 uint8_t *uint8_ndup (const uint8_t *str, size_t size);

+ 17 - 32
src/codec/codec_nt.c

@@ -24,20 +24,24 @@ static LSUP_rc
 term_to_nt (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
 {
     LSUP_rc rc;
-    char *out = NULL, *tmp, *escaped;
+    char *out = NULL, *escaped;
     const char *metadata = NULL;
     size_t buf_len;
 
     // Free previous content if not NULL.
     if (*out_p != NULL) out = realloc (*out_p, 0);
 
+    char *data = term->data;
     switch (term->type) {
+        case LSUP_TERM_NS_IRIREF:
+            LSUP_nsmap_normalize_uri (nsm, term->data, &data);
         case LSUP_TERM_IRIREF:
-            tmp = realloc (out, strlen (term->data) + 3);
-            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
-            out = tmp;
+            out = realloc (out, strlen (data) + 3);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
 
-            sprintf (out, "<%s>", term->data);
+            sprintf (out, "<%s>", data);
+
+            if (data != term->data) free (data);
             rc = LSUP_OK;
             break;
 
@@ -55,9 +59,8 @@ term_to_nt (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
                 buf_len += strlen (metadata) + 4; // Room for ^^<>
             }
 
-            tmp = realloc (out, buf_len);
-            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
-            out = tmp;
+            out = realloc (out, buf_len);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
 
             sprintf (out, "\"%s\"", escaped);
             free (escaped);
@@ -81,9 +84,8 @@ term_to_nt (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
                 buf_len += strlen (metadata) + 1; // Room for @
             }
 
-            tmp = realloc (out, buf_len);
-            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
-            out = tmp;
+            out = realloc (out, buf_len);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
 
             sprintf (out, "\"%s\"", escaped);
             free (escaped);
@@ -96,9 +98,8 @@ term_to_nt (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
             break;
 
         case LSUP_TERM_BNODE:
-            tmp = realloc (out, strlen (term->data) + 3);
-            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
-            out = tmp;
+            out = realloc (out, strlen (term->data) + 3);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
 
             sprintf (out, "_:%s", term->data);
             rc = LSUP_OK;
@@ -181,23 +182,7 @@ const LSUP_Codec nt_codec = {
 };
 
 
-/* * * Other internal functions. * * */
-
-/** Replace non-printable characters with their literal byte.
- *
- *  Escape backslash is to be added separately.
- */
-static inline char replace_char(const char c) {
-    switch (c) {
-        case '\t': return 't';
-        case '\b': return 'b';
-        case '\n': return 'n';
-        case '\r': return 'r';
-        case '\f': return 'f';
-        default: return c;
-    }
-}
-
+/* * * Other static functions. * * */
 
 static LSUP_CodecIterator *
 gr_to_nt_init (const LSUP_Graph *gr)
@@ -247,7 +232,7 @@ escape_lit (const char *in, char **out_p)
         if (i >= strlen (in)) break;
 
         out[j++] = '\\';
-        out[j++] = replace_char (in[i++]);
+        out[j++] = escape_char (in[i++]);
         boundary = strcspn (in + i, LIT_ECHAR);
     }
 

+ 15 - 31
src/codec/codec_ttl.c

@@ -33,11 +33,14 @@ term_to_ttl (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
 
     switch (term->type) {
         case LSUP_TERM_IRIREF:
-            tmp = realloc (out, strlen (term->data) + 3);
-            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
-            out = tmp;
+            LSUP_nsmap_denormalize_uri (nsm, term->data, &out);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
+            rc = LSUP_OK;
+            break;
 
-            sprintf (out, "<%s>", term->data);
+        case LSUP_TERM_NS_IRIREF:
+            out = strdup (term->data);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
             rc = LSUP_OK;
             break;
 
@@ -55,9 +58,8 @@ term_to_ttl (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
                 buf_len += strlen (metadata) + 4; // Room for ^^<>
             }
 
-            tmp = realloc (out, buf_len);
-            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
-            out = tmp;
+            out = realloc (out, buf_len);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
 
             sprintf (out, "\"%s\"", escaped);
             free (escaped);
@@ -81,9 +83,8 @@ term_to_ttl (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
                 buf_len += strlen (metadata) + 1; // Room for @
             }
 
-            tmp = realloc (out, buf_len);
-            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
-            out = tmp;
+            out = realloc (out, buf_len);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
 
             sprintf (out, "\"%s\"", escaped);
             free (escaped);
@@ -96,9 +97,8 @@ term_to_ttl (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
             break;
 
         case LSUP_TERM_BNODE:
-            tmp = realloc (out, strlen (term->data) + 3);
-            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
-            out = tmp;
+            out = realloc (out, strlen (term->data) + 3);
+            if (UNLIKELY (!out)) return LSUP_MEM_ERR;
 
             sprintf (out, "_:%s", term->data);
             rc = LSUP_OK;
@@ -181,23 +181,7 @@ const LSUP_Codec ttl_codec = {
 };
 
 
-/* * * Other internal functions. * * */
-
-/** Replace non-printable characters with their literal byte.
- *
- *  Escape backslash is to be added separately.
- */
-static inline char replace_char(const char c) {
-    switch (c) {
-        case '\t': return 't';
-        case '\b': return 'b';
-        case '\n': return 'n';
-        case '\r': return 'r';
-        case '\f': return 'f';
-        default: return c;
-    }
-}
-
+/* * * Other static functions. * * */
 
 static LSUP_CodecIterator *
 gr_to_ttl_init (const LSUP_Graph *gr)
@@ -247,7 +231,7 @@ escape_lit (const char *in, char **out_p)
         if (i >= strlen (in)) break;
 
         out[j++] = '\\';
-        out[j++] = replace_char (in[i++]);
+        out[j++] = escape_char (in[i++]);
         boundary = strcspn (in + i, LIT_ECHAR);
     }
 

+ 10 - 12
src/codec/grammar_ttl.y

@@ -64,11 +64,11 @@ turtleDoc   ::= statements EOF .
 statements  ::= statements statement .
 statements  ::= .
 
-statement   ::= directive ows EOS .
-statement   ::= triples ows EOS .
+statement   ::= directive .
+statement   ::= triples .
 
-directive 	::= prefixID .
-directive   ::= base .
+directive 	::= prefixID EOS .
+directive   ::= base EOS .
 
 prefixID    ::= PREFIX WS PFX(P) ows IRIREF(N) . {
                 LSUP_nsmap_add (state->nsm, P, N);
@@ -82,7 +82,7 @@ base        ::= BASE WS IRIREF(D) . {
             }
 
 // WS before predicate is optional because pred has leading WS already.
-triples 	::= subject(S) predObjList(L) . [EOS] {
+triples 	::= subject(S) predObjList(L) EOS . [EOS] {
                 size_t ct = LSUP_spo_list_add_triples (state->it, S, L);
                 state->ct += ct;
                 log_trace ("Added %lu triples.", ct);
@@ -156,27 +156,27 @@ literal(A)  ::= BOOLEAN(D) . {
             }
 
 %type blank { LSUP_Term * }
-blank(A)    ::= nodeID(D) . {
+blank(A)    ::= BNODE_ID(D) . {
                 A = LSUP_term_new (LSUP_TERM_BNODE, D, NULL);
             }
-blank(A)    ::= LBRACKET ows RBRACKET . {
+blank(A)    ::= LBRACKET RBRACKET . {
                 A = LSUP_term_new (LSUP_TERM_BNODE, NULL, NULL);
             }
-blank(A)    ::= LBRACKET ows predObjList(L) ows RBRACKET . {
+blank(A)    ::= LBRACKET predObjList(L) RBRACKET . {
                 A = LSUP_term_new (LSUP_TERM_BNODE, NULL, NULL);
                 state->ct += LSUP_spo_list_add_triples (state->it, A, L);
 
                 LSUP_pred_obj_list_free (L);
             }
 blank       ::= collection .
-blank(A)    ::= LPAREN ows RPAREN . {
+blank(A)    ::= LPAREN RPAREN . {
                 A = LSUP_iriref_new ("rdf:nil", state->nsm);
             }
 
 // "collection" is the subject of the first collection item.
 %type collection { LSUP_Term * }
 // Collection triples are added here to the graph.
-collection(A) ::= LPAREN ows itemList(L) ows RPAREN . {
+collection(A) ::= LPAREN itemList(L) RPAREN . {
                 A = LSUP_bnode_add_collection (state->it, L);
             }
 
@@ -212,8 +212,6 @@ qname(A)    ::= COLON IDNAME(D) . {
             }
 qname(A)    ::= COLON . { A = strndup (":", 2); }
 
-nodeID(A)   ::= BNODE_PFX IDNAME(D) . { A = D; }
-
 ows         ::= WS.
 ows         ::=.
 

+ 95 - 33
src/codec/lexer_ttl.re

@@ -133,9 +133,17 @@ loop: // Start new token.
 
     *token_p = NULL;
 
-cont: // Continue token parsing.
+cont: // Continue token parsing. Do not move token start pointer.
     /*!re2c
 
+    * {
+        log_warn (
+            "Invalid token @ %lu: %s (\\x%x)",
+            YYCURSOR - it->buf - 1, it->tok, *it->tok);
+
+        return -1;
+    }
+
     $ {
         log_trace ("End of document.");
         return T_EOF;
@@ -148,10 +156,9 @@ cont: // Continue token parsing.
         goto loop;
     }
 
-    '@prefix' {
-        log_trace ("'@prefix' keyword.");
-        return T_PREFIX;
-    }
+    [\x22]{3} { goto lchar; }
+
+    [\x22] { goto schar; }
 
     "true" | "false" {
         *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok);
@@ -167,11 +174,6 @@ cont: // Continue token parsing.
         return T_IRIREF;
     }
 
-    NCWS 'a' / WS {
-        log_trace ("RDF type shorthand 'a'.");
-        return T_RDF_TYPE;
-    }
-
     NSTART_CHAR NAME_CHAR* {
         *token_p = unescape_unicode (it->tok, YYCURSOR - it->tok);
         log_trace ("ID name: %s", *token_p);
@@ -186,18 +188,21 @@ cont: // Continue token parsing.
         return T_PFX;
     }
 
-    [\x22]{3} { goto lchar; }
+    WS {
+        log_trace ("Whitespace.");
+        return T_WS;
+    }
 
-    [\x22] SCHAR* [\x22] {
-        *token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
-        log_trace ("Long string: %s", *token_p);
+    '@prefix' {
+        log_trace ("'@prefix' keyword.");
 
-        return T_STRING;
+        return T_PREFIX;
     }
 
-    WS {
-        log_trace ("Whitespace.");
-        return T_WS;
+    '@base' {
+        log_trace ("'@base' keyword.");
+
+        return T_BASE;
     }
 
     '@' [a-z]+ ('-' [a-z0-9]+)* {
@@ -242,13 +247,13 @@ cont: // Continue token parsing.
         return T_DECIMAL;
     }
 
-    '(' { return T_LPAREN; }
+    '(' WS? { return T_LPAREN; }
 
-    ')' { return T_RPAREN; }
+    WS? ')' { return T_RPAREN; }
 
-    '[' { return T_LBRACKET; }
+    '[' WS? { return T_LBRACKET; }
 
-    ']' { return T_RBRACKET; }
+    WS? ']' { return T_RBRACKET; }
 
     ':' { return T_COLON; }
 
@@ -262,28 +267,23 @@ cont: // Continue token parsing.
         return T_EOS;
     }
 
-    '_:' { return T_BNODE_PFX; }
+    '_:' { goto bnode_id; }
 
     '^^' { return T_DTYPE_MARKER; }
 
-    '@base' {return T_BASE; }
-
     COMMENT {
         log_trace ("Comment: `%s`", it->tok);
         goto loop;
     }
 
-    * {
-        log_warn (
-            "Invalid token @ %lu: %s (\\x%x)",
-            YYCURSOR - it->buf - 1, it->tok, *it->tok);
-
-        return -1;
+    WS "a" / WS {
+        log_trace ("RDF type shorthand 'a'.");
+        return T_RDF_TYPE;
     }
 
     */
 
-lchar:
+schar:
     /*!re2c
 
     * {
@@ -294,6 +294,32 @@ lchar:
         return -1;
     }
 
+    $ {
+        log_warn ("Unterminated string!");
+
+        return -1;
+    }
+
+    SCHAR {
+        log_trace (
+                "Continue string token at position %lu: %c",
+                YYCURSOR - it->tok, *YYCURSOR);
+
+        goto schar;
+    }
+
+    [\x22] {
+        *token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
+        log_trace ("String: %s", *token_p);
+
+        return T_STRING;
+    }
+
+    */
+
+lchar:
+    /*!re2c
+
     $ {
         log_warn ("Unterminated long string!");
 
@@ -312,7 +338,43 @@ lchar:
 
         return T_STRING;
     }
-     */
+
+    * {
+        log_warn (
+            "Invalid token @ %lu: %s (\\x%x)",
+            YYCURSOR - it->buf - 1, it->tok, *it->tok);
+
+        return -1;
+    }
+
+    */
+
+bnode_id:
+    /*!re2c
+
+    NSTART_CHAR NAME_CHAR* {
+        *token_p = unescape_unicode (it->tok, YYCURSOR - it->tok);
+        log_trace ("ID name: %s", *token_p);
+
+        return T_BNODE_ID;
+    }
+
+    * {
+        log_warn (
+            "Invalid token @ %lu: %s (\\x%x)",
+            YYCURSOR - it->buf - 1, it->tok, *it->tok);
+
+        return -1;
+    }
+
+    $ {
+        log_trace ("End of document.");
+
+        return T_EOF;
+    }
+
+    */
+
 }
 
 

+ 9 - 9
src/namespace.c

@@ -117,22 +117,22 @@ LSUP_nsmap_remove (NSMap *map, const char *pfx)
 
 
 const char *
-LSUP_nsmap_get_ns (NSMap *map, const char *pfx)
+LSUP_nsmap_get_ns (const NSMap *map, const char *pfx)
 {
     NSEntry entry_s;
     strncpy (entry_s.pfx, pfx, PFX_LEN);
-    NSEntry *entry = hashmap_get (map, &entry_s);
+    NSEntry *entry = hashmap_get ((NSMap *)map, &entry_s);
 
     return (entry) ? entry->ns : NULL;
 }
 
 
 const char *
-LSUP_nsmap_get_pfx (NSMap *map, const char *ns)
+LSUP_nsmap_get_pfx (const NSMap *map, const char *ns)
 {
     const NSEntry *entry;
     size_t i = 0;
-    while (hashmap_iter (map, &i, (void **) &entry)) {
+    while (hashmap_iter ((NSMap *)map, &i, (void **) &entry)) {
         if (strncmp (entry->ns, ns, strlen (ns)) == 0)
             return entry->pfx;
     }
@@ -143,7 +143,7 @@ LSUP_nsmap_get_pfx (NSMap *map, const char *ns)
 
 LSUP_rc
 LSUP_nsmap_normalize_uri (
-        NSMap *map, const char *pfx_uri, char **fq_uri_p)
+        const NSMap *map, const char *pfx_uri, char **fq_uri_p)
 {
     char *fq_uri = NULL;
 
@@ -157,7 +157,7 @@ LSUP_nsmap_normalize_uri (
 
     ns_pfx pfx;
     strncpy (pfx, pfx_uri, pfx_len);
-    pfx[pfx_len] = 0;
+    pfx[pfx_len] = '\0';
 
     /*
     Namespace *entry;
@@ -166,7 +166,7 @@ LSUP_nsmap_normalize_uri (
             break;
     }
     */
-    const char *ns = LSUP_nsmap_get_ns (map, pfx);
+    const char *ns = LSUP_nsmap_get_ns ((NSMap *)map, pfx);
 
     if (ns) {
         // -1 for :, +1 for terminator.
@@ -188,7 +188,7 @@ LSUP_nsmap_normalize_uri (
 
 LSUP_rc
 LSUP_nsmap_denormalize_uri (
-        NSMap *map, const char *fq_uri, char **pfx_uri_p)
+        const NSMap *map, const char *fq_uri, char **pfx_uri_p)
 {
     /*
      * This is different from LSUP_nsmap_get_ns, in that the URI being looked
@@ -199,7 +199,7 @@ LSUP_nsmap_denormalize_uri (
     const NSEntry *entry;
     const char *pfx = NULL;
     size_t i = 0, offset;
-    while (hashmap_iter (map, &i, (void **) &entry)) {
+    while (hashmap_iter ((NSMap *)map, &i, (void **) &entry)) {
         offset = strlen(entry->ns);
         if (strncmp (entry->ns, fq_uri, offset) == 0) {
             pfx = entry->pfx;

+ 9 - 0
test/test_codec_ttl.c

@@ -38,6 +38,15 @@ test_w3c_pos()
         EXPECT_PASS (codec.decode_graph (test_stream, &gr, &ct, &err));
         EXPECT_INT_EQ (LSUP_graph_size (gr), nt_ct); // Just count NT lines.
     }
+    for (int i = 0; i <= W3C_NEG_TEST_CT; i++) {
+        sprintf (test_fname, "test/assets/ttl/bad-%02d.ttl", i);
+        FILE *test_stream = fopen (test_fname, "r");
+        log_info ("Testing %s", test_fname);
+
+        LSUP_rc rc = codec.decode_graph (test_stream, &gr, &ct, &err);
+        log_info ("rc: %d", rc);
+        ASSERT (rc != LSUP_OK, "Test meant to fail passed!");
+    }
 
     return 0;
 }