Browse Source

Handle syntax errors; pass light W3C tests.

Stefano Cossu 2 years ago
parent
commit
6fe9304fab
3 changed files with 47 additions and 52 deletions
  1. 1 1
      src/codec/codec_ttl.c
  2. 29 12
      src/codec/grammar_ttl.y
  3. 17 39
      src/codec/lexer_ttl.re

+ 1 - 1
src/codec/codec_ttl.c

@@ -24,7 +24,7 @@ static LSUP_rc
 term_to_ttl (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
 {
     LSUP_rc rc;
-    char *out = NULL, *tmp, *escaped;
+    char *out = NULL, *escaped;
     const char *metadata = NULL;
     size_t buf_len;
 

+ 29 - 12
src/codec/grammar_ttl.y

@@ -11,6 +11,14 @@
  */
 
 #include "codec.h"
+
+/*
+ * Disable all error recovery processing in the parser push-down
+ * automaton. From SQLite grammar file parse.y
+ */
+// FIXME This throws a compile error.
+//#define YYNOERRORRECOVERY 1
+
 }
 
 %name TTLParse
@@ -40,7 +48,7 @@
 
 %token_prefix "T_"
 %token_type { char * }
-%token_destructor { free ($$); }
+%token_destructor { (void) state; free ($$); }
 
 /* NULL-terminated array of object term handles. */
 %type objList           { LSUP_Term ** }
@@ -54,6 +62,7 @@
 
 %extra_argument         { LSUP_TTLParserState *state }
 
+// Low- to high-priority.
 %left PERIOD .
 %left SEMICOLON .
 %left COMMA .
@@ -72,9 +81,9 @@ statements  ::= .
 statement 	::= prefixID .
 statement   ::= base .
 statement   ::= triples .
+statement   ::= WS .
 
-
-prefixID    ::= PREFIX(P) WS IRIREF(N) PERIOD . {
+prefixID    ::= PREFIX(P) WS IRIREF(N) ows PERIOD . {
                 LSUP_nsmap_add (state->nsm, P, N);
             }
 
@@ -82,7 +91,7 @@ base        ::= BASE IRIREF(D) PERIOD . {
                 state->base = LSUP_iriref_new (D, NULL);
             }
 
-triples 	::= subject(S) predObjList(L) PERIOD . {
+triples 	::= subject(S) WS predObjList(L) PERIOD . {
                 size_t ct = LSUP_spo_list_add_triples (state->it, S, L);
                 state->ct += ct;
                 state->rc = LSUP_OK;
@@ -90,18 +99,25 @@ triples 	::= subject(S) predObjList(L) PERIOD . {
                 LSUP_term_free (S);
                 LSUP_pred_obj_list_free (L);
             }
+triples 	::= subject(S) WS predObjList(L) SEMICOLON PERIOD . [PERIOD] {
+                size_t ct = LSUP_spo_list_add_triples (state->it, S, L);
+                state->ct += ct;
+                state->rc = LSUP_OK;
+                log_trace ("Added %lu triples.", ct);
+                LSUP_term_free (S);
+                LSUP_pred_obj_list_free (L);
+            }
+triples     ::= subject WS error EOF . {
+                log_warn ("Error symbol popped.");
+            }
 
 %type predObjList       { LSUP_PredObjList * }
 %destructor predObjList { LSUP_pred_obj_list_free ($$); }
-predObjList(A) ::= predicate(P) objectList(O) SEMICOLON . {
+predObjList(A) ::= predicate(P) WS objectList(O) . [SEMICOLON] {
                 A = LSUP_pred_obj_list_new();
                 LSUP_pred_obj_list_add (A, P, O);
             }
-predObjList(A) ::= predicate(P) objectList(O) . [SEMICOLON] {
-                A = LSUP_pred_obj_list_new();
-                LSUP_pred_obj_list_add (A, P, O);
-            }
-predObjList(A) ::= predObjList(L) SEMICOLON predicate(P) objectList(O) . {
+predObjList(A) ::= predObjList(L) SEMICOLON predicate(P) WS objectList(O) . {
                 LSUP_pred_obj_list_add (L, P, O);
                 A = L;
             }
@@ -167,7 +183,7 @@ blank(A)    ::= BNODE_ID(D) . {
 blank(A)    ::= LBRACKET RBRACKET . [BNODE_ID] {
                 A = LSUP_term_new (LSUP_TERM_BNODE, NULL, NULL);
             }
-blank(A)    ::= LBRACKET ows predObjList(L) ows RBRACKET . [BNODE_ID] {
+blank(A)    ::= LBRACKET predObjList(L) RBRACKET . [BNODE_ID] {
                 A = LSUP_term_new (LSUP_TERM_BNODE, NULL, NULL);
                 state->ct += LSUP_spo_list_add_triples (state->it, A, L);
 
@@ -188,7 +204,7 @@ collection(A) ::= LPAREN itemList(L) RPAREN . {
 %type itemList { LSUP_Term ** }
 // Freed when the item list in the collection gets added to the graph.
 %destructor itemList {}
-itemList(A) ::= itemList(L) object(O) . { A = LSUP_obj_list_add (L, O); }
+itemList(A) ::= itemList(L) WS object(O) . { A = LSUP_obj_list_add (L, O); }
 itemList(A) ::= object(O) . {
                 A = calloc (sizeof (*A), 2);
                 A[0] = O;
@@ -210,6 +226,7 @@ resource(A) ::= QNAME(D) . { A = LSUP_iriref_new (D, state->nsm); }
 ows         ::= WS .
 ows         ::= .
 
+
 /*
  * This has been adapted from
  * https://www.w3.org/TeamSubmission/turtle/#sec-grammar-grammar :

+ 17 - 39
src/codec/lexer_ttl.re

@@ -120,7 +120,7 @@ static int lex (ParseIterator *it, YYCTYPE **token_p)
 
     // Constructs.
     COMMENT         = '#' ( [^\n\r] )*;
-    WS              = NCWS+ | COMMENT;
+    WS              = (NCWS+ | COMMENT)+;
     INTEGER         = ('-' | '+')? [0-9]+;
     EXPONENT        = [eE] INTEGER;
     DOUBLE          = ('-' | '+') ? ([0-9]+ '.' [0-9]* EXPONENT
@@ -135,7 +135,6 @@ loop: // Start new token.
 
     *token_p = NULL;
 
-cont: // Continue token parsing. Do not move token start pointer.
     /*!re2c
 
     * {
@@ -203,9 +202,11 @@ cont: // Continue token parsing. Do not move token start pointer.
         return T_BNODE_ID;
     }
 
-
     WS {
-        log_trace ("Whitespace.");
+        uint8_t *ws = uint8_ndup (it->tok, YYCURSOR - it->tok);
+        log_trace ("Whitespace: '%s'", ws);
+        free (ws);
+
         return T_WS;
     }
 
@@ -251,21 +252,25 @@ cont: // Continue token parsing. Do not move token start pointer.
         return T_DECIMAL;
     }
 
-    '(' { return T_LPAREN; }
+    '(' WS? { return T_LPAREN; }
 
-    ')' { return T_RPAREN; }
+    WS? ')' { return T_RPAREN; }
 
-    '[' { return T_LBRACKET; }
+    '[' WS? { return T_LBRACKET; }
 
-    ']' { return T_RBRACKET; }
+    WS? ']' { return T_RBRACKET; }
 
     ':' { return T_COLON; }
 
-    ';' { return T_SEMICOLON; }
+    WS? ';' WS? {
+        log_trace ("End of object list.");
+
+        return T_SEMICOLON;
+    }
 
-    ',' { return T_COMMA; }
+    WS? ',' WS? { return T_COMMA; }
 
-    '.' {
+    WS? '.' {
         log_trace ("End of statement #%u.", it->stmt);
         it->stmt++;
         return T_PERIOD;
@@ -278,7 +283,7 @@ cont: // Continue token parsing. Do not move token start pointer.
         goto loop;
     }
 
-    "a" / WS {
+    "a" {
         log_trace ("RDF type shorthand 'a'.");
         return T_RDF_TYPE;
     }
@@ -340,33 +345,6 @@ lchar:
     }
 
     */
-
-bnode_id:
-    /*!re2c
-
-    NSTART_CHAR NAME_CHAR* {
-        *token_p = unescape_unicode (it->tok, YYCURSOR - it->tok);
-        log_trace ("ID name: %s", *token_p);
-
-        return T_BNODE_ID;
-    }
-
-    * {
-        log_warn (
-            "Invalid token @ %lu: %s (\\x%x)",
-            YYCURSOR - it->buf - 1, it->tok, *it->tok);
-
-        return -1;
-    }
-
-    $ {
-        log_trace ("End of document.");
-
-        return T_EOF;
-    }
-
-    */
-
 }