Browse Source

Detailed parse error report.

Stefano Cossu 3 years ago
parent
commit
ac98d2b97a
6 changed files with 112 additions and 33 deletions
  1. 6 3
      TODO.md
  2. 18 2
      include/codec_base.h
  3. 1 1
      include/nt_parser.h
  4. 2 2
      src/codec/nt_grammar.y
  5. 50 24
      src/codec/nt_lexer.re
  6. 35 1
      test/test_codec_nt.c

+ 6 - 3
TODO.md

@@ -7,16 +7,19 @@ P = pending; W = working on it; D = done.
 - *D* LMDB back end
 - D* Hash table back end
 - D* Python bindings
-- W* Namespace manager
-- *W* Better error handling
-- *P* N3 serialization / deserialization
+- D* Namespace manager
+- W* N3 serialization / deserialization
+- P Environment
 - *P* Turtle serialization / deserialization
+- P* Better error handling
 
 
 ## Non-critical for MVP
 
 - Logging
 - Term and triple validation
+- NQ codec
+- TriG codec
 
 
 ## Long-term

+ 18 - 2
include/codec_base.h

@@ -26,6 +26,18 @@ typedef struct codec_iter_t {
 } LSUP_CodecIterator;
 
 
+/** @brief Parse error information.
+ *
+ */
+/* TODO A plain string will suffice for now.
+typedef struct parse_error_t {
+    unsigned int        line;       // Line number where the error occurred.
+    unsigned int        linec;      // Position in line of the offending token.
+    char *              token;      // String representation of the token.
+} LSUP_ParseError;
+*/
+
+
 /** @brief Term encoder callback type.
  *
  * @param[in] term Single term handle.
@@ -111,10 +123,14 @@ typedef LSUP_rc (*term_decode_fn_t)(
  *  Implementations may choose not not use this, and they must account for the
  *  value to be NULL.
  *
+ * @param[out] err Pointer to error info string. If no error occurs, it yields
+ *  NULL.
+ *
  * @return Implementations MUST return LSUP_OK on success and a negative value
  *  on parsing error.
  */
-typedef LSUP_rc (*gr_decode_fn_t)(FILE *rep, LSUP_Graph **gr, size_t *ct);
+typedef LSUP_rc (*gr_decode_fn_t)(
+        FILE *rep, LSUP_Graph **gr, size_t *ct, char **err);
 
 
 /** @brief Codec structure.
@@ -136,7 +152,7 @@ typedef LSUP_rc (*gr_decode_fn_t)(FILE *rep, LSUP_Graph **gr, size_t *ct);
  * - term_decoder: Decode a single term.
  * - gr_decoder: Decode a RDF document into a graph.
  *
- * For documentation on the individual encoding and decoding callbaks, see the
+ * For documentation on the individual encoding and decoding callbacks, see the
  * related function prototypes.
  */
 typedef struct codec_t {

+ 1 - 1
include/nt_parser.h

@@ -33,6 +33,6 @@ LSUP_nt_parse_term (const char *rep, const LSUP_NSMap *map, LSUP_Term **term);
  *  encountered. TODO Add line/char info for parsing error
  */
 LSUP_rc
-LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr, size_t *ct);
+LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr, size_t *ct, char **err);
 
 #endif

+ 2 - 2
src/codec/nt_grammar.y

@@ -30,7 +30,7 @@
 
 // Rules.
 
-ntriplesDoc ::= triples EOF. { printf (" Start of document.\n"); }
+ntriplesDoc ::= triples EOF. { TRACE (STR, "Parsed N-Triples document.\n"); }
 
 triples     ::= eol.
 triples     ::= triple eol.
@@ -64,4 +64,4 @@ eol         ::= EOL.
 eol         ::= eol EOL.
 
 ws          ::=.
-ws          ::= WS. { printf ("WS in grammar.\n"); }
+ws          ::= WS.

+ 50 - 24
src/codec/nt_lexer.re

@@ -27,15 +27,17 @@
 typedef struct {
     FILE *          file;           // Input file handle.
     YYCTYPE *       buf,            // Start of buffer.
-            *       lim,            // Position after the last
-                                    //   available input character
-                                    //   (YYLIMIT)
+            *       lim,            // Position after the last available input
+                                    //   character (YYLIMIT).
             *       cur,            // Next input character to be read
                                     //   (YYCURSOR)
             *       mar,            // Most recent match (YYMARKER)
-            *       tok;            // Start of current token.
-    size_t          ct;             // Number of parsed triples.
-    int             eof;            // if we have reached EOF (T|F)
+            *       tok,            // Start of current token.
+            *       bol;            // Address of the beginning of the current
+                                    //   line (for debugging).
+    unsigned        line;           // Current line no. (for debugging).
+    unsigned        ct;             // Number of parsed triples.
+    bool            eof;            // if we have reached EOF.
     /*!stags:re2c format = "YYCTYPE *@@;"; */
 } ParseIterator;
 
@@ -49,7 +51,7 @@ static int fill(ParseIterator *it)
     if (shift < 1) {
         return 2;
     }
-    printf ("Shifting bytes: %lu\n", shift);
+    TRACE ("Shifting bytes: %lu\n", shift);
     memmove(it->buf, it->tok, it->lim - it->tok);
     it->lim -= shift;
     it->cur -= shift;
@@ -68,6 +70,8 @@ static void parse_init(ParseIterator *it, FILE *file)
     it->file = file;
     it->buf = malloc (CHUNK_SIZE + 1);
     it->cur = it->mar = it->tok = it->lim = it->buf + CHUNK_SIZE;
+    it->line = 1;
+    it->bol = it->buf;
     it->ct = 0;
     it->eof = 0;
     /*!stags:re2c format = "it->@@ = NULL; "; */
@@ -173,8 +177,9 @@ loop:
 
     // For unresolved and partially resolved inconsistencies of the spec, see
     // https://lists.w3.org/Archives/Public/public-rdf-comments/2017Jun/0000.html
-    WS                  = [\x09\x20]+;
-    EOL                 = WS? [\x0D\x0A]+;
+    _WS                 = [\x09\x20];
+    WS                  = _WS+;
+    EOL                 = [\x0D\x0A] (_WS | [\x0D\x0A])*;
     DOT                 = [.];
     HEX                 = [0-9A-Fa-f];
     ECHAR               = [\\] [tbnrf"'\\];
@@ -187,25 +192,27 @@ loop:
     LANGTAG             = [@] [a-zA-Z]+ ("-" [a-zA-Z0-9]+)*;
 
     IRIREF              = [<] IRI_CHARS [>];
-    LITERAL             = LITERAL_QUOTE @lit_data_e WS? ("^^" WS? @dtype_s IRIREF | @lang_s LANGTAG)?;
+    LITERAL             = LITERAL_QUOTE @lit_data_e _WS* ("^^" _WS* @dtype_s IRIREF | @lang_s LANGTAG)?;
     BNODE               = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")* PN_CHARS)?);
     COMMENT             = "#" .*;
 
 
     EOL {
-        printf("End of line.\n");
+        it->line ++;
+        it->bol = YYCURSOR;
+        TRACE("New line: #%u.\n", it->line);
         return T_EOL;
     }
 
     $ {
-        printf("End of buffer.\n");
+        TRACE(STR, "End of buffer.\n");
         return T_EOF;
     }
 
     IRIREF {
         YYCTYPE *data = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
 
-        printf ("URI data: %s\n", data);
+        TRACE ("URI data: %s\n", data);
 
         *term = LSUP_uri_new ((char*)data);
         free (data);
@@ -217,7 +224,7 @@ loop:
         // Only unescape Unicode from data.
         size_t size = lit_data_e - it->tok - 2;
         YYCTYPE *data = unescape_unicode (it->tok + 1, size);
-        printf ("Literal data: %s\n", data);
+        TRACE ("Literal data: %s\n", data);
 
         YYCTYPE *datatype = NULL, *lang = NULL;
 
@@ -226,7 +233,7 @@ loop:
             datatype = malloc (size);
             memcpy (datatype, dtype_s + 1, size);
             datatype [size - 1] = '\0';
-            printf ("datatype: %s\n", datatype);
+            TRACE ("datatype: %s\n", datatype);
         }
 
         if (lang_s) {
@@ -234,7 +241,7 @@ loop:
             lang = malloc (size);
             memcpy (lang, lang_s + 1, size);
             lang [size - 1] = '\0';
-            printf ("lang: %s\n", lang);
+            TRACE ("lang: %s\n", lang);
         }
 
         *term = LSUP_term_new (LSUP_TERM_LITERAL, (char*)data, (char*)datatype, (char*)lang);
@@ -249,7 +256,7 @@ loop:
     BNODE {
         YYCTYPE *data = unescape_unicode (it->tok + 2, YYCURSOR - it->tok - 1);
 
-        printf ("BNode data: %s\n", data);
+        TRACE ("BNode data: %s\n", data);
 
         *term = LSUP_term_new (LSUP_TERM_BNODE, (char*)data, NULL, NULL);
         free (data);
@@ -258,14 +265,14 @@ loop:
     }
 
     DOT {
-        printf ("End of triple.\n");
+        TRACE (STR, "End of triple.\n");
         it->ct ++;
 
         return T_DOT;
     }
 
     WS {
-        printf("Separator.\n");
+        TRACE (STR, "Separator.\n");
 
         return T_WS;
     }
@@ -275,14 +282,14 @@ loop:
         YYCTYPE *data = malloc (size);
         memcpy (data, it->tok, size);
         data [size - 1] = '\0';
-        printf ("Comment: `%s`\n", data);
+        TRACE ("Comment: `%s`\n", data);
         free (data);
 
         goto loop;
     }
 
     * {
-        printf (
+        TRACE (
             "Invalid token @ %lu: %s (\\x%x)\n",
             YYCURSOR - it->buf - 1, it->tok, *it->tok);
 
@@ -317,8 +324,11 @@ LSUP_nt_parse_term (const char *rep, const LSUP_NSMap *map, LSUP_Term **term)
 }
 
 LSUP_rc
-LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct)
+LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct, char **err_p)
 {
+    *err_p = NULL;
+    *gr_p = NULL;
+
     ParseIterator parse_it;
     parse_init (&parse_it, stream);
 
@@ -332,7 +342,23 @@ LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct)
         int ttype = lex (&parse_it, &term);
 
         if (ttype == -1) {
-            fprintf(stderr, "Parse error.\n");
+            char token[16];
+            strncpy (token, (const char *)parse_it.tok, 15);
+
+            char *err_start = "Parse error near token `";
+
+            char err_info [64];
+            sprintf(
+                    err_info, "[...]' at line %u, character %ld.\n",
+                    parse_it.line, parse_it.cur - parse_it.bol);
+
+            size_t err_size = strlen (err_start) + strlen (token)
+                    + strlen (err_info) + 1;
+            char *err_str = malloc (err_size);
+            sprintf (err_str, "%s%s%s", err_start, token, err_info);
+
+            *err_p = err_str;
+
             goto fail;
         }
 
@@ -347,7 +373,7 @@ LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct)
 
     if (ct) *ct = parse_it.ct;
 
-    TRACE ("Parsed %lu triples.\n", parse_it.ct);
+    TRACE ("Parsed %u triples.\n", parse_it.ct);
     TRACE ("Graph size: %lu\n", LSUP_graph_size (gr));
 
     LSUP_term_free (term);

+ 35 - 1
test/test_codec_nt.c

@@ -71,6 +71,14 @@ static char *start_doc = (
     "# FREE WHITESPACE!\n\n\n\n\n\n\n      \n\n\n\n"
 );
 
+
+static char *bad_doc = (
+    "<urn:local:s1> <http://example.org/p1> \"hello\" . #  Comment here.\n"
+    "<urn:local:s1> <http://example.org/p1>\"hello\"@es-ES .\n"
+    "<urn:local:s1> dc:title \"Bad Data.\" ."
+);
+
+
 // End result NT document as it should be produced by the NT codec.
 // Lines should not be checked in strict order.
 static char *end_doc[6] = {
@@ -200,10 +208,12 @@ test_decode_nt_graph()
 
     LSUP_Graph *gr;
     size_t ct;
-    EXPECT_PASS (nt_codec.gr_decoder (input, &gr, &ct));
+    char *err;
+    EXPECT_PASS (nt_codec.gr_decoder (input, &gr, &ct, &err));
 
     fclose (input);
 
+    ASSERT (err == NULL, "Error string is not NULL!");
     EXPECT_INT_EQ (ct, 6);
     EXPECT_INT_EQ (LSUP_graph_size (gr), 6);
 
@@ -216,6 +226,29 @@ test_decode_nt_graph()
 }
 
 
+static int
+test_decode_nt_bad_graph()
+{
+    FILE *input = fmemopen ((void *)bad_doc, strlen (start_doc), "r");
+
+    LSUP_Graph *gr;
+    size_t ct;
+    char *err;
+    EXPECT_INT_EQ (nt_codec.gr_decoder (input, &gr, &ct, &err), LSUP_VALUE_ERR);
+
+    TRACE ("Error: %s", err);
+    ASSERT (strstr (err, "`dc:title") != NULL, "Wrong error string report!");
+    ASSERT (strstr (err, "line 3") != NULL, "Wrong error line report!");
+    ASSERT (strstr (err, "character 16") != NULL, "Wrong error char report!");
+
+    fclose (input);
+
+    LSUP_graph_free (gr);
+
+    return 0;
+}
+
+
 int codec_nt_tests()
 {
     LSUP_Term **terms = init_terms();
@@ -225,6 +258,7 @@ int codec_nt_tests()
     RUN (test_encode_nt_graph);
     RUN (test_decode_nt_term);
     RUN (test_decode_nt_graph);
+    RUN (test_decode_nt_bad_graph);
 
     free_terms(terms);