4 years ago · ac98d2b97a
--- a/TODO.md
+++ b/TODO.md
@@ -7,16 +7,19 @@ P = pending; W = working on it; D = done.
 
															 - *D* LMDB back end
														
 
															 - D* Hash table back end
														
 
															 - D* Python bindings
														
 
															-- W* Namespace manager
														
 
															-- *W* Better error handling
														
 
															-- *P* N3 serialization / deserialization
														
 
															+- D* Namespace manager
														
 
															+- W* N3 serialization / deserialization
														
 
															+- P Environment
														
 
															 - *P* Turtle serialization / deserialization
														
 
															+- P* Better error handling
														
 
															 ## Non-critical for MVP
														
 
															 - Logging
														
 
															 - Term and triple validation
														
 
															+- NQ codec
														
 
															+- TriG codec
														
 
															 ## Long-term
														
--- a/include/codec_base.h
+++ b/include/codec_base.h
@@ -26,6 +26,18 @@ typedef struct codec_iter_t {
 
															 } LSUP_CodecIterator;
														
 
															+/** @brief Parse error information.
														
 
															+ *
														
 
															+ */
														
 
															+/* TODO A plain string will suffice for now.
														
 
															+typedef struct parse_error_t {
														
 
															+    unsigned int        line;       // Line number where the error occurred.
														
 
															+    unsigned int        linec;      // Position in line of the offending token.
														
 
															+    char *              token;      // String representation of the token.
														
 
															+} LSUP_ParseError;
														
 
															+*/
														
 
															+
														
 
															+
														
 
															 /** @brief Term encoder callback type.
														
 
															  *
														
 
															  * @param[in] term Single term handle.
														
@@ -111,10 +123,14 @@ typedef LSUP_rc (*term_decode_fn_t)(
 
															  *  Implementations may choose not not use this, and they must account for the
														
 
															  *  value to be NULL.
														
 
															  *
														
 
															+ * @param[out] err Pointer to error info string. If no error occurs, it yields
														
 
															+ *  NULL.
														
 
															+ *
														
 
															  * @return Implementations MUST return LSUP_OK on success and a negative value
														
 
															  *  on parsing error.
														
 
															  */
														
 
															-typedef LSUP_rc (*gr_decode_fn_t)(FILE *rep, LSUP_Graph **gr, size_t *ct);
														
 
															+typedef LSUP_rc (*gr_decode_fn_t)(
														
 
															+        FILE *rep, LSUP_Graph **gr, size_t *ct, char **err);
														
 
															 /** @brief Codec structure.
														
@@ -136,7 +152,7 @@ typedef LSUP_rc (*gr_decode_fn_t)(FILE *rep, LSUP_Graph **gr, size_t *ct);
 
															  * - term_decoder: Decode a single term.
														
 
															  * - gr_decoder: Decode a RDF document into a graph.
														
 
															  *
														
 
															- * For documentation on the individual encoding and decoding callbaks, see the
														
 
															+ * For documentation on the individual encoding and decoding callbacks, see the
														
 
															  * related function prototypes.
														
 
															  */
														
 
															 typedef struct codec_t {
														
--- a/include/nt_parser.h
+++ b/include/nt_parser.h
@@ -33,6 +33,6 @@ LSUP_nt_parse_term (const char *rep, const LSUP_NSMap *map, LSUP_Term **term);
 
															  *  encountered. TODO Add line/char info for parsing error
														
 
															  */
														
 
															 LSUP_rc
														
 
															-LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr, size_t *ct);
														
 
															+LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr, size_t *ct, char **err);
														
 
															 #endif
														
--- a/src/codec/nt_grammar.y
+++ b/src/codec/nt_grammar.y
@@ -30,7 +30,7 @@
 
															 // Rules.
														
 
															-ntriplesDoc ::= triples EOF. { printf (" Start of document.\n"); }
														
 
															+ntriplesDoc ::= triples EOF. { TRACE (STR, "Parsed N-Triples document.\n"); }
														
 
															 triples     ::= eol.
														
 
															 triples     ::= triple eol.
														
@@ -64,4 +64,4 @@ eol         ::= EOL.
 
															 eol         ::= eol EOL.
														
 
															 ws          ::=.
														
 
															-ws          ::= WS. { printf ("WS in grammar.\n"); }
														
 
															+ws          ::= WS.
														
--- a/src/codec/nt_lexer.re
+++ b/src/codec/nt_lexer.re
@@ -27,15 +27,17 @@
 
															 typedef struct {
														
 
															     FILE *          file;           // Input file handle.
														
 
															     YYCTYPE *       buf,            // Start of buffer.
														
 
															-            *       lim,            // Position after the last
														
 
															-                                    //   available input character
														
 
															-                                    //   (YYLIMIT)
														
 
															+            *       lim,            // Position after the last available input
														
 
															+                                    //   character (YYLIMIT).
														
 
															             *       cur,            // Next input character to be read
														
 
															                                     //   (YYCURSOR)
														
 
															             *       mar,            // Most recent match (YYMARKER)
														
 
															-            *       tok;            // Start of current token.
														
 
															-    size_t          ct;             // Number of parsed triples.
														
 
															-    int             eof;            // if we have reached EOF (T|F)
														
 
															+            *       tok,            // Start of current token.
														
 
															+            *       bol;            // Address of the beginning of the current
														
 
															+                                    //   line (for debugging).
														
 
															+    unsigned        line;           // Current line no. (for debugging).
														
 
															+    unsigned        ct;             // Number of parsed triples.
														
 
															+    bool            eof;            // if we have reached EOF.
														
 
															     /*!stags:re2c format = "YYCTYPE *@@;"; */
														
 
															 } ParseIterator;
														
@@ -49,7 +51,7 @@ static int fill(ParseIterator *it)
 
															     if (shift < 1) {
														
 
															         return 2;
														
 
															     }
														
 
															-    printf ("Shifting bytes: %lu\n", shift);
														
 
															+    TRACE ("Shifting bytes: %lu\n", shift);
														
 
															     memmove(it->buf, it->tok, it->lim - it->tok);
														
 
															     it->lim -= shift;
														
 
															     it->cur -= shift;
														
@@ -68,6 +70,8 @@ static void parse_init(ParseIterator *it, FILE *file)
 
															     it->file = file;
														
 
															     it->buf = malloc (CHUNK_SIZE + 1);
														
 
															     it->cur = it->mar = it->tok = it->lim = it->buf + CHUNK_SIZE;
														
 
															+    it->line = 1;
														
 
															+    it->bol = it->buf;
														
 
															     it->ct = 0;
														
 
															     it->eof = 0;
														
 
															     /*!stags:re2c format = "it->@@ = NULL; "; */
														
@@ -173,8 +177,9 @@ loop:
 
															     // For unresolved and partially resolved inconsistencies of the spec, see
														
 
															     // https://lists.w3.org/Archives/Public/public-rdf-comments/2017Jun/0000.html
														
 
															-    WS                  = [\x09\x20]+;
														
 
															-    EOL                 = WS? [\x0D\x0A]+;
														
 
															+    _WS                 = [\x09\x20];
														
 
															+    WS                  = _WS+;
														
 
															+    EOL                 = [\x0D\x0A] (_WS | [\x0D\x0A])*;
														
 
															     DOT                 = [.];
														
 
															     HEX                 = [0-9A-Fa-f];
														
 
															     ECHAR               = [\\] [tbnrf"'\\];
														
@@ -187,25 +192,27 @@ loop:
 
															     LANGTAG             = [@] [a-zA-Z]+ ("-" [a-zA-Z0-9]+)*;
														
 
															     IRIREF              = [<] IRI_CHARS [>];
														
 
															-    LITERAL             = LITERAL_QUOTE @lit_data_e WS? ("^^" WS? @dtype_s IRIREF | @lang_s LANGTAG)?;
														
 
															+    LITERAL             = LITERAL_QUOTE @lit_data_e _WS* ("^^" _WS* @dtype_s IRIREF | @lang_s LANGTAG)?;
														
 
															     BNODE               = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")* PN_CHARS)?);
														
 
															     COMMENT             = "#" .*;
														
 
															     EOL {
														
 
															-        printf("End of line.\n");
														
 
															+        it->line ++;
														
 
															+        it->bol = YYCURSOR;
														
 
															+        TRACE("New line: #%u.\n", it->line);
														
 
															         return T_EOL;
														
 
															     }
														
 
															     $ {
														
 
															-        printf("End of buffer.\n");
														
 
															+        TRACE(STR, "End of buffer.\n");
														
 
															         return T_EOF;
														
 
															     }
														
 
															     IRIREF {
														
 
															         YYCTYPE *data = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
														
 
															-        printf ("URI data: %s\n", data);
														
 
															+        TRACE ("URI data: %s\n", data);
														
 
															         *term = LSUP_uri_new ((char*)data);
														
 
															         free (data);
														
@@ -217,7 +224,7 @@ loop:
 
															         // Only unescape Unicode from data.
														
 
															         size_t size = lit_data_e - it->tok - 2;
														
 
															         YYCTYPE *data = unescape_unicode (it->tok + 1, size);
														
 
															-        printf ("Literal data: %s\n", data);
														
 
															+        TRACE ("Literal data: %s\n", data);
														
 
															         YYCTYPE *datatype = NULL, *lang = NULL;
														
@@ -226,7 +233,7 @@ loop:
 
															             datatype = malloc (size);
														
 
															             memcpy (datatype, dtype_s + 1, size);
														
 
															             datatype [size - 1] = '\0';
														
 
															-            printf ("datatype: %s\n", datatype);
														
 
															+            TRACE ("datatype: %s\n", datatype);
														
 
															         }
														
 
															         if (lang_s) {
														
@@ -234,7 +241,7 @@ loop:
 
															             lang = malloc (size);
														
 
															             memcpy (lang, lang_s + 1, size);
														
 
															             lang [size - 1] = '\0';
														
 
															-            printf ("lang: %s\n", lang);
														
 
															+            TRACE ("lang: %s\n", lang);
														
 
															         }
														
 
															         *term = LSUP_term_new (LSUP_TERM_LITERAL, (char*)data, (char*)datatype, (char*)lang);
														
@@ -249,7 +256,7 @@ loop:
 
															     BNODE {
														
 
															         YYCTYPE *data = unescape_unicode (it->tok + 2, YYCURSOR - it->tok - 1);
														
 
															-        printf ("BNode data: %s\n", data);
														
 
															+        TRACE ("BNode data: %s\n", data);
														
 
															         *term = LSUP_term_new (LSUP_TERM_BNODE, (char*)data, NULL, NULL);
														
 
															         free (data);
														
@@ -258,14 +265,14 @@ loop:
 
															     }
														
 
															     DOT {
														
 
															-        printf ("End of triple.\n");
														
 
															+        TRACE (STR, "End of triple.\n");
														
 
															         it->ct ++;
														
 
															         return T_DOT;
														
 
															     }
														
 
															     WS {
														
 
															-        printf("Separator.\n");
														
 
															+        TRACE (STR, "Separator.\n");
														
 
															         return T_WS;
														
 
															     }
														
@@ -275,14 +282,14 @@ loop:
 
															         YYCTYPE *data = malloc (size);
														
 
															         memcpy (data, it->tok, size);
														
 
															         data [size - 1] = '\0';
														
 
															-        printf ("Comment: `%s`\n", data);
														
 
															+        TRACE ("Comment: `%s`\n", data);
														
 
															         free (data);
														
 
															         goto loop;
														
 
															     }
														
 
															     * {
														
 
															-        printf (
														
 
															+        TRACE (
														
 
															             "Invalid token @ %lu: %s (\\x%x)\n",
														
 
															             YYCURSOR - it->buf - 1, it->tok, *it->tok);
														
@@ -317,8 +324,11 @@ LSUP_nt_parse_term (const char *rep, const LSUP_NSMap *map, LSUP_Term **term)
 
															 }
														
 
															 LSUP_rc
														
 
															-LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct)
														
 
															+LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct, char **err_p)
														
 
															 {
														
 
															+    *err_p = NULL;
														
 
															+    *gr_p = NULL;
														
 
															+
														
 
															     ParseIterator parse_it;
														
 
															     parse_init (&parse_it, stream);
														
@@ -332,7 +342,23 @@ LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct)
 
															         int ttype = lex (&parse_it, &term);
														
 
															         if (ttype == -1) {
														
 
															-            fprintf(stderr, "Parse error.\n");
														
 
															+            char token[16];
														
 
															+            strncpy (token, (const char *)parse_it.tok, 15);
														
 
															+
														
 
															+            char *err_start = "Parse error near token `";
														
 
															+
														
 
															+            char err_info [64];
														
 
															+            sprintf(
														
 
															+                    err_info, "[...]' at line %u, character %ld.\n",
														
 
															+                    parse_it.line, parse_it.cur - parse_it.bol);
														
 
															+
														
 
															+            size_t err_size = strlen (err_start) + strlen (token)
														
 
															+                    + strlen (err_info) + 1;
														
 
															+            char *err_str = malloc (err_size);
														
 
															+            sprintf (err_str, "%s%s%s", err_start, token, err_info);
														
 
															+
														
 
															+            *err_p = err_str;
														
 
															+
														
 
															             goto fail;
														
 
															         }
														
@@ -347,7 +373,7 @@ LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct)
 
															     if (ct) *ct = parse_it.ct;
														
 
															-    TRACE ("Parsed %lu triples.\n", parse_it.ct);
														
 
															+    TRACE ("Parsed %u triples.\n", parse_it.ct);
														
 
															     TRACE ("Graph size: %lu\n", LSUP_graph_size (gr));
														
 
															     LSUP_term_free (term);
														
--- a/test/test_codec_nt.c
+++ b/test/test_codec_nt.c
@@ -71,6 +71,14 @@ static char *start_doc = (
 
															     "# FREE WHITESPACE!\n\n\n\n\n\n\n      \n\n\n\n"
														
 
															 );
														
 
															+
														
 
															+static char *bad_doc = (
														
 
															+    "<urn:local:s1> <http://example.org/p1> \"hello\" . #  Comment here.\n"
														
 
															+    "<urn:local:s1> <http://example.org/p1>\"hello\"@es-ES .\n"
														
 
															+    "<urn:local:s1> dc:title \"Bad Data.\" ."
														
 
															+);
														
 
															+
														
 
															+
														
 
															 // End result NT document as it should be produced by the NT codec.
														
 
															 // Lines should not be checked in strict order.
														
 
															 static char *end_doc[6] = {
														
@@ -200,10 +208,12 @@ test_decode_nt_graph()
 
															     LSUP_Graph *gr;
														
 
															     size_t ct;
														
 
															-    EXPECT_PASS (nt_codec.gr_decoder (input, &gr, &ct));
														
 
															+    char *err;
														
 
															+    EXPECT_PASS (nt_codec.gr_decoder (input, &gr, &ct, &err));
														
 
															     fclose (input);
														
 
															+    ASSERT (err == NULL, "Error string is not NULL!");
														
 
															     EXPECT_INT_EQ (ct, 6);
														
 
															     EXPECT_INT_EQ (LSUP_graph_size (gr), 6);
														
@@ -216,6 +226,29 @@ test_decode_nt_graph()
 
															 }
														
 
															+static int
														
 
															+test_decode_nt_bad_graph()
														
 
															+{
														
 
															+    FILE *input = fmemopen ((void *)bad_doc, strlen (start_doc), "r");
														
 
															+
														
 
															+    LSUP_Graph *gr;
														
 
															+    size_t ct;
														
 
															+    char *err;
														
 
															+    EXPECT_INT_EQ (nt_codec.gr_decoder (input, &gr, &ct, &err), LSUP_VALUE_ERR);
														
 
															+
														
 
															+    TRACE ("Error: %s", err);
														
 
															+    ASSERT (strstr (err, "`dc:title") != NULL, "Wrong error string report!");
														
 
															+    ASSERT (strstr (err, "line 3") != NULL, "Wrong error line report!");
														
 
															+    ASSERT (strstr (err, "character 16") != NULL, "Wrong error char report!");
														
 
															+
														
 
															+    fclose (input);
														
 
															+
														
 
															+    LSUP_graph_free (gr);
														
 
															+
														
 
															+    return 0;
														
 
															+}
														
 
															+
														
 
															+
														
 
															 int codec_nt_tests()
														
 
															 {
														
 
															     LSUP_Term **terms = init_terms();
														
@@ -225,6 +258,7 @@ int codec_nt_tests()
 
															     RUN (test_encode_nt_graph);
														
 
															     RUN (test_decode_nt_term);
														
 
															     RUN (test_decode_nt_graph);
														
 
															+    RUN (test_decode_nt_bad_graph);
														
 
															     free_terms(terms);