4 years ago · ac98d2b97a
--- a/TODO.md
+++ b/TODO.md
@@ -7,16 +7,19 @@ P = pending; W = working on it; D = done.
 
				 - *D* LMDB back end
			
 
				 - D* Hash table back end
			
 
				 - D* Python bindings
			
 
				-- W* Namespace manager
			
 
				-- *W* Better error handling
			
 
				-- *P* N3 serialization / deserialization
			
 
				+- D* Namespace manager
			
 
				+- W* N3 serialization / deserialization
			
 
				+- P Environment
			
 
				 - *P* Turtle serialization / deserialization
			
 
				+- P* Better error handling
			
 
				 
			
 
				 
			
 
				 ## Non-critical for MVP
			
 
				 
			
 
				 - Logging
			
 
				 - Term and triple validation
			
 
				+- NQ codec
			
 
				+- TriG codec
			
 
				 
			
 
				 
			
 
				 ## Long-term
			
--- a/include/codec_base.h
+++ b/include/codec_base.h
@@ -26,6 +26,18 @@ typedef struct codec_iter_t {
 
				 } LSUP_CodecIterator;
			
 
				 
			
 
				 
			
 
				+/** @brief Parse error information.
			
 
				+ *
			
 
				+ */
			
 
				+/* TODO A plain string will suffice for now.
			
 
				+typedef struct parse_error_t {
			
 
				+    unsigned int        line;       // Line number where the error occurred.
			
 
				+    unsigned int        linec;      // Position in line of the offending token.
			
 
				+    char *              token;      // String representation of the token.
			
 
				+} LSUP_ParseError;
			
 
				+*/
			
 
				+
			
 
				+
			
 
				 /** @brief Term encoder callback type.
			
 
				  *
			
 
				  * @param[in] term Single term handle.
			
@@ -111,10 +123,14 @@ typedef LSUP_rc (*term_decode_fn_t)(
 
				  *  Implementations may choose not not use this, and they must account for the
			
 
				  *  value to be NULL.
			
 
				  *
			
 
				+ * @param[out] err Pointer to error info string. If no error occurs, it yields
			
 
				+ *  NULL.
			
 
				+ *
			
 
				  * @return Implementations MUST return LSUP_OK on success and a negative value
			
 
				  *  on parsing error.
			
 
				  */
			
 
				-typedef LSUP_rc (*gr_decode_fn_t)(FILE *rep, LSUP_Graph **gr, size_t *ct);
			
 
				+typedef LSUP_rc (*gr_decode_fn_t)(
			
 
				+        FILE *rep, LSUP_Graph **gr, size_t *ct, char **err);
			
 
				 
			
 
				 
			
 
				 /** @brief Codec structure.
			
@@ -136,7 +152,7 @@ typedef LSUP_rc (*gr_decode_fn_t)(FILE *rep, LSUP_Graph **gr, size_t *ct);
 
				  * - term_decoder: Decode a single term.
			
 
				  * - gr_decoder: Decode a RDF document into a graph.
			
 
				  *
			
 
				- * For documentation on the individual encoding and decoding callbaks, see the
			
 
				+ * For documentation on the individual encoding and decoding callbacks, see the
			
 
				  * related function prototypes.
			
 
				  */
			
 
				 typedef struct codec_t {
			
--- a/include/nt_parser.h
+++ b/include/nt_parser.h
@@ -33,6 +33,6 @@ LSUP_nt_parse_term (const char *rep, const LSUP_NSMap *map, LSUP_Term **term);
 
				  *  encountered. TODO Add line/char info for parsing error
			
 
				  */
			
 
				 LSUP_rc
			
 
				-LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr, size_t *ct);
			
 
				+LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr, size_t *ct, char **err);
			
 
				 
			
 
				 #endif
			
--- a/src/codec/nt_grammar.y
+++ b/src/codec/nt_grammar.y
@@ -30,7 +30,7 @@
 
				 
			
 
				 // Rules.
			
 
				 
			
 
				-ntriplesDoc ::= triples EOF. { printf (" Start of document.\n"); }
			
 
				+ntriplesDoc ::= triples EOF. { TRACE (STR, "Parsed N-Triples document.\n"); }
			
 
				 
			
 
				 triples     ::= eol.
			
 
				 triples     ::= triple eol.
			
@@ -64,4 +64,4 @@ eol         ::= EOL.
 
				 eol         ::= eol EOL.
			
 
				 
			
 
				 ws          ::=.
			
 
				-ws          ::= WS. { printf ("WS in grammar.\n"); }
			
 
				+ws          ::= WS.
			
--- a/src/codec/nt_lexer.re
+++ b/src/codec/nt_lexer.re
@@ -27,15 +27,17 @@
 
				 typedef struct {
			
 
				     FILE *          file;           // Input file handle.
			
 
				     YYCTYPE *       buf,            // Start of buffer.
			
 
				-            *       lim,            // Position after the last
			
 
				-                                    //   available input character
			
 
				-                                    //   (YYLIMIT)
			
 
				+            *       lim,            // Position after the last available input
			
 
				+                                    //   character (YYLIMIT).
			
 
				             *       cur,            // Next input character to be read
			
 
				                                     //   (YYCURSOR)
			
 
				             *       mar,            // Most recent match (YYMARKER)
			
 
				-            *       tok;            // Start of current token.
			
 
				-    size_t          ct;             // Number of parsed triples.
			
 
				-    int             eof;            // if we have reached EOF (T|F)
			
 
				+            *       tok,            // Start of current token.
			
 
				+            *       bol;            // Address of the beginning of the current
			
 
				+                                    //   line (for debugging).
			
 
				+    unsigned        line;           // Current line no. (for debugging).
			
 
				+    unsigned        ct;             // Number of parsed triples.
			
 
				+    bool            eof;            // if we have reached EOF.
			
 
				     /*!stags:re2c format = "YYCTYPE *@@;"; */
			
 
				 } ParseIterator;
			
 
				 
			
@@ -49,7 +51,7 @@ static int fill(ParseIterator *it)
 
				     if (shift < 1) {
			
 
				         return 2;
			
 
				     }
			
 
				-    printf ("Shifting bytes: %lu\n", shift);
			
 
				+    TRACE ("Shifting bytes: %lu\n", shift);
			
 
				     memmove(it->buf, it->tok, it->lim - it->tok);
			
 
				     it->lim -= shift;
			
 
				     it->cur -= shift;
			
@@ -68,6 +70,8 @@ static void parse_init(ParseIterator *it, FILE *file)
 
				     it->file = file;
			
 
				     it->buf = malloc (CHUNK_SIZE + 1);
			
 
				     it->cur = it->mar = it->tok = it->lim = it->buf + CHUNK_SIZE;
			
 
				+    it->line = 1;
			
 
				+    it->bol = it->buf;
			
 
				     it->ct = 0;
			
 
				     it->eof = 0;
			
 
				     /*!stags:re2c format = "it->@@ = NULL; "; */
			
@@ -173,8 +177,9 @@ loop:
 
				 
			
 
				     // For unresolved and partially resolved inconsistencies of the spec, see
			
 
				     // https://lists.w3.org/Archives/Public/public-rdf-comments/2017Jun/0000.html
			
 
				-    WS                  = [\x09\x20]+;
			
 
				-    EOL                 = WS? [\x0D\x0A]+;
			
 
				+    _WS                 = [\x09\x20];
			
 
				+    WS                  = _WS+;
			
 
				+    EOL                 = [\x0D\x0A] (_WS | [\x0D\x0A])*;
			
 
				     DOT                 = [.];
			
 
				     HEX                 = [0-9A-Fa-f];
			
 
				     ECHAR               = [\\] [tbnrf"'\\];
			
@@ -187,25 +192,27 @@ loop:
 
				     LANGTAG             = [@] [a-zA-Z]+ ("-" [a-zA-Z0-9]+)*;
			
 
				 
			
 
				     IRIREF              = [<] IRI_CHARS [>];
			
 
				-    LITERAL             = LITERAL_QUOTE @lit_data_e WS? ("^^" WS? @dtype_s IRIREF | @lang_s LANGTAG)?;
			
 
				+    LITERAL             = LITERAL_QUOTE @lit_data_e _WS* ("^^" _WS* @dtype_s IRIREF | @lang_s LANGTAG)?;
			
 
				     BNODE               = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")* PN_CHARS)?);
			
 
				     COMMENT             = "#" .*;
			
 
				 
			
 
				 
			
 
				     EOL {
			
 
				-        printf("End of line.\n");
			
 
				+        it->line ++;
			
 
				+        it->bol = YYCURSOR;
			
 
				+        TRACE("New line: #%u.\n", it->line);
			
 
				         return T_EOL;
			
 
				     }
			
 
				 
			
 
				     $ {
			
 
				-        printf("End of buffer.\n");
			
 
				+        TRACE(STR, "End of buffer.\n");
			
 
				         return T_EOF;
			
 
				     }
			
 
				 
			
 
				     IRIREF {
			
 
				         YYCTYPE *data = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
			
 
				 
			
 
				-        printf ("URI data: %s\n", data);
			
 
				+        TRACE ("URI data: %s\n", data);
			
 
				 
			
 
				         *term = LSUP_uri_new ((char*)data);
			
 
				         free (data);
			
@@ -217,7 +224,7 @@ loop:
 
				         // Only unescape Unicode from data.
			
 
				         size_t size = lit_data_e - it->tok - 2;
			
 
				         YYCTYPE *data = unescape_unicode (it->tok + 1, size);
			
 
				-        printf ("Literal data: %s\n", data);
			
 
				+        TRACE ("Literal data: %s\n", data);
			
 
				 
			
 
				         YYCTYPE *datatype = NULL, *lang = NULL;
			
 
				 
			
@@ -226,7 +233,7 @@ loop:
 
				             datatype = malloc (size);
			
 
				             memcpy (datatype, dtype_s + 1, size);
			
 
				             datatype [size - 1] = '\0';
			
 
				-            printf ("datatype: %s\n", datatype);
			
 
				+            TRACE ("datatype: %s\n", datatype);
			
 
				         }
			
 
				 
			
 
				         if (lang_s) {
			
@@ -234,7 +241,7 @@ loop:
 
				             lang = malloc (size);
			
 
				             memcpy (lang, lang_s + 1, size);
			
 
				             lang [size - 1] = '\0';
			
 
				-            printf ("lang: %s\n", lang);
			
 
				+            TRACE ("lang: %s\n", lang);
			
 
				         }
			
 
				 
			
 
				         *term = LSUP_term_new (LSUP_TERM_LITERAL, (char*)data, (char*)datatype, (char*)lang);
			
@@ -249,7 +256,7 @@ loop:
 
				     BNODE {
			
 
				         YYCTYPE *data = unescape_unicode (it->tok + 2, YYCURSOR - it->tok - 1);
			
 
				 
			
 
				-        printf ("BNode data: %s\n", data);
			
 
				+        TRACE ("BNode data: %s\n", data);
			
 
				 
			
 
				         *term = LSUP_term_new (LSUP_TERM_BNODE, (char*)data, NULL, NULL);
			
 
				         free (data);
			
@@ -258,14 +265,14 @@ loop:
 
				     }
			
 
				 
			
 
				     DOT {
			
 
				-        printf ("End of triple.\n");
			
 
				+        TRACE (STR, "End of triple.\n");
			
 
				         it->ct ++;
			
 
				 
			
 
				         return T_DOT;
			
 
				     }
			
 
				 
			
 
				     WS {
			
 
				-        printf("Separator.\n");
			
 
				+        TRACE (STR, "Separator.\n");
			
 
				 
			
 
				         return T_WS;
			
 
				     }
			
@@ -275,14 +282,14 @@ loop:
 
				         YYCTYPE *data = malloc (size);
			
 
				         memcpy (data, it->tok, size);
			
 
				         data [size - 1] = '\0';
			
 
				-        printf ("Comment: `%s`\n", data);
			
 
				+        TRACE ("Comment: `%s`\n", data);
			
 
				         free (data);
			
 
				 
			
 
				         goto loop;
			
 
				     }
			
 
				 
			
 
				     * {
			
 
				-        printf (
			
 
				+        TRACE (
			
 
				             "Invalid token @ %lu: %s (\\x%x)\n",
			
 
				             YYCURSOR - it->buf - 1, it->tok, *it->tok);
			
 
				 
			
@@ -317,8 +324,11 @@ LSUP_nt_parse_term (const char *rep, const LSUP_NSMap *map, LSUP_Term **term)
 
				 }
			
 
				 
			
 
				 LSUP_rc
			
 
				-LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct)
			
 
				+LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct, char **err_p)
			
 
				 {
			
 
				+    *err_p = NULL;
			
 
				+    *gr_p = NULL;
			
 
				+
			
 
				     ParseIterator parse_it;
			
 
				     parse_init (&parse_it, stream);
			
 
				 
			
@@ -332,7 +342,23 @@ LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct)
 
				         int ttype = lex (&parse_it, &term);
			
 
				 
			
 
				         if (ttype == -1) {
			
 
				-            fprintf(stderr, "Parse error.\n");
			
 
				+            char token[16];
			
 
				+            strncpy (token, (const char *)parse_it.tok, 15);
			
 
				+
			
 
				+            char *err_start = "Parse error near token `";
			
 
				+
			
 
				+            char err_info [64];
			
 
				+            sprintf(
			
 
				+                    err_info, "[...]' at line %u, character %ld.\n",
			
 
				+                    parse_it.line, parse_it.cur - parse_it.bol);
			
 
				+
			
 
				+            size_t err_size = strlen (err_start) + strlen (token)
			
 
				+                    + strlen (err_info) + 1;
			
 
				+            char *err_str = malloc (err_size);
			
 
				+            sprintf (err_str, "%s%s%s", err_start, token, err_info);
			
 
				+
			
 
				+            *err_p = err_str;
			
 
				+
			
 
				             goto fail;
			
 
				         }
			
 
				 
			
@@ -347,7 +373,7 @@ LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct)
 
				 
			
 
				     if (ct) *ct = parse_it.ct;
			
 
				 
			
 
				-    TRACE ("Parsed %lu triples.\n", parse_it.ct);
			
 
				+    TRACE ("Parsed %u triples.\n", parse_it.ct);
			
 
				     TRACE ("Graph size: %lu\n", LSUP_graph_size (gr));
			
 
				 
			
 
				     LSUP_term_free (term);
			
--- a/test/test_codec_nt.c
+++ b/test/test_codec_nt.c
@@ -71,6 +71,14 @@ static char *start_doc = (
 
				     "# FREE WHITESPACE!\n\n\n\n\n\n\n      \n\n\n\n"
			
 
				 );
			
 
				 
			
 
				+
			
 
				+static char *bad_doc = (
			
 
				+    "<urn:local:s1> <http://example.org/p1> \"hello\" . #  Comment here.\n"
			
 
				+    "<urn:local:s1> <http://example.org/p1>\"hello\"@es-ES .\n"
			
 
				+    "<urn:local:s1> dc:title \"Bad Data.\" ."
			
 
				+);
			
 
				+
			
 
				+
			
 
				 // End result NT document as it should be produced by the NT codec.
			
 
				 // Lines should not be checked in strict order.
			
 
				 static char *end_doc[6] = {
			
@@ -200,10 +208,12 @@ test_decode_nt_graph()
 
				 
			
 
				     LSUP_Graph *gr;
			
 
				     size_t ct;
			
 
				-    EXPECT_PASS (nt_codec.gr_decoder (input, &gr, &ct));
			
 
				+    char *err;
			
 
				+    EXPECT_PASS (nt_codec.gr_decoder (input, &gr, &ct, &err));
			
 
				 
			
 
				     fclose (input);
			
 
				 
			
 
				+    ASSERT (err == NULL, "Error string is not NULL!");
			
 
				     EXPECT_INT_EQ (ct, 6);
			
 
				     EXPECT_INT_EQ (LSUP_graph_size (gr), 6);
			
 
				 
			
@@ -216,6 +226,29 @@ test_decode_nt_graph()
 
				 }
			
 
				 
			
 
				 
			
 
				+static int
			
 
				+test_decode_nt_bad_graph()
			
 
				+{
			
 
				+    FILE *input = fmemopen ((void *)bad_doc, strlen (start_doc), "r");
			
 
				+
			
 
				+    LSUP_Graph *gr;
			
 
				+    size_t ct;
			
 
				+    char *err;
			
 
				+    EXPECT_INT_EQ (nt_codec.gr_decoder (input, &gr, &ct, &err), LSUP_VALUE_ERR);
			
 
				+
			
 
				+    TRACE ("Error: %s", err);
			
 
				+    ASSERT (strstr (err, "`dc:title") != NULL, "Wrong error string report!");
			
 
				+    ASSERT (strstr (err, "line 3") != NULL, "Wrong error line report!");
			
 
				+    ASSERT (strstr (err, "character 16") != NULL, "Wrong error char report!");
			
 
				+
			
 
				+    fclose (input);
			
 
				+
			
 
				+    LSUP_graph_free (gr);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 int codec_nt_tests()
			
 
				 {
			
 
				     LSUP_Term **terms = init_terms();
			
@@ -225,6 +258,7 @@ int codec_nt_tests()
 
				     RUN (test_encode_nt_graph);
			
 
				     RUN (test_decode_nt_term);
			
 
				     RUN (test_decode_nt_graph);
			
 
				+    RUN (test_decode_nt_bad_graph);
			
 
				 
			
 
				     free_terms(terms);