Browse Source

Integrate NT parser into codec framework.

Stefano Cossu 4 years ago
parent
commit
5d823693aa
5 changed files with 177 additions and 164 deletions
  1. 12 9
      Makefile
  2. 36 44
      include/codec_base.h
  3. 38 0
      include/nt_parser.h
  4. 80 72
      src/codec/nt_lexer.re
  5. 11 39
      src/codec_nt.c

+ 12 - 9
Makefile

@@ -1,18 +1,18 @@
 CODEC_DIR=src/codec
 CC=gcc
-CFLAGS+= -Wall -D_XOPEN_SOURCE=500
-INCLUDE=-Iinclude -Iext/xxHash -Iext/openldap/libraries/liblmdb -Iext/uthash/src
+CFLAGS+= -Wall
+INCLUDE=-I. -Iinclude -Iext/xxHash -Iext/openldap/libraries/liblmdb \
+	-Iext/uthash/src
 LIB=-luuid -lpthread
 SRC=ext/xxHash/xxhash.c ext/openldap/libraries/liblmdb/mdb.c \
-	ext/openldap/libraries/liblmdb/midl.c src/*.c
+	ext/openldap/libraries/liblmdb/midl.c src/*.c src/codec/*_grammar.c \
+	src/codec/*_parser.c
 
-.PHONY: build test lint profile
+.PHONY: build_parsers lint profile
 
 default: test
 
-build_parsers:; $(MAKE) -C $(CODEC_DIR)
-
-build:
+build: build_parsers
 	$(CC) \
 		$(CFLAGS) -Werror
 		$(INCLUDE) \
@@ -28,7 +28,7 @@ lint:
 		-posix-lib \
 		test.c
 
-test:
+test: build_parsers
 	$(CC) \
 		$(CFLAGS) -g3 -DDEBUG \
 		$(INCLUDE) -Itest \
@@ -59,10 +59,13 @@ valgrind:
 memcheck: test valgrind
 
 
-profile:
+profile: build_parsers
 	$(CC) \
 		$(CFLAGS) \
 		$(INCLUDE) \
 		$(LIB) \
 		$(SRC) profile.c \
 		-o bin/profile
+
+
+build_parsers:; $(MAKE) -C $(CODEC_DIR)

+ 36 - 44
include/codec_base.h

@@ -63,35 +63,19 @@ typedef LSUP_rc (*term_dec_fn_t)(
 typedef LSUP_CodecIterator * (*gr_encode_init_fn_t)(const LSUP_Graph *gr);
 
 
-/** @brief Initialize a graph decoding loop.
+/** @brief Perform one encoding iteration.
  *
- * This prototype is to be implemented by graph decoding loops. It should
- * create an iterator and perform all initial setup for tokenizing text into
- * processing units.
- *
- * @param[in] rep The RDF string to be decoded. Any namespace prefixes defined in
- *  this string will be used for decoding.
- *
- * @return A codec iterator handle to be passed to a #gr_codec_iter_fn_t
- * function and, eventually, to a #gr_codec_done_fn_t function.
- */
-typedef LSUP_CodecIterator * (*gr_decode_init_fn_t)(const char *rep);
-
-
-/** @brief Perform one encoding or decoding iteration.
- *
- * This prototype is used for both encoding and decoding function. It should
- * perform all the steps to either encode one or more triples into a complete
- * RDF fragment representing a complete triple or a set thereof, or to parse
- * a RDF string into one or more complete triples.
+ * Implementations of this prototype should perform all the steps to encode one
+ * or more triples into a complete RDF fragment representing a complete triple
+ * or a set thereof. The input unit is up to the implementation.
  *
  * @param[in] it Iterator handle.
  *
- * @param[out] res Handle to be populated with the data obtained from encoding
- * (a string) or decoding (a NULL-terminated array of triples). This pointer
- * must be passed initialized (it may be NULL) and should be eventually
- * freed manually at the end of the loop (it is reallocated at each iteration,
- * so memory from a previous iteration may be overwritten with new data).
+ * @param[out] res Handle to be populated with a string obtained from encoding.
+ * This pointer must be passed initialized (it may be NULL) and should be
+ * eventually freed manually at the end of the loop (it is reallocated at each
+ * iteration, so memory from a previous iteration may be overwritten with new
+ * data).
  *
  * @return LSUP_OK if a new token was processed; LSUP_END if the end of the
  *  loop was reached.
@@ -101,11 +85,24 @@ typedef LSUP_rc (*gr_codec_iter_fn_t)(LSUP_CodecIterator *it, void **res);
 typedef void (*gr_codec_done_fn_t)(LSUP_CodecIterator *it);
 
 
-/** TODO
+/** @brief Prototype for decoding a complete RDF document into a graph.
+ *
+ * Implementations SHOULD consume data from the file handle in chunks.
+ *
+ * @param[in] rep Open file handle pointing to the RDF data. Implementations
+ * MUST NOT close the file handle.
+ *
+ * @param[out] gr Pointer to a graph handle to be generated from decoding.
+ *
+ * @param[out] ct If not NULL, it may be populated with the number of triples
+ *  parsed (which may be different from the resulting graph size).
+ *  Implementations may choose not not use this, and they must account for the
+ *  value to be NULL.
+ *
+ * @return Implementations MUST return LSUP_OK on success and a negative value
+ *  on parsing error.
  */
-typedef LSUP_rc (*gr_dec_fn_t)(const char *rep, LSUP_Graph **gr);
-
-
+typedef LSUP_rc (*gr_decode_fn_t)(FILE *rep, LSUP_Graph **gr, size_t *ct);
 
 
 /** @brief Codec structure.
@@ -115,19 +112,17 @@ typedef LSUP_rc (*gr_dec_fn_t)(const char *rep, LSUP_Graph **gr);
  * following defined:
  *
  * - name: A brief (16-char max), human-readable to identify the codec.
- * - mimetype: MIME type associated with the codec.
+ * - mimetype: MIME type (32-char max) associated with the codec.
  * - extension: File extension associated with the serialized file.
  *
- * - term_encoder: Callback function for encoding a single term.
- * - term_decoder: Callback function for decoding a single term.
+ * - term_encoder: Encode a single term.
  *
  * - gr_encode_init: Initialize a graph decoding loop.
  * - gr_encode_iter: Run one iteration of encoding on one or more triples.
  * - gr_encode_done: Finalize the encoding loop and free the support data.
  *
- * - gr_decode_init: Initialize a graph decoding loop.
- * - gr_decode_iter: Run one iteration of decoding on one or more text lines.
- * - gr_decode_done: Finalize the decoding loop and free the support data.
+ * - term_decoder: Decode a single term.
+ * - gr_decoder: Decode a RDF document into a graph.
  *
  * For documentation on the individual encoding and decoding callbaks, see the
  * related function prototypes.
@@ -137,19 +132,16 @@ typedef struct codec_t {
     char                mimetype[32];   // MIME type associated with the codec.
     char                extension[8];   // Serialized file extension.
 
-    // Term encoding and decoding.
+    // Encoding.
     term_enc_fn_t       term_encoder;   // Term encoder function.
-    term_dec_fn_t       term_decoder;   // Term decoder function.
 
-    // Graph encoding.
     gr_encode_init_fn_t gr_encode_init; // Graph encoder initialization.
-    gr_codec_iter_fn_t  gr_encode_iter; // Graph encoder initialization.
-    gr_codec_done_fn_t  gr_encode_done; // Graph encoder initialization.
+    gr_codec_iter_fn_t  gr_encode_iter; // Graph encoder iteration.
+    gr_codec_done_fn_t  gr_encode_done; // Graph encoder finalization.
 
-    // Graph decoding.
-    gr_decode_init_fn_t gr_decode_init; // Graph decoder initialization.
-    gr_codec_iter_fn_t  gr_decode_iter; // Graph decoder initialization.
-    gr_codec_done_fn_t  gr_decode_done; // Graph decoder initialization.
+    // Decoding.
+    term_dec_fn_t       term_decoder;   // Term decoder function.
+    gr_decode_fn_t      gr_decoder;      // Graph decoder function.
 } LSUP_Codec;
 
 #endif

+ 38 - 0
include/nt_parser.h

@@ -0,0 +1,38 @@
+#ifndef _LSUP_NT_PARSER_H
+#define _LSUP_NT_PARSER_H
+
+#include "graph.h"
+
+/** @brief Parse a single term.
+ *
+ * @param[in] rep N-Triples representation as a character string.
+ *
+ * @param[in] map Unused: there is no namespace prefixing in N-triples. Kept
+ *  for interface compatibility. May be NULL.
+ *
+ * @param[out] term Term to be created from the string.
+ *
+ * @return LSUP_OK on success, LSUP_VALUE_ERR if the string is not valid
+ *  N-Triples syntax for a IRI ref, Literal or BNode.
+ */
+LSUP_rc
+LSUP_nt_parse_term (const char *rep, const LSUP_NSMap *map, LSUP_Term **term);
+
+
+/** @brief Parse a N-Triples document from a file handle.
+ *
+ * @param[in] doc N-Triples document.
+ *
+ * @param[out] Pointer to a graph handle to be created. The new graph will have
+ *  a random UUID URN.
+ *
+ * @param[out] ct If not NULL it is populated with the number of triples
+ *  parsed. This may be more than the triples in the resulting graph.
+ *
+ * @return LSUP_OK on success, LSUP_VALUE_ERR if a parsing error was
+ *  encountered. TODO Add line/char info for parsing error:w
+ */
+LSUP_rc
+LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr, size_t *ct);
+
+#endif

+ 80 - 72
src/codec/nt_lexer.re

@@ -1,18 +1,13 @@
-#include <errno.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
 #include "graph.h"
 #include "src/codec/nt_grammar.h"
+#include "nt_parser.h"
 
 
 #define YYCTYPE     unsigned char
-#define YYCURSOR    in->cur
-#define YYMARKER    in->mar
-#define YYLIMIT     in->lim
-#define YYFILL      fill(in) == 0
+#define YYCURSOR    it->cur
+#define YYMARKER    it->mar
+#define YYLIMIT     it->lim
+#define YYFILL      fill(it) == 0
 
 /**
  * Max chunk size passed to scanner at each iteration.
@@ -42,61 +37,61 @@ typedef struct {
     size_t          ct;             // Number of parsed triples.
     int             eof;            // if we have reached EOF (T|F)
     /*!stags:re2c format = "YYCTYPE *@@;"; */
-} Input;
+} ParseIterator;
 
 
-static int fill(Input *in)
+static int fill(ParseIterator *it)
 {
-    if (in->eof) {
+    if (it->eof) {
         return 1;
     }
-    const size_t shift = in->tok - in->buf;
+    const size_t shift = it->tok - it->buf;
     if (shift < 1) {
         return 2;
     }
     printf ("Shifting bytes: %lu\n", shift);
-    memmove(in->buf, in->tok, in->lim - in->tok);
-    in->lim -= shift;
-    in->cur -= shift;
-    in->mar -= shift;
-    in->tok -= shift;
-    in->lim += fread(in->lim, 1, shift, in->file);
-    /*!stags:re2c format = "if (in->@@) in->@@ -= shift; "; */
-    in->lim[0] = 0;
-    in->eof |= in->lim < in->buf + CHUNK_SIZE;
+    memmove(it->buf, it->tok, it->lim - it->tok);
+    it->lim -= shift;
+    it->cur -= shift;
+    it->mar -= shift;
+    it->tok -= shift;
+    it->lim += fread(it->lim, 1, shift, it->file);
+    /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
+    it->lim[0] = 0;
+    it->eof |= it->lim < it->buf + CHUNK_SIZE;
     return 0;
 }
 
 
-static void init(Input *in, FILE *file)
+static void parse_init(ParseIterator *it, FILE *file)
 {
-    in->file = file;
-    in->buf = malloc (CHUNK_SIZE + 1);
-    in->cur = in->mar = in->tok = in->lim = in->buf + CHUNK_SIZE;
-    in->ct = 0;
-    in->eof = 0;
-    /*!stags:re2c format = "in->@@ = NULL; "; */
-    fill (in);
+    it->file = file;
+    it->buf = malloc (CHUNK_SIZE + 1);
+    it->cur = it->mar = it->tok = it->lim = it->buf + CHUNK_SIZE;
+    it->ct = 0;
+    it->eof = 0;
+    /*!stags:re2c format = "it->@@ = NULL; "; */
+    fill (it);
 }
 
 
 // TODO Make buffer extensible if a token is larger than the current buf size.
-static int __attribute__((unused)) extend (Input *in)
+static int __attribute__((unused)) extend (ParseIterator *it)
 {
-    size_t delta = YYLIMIT - in->buf + CHUNK_SIZE;
-    YYCTYPE *tmp = realloc (in->buf, delta);
+    size_t delta = YYLIMIT - it->buf + CHUNK_SIZE;
+    YYCTYPE *tmp = realloc (it->buf, delta);
     if (!tmp) return ENOMEM;
 
-    in->lim += delta;
+    it->lim += delta;
 
-    in->buf = tmp;
+    it->buf = tmp;
 
     return 0;
 }
 
 
-static void done (Input *in)
-{ free (in->buf); }
+static void parse_done (ParseIterator *it)
+{ free (it->buf); }
 
 
 /** @brief Replace \uxxxx and \Uxxxxxxxx with Unicode bytes.
@@ -157,13 +152,13 @@ void ParseFree();
 
 // Lexer.
 
-static int lex (Input *in, LSUP_Term **term)
+static int lex (ParseIterator *it, LSUP_Term **term)
 {
     const YYCTYPE *lit_data_e, *dtype_s, *lang_s;
 
 loop:
 
-    in->tok = in->cur;
+    it->tok = it->cur;
 
     *term = NULL;
 
@@ -171,7 +166,7 @@ loop:
     re2c:eof = 0;
     re2c:flags:8 = 1;
     re2c:flags:tags = 1;
-    re2c:tags:expression = "in->@@";
+    re2c:tags:expression = "it->@@";
     re2c:api:style = functions;
     re2c:define:YYFILL:naked = 1;
 
@@ -208,7 +203,7 @@ loop:
     }
 
     IRIREF {
-        YYCTYPE *data = unescape_unicode (in->tok + 1, YYCURSOR - in->tok - 2);
+        YYCTYPE *data = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
 
         printf ("URI data: %s\n", data);
 
@@ -220,8 +215,8 @@ loop:
 
     LITERAL {
         // Only unescape Unicode from data.
-        size_t size = lit_data_e - in->tok - 2;
-        YYCTYPE *data = unescape_unicode (in->tok + 1, size);
+        size_t size = lit_data_e - it->tok - 2;
+        YYCTYPE *data = unescape_unicode (it->tok + 1, size);
         printf ("Literal data: %s\n", data);
 
         YYCTYPE *datatype = NULL, *lang = NULL;
@@ -252,7 +247,7 @@ loop:
     }
 
     BNODE {
-        YYCTYPE *data = unescape_unicode (in->tok + 2, YYCURSOR - in->tok - 1);
+        YYCTYPE *data = unescape_unicode (it->tok + 2, YYCURSOR - it->tok - 1);
 
         printf ("BNode data: %s\n", data);
 
@@ -264,7 +259,7 @@ loop:
 
     DOT {
         printf ("End of triple.\n");
-        in->ct ++;
+        it->ct ++;
 
         return T_DOT;
     }
@@ -273,13 +268,12 @@ loop:
         printf("Separator.\n");
 
         return T_WS;
-        //goto loop;
     }
 
     COMMENT {
-        size_t size = YYCURSOR - in->tok + 1;
+        size_t size = YYCURSOR - it->tok + 1;
         YYCTYPE *data = malloc (size);
-        memcpy (data, in->tok, size);
+        memcpy (data, it->tok, size);
         data [size - 1] = '\0';
         printf ("Comment: `%s`\n", data);
         free (data);
@@ -290,7 +284,7 @@ loop:
     * {
         printf (
             "Invalid token @ %lu: %s (\\x%x)\n",
-            YYCURSOR - in->buf - 1, in->tok, *in->tok);
+            YYCURSOR - it->buf - 1, it->tok, *it->tok);
 
         return -1;
     }
@@ -299,22 +293,33 @@ loop:
 }
 
 
-int main(int argc, char *argv[])
+LSUP_rc
+LSUP_nt_parse_term (const char *rep, const LSUP_NSMap *map, LSUP_Term **term)
 {
-    Input input;
+    FILE *stream = fmemopen ((void *)rep, strlen (rep), "r");
 
-    if (argc != 2) {
-        fprintf (stderr, "One argument required.\n");
-        return -1;
-    }
+    ParseIterator it;
+    parse_init (&it, stream);
 
-    FILE *fh = fopen (argv[1], "r");
-    if (!fh) {
-        fprintf (stderr, "Error opening file.\n");
-        return -1;
+    int ttype = lex (&it, term);
+
+    parse_done (&it);
+
+    switch (ttype) {
+        case T_IRIREF:
+        case T_LITERAL:
+        case T_BNODE:
+            return LSUP_OK;
+        default:
+            return LSUP_VALUE_ERR;
     }
+}
 
-    init (&input, fh);
+LSUP_rc
+LSUP_nt_parse_doc (FILE *stream, LSUP_Graph **gr_p, size_t *ct)
+{
+    ParseIterator parse_it;
+    parse_init (&parse_it, stream);
 
     void *parser = ParseAlloc (malloc);
 
@@ -323,17 +328,14 @@ int main(int argc, char *argv[])
     LSUP_Term *term = NULL;
 
     for (;;) {
-        int ttype = lex (&input, &term);
+        int ttype = lex (&parse_it, &term);
 
         if (ttype == -1) {
             fprintf(stderr, "Parse error.\n");
-            break;
+            goto fail;
         }
 
-        printf ("Token #%d\n", ttype);
-
         Parse (parser, ttype, term, it);
-        //if (term) LSUP_term_free (term);
 
         if (ttype == T_EOF) break;
     };
@@ -342,17 +344,23 @@ int main(int argc, char *argv[])
 
     LSUP_graph_add_done (it);
 
-    if (term) LSUP_term_free (term);
+    if (ct) *ct = parse_it.ct;
+
+    TRACE ("Parsed %lu triples.\n", parse_it.ct);
+    TRACE ("Graph size: %lu\n", LSUP_graph_size (gr));
+
+    LSUP_term_free (term);
 
     ParseFree (parser, free);
-    fclose (fh);
-    done (&input);
+    parse_done (&parse_it);
+
+    *gr_p = gr;
 
-    printf ("Parsed %lu triples.\n", input.ct);
+    return LSUP_OK;
 
-    printf ("Graph size: %lu\n", LSUP_graph_size (gr));
+fail:
     LSUP_graph_free (gr);
 
-    return 0;
+    return LSUP_VALUE_ERR;
 }
 

+ 11 - 39
src/codec_nt.c

@@ -1,4 +1,5 @@
 #include "codec_nt.h"
+#include "nt_parser.h"
 
 /** @brief List of characters to be escaped in serialized literals.
  *
@@ -96,14 +97,6 @@ term_to_nt (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
 }
 
 
-static LSUP_rc
-nt_to_term (const char *rep, const LSUP_NSMap *nsm, LSUP_Term **term)
-{
-    // TODO
-    return LSUP_NOT_IMPL_ERR;
-}
-
-
 static LSUP_CodecIterator *
 gr_to_nt_init (const LSUP_Graph *gr)
 {
@@ -163,40 +156,19 @@ gr_to_nt_done (LSUP_CodecIterator *it)
 }
 
 
-static LSUP_CodecIterator *
-nt_to_gr_init (const char *rep)
-{
-    // TODO
-    return NULL;
-}
-
-
-static LSUP_rc
-nt_to_gr_iter (LSUP_CodecIterator *it, void **res)
-{
-    // TODO
-    return LSUP_NOT_IMPL_ERR;
-}
+const LSUP_Codec nt_codec = {
+    .name               = "N-Triples",
+    .mimetype           = "application/n-triples",
+    .extension          = "nt",
 
+    .term_encoder       = term_to_nt,
 
-static void
-nt_to_gr_done (LSUP_CodecIterator *it) {
-    free (it);
-}
+    .gr_encode_init     = gr_to_nt_init,
+    .gr_encode_iter     = gr_to_nt_iter,
+    .gr_encode_done     = gr_to_nt_done,
 
-
-const LSUP_Codec nt_codec = {
-    .name           = "N-Triples",
-    .mimetype       = "application/n-triples",
-    .extension      = "nt",
-    .term_encoder   = term_to_nt,
-    .term_decoder   = nt_to_term,
-    .gr_encode_init = gr_to_nt_init,
-    .gr_encode_iter = gr_to_nt_iter,
-    .gr_encode_done = gr_to_nt_done,
-    .gr_decode_init = nt_to_gr_init,
-    .gr_decode_iter = nt_to_gr_iter,
-    .gr_decode_done = nt_to_gr_done,
+    .term_decoder       = LSUP_nt_parse_term,
+    .gr_decoder         = LSUP_nt_parse_doc,
 };