Browse Source

Initial N-Triples grammar file.

Stefano Cossu 4 years ago
parent
commit
d634579018
3 changed files with 155 additions and 84 deletions
  1. 6 3
      Makefile
  2. 67 0
      src/codec/nt_grammar
  3. 82 81
      src/codec/nt_lexer.re

+ 6 - 3
Makefile

@@ -1,5 +1,5 @@
 CC=gcc
-CFLAGS+= -Wall -std=c11 -D_XOPEN_SOURCE=500
+CFLAGS+= -Wall -D_XOPEN_SOURCE=500
 INCLUDE=-Iinclude -Iext/xxHash -Iext/openldap/libraries/liblmdb -Iext/uthash/src
 LIB=-luuid -lpthread
 SRC=ext/xxHash/xxhash.c ext/openldap/libraries/liblmdb/mdb.c \
@@ -35,12 +35,15 @@ test:
 
 
 test_lexer:
-	re2c src/codec/nt.re -o src/codec/nt.c -T --case-ranges && \
+	cd src/codec; \
+	lemon -T/usr/share/lemon/lempar.c nt_grammar && \
+	re2c nt_lexer.re -o nt_lexer.c -T --case-ranges && \
+	cd ../../; \
 	$(CC) \
 		$(CFLAGS) -g3 -DDEBUG \
 		$(INCLUDE) -I. \
 		$(LIB) \
-		$(SRC) src/codec/nt.c \
+		$(SRC) src/codec/nt_lexer.c src/codec/nt_grammar.c \
 		-o bin/test_lexer
 
 

+ 67 - 0
src/codec/nt_grammar

@@ -0,0 +1,67 @@
+%include {
+
+/** @brief Lemon parser grammar for N-Triples.
+ *
+ * The `lemon' parser generator executable must be in your PATH:
+ * https://sqlite.org/src/doc/trunk/doc/lemon.html
+ *
+ * To generate the parser, run: `lemon ${FILE}'
+ */
+
+#include "graph.h"
+}
+
+
+%token_type { LSUP_Term * }
+%token_prefix "T_"
+
+%type triple            { LSUP_SerTriple * }
+%destructor triple      { LSUP_striple_free ($$); }
+%type subject           { LSUP_Term * }
+%destructor subject     { LSUP_term_free ($$); }
+%type predicate         { LSUP_Term * }
+%destructor predicate   { LSUP_term_free ($$); }
+%type object            { LSUP_Term * }
+%destructor object      { LSUP_term_free ($$); }
+%default_type           { void * }
+
+%extra_argument         { LSUP_GraphIterator *it }
+
+
+// Rules.
+
+ntriplesDoc ::= triples EOF. { printf (" Start of document.\n"); }
+
+triples     ::= eol.
+triples     ::= triple eol.
+triples     ::= triples triple eol.
+
+triple(A)   ::= ws subject(S) ws predicate(P) ws object(O) ws DOT. {
+
+                A = LSUP_striple_new (
+                    LSUP_buffer_new_from_term (S),
+                    LSUP_buffer_new_from_term (P),
+                    LSUP_buffer_new_from_term (O)
+                );
+
+                LSUP_graph_add_iter (it, A);
+
+                LSUP_term_free (S);
+                LSUP_term_free (P);
+                LSUP_term_free (O);
+            }
+
+subject     ::= IRIREF.
+subject     ::= BNODE.
+
+predicate   ::= IRIREF.
+
+object      ::= IRIREF.
+object      ::= BNODE.
+object      ::= LITERAL.
+
+eol         ::= EOL.
+eol         ::= eol EOL.
+
+ws          ::=.
+ws          ::= WS. { printf ("WS in grammar.\n"); }

+ 82 - 81
src/codec/nt.re → src/codec/nt_lexer.re

@@ -4,7 +4,9 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "term.h"
+#include "graph.h"
+#include "src/codec/nt_grammar.h"
+
 
 #define YYCTYPE     unsigned char
 #define YYCURSOR    in->cur
@@ -27,19 +29,6 @@
 #define MAX_TOKEN_SIZE 8192
 
 
-typedef enum {
-    T_UNDEFINED,    // Undefined token.
-    T_EOF,          // end of buffer.
-    T_EOL,          // End of line.
-    T_SEP,          // Separator (whitespace).
-    T_DOT,          // End of triple.
-    T_IRIREF,       // IRI reference.
-    T_LITERAL,      // Literal term.
-    T_BNODE,        // Blank node.
-    T_COMMENT,      // Comment.
-} TokenType;
-
-
 typedef struct {
     FILE *          file;           // Input file handle.
     char *          buf,            // Start of buffer.
@@ -50,13 +39,14 @@ typedef struct {
                                     //   (YYCURSOR)
          *          mar,            // Most recent match (YYMARKER)
          *          tok;            // Start of current token.
+    size_t          ct;             // Number of parsed triples.
     int             eof;            // if we have reached EOF (T|F)
     /*!stags:re2c format = "char *@@;"; */
 } Input;
 
 
-typedef struct {
-    TokenType       type;           // Token type (enum).
+typedef struct token_t {
+    int             type;           // Token type (enum).
     LSUP_Term *     term;           // Token data (e.g. char*,  Term*)
 } Token;
 
@@ -87,8 +77,9 @@ static int fill(Input *in)
 static void init(Input *in, FILE *file)
 {
     in->file = file;
-    in->buf = malloc(CHUNK_SIZE + 1);
+    in->buf = malloc (CHUNK_SIZE + 1);
     in->cur = in->mar = in->tok = in->lim = in->buf + CHUNK_SIZE;
+    in->ct = 0;
     in->eof = 0;
     /*!stags:re2c format = "in->@@ = NULL; "; */
     fill (in);
@@ -114,16 +105,24 @@ static void done (Input *in)
 { free (in->buf); }
 
 
-static Token *lex (Input *in)
+// Parser interface.
+
+void *ParseAlloc();
+void Parse();
+void ParseFree();
+
+
+// Lexer.
+
+static int lex (Input *in, LSUP_Term **term)
 {
     const char *lit_data_e, *dtype_s, *lang_s;
 
+loop:
+
     in->tok = in->cur;
 
-    // Returned token.
-    Token *ret = malloc (sizeof (*ret));
-    ret->term = NULL;
-    ret->type = T_UNDEFINED;
+    *term = NULL;
 
     /*!re2c
     re2c:eof = 0;
@@ -137,40 +136,46 @@ static Token *lex (Input *in)
     // For unresolved and partially resolved inconsistencies of the spec, see
     // https://lists.w3.org/Archives/Public/public-rdf-comments/2017Jun/0000.html
     WS                  = [\x09\x20]+;
-    OPT_WS              = WS?;
-    EOL                 = OPT_WS [\x0D\x0A]+;
+    EOL                 = WS? [\x0D\x0A]+;
     DOT                 = [.];
     HEX                 = [0-9A-Fa-f];
     ECHAR               = [\\] [tbnrf"'\\];
     UCHAR               = "\\u" HEX{4} | "\\U" HEX{8};
     PN_CHARS_BASE       = [A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF];
     PN_CHARS_U          = PN_CHARS_BASE | '_' | ':';
-    PN_CHARS            = PN_CHARS_U | '-' |
-                            [0-9\u00B7\u0300-\u036F\u203F-\u2040];
+    PN_CHARS            = PN_CHARS_U | '-' | [0-9\u00B7\u0300-\u036F\u203F-\u2040];
     IRI_CHARS           = ([^\x00-\x20<>"{}|^`\\] | UCHAR)*;
     LITERAL_QUOTE       = ["] ([^\x22\x5C\x0A\x0D] | ECHAR|UCHAR)* ["];
     LANGTAG             = [@] [a-zA-Z]+ ("-" [a-zA-Z0-9]+)*;
 
     IRIREF              = [<] IRI_CHARS [>];
-    LITERAL             = LITERAL_QUOTE @lit_data_e OPT_WS
-                            ("^^" OPT_WS @dtype_s IRIREF | @lang_s LANGTAG)?;
-    BNODE               = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")*
-                            PN_CHARS)?);
+    LITERAL             = LITERAL_QUOTE @lit_data_e WS? ("^^" WS? @dtype_s IRIREF | @lang_s LANGTAG)?;
+    BNODE               = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")* PN_CHARS)?);
     COMMENT             = "#" .*;
 
 
     EOL {
         printf("End of line.\n");
-        ret->type = T_EOL;
-
-        return ret;
+        return T_EOL;
     }
 
     $ {
         printf("End of buffer.\n");
-        ret->type = T_EOF;
+        return T_EOF;
+    }
+
+    IRIREF {
+        size_t size = YYCURSOR - in->tok - 1;
+        char *data = malloc (size);
+        memcpy (data, in->tok + 1, size);
+        data [size - 1] = '\0';
 
-        return ret;
+        printf ("URI data (%lu): %s\n", strlen (data), data);
+
+        *term = LSUP_uri_new (data);
+        free (data);
+
+        return T_IRIREF;
     }
 
     LITERAL {
@@ -198,30 +203,13 @@ static Token *lex (Input *in)
             printf ("lang (%lu): %s\n", strlen (lang), lang);
         }
 
-        ret->type = T_LITERAL;
-        ret->term = LSUP_term_new (LSUP_TERM_LITERAL, data, datatype, lang);
+        *term = LSUP_term_new (LSUP_TERM_LITERAL, data, datatype, lang);
 
         free (data);
-        if (datatype) free (datatype);
-        if (lang) free (lang);
+        free (datatype);
+        free (lang);
 
-        return ret;
-    }
-
-    IRIREF {
-        size_t size = YYCURSOR - in->tok - 1;
-        char *data = malloc (size);
-        memcpy (data, in->tok + 1, size);
-        data [size - 1] = '\0';
-
-        printf ("URI data (%lu): %s\n", strlen (data), data);
-
-        ret->type = T_IRIREF;
-        ret->term = LSUP_uri_new (data);
-
-        free (data);
-
-        return ret;
+        return T_LITERAL;
     }
 
     BNODE {
@@ -232,28 +220,24 @@ static Token *lex (Input *in)
 
         printf ("BNode data (%lu): %s\n", strlen(data), data);
 
-        ret->type = T_IRIREF;
-        ret->term = LSUP_term_new (LSUP_TERM_BNODE, data, NULL, NULL);
-
+        *term = LSUP_term_new (LSUP_TERM_BNODE, data, NULL, NULL);
         free (data);
 
-        return ret;
+        return T_BNODE;
     }
 
     DOT {
         printf ("End of triple.\n");
+        in->ct ++;
 
-        ret->type = T_DOT;
-
-        return ret;
+        return T_DOT;
     }
 
     WS {
         printf("Separator.\n");
 
-        ret->type = T_SEP;
-
-        return ret;
+        return T_WS;
+        //goto loop;
     }
 
     COMMENT {
@@ -264,9 +248,7 @@ static Token *lex (Input *in)
         printf ("Comment: `%s`\n", data);
         free (data);
 
-        ret->type = T_COMMENT;
-
-        return ret;
+        goto loop;
     }
 
     * {
@@ -274,12 +256,13 @@ static Token *lex (Input *in)
             "Invalid token @ %lu: %s (\\x%x)\n",
             YYCURSOR - in->buf - 1, in->tok, *in->tok);
 
-        return ret;
+        return -1;
     }
 
     */
 }
 
+
 int main(int argc, char *argv[])
 {
     Input input;
@@ -297,25 +280,43 @@ int main(int argc, char *argv[])
 
     init (&input, fh);
 
-    Token *tok = NULL;
-    while ((tok = lex (&input))->type != T_EOF){
-        printf ("Token #%d\n", tok->type);
-        if (tok->term) LSUP_term_free (tok->term);
-        free (tok);
-        if (tok->type == T_UNDEFINED) {
-            printf("Error.\n");
+    void *parser = ParseAlloc (malloc);
+
+    LSUP_Graph *gr = LSUP_graph_new (LSUP_STORE_MEM);
+    LSUP_GraphIterator *it = LSUP_graph_add_init (gr);
+    LSUP_Term *term = NULL;
+
+    for (;;) {
+        int ttype = lex (&input, &term);
+
+        if (ttype == -1) {
+            fprintf(stderr, "Parse error.\n");
             break;
         }
+
+        printf ("Token #%d\n", ttype);
+
+        Parse (parser, ttype, term, it);
+        //if (term) LSUP_term_free (term);
+
+        if (ttype == T_EOF) break;
     };
 
-    if (tok) {
-        if (tok->term) LSUP_term_free (tok->term);
-        free (tok);
-    }
+    Parse (parser, 0, NULL, it);
+
+    LSUP_graph_add_done (it);
 
+    if (term) LSUP_term_free (term);
+
+    ParseFree (parser, free);
     fclose (fh);
     done (&input);
 
+    printf ("Parsed %lu triples.\n", input.ct);
+
+    printf ("Graph size: %lu\n", LSUP_graph_size (gr));
+    LSUP_graph_free (gr);
+
     return 0;
 }