Browse Source

Wire lexer into library; lexer returns struct.

Stefano Cossu 4 years ago
parent
commit
a66f3fdc6e
2 changed files with 143 additions and 57 deletions
  1. 11 1
      Makefile
  2. 132 56
      src/codec/nt.re

+ 11 - 1
Makefile

@@ -1,5 +1,5 @@
 CC=gcc
-CFLAGS+= -Wall -std=c99 -D_XOPEN_SOURCE=500
+CFLAGS+= -Wall -std=c11 -D_XOPEN_SOURCE=500
 INCLUDE=-Iinclude -Iext/xxHash -Iext/openldap/libraries/liblmdb -Iext/uthash/src
 LIB=-luuid -lpthread
 SRC=ext/xxHash/xxhash.c ext/openldap/libraries/liblmdb/mdb.c \
@@ -34,6 +34,16 @@ test:
 		-o bin/test
 
 
+test_lexer:
+	re2c src/codec/nt.re -o src/codec/nt.c -T --case-ranges && \
+	$(CC) \
+		$(CFLAGS) -g3 -DDEBUG \
+		$(INCLUDE) -I. \
+		$(LIB) \
+		$(SRC) src/codec/nt.c \
+		-o bin/test_lexer
+
+
 valgrind:
 	valgrind \
 	--leak-check=full --show-leak-kinds=all --track-origins=yes \

+ 132 - 56
src/codec/nt.re

@@ -1,11 +1,10 @@
-#include <assert.h>
 #include <errno.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-//#include "term.h"
+#include "term.h"
 
 #define YYCTYPE     unsigned char
 #define YYCURSOR    in->cur
@@ -29,6 +28,7 @@
 
 
 typedef enum {
+    T_UNDEFINED,    // Undefined token.
     T_EOF,          // end of buffer.
     T_EOL,          // End of line.
     T_SEP,          // Separator (whitespace).
@@ -37,7 +37,7 @@ typedef enum {
     T_LITERAL,      // Literal term.
     T_BNODE,        // Blank node.
     T_COMMENT,      // Comment.
-} Token;
+} TokenType;
 
 
 typedef struct {
@@ -55,6 +55,12 @@ typedef struct {
 } Input;
 
 
+typedef struct {
+    TokenType       type;           // Token type (enum).
+    LSUP_Term *     term;           // Token data (e.g. char*,  Term*)
+} Token;
+
+
 static int fill(Input *in)
 {
     if (in->eof) {
@@ -71,7 +77,7 @@ static int fill(Input *in)
     in->mar -= shift;
     in->tok -= shift;
     in->lim += fread(in->lim, 1, shift, in->file);
-    /*!stags:re2c format = "if (in->@@) in->@@ -= shift;"; */
+    /*!stags:re2c format = "if (in->@@) in->@@ -= shift; "; */
     in->lim[0] = 0;
     in->eof |= in->lim < in->buf + CHUNK_SIZE;
     return 0;
@@ -84,6 +90,7 @@ static void init(Input *in, FILE *file)
     in->buf = malloc(CHUNK_SIZE + 1);
     in->cur = in->mar = in->tok = in->lim = in->buf + CHUNK_SIZE;
     in->eof = 0;
+    /*!stags:re2c format = "in->@@ = NULL; "; */
     fill (in);
 }
 
@@ -107,12 +114,17 @@ static void done (Input *in)
 { free (in->buf); }
 
 
-static int lex (Input *in)
+static Token *lex (Input *in)
 {
     const char *lit_data_e, *dtype_s, *lang_s;
 
     in->tok = in->cur;
 
+    // Returned token.
+    Token *ret = malloc (sizeof (*ret));
+    ret->term = NULL;
+    ret->type = T_UNDEFINED;
+
     /*!re2c
     re2c:eof = 0;
     re2c:flags:8 = 1;
@@ -122,121 +134,185 @@ static int lex (Input *in)
     re2c:define:YYFILL:naked = 1;
 
 
-    EOL                 = [\r]?[\n];
-    SEP                 = [\x09\x20]+;
+    // For unresolved and partially resolved inconsistencies of the spec, see
+    // https://lists.w3.org/Archives/Public/public-rdf-comments/2017Jun/0000.html
+    WS                  = [\x09\x20]+;
+    OPT_WS              = WS?;
+    EOL                 = OPT_WS [\x0D\x0A]+;
     DOT                 = [.];
     HEX                 = [0-9A-Fa-f];
     ECHAR               = [\\] [tbnrf"'\\];
     UCHAR               = "\\u" HEX{4} | "\\U" HEX{8};
     PN_CHARS_BASE       = [A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF];
     PN_CHARS_U          = PN_CHARS_BASE | '_' | ':';
-    PN_CHARS            = PN_CHARS_U | '-' | [0-9\u00B7\u0300-\u036F\u203F-\u2040];
+    PN_CHARS            = PN_CHARS_U | '-' |
+                            [0-9\u00B7\u0300-\u036F\u203F-\u2040];
     IRI_CHARS           = ([^\x00-\x20<>"{}|^`\\] | UCHAR)*;
     LITERAL_QUOTE       = ["] ([^\x22\x5C\x0A\x0D] | ECHAR|UCHAR)* ["];
-    LANGTAG             = [@] [a-zA-Z]+ ("-"[a-zA-Z0-9]+)*;
+    LANGTAG             = [@] [a-zA-Z]+ ("-" [a-zA-Z0-9]+)*;
 
     IRIREF              = [<] IRI_CHARS [>];
-    LITERAL             = LITERAL_QUOTE @lit_data_e ([^]{2} @dtype_s IRIREF | @lang_s LANGTAG)?;
-    BNODE               = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")* PN_CHARS)?);
+    LITERAL             = LITERAL_QUOTE @lit_data_e OPT_WS
+                            ("^^" OPT_WS @dtype_s IRIREF | @lang_s LANGTAG)?;
+    BNODE               = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")*
+                            PN_CHARS)?);
     COMMENT             = "#" .*;
 
 
-    EOL     {
+    EOL {
         printf("End of line.\n");
-        return T_EOL;
-    }
-
-    COMMENT {
-        char *data = strndup (in->tok, YYCURSOR - in->tok);
-
-        printf ("Comment: `%s`\n", data);
-        free (data);
+        ret->type = T_EOL;
 
-        return T_COMMENT;
+        return ret;
     }
 
-    $       {
+    $ {
         printf("End of buffer.\n");
-        return T_EOF;
+        ret->type = T_EOF;
+
+        return ret;
     }
 
     LITERAL {
-        // Tags are not shifted in fill() so shift is calculated manually
-        // from a tag placed at the beginning of the match.
-        //size_t shift = (size_t)s - (size_t)in->tok;
-
-        char *data = strndup (in->tok + 1, lit_data_e - in->tok - 2);
+        size_t size = lit_data_e - in->tok - 1;
+        char *data = malloc (size);
+        memcpy (data, in->tok + 1, size);
+        data [size -1] = '\0';
         printf ("Literal data (%lu): %s\n", strlen(data), data);
-        free (data);
 
         char *datatype = NULL, *lang = NULL;
+
         if (dtype_s) {
-            datatype = strndup (dtype_s + 1, YYCURSOR - dtype_s - 2);
+            size = YYCURSOR - dtype_s - 1;
+            datatype = malloc (size);
+            memcpy (datatype, dtype_s + 1, size);
+            datatype [size - 1] = '\0';
             printf ("datatype (%lu): %s\n", strlen(datatype), datatype);
-            free (datatype);
         }
 
         if (lang_s) {
-            lang = strndup (lang_s, YYCURSOR - lang_s);
+            size = YYCURSOR - lang_s + 1;
+            lang = malloc (size);
+            memcpy (lang, lang_s, size);
+            lang [size - 1] = '\0';
             printf ("lang (%lu): %s\n", strlen (lang), lang);
-            free (lang);
         }
 
-        //LSUP_Term lit = LSUP_term_new (data, datatype, lang);
-        return T_LITERAL;
+        ret->type = T_LITERAL;
+        ret->term = LSUP_term_new (LSUP_TERM_LITERAL, data, datatype, lang);
+
+        free (data);
+        if (datatype) free (datatype);
+        if (lang) free (lang);
+
+        return ret;
     }
 
-    IRIREF  {
-        //printf ("URI data (%lu): %s\n", iri_data_e - iri_data_s, iri_data_s);
+    IRIREF {
+        size_t size = YYCURSOR - in->tok - 1;
+        char *data = malloc (size);
+        memcpy (data, in->tok + 1, size);
+        data [size - 1] = '\0';
 
-        // NOTE: UTF-8 safe, but not UTF-16 safe because of \x00 code points.
-        char *data = strndup (in->tok + 1, YYCURSOR - in->tok - 2);
         printf ("URI data (%lu): %s\n", strlen (data), data);
-        //LSUP_Term uri = LSUP_uri_new (data);
+
+        ret->type = T_IRIREF;
+        ret->term = LSUP_uri_new (data);
+
         free (data);
 
-        return T_IRIREF;
+        return ret;
     }
 
-    BNODE   {
-        char *data = strndup (in->tok + 2, YYCURSOR - in->tok - 2);
+    BNODE {
+        size_t size = YYCURSOR - in->tok - 1;
+        char *data = malloc (size);
+        memcpy (data, in->tok + 2, size);
+        data [size - 1] = '\0';
+
         printf ("BNode data (%lu): %s\n", strlen(data), data);
+
+        ret->type = T_IRIREF;
+        ret->term = LSUP_term_new (LSUP_TERM_BNODE, data, NULL, NULL);
+
         free (data);
-        return T_BNODE;
+
+        return ret;
     }
 
-    DOT     {
+    DOT {
         printf ("End of triple.\n");
 
-        return T_DOT;
+        ret->type = T_DOT;
+
+        return ret;
     }
 
-    SEP     { printf("Separator.\n"); return T_SEP; }
+    WS {
+        printf("Separator.\n");
 
-    *       {
-        printf ("Invalid token @ %lu: %s (\\x%x)\n", YYCURSOR - in->buf - 1, in->tok, *in->tok);
-        return -1;
+        ret->type = T_SEP;
+
+        return ret;
+    }
+
+    COMMENT {
+        size_t size = YYCURSOR - in->tok + 1;
+        char *data = malloc (size);
+        memcpy (data, in->tok, size);
+        data [size - 1] = '\0';
+        printf ("Comment: `%s`\n", data);
+        free (data);
+
+        ret->type = T_COMMENT;
+
+        return ret;
+    }
+
+    * {
+        printf (
+            "Invalid token @ %lu: %s (\\x%x)\n",
+            YYCURSOR - in->buf - 1, in->tok, *in->tok);
+
+        return ret;
     }
 
     */
 }
 
-int main()
+int main(int argc, char *argv[])
 {
     Input input;
 
-    FILE *fh = fopen ("test2.nt", "r");
+    if (argc != 2) {
+        fprintf (stderr, "One argument required.\n");
+        return -1;
+    }
+
+    FILE *fh = fopen (argv[1], "r");
+    if (!fh) {
+        fprintf (stderr, "Error opening file.\n");
+        return -1;
+    }
+
     init (&input, fh);
 
-    int tok;
-    while ((tok = lex (&input)) != T_EOF){
-        printf ("Token: %d\n", tok);
-        if (tok == -1) {
+    Token *tok = NULL;
+    while ((tok = lex (&input))->type != T_EOF){
+        printf ("Token #%d\n", tok->type);
+        if (tok->term) LSUP_term_free (tok->term);
+        free (tok);
+        if (tok->type == T_UNDEFINED) {
             printf("Error.\n");
             break;
         }
     };
 
+    if (tok) {
+        if (tok->term) LSUP_term_free (tok->term);
+        free (tok);
+    }
+
     fclose (fh);
     done (&input);