Browse Source

Prototype of re2c lexer.

Stefano Cossu 4 years ago
parent
commit
06f351face
3 changed files with 269 additions and 0 deletions
  1. 245 0
      src/codec/nt.re
  2. 15 0
      src/codec/test.nt
  3. 9 0
      src/codec/test2.nt

+ 245 - 0
src/codec/nt.re

@@ -0,0 +1,245 @@
+#include <assert.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+//#include "term.h"
+
+#define YYCTYPE     unsigned char
+#define YYCURSOR    in->cur
+#define YYMARKER    in->mar
+#define YYLIMIT     in->lim
+#define YYFILL      fill(in) == 0
+
+/**
+ * Max chunk size passed to scanner at each iteration.
+ */
+#define CHUNK_SIZE 256
+
+/* Max possible token size. If a matching patten is not found, the scanner
+ * keeps pulling data from input until a) a match is unambiguously found, or
+ * not found; or b) EOF is reached; or c) the size of the buffer being searched
+ * exceeds this size. Setting this to 0 disables any limit, which means that a
+ * bad token might consume the whole input and, possibly, exhaust the available
+ * memory and throw an error.
+ */
+#define MAX_TOKEN_SIZE 8192
+
+
+typedef enum {
+    T_EOF,          // end of buffer.
+    T_EOL,          // End of line.
+    T_SEP,          // Separator (whitespace).
+    T_DOT,          // End of triple.
+    T_IRIREF,       // IRI reference.
+    T_LITERAL,      // Literal term.
+    T_BNODE,        // Blank node.
+    T_COMMENT,      // Comment.
+} Token;
+
+
+typedef struct {
+    FILE *          file;           // Input file handle.
+    char *          buf,            // Start of buffer.
+         *          lim,            // Position after the last
+                                    //   available input character
+                                    //   (YYLIMIT)
+         *          cur,            // Next input character to be read
+                                    //   (YYCURSOR)
+         *          mar,            // Most recent match (YYMARKER)
+         *          tok;            // Start of current token.
+    int             eof;            // if we have reached EOF (T|F)
+    /*!stags:re2c format = "char *@@;"; */
+} Input;
+
+
+static int fill(Input *in)
+{
+    if (in->eof) {
+        return 1;
+    }
+    const size_t shift = in->tok - in->buf;
+    if (shift < 1) {
+        return 2;
+    }
+    printf ("Shifting bytes: %lu\n", shift);
+    memmove(in->buf, in->tok, in->lim - in->tok);
+    in->lim -= shift;
+    in->cur -= shift;
+    in->mar -= shift;
+    in->tok -= shift;
+    in->lim += fread(in->lim, 1, shift, in->file);
+    /*!stags:re2c format = "if (in->@@) in->@@ -= shift;"; */
+    in->lim[0] = 0;
+    in->eof |= in->lim < in->buf + CHUNK_SIZE;
+    return 0;
+}
+
+
+static void init(Input *in, FILE *file)
+{
+    in->file = file;
+    in->buf = malloc(CHUNK_SIZE + 1);
+    in->cur = in->mar = in->tok = in->lim = in->buf + CHUNK_SIZE;
+    in->eof = 0;
+    fill (in);
+}
+
+
+// TODO Make buffer extensible if a token is larger than the current buf size.
+static int __attribute__((unused)) extend (Input *in)
+{
+    size_t delta = YYLIMIT - in->buf + CHUNK_SIZE;
+    char *tmp = realloc (in->buf, delta);
+    if (!tmp) return ENOMEM;
+
+    in->lim += delta;
+
+    in->buf = tmp;
+
+    return 0;
+}
+
+
+static void done (Input *in)
+{ free (in->buf); }
+
+
+static int lex (Input *in)
+{
+    const char *lit_data_e, *dtype_s, *lang_s;
+
+    in->tok = in->cur;
+
+    /*!re2c
+    re2c:eof = 0;
+    re2c:flags:8 = 1;
+    re2c:flags:tags = 1;
+    re2c:tags:expression = "in->@@";
+    re2c:api:style = functions;
+    re2c:define:YYFILL:naked = 1;
+
+
+    EOL                 = [\r]?[\n];
+    SEP                 = [\x09\x20]+;
+    DOT                 = [.];
+    HEX                 = [0-9A-Fa-f];
+    ECHAR               = [\\] [tbnrf"'\\];
+    UCHAR               = "\\u" HEX{4} | "\\U" HEX{8};
+    PN_CHARS_BASE       = [A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF];
+    PN_CHARS_U          = PN_CHARS_BASE | '_' | ':';
+    PN_CHARS            = PN_CHARS_U | '-' | [0-9\u00B7\u0300-\u036F\u203F-\u2040];
+    IRI_CHARS           = ([^\x00-\x20<>"{}|^`\\] | UCHAR)*;
+    LITERAL_QUOTE       = ["] ([^\x22\x5C\x0A\x0D] | ECHAR|UCHAR)* ["];
+    LANGTAG             = [@] [a-zA-Z]+ ("-"[a-zA-Z0-9]+)*;
+
+    IRIREF              = [<] IRI_CHARS [>];
+    LITERAL             = LITERAL_QUOTE @lit_data_e ([^]{2} @dtype_s IRIREF | @lang_s LANGTAG)?;
+    BNODE               = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")* PN_CHARS)?);
+    COMMENT             = "#" .*;
+
+
+    EOL     {
+        printf("End of line.\n");
+        return T_EOL;
+    }
+
+    COMMENT {
+        char *data = strndup (in->tok, YYCURSOR - in->tok);
+
+        printf ("Comment: `%s`\n", data);
+        free (data);
+
+        return T_COMMENT;
+    }
+
+    $       {
+        printf("End of buffer.\n");
+        return T_EOF;
+    }
+
+    LITERAL {
+        // Tags are not shifted in fill() so shift is calculated manually
+        // from a tag placed at the beginning of the match.
+        //size_t shift = (size_t)s - (size_t)in->tok;
+
+        char *data = strndup (in->tok + 1, lit_data_e - in->tok - 2);
+        printf ("Literal data (%lu): %s\n", strlen(data), data);
+        free (data);
+
+        char *datatype = NULL, *lang = NULL;
+        if (dtype_s) {
+            datatype = strndup (dtype_s + 1, YYCURSOR - dtype_s - 2);
+            printf ("datatype (%lu): %s\n", strlen(datatype), datatype);
+            free (datatype);
+        }
+
+        if (lang_s) {
+            lang = strndup (lang_s, YYCURSOR - lang_s);
+            printf ("lang (%lu): %s\n", strlen (lang), lang);
+            free (lang);
+        }
+
+        //LSUP_Term lit = LSUP_term_new (data, datatype, lang);
+        return T_LITERAL;
+    }
+
+    IRIREF  {
+        //printf ("URI data (%lu): %s\n", iri_data_e - iri_data_s, iri_data_s);
+
+        // NOTE: UTF-8 safe, but not UTF-16 safe because of \x00 code points.
+        char *data = strndup (in->tok + 1, YYCURSOR - in->tok - 2);
+        printf ("URI data (%lu): %s\n", strlen (data), data);
+        //LSUP_Term uri = LSUP_uri_new (data);
+        free (data);
+
+        return T_IRIREF;
+    }
+
+    BNODE   {
+        char *data = strndup (in->tok + 2, YYCURSOR - in->tok - 2);
+        printf ("BNode data (%lu): %s\n", strlen(data), data);
+        free (data);
+        return T_BNODE;
+    }
+
+    DOT     {
+        printf ("End of triple.\n");
+
+        return T_DOT;
+    }
+
+    SEP     { printf("Separator.\n"); return T_SEP; }
+
+    *       {
+        printf ("Invalid token @ %lu: %s (\\x%x)\n", YYCURSOR - in->buf - 1, in->tok, *in->tok);
+        return -1;
+    }
+
+    */
+}
+
+int main()
+{
+    Input input;
+
+    FILE *fh = fopen ("test2.nt", "r");
+    init (&input, fh);
+
+    int tok;
+    while ((tok = lex (&input)) != T_EOF){
+        printf ("Token: %d\n", tok);
+        if (tok == -1) {
+            printf("Error.\n");
+            break;
+        }
+    };
+
+    fclose (fh);
+    done (&input);
+
+    return 0;
+}
+

+ 15 - 0
src/codec/test.nt

@@ -0,0 +1,15 @@
+<http://example.org/show/218> <http://www.w3.org/2000/01/rdf-schema#label> "That Seventies Show"^^<http://www.w3.org/2001/XMLSchema#string> . # literal with XML Schema string datatype
+<http://example.org/show/218> <http://www.w3.org/2000/01/rdf-schema#label> "That Seventies Show" . # same as above
+<http://example.org/show/218> <http://example.org/show/localName> "That Seventies Show"@en . # literal with a language tag
+<http://example.org/show/218> <http://example.org/show/localName> "Cette Série des Années Septante"@fr-be .  # literal outside of ASCII range with a region subtag
+<http://example.org/#spiderman> <http://example.org/text> "This is a multi-line\nliteral with many quotes (\"\"\"\"\")\nand two apostrophes ('')." .
+<http://en.wikipedia.org/wiki/Helium> <http://example.org/elements/atomicNumber> "2"^^<http://www.w3.org/2001/XMLSchema#integer> . # xsd:integer
+<http://en.wikipedia.org/wiki/Helium> <http://example.org/elements/specificGravity> "1.663E-4"^^<http://www.w3.org/2001/XMLSchema#double> .     #xsd:double
+# A comment.
+# Another comment.
+<http://ex.org/longtext> <urn:myns:hasContent> "riverrun, past Eve and Adam's, from swerve of shore to bend of bay, brings us by a commodius vicus of recirculation back to Howth Castle and Environs.\nSir Tristram, violer d'amores, fr'over the short sea, had passencore rearrived from North Armorica on this side the scraggy isthmus of Europe Minor to wielderfight his penisolate war: nor had topsawyer's rocks by the stream Oconee exaggerated themselse to Laurens County's gorgios while they went doublin their mumper all the time: nor avoice from afire bellowsed mishe mishe to tauftauf thuartpeatrick: not yet, though venissoon after, had a kidscad buttended a bland old isaac: not yet, though all's fair in vanessy, were sosie sesthers wroth with twone nathandjoe. Rot a peck of pa's malt had Jhem or Shen brewed by arclight and rory end to the regginbrow was to be seen ringsome on the aquaface.\nThe fall (bababadalgharaghtakamminarronnkonnbronntonnerronntuonnthunntrovarrhounawnskawntoohoohoordenenthurnuk!) of a once wallstrait oldparr is retaled early in bed and later on life down through all christian minstrelsy. The great fall of the offwall entailed at such short notice the pftjschute of Finnegan, erse solid man, that the humptyhillhead of humself prumptly sends an unquiring one well to the west in quest of his tumptytumtoes: and their upturnpikepointandplace is at the knock out in the park where oranges have been laid to rust upon the green since devlinsfirst loved livvy." .
+#<http://example.org/utf8/ქართული ენის შესწავლა და სწავლება> <urn:utf8:語文教學・语文教学> "'læŋɡwidʒ 'lɘr:niŋ ænd 'ti:tʃiŋ" .
+#<http://example.org/utf8/\u202d\u202bללמוד וללמד את ה> <urn:utf8:말배우기와 가르치기> "'læŋɡwidʒ 'lɘr:niŋ ænd 'ti:tʃiŋ" .
+_:alice <http://xmlns.com/foaf/0.1/knows> _:bob .
+_:bob <http://xmlns.com/foaf/0.1/knows> _:alice .
+

+ 9 - 0
src/codec/test2.nt

@@ -0,0 +1,9 @@
+<http://example.org/show/218> <http://www.w3.org/2000/01/rdf-schema#label> "That Seventies Show"^^<http://www.w3.org/2001/XMLSchema#string> . # literal with XML Schema string datatype
+<http://example.org/show/218> <http://www.w3.org/2000/01/rdf-schema#label> "That Seventies Show" . # same as above
+<http://example.org/show/218> <http://example.org/show/localName> "That Seventies Show"@en . # literal with a language tag
+<http://example.org/show/218> <http://example.org/show/localName> "Cette Série des Années Septante"@fr-be .  # literal outside of ASCII range with a region subtag
+<http://example.org/#spiderman> <http://example.org/text> "This is a multi-line\nliteral with many quotes (\"\"\"\"\")\nand two apostrophes ('')."^^<http://www.w3.org/2001/XMLSchema#string> .
+<http://en.wikipedia.org/wiki/Helium> <http://example.org/elements/atomicNumber> "2"^^<http://www.w3.org/2001/XMLSchema#integer> . # xsd:integer
+<http://en.wikipedia.org/wiki/Helium> <http://example.org/elements/specificGravity> "1.663E-4"^^<http://www.w3.org/2001/XMLSchema#double> .     #xsd:double
+# A comment.
+# Another comment.