|
@@ -1,11 +1,10 @@
|
|
|
-#include <assert.h>
|
|
|
#include <errno.h>
|
|
|
#include <stdint.h>
|
|
|
#include <stdio.h>
|
|
|
#include <stdlib.h>
|
|
|
#include <string.h>
|
|
|
|
|
|
-//#include "term.h"
|
|
|
+#include "term.h"
|
|
|
|
|
|
#define YYCTYPE unsigned char
|
|
|
#define YYCURSOR in->cur
|
|
@@ -29,6 +28,7 @@
|
|
|
|
|
|
|
|
|
typedef enum {
|
|
|
+ T_UNDEFINED, // Undefined token.
|
|
|
T_EOF, // end of buffer.
|
|
|
T_EOL, // End of line.
|
|
|
T_SEP, // Separator (whitespace).
|
|
@@ -37,7 +37,7 @@ typedef enum {
|
|
|
T_LITERAL, // Literal term.
|
|
|
T_BNODE, // Blank node.
|
|
|
T_COMMENT, // Comment.
|
|
|
-} Token;
|
|
|
+} TokenType;
|
|
|
|
|
|
|
|
|
typedef struct {
|
|
@@ -55,6 +55,12 @@ typedef struct {
|
|
|
} Input;
|
|
|
|
|
|
|
|
|
+typedef struct {
|
|
|
+ TokenType type; // Token type (enum).
|
|
|
+ LSUP_Term * term; // Token data (e.g. char*, Term*)
|
|
|
+} Token;
|
|
|
+
|
|
|
+
|
|
|
static int fill(Input *in)
|
|
|
{
|
|
|
if (in->eof) {
|
|
@@ -71,7 +77,7 @@ static int fill(Input *in)
|
|
|
in->mar -= shift;
|
|
|
in->tok -= shift;
|
|
|
in->lim += fread(in->lim, 1, shift, in->file);
|
|
|
- /*!stags:re2c format = "if (in->@@) in->@@ -= shift;"; */
|
|
|
+ /*!stags:re2c format = "if (in->@@) in->@@ -= shift; "; */
|
|
|
in->lim[0] = 0;
|
|
|
in->eof |= in->lim < in->buf + CHUNK_SIZE;
|
|
|
return 0;
|
|
@@ -84,6 +90,7 @@ static void init(Input *in, FILE *file)
|
|
|
in->buf = malloc(CHUNK_SIZE + 1);
|
|
|
in->cur = in->mar = in->tok = in->lim = in->buf + CHUNK_SIZE;
|
|
|
in->eof = 0;
|
|
|
+ /*!stags:re2c format = "in->@@ = NULL; "; */
|
|
|
fill (in);
|
|
|
}
|
|
|
|
|
@@ -107,12 +114,17 @@ static void done (Input *in)
|
|
|
{ free (in->buf); }
|
|
|
|
|
|
|
|
|
-static int lex (Input *in)
|
|
|
+static Token *lex (Input *in)
|
|
|
{
|
|
|
const char *lit_data_e, *dtype_s, *lang_s;
|
|
|
|
|
|
in->tok = in->cur;
|
|
|
|
|
|
+ // Returned token.
|
|
|
+ Token *ret = malloc (sizeof (*ret));
|
|
|
+ ret->term = NULL;
|
|
|
+ ret->type = T_UNDEFINED;
|
|
|
+
|
|
|
/*!re2c
|
|
|
re2c:eof = 0;
|
|
|
re2c:flags:8 = 1;
|
|
@@ -122,121 +134,185 @@ static int lex (Input *in)
|
|
|
re2c:define:YYFILL:naked = 1;
|
|
|
|
|
|
|
|
|
- EOL = [\r]?[\n];
|
|
|
- SEP = [\x09\x20]+;
|
|
|
+ // For unresolved and partially resolved inconsistencies of the spec, see
|
|
|
+ // https://lists.w3.org/Archives/Public/public-rdf-comments/2017Jun/0000.html
|
|
|
+ WS = [\x09\x20]+;
|
|
|
+ OPT_WS = WS?;
|
|
|
+ EOL = OPT_WS [\x0D\x0A]+;
|
|
|
DOT = [.];
|
|
|
HEX = [0-9A-Fa-f];
|
|
|
ECHAR = [\\] [tbnrf"'\\];
|
|
|
UCHAR = "\\u" HEX{4} | "\\U" HEX{8};
|
|
|
PN_CHARS_BASE = [A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF];
|
|
|
PN_CHARS_U = PN_CHARS_BASE | '_' | ':';
|
|
|
- PN_CHARS = PN_CHARS_U | '-' | [0-9\u00B7\u0300-\u036F\u203F-\u2040];
|
|
|
+ PN_CHARS = PN_CHARS_U | '-' |
|
|
|
+ [0-9\u00B7\u0300-\u036F\u203F-\u2040];
|
|
|
IRI_CHARS = ([^\x00-\x20<>"{}|^`\\] | UCHAR)*;
|
|
|
LITERAL_QUOTE = ["] ([^\x22\x5C\x0A\x0D] | ECHAR|UCHAR)* ["];
|
|
|
- LANGTAG = [@] [a-zA-Z]+ ("-"[a-zA-Z0-9]+)*;
|
|
|
+ LANGTAG = [@] [a-zA-Z]+ ("-" [a-zA-Z0-9]+)*;
|
|
|
|
|
|
IRIREF = [<] IRI_CHARS [>];
|
|
|
- LITERAL = LITERAL_QUOTE @lit_data_e ([^]{2} @dtype_s IRIREF | @lang_s LANGTAG)?;
|
|
|
- BNODE = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")* PN_CHARS)?);
|
|
|
+ LITERAL = LITERAL_QUOTE @lit_data_e OPT_WS
|
|
|
+ ("^^" OPT_WS @dtype_s IRIREF | @lang_s LANGTAG)?;
|
|
|
+ BNODE = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")*
|
|
|
+ PN_CHARS)?);
|
|
|
COMMENT = "#" .*;
|
|
|
|
|
|
|
|
|
- EOL {
|
|
|
+ EOL {
|
|
|
printf("End of line.\n");
|
|
|
- return T_EOL;
|
|
|
- }
|
|
|
-
|
|
|
- COMMENT {
|
|
|
- char *data = strndup (in->tok, YYCURSOR - in->tok);
|
|
|
-
|
|
|
- printf ("Comment: `%s`\n", data);
|
|
|
- free (data);
|
|
|
+ ret->type = T_EOL;
|
|
|
|
|
|
- return T_COMMENT;
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
- $ {
|
|
|
+ $ {
|
|
|
printf("End of buffer.\n");
|
|
|
- return T_EOF;
|
|
|
+ ret->type = T_EOF;
|
|
|
+
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
LITERAL {
|
|
|
- // Tags are not shifted in fill() so shift is calculated manually
|
|
|
- // from a tag placed at the beginning of the match.
|
|
|
- //size_t shift = (size_t)s - (size_t)in->tok;
|
|
|
-
|
|
|
- char *data = strndup (in->tok + 1, lit_data_e - in->tok - 2);
|
|
|
+ size_t size = lit_data_e - in->tok - 1;
|
|
|
+ char *data = malloc (size);
|
|
|
+ memcpy (data, in->tok + 1, size);
|
|
|
+ data [size -1] = '\0';
|
|
|
printf ("Literal data (%lu): %s\n", strlen(data), data);
|
|
|
- free (data);
|
|
|
|
|
|
char *datatype = NULL, *lang = NULL;
|
|
|
+
|
|
|
if (dtype_s) {
|
|
|
- datatype = strndup (dtype_s + 1, YYCURSOR - dtype_s - 2);
|
|
|
+ size = YYCURSOR - dtype_s - 1;
|
|
|
+ datatype = malloc (size);
|
|
|
+ memcpy (datatype, dtype_s + 1, size);
|
|
|
+ datatype [size - 1] = '\0';
|
|
|
printf ("datatype (%lu): %s\n", strlen(datatype), datatype);
|
|
|
- free (datatype);
|
|
|
}
|
|
|
|
|
|
if (lang_s) {
|
|
|
- lang = strndup (lang_s, YYCURSOR - lang_s);
|
|
|
+ size = YYCURSOR - lang_s + 1;
|
|
|
+ lang = malloc (size);
|
|
|
+ memcpy (lang, lang_s, size);
|
|
|
+ lang [size - 1] = '\0';
|
|
|
printf ("lang (%lu): %s\n", strlen (lang), lang);
|
|
|
- free (lang);
|
|
|
}
|
|
|
|
|
|
- //LSUP_Term lit = LSUP_term_new (data, datatype, lang);
|
|
|
- return T_LITERAL;
|
|
|
+ ret->type = T_LITERAL;
|
|
|
+ ret->term = LSUP_term_new (LSUP_TERM_LITERAL, data, datatype, lang);
|
|
|
+
|
|
|
+ free (data);
|
|
|
+ if (datatype) free (datatype);
|
|
|
+ if (lang) free (lang);
|
|
|
+
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
- IRIREF {
|
|
|
- //printf ("URI data (%lu): %s\n", iri_data_e - iri_data_s, iri_data_s);
|
|
|
+ IRIREF {
|
|
|
+ size_t size = YYCURSOR - in->tok - 1;
|
|
|
+ char *data = malloc (size);
|
|
|
+ memcpy (data, in->tok + 1, size);
|
|
|
+ data [size - 1] = '\0';
|
|
|
|
|
|
- // NOTE: UTF-8 safe, but not UTF-16 safe because of \x00 code points.
|
|
|
- char *data = strndup (in->tok + 1, YYCURSOR - in->tok - 2);
|
|
|
printf ("URI data (%lu): %s\n", strlen (data), data);
|
|
|
- //LSUP_Term uri = LSUP_uri_new (data);
|
|
|
+
|
|
|
+ ret->type = T_IRIREF;
|
|
|
+ ret->term = LSUP_uri_new (data);
|
|
|
+
|
|
|
free (data);
|
|
|
|
|
|
- return T_IRIREF;
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
- BNODE {
|
|
|
- char *data = strndup (in->tok + 2, YYCURSOR - in->tok - 2);
|
|
|
+ BNODE {
|
|
|
+ size_t size = YYCURSOR - in->tok - 1;
|
|
|
+ char *data = malloc (size);
|
|
|
+ memcpy (data, in->tok + 2, size);
|
|
|
+ data [size - 1] = '\0';
|
|
|
+
|
|
|
printf ("BNode data (%lu): %s\n", strlen(data), data);
|
|
|
+
|
|
|
+ ret->type = T_IRIREF;
|
|
|
+ ret->term = LSUP_term_new (LSUP_TERM_BNODE, data, NULL, NULL);
|
|
|
+
|
|
|
free (data);
|
|
|
- return T_BNODE;
|
|
|
+
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
- DOT {
|
|
|
+ DOT {
|
|
|
printf ("End of triple.\n");
|
|
|
|
|
|
- return T_DOT;
|
|
|
+ ret->type = T_DOT;
|
|
|
+
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
- SEP { printf("Separator.\n"); return T_SEP; }
|
|
|
+ WS {
|
|
|
+ printf("Separator.\n");
|
|
|
|
|
|
- * {
|
|
|
- printf ("Invalid token @ %lu: %s (\\x%x)\n", YYCURSOR - in->buf - 1, in->tok, *in->tok);
|
|
|
- return -1;
|
|
|
+ ret->type = T_SEP;
|
|
|
+
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+
|
|
|
+ COMMENT {
|
|
|
+ size_t size = YYCURSOR - in->tok + 1;
|
|
|
+ char *data = malloc (size);
|
|
|
+ memcpy (data, in->tok, size);
|
|
|
+ data [size - 1] = '\0';
|
|
|
+ printf ("Comment: `%s`\n", data);
|
|
|
+ free (data);
|
|
|
+
|
|
|
+ ret->type = T_COMMENT;
|
|
|
+
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+
|
|
|
+ * {
|
|
|
+ printf (
|
|
|
+ "Invalid token @ %lu: %s (\\x%x)\n",
|
|
|
+ YYCURSOR - in->buf - 1, in->tok, *in->tok);
|
|
|
+
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
*/
|
|
|
}
|
|
|
|
|
|
-int main()
|
|
|
+int main(int argc, char *argv[])
|
|
|
{
|
|
|
Input input;
|
|
|
|
|
|
- FILE *fh = fopen ("test2.nt", "r");
|
|
|
+ if (argc != 2) {
|
|
|
+ fprintf (stderr, "One argument required.\n");
|
|
|
+ return -1;
|
|
|
+ }
|
|
|
+
|
|
|
+ FILE *fh = fopen (argv[1], "r");
|
|
|
+ if (!fh) {
|
|
|
+ fprintf (stderr, "Error opening file.\n");
|
|
|
+ return -1;
|
|
|
+ }
|
|
|
+
|
|
|
init (&input, fh);
|
|
|
|
|
|
- int tok;
|
|
|
- while ((tok = lex (&input)) != T_EOF){
|
|
|
- printf ("Token: %d\n", tok);
|
|
|
- if (tok == -1) {
|
|
|
+ Token *tok = NULL;
|
|
|
+ while ((tok = lex (&input))->type != T_EOF){
|
|
|
+ printf ("Token #%d\n", tok->type);
|
|
|
+ if (tok->term) LSUP_term_free (tok->term);
|
|
|
+ free (tok);
|
|
|
+ if (tok->type == T_UNDEFINED) {
|
|
|
printf("Error.\n");
|
|
|
break;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
+ if (tok) {
|
|
|
+ if (tok->term) LSUP_term_free (tok->term);
|
|
|
+ free (tok);
|
|
|
+ }
|
|
|
+
|
|
|
fclose (fh);
|
|
|
done (&input);
|
|
|
|