|
@@ -4,7 +4,9 @@
|
|
|
#include <stdlib.h>
|
|
|
#include <string.h>
|
|
|
|
|
|
-#include "term.h"
|
|
|
+#include "graph.h"
|
|
|
+#include "src/codec/nt_grammar.h"
|
|
|
+
|
|
|
|
|
|
#define YYCTYPE unsigned char
|
|
|
#define YYCURSOR in->cur
|
|
@@ -27,19 +29,6 @@
|
|
|
#define MAX_TOKEN_SIZE 8192
|
|
|
|
|
|
|
|
|
-typedef enum {
|
|
|
- T_UNDEFINED, // Undefined token.
|
|
|
- T_EOF, // end of buffer.
|
|
|
- T_EOL, // End of line.
|
|
|
- T_SEP, // Separator (whitespace).
|
|
|
- T_DOT, // End of triple.
|
|
|
- T_IRIREF, // IRI reference.
|
|
|
- T_LITERAL, // Literal term.
|
|
|
- T_BNODE, // Blank node.
|
|
|
- T_COMMENT, // Comment.
|
|
|
-} TokenType;
|
|
|
-
|
|
|
-
|
|
|
typedef struct {
|
|
|
FILE * file; // Input file handle.
|
|
|
char * buf, // Start of buffer.
|
|
@@ -50,13 +39,14 @@ typedef struct {
|
|
|
// (YYCURSOR)
|
|
|
* mar, // Most recent match (YYMARKER)
|
|
|
* tok; // Start of current token.
|
|
|
+ size_t ct; // Number of parsed triples.
|
|
|
int eof; // if we have reached EOF (T|F)
|
|
|
/*!stags:re2c format = "char *@@;"; */
|
|
|
} Input;
|
|
|
|
|
|
|
|
|
-typedef struct {
|
|
|
- TokenType type; // Token type (enum).
|
|
|
+typedef struct token_t {
|
|
|
+ int type; // Token type (enum).
|
|
|
LSUP_Term * term; // Token data (e.g. char*, Term*)
|
|
|
} Token;
|
|
|
|
|
@@ -87,8 +77,9 @@ static int fill(Input *in)
|
|
|
static void init(Input *in, FILE *file)
|
|
|
{
|
|
|
in->file = file;
|
|
|
- in->buf = malloc(CHUNK_SIZE + 1);
|
|
|
+ in->buf = malloc (CHUNK_SIZE + 1);
|
|
|
in->cur = in->mar = in->tok = in->lim = in->buf + CHUNK_SIZE;
|
|
|
+ in->ct = 0;
|
|
|
in->eof = 0;
|
|
|
/*!stags:re2c format = "in->@@ = NULL; "; */
|
|
|
fill (in);
|
|
@@ -114,16 +105,24 @@ static void done (Input *in)
|
|
|
{ free (in->buf); }
|
|
|
|
|
|
|
|
|
-static Token *lex (Input *in)
|
|
|
+// Parser interface.
|
|
|
+
|
|
|
+void *ParseAlloc();
|
|
|
+void Parse();
|
|
|
+void ParseFree();
|
|
|
+
|
|
|
+
|
|
|
+// Lexer.
|
|
|
+
|
|
|
+static int lex (Input *in, LSUP_Term **term)
|
|
|
{
|
|
|
const char *lit_data_e, *dtype_s, *lang_s;
|
|
|
|
|
|
+loop:
|
|
|
+
|
|
|
in->tok = in->cur;
|
|
|
|
|
|
- // Returned token.
|
|
|
- Token *ret = malloc (sizeof (*ret));
|
|
|
- ret->term = NULL;
|
|
|
- ret->type = T_UNDEFINED;
|
|
|
+ *term = NULL;
|
|
|
|
|
|
/*!re2c
|
|
|
re2c:eof = 0;
|
|
@@ -137,40 +136,46 @@ static Token *lex (Input *in)
|
|
|
// For unresolved and partially resolved inconsistencies of the spec, see
|
|
|
// https://lists.w3.org/Archives/Public/public-rdf-comments/2017Jun/0000.html
|
|
|
WS = [\x09\x20]+;
|
|
|
- OPT_WS = WS?;
|
|
|
- EOL = OPT_WS [\x0D\x0A]+;
|
|
|
+ EOL = WS? [\x0D\x0A]+;
|
|
|
DOT = [.];
|
|
|
HEX = [0-9A-Fa-f];
|
|
|
ECHAR = [\\] [tbnrf"'\\];
|
|
|
UCHAR = "\\u" HEX{4} | "\\U" HEX{8};
|
|
|
PN_CHARS_BASE = [A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF];
|
|
|
PN_CHARS_U = PN_CHARS_BASE | '_' | ':';
|
|
|
- PN_CHARS = PN_CHARS_U | '-' |
|
|
|
- [0-9\u00B7\u0300-\u036F\u203F-\u2040];
|
|
|
+ PN_CHARS = PN_CHARS_U | '-' | [0-9\u00B7\u0300-\u036F\u203F-\u2040];
|
|
|
IRI_CHARS = ([^\x00-\x20<>"{}|^`\\] | UCHAR)*;
|
|
|
LITERAL_QUOTE = ["] ([^\x22\x5C\x0A\x0D] | ECHAR|UCHAR)* ["];
|
|
|
LANGTAG = [@] [a-zA-Z]+ ("-" [a-zA-Z0-9]+)*;
|
|
|
|
|
|
IRIREF = [<] IRI_CHARS [>];
|
|
|
- LITERAL = LITERAL_QUOTE @lit_data_e OPT_WS
|
|
|
- ("^^" OPT_WS @dtype_s IRIREF | @lang_s LANGTAG)?;
|
|
|
- BNODE = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")*
|
|
|
- PN_CHARS)?);
|
|
|
+ LITERAL = LITERAL_QUOTE @lit_data_e WS? ("^^" WS? @dtype_s IRIREF | @lang_s LANGTAG)?;
|
|
|
+ BNODE = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")* PN_CHARS)?);
|
|
|
COMMENT = "#" .*;
|
|
|
|
|
|
|
|
|
EOL {
|
|
|
printf("End of line.\n");
|
|
|
- ret->type = T_EOL;
|
|
|
-
|
|
|
- return ret;
|
|
|
+ return T_EOL;
|
|
|
}
|
|
|
|
|
|
$ {
|
|
|
printf("End of buffer.\n");
|
|
|
- ret->type = T_EOF;
|
|
|
+ return T_EOF;
|
|
|
+ }
|
|
|
+
|
|
|
+ IRIREF {
|
|
|
+ size_t size = YYCURSOR - in->tok - 1;
|
|
|
+ char *data = malloc (size);
|
|
|
+ memcpy (data, in->tok + 1, size);
|
|
|
+ data [size - 1] = '\0';
|
|
|
|
|
|
- return ret;
|
|
|
+ printf ("URI data (%lu): %s\n", strlen (data), data);
|
|
|
+
|
|
|
+ *term = LSUP_uri_new (data);
|
|
|
+ free (data);
|
|
|
+
|
|
|
+ return T_IRIREF;
|
|
|
}
|
|
|
|
|
|
LITERAL {
|
|
@@ -198,30 +203,13 @@ static Token *lex (Input *in)
|
|
|
printf ("lang (%lu): %s\n", strlen (lang), lang);
|
|
|
}
|
|
|
|
|
|
- ret->type = T_LITERAL;
|
|
|
- ret->term = LSUP_term_new (LSUP_TERM_LITERAL, data, datatype, lang);
|
|
|
+ *term = LSUP_term_new (LSUP_TERM_LITERAL, data, datatype, lang);
|
|
|
|
|
|
free (data);
|
|
|
- if (datatype) free (datatype);
|
|
|
- if (lang) free (lang);
|
|
|
+ free (datatype);
|
|
|
+ free (lang);
|
|
|
|
|
|
- return ret;
|
|
|
- }
|
|
|
-
|
|
|
- IRIREF {
|
|
|
- size_t size = YYCURSOR - in->tok - 1;
|
|
|
- char *data = malloc (size);
|
|
|
- memcpy (data, in->tok + 1, size);
|
|
|
- data [size - 1] = '\0';
|
|
|
-
|
|
|
- printf ("URI data (%lu): %s\n", strlen (data), data);
|
|
|
-
|
|
|
- ret->type = T_IRIREF;
|
|
|
- ret->term = LSUP_uri_new (data);
|
|
|
-
|
|
|
- free (data);
|
|
|
-
|
|
|
- return ret;
|
|
|
+ return T_LITERAL;
|
|
|
}
|
|
|
|
|
|
BNODE {
|
|
@@ -232,28 +220,24 @@ static Token *lex (Input *in)
|
|
|
|
|
|
printf ("BNode data (%lu): %s\n", strlen(data), data);
|
|
|
|
|
|
- ret->type = T_IRIREF;
|
|
|
- ret->term = LSUP_term_new (LSUP_TERM_BNODE, data, NULL, NULL);
|
|
|
-
|
|
|
+ *term = LSUP_term_new (LSUP_TERM_BNODE, data, NULL, NULL);
|
|
|
free (data);
|
|
|
|
|
|
- return ret;
|
|
|
+ return T_BNODE;
|
|
|
}
|
|
|
|
|
|
DOT {
|
|
|
printf ("End of triple.\n");
|
|
|
+ in->ct ++;
|
|
|
|
|
|
- ret->type = T_DOT;
|
|
|
-
|
|
|
- return ret;
|
|
|
+ return T_DOT;
|
|
|
}
|
|
|
|
|
|
WS {
|
|
|
printf("Separator.\n");
|
|
|
|
|
|
- ret->type = T_SEP;
|
|
|
-
|
|
|
- return ret;
|
|
|
+ return T_WS;
|
|
|
+ //goto loop;
|
|
|
}
|
|
|
|
|
|
COMMENT {
|
|
@@ -264,9 +248,7 @@ static Token *lex (Input *in)
|
|
|
printf ("Comment: `%s`\n", data);
|
|
|
free (data);
|
|
|
|
|
|
- ret->type = T_COMMENT;
|
|
|
-
|
|
|
- return ret;
|
|
|
+ goto loop;
|
|
|
}
|
|
|
|
|
|
* {
|
|
@@ -274,12 +256,13 @@ static Token *lex (Input *in)
|
|
|
"Invalid token @ %lu: %s (\\x%x)\n",
|
|
|
YYCURSOR - in->buf - 1, in->tok, *in->tok);
|
|
|
|
|
|
- return ret;
|
|
|
+ return -1;
|
|
|
}
|
|
|
|
|
|
*/
|
|
|
}
|
|
|
|
|
|
+
|
|
|
int main(int argc, char *argv[])
|
|
|
{
|
|
|
Input input;
|
|
@@ -297,25 +280,43 @@ int main(int argc, char *argv[])
|
|
|
|
|
|
init (&input, fh);
|
|
|
|
|
|
- Token *tok = NULL;
|
|
|
- while ((tok = lex (&input))->type != T_EOF){
|
|
|
- printf ("Token #%d\n", tok->type);
|
|
|
- if (tok->term) LSUP_term_free (tok->term);
|
|
|
- free (tok);
|
|
|
- if (tok->type == T_UNDEFINED) {
|
|
|
- printf("Error.\n");
|
|
|
+ void *parser = ParseAlloc (malloc);
|
|
|
+
|
|
|
+ LSUP_Graph *gr = LSUP_graph_new (LSUP_STORE_MEM);
|
|
|
+ LSUP_GraphIterator *it = LSUP_graph_add_init (gr);
|
|
|
+ LSUP_Term *term = NULL;
|
|
|
+
|
|
|
+ for (;;) {
|
|
|
+ int ttype = lex (&input, &term);
|
|
|
+
|
|
|
+ if (ttype == -1) {
|
|
|
+ fprintf(stderr, "Parse error.\n");
|
|
|
break;
|
|
|
}
|
|
|
+
|
|
|
+ printf ("Token #%d\n", ttype);
|
|
|
+
|
|
|
+ Parse (parser, ttype, term, it);
|
|
|
+ //if (term) LSUP_term_free (term);
|
|
|
+
|
|
|
+ if (ttype == T_EOF) break;
|
|
|
};
|
|
|
|
|
|
- if (tok) {
|
|
|
- if (tok->term) LSUP_term_free (tok->term);
|
|
|
- free (tok);
|
|
|
- }
|
|
|
+ Parse (parser, 0, NULL, it);
|
|
|
+
|
|
|
+ LSUP_graph_add_done (it);
|
|
|
|
|
|
+ if (term) LSUP_term_free (term);
|
|
|
+
|
|
|
+ ParseFree (parser, free);
|
|
|
fclose (fh);
|
|
|
done (&input);
|
|
|
|
|
|
+ printf ("Parsed %lu triples.\n", input.ct);
|
|
|
+
|
|
|
+ printf ("Graph size: %lu\n", LSUP_graph_size (gr));
|
|
|
+ LSUP_graph_free (gr);
|
|
|
+
|
|
|
return 0;
|
|
|
}
|
|
|
|