|
@@ -29,7 +29,6 @@ typedef struct {
|
|
|
// current line (for debugging).
|
|
|
unsigned line; // Current line no. (for debugging).
|
|
|
unsigned stmt; // Current statement.
|
|
|
- unsigned ct; // Number of parsed triples.
|
|
|
bool eof; // if we have reached EOF.
|
|
|
/*!stags:re2c format = "YYCTYPE *@@;"; */
|
|
|
} ParseIterator;
|
|
@@ -40,7 +39,7 @@ typedef struct {
|
|
|
} ParserToken;
|
|
|
|
|
|
|
|
|
-static int fill(ParseIterator *it)
|
|
|
+static int fill (ParseIterator *it)
|
|
|
{
|
|
|
if (it->eof) {
|
|
|
return 1;
|
|
@@ -49,7 +48,7 @@ static int fill(ParseIterator *it)
|
|
|
if (shift < 1) {
|
|
|
return 2;
|
|
|
}
|
|
|
- log_debug ("Shifting bytes: %lu", shift);
|
|
|
+ log_trace ("Shifting bytes: %lu", shift);
|
|
|
memmove(it->buf, it->tok, it->lim - it->tok);
|
|
|
it->lim -= shift;
|
|
|
it->cur -= shift;
|
|
@@ -70,7 +69,6 @@ static void parse_init (ParseIterator *it, FILE *fh)
|
|
|
it->line = 1;
|
|
|
it->stmt = 1;
|
|
|
it->bol = it->buf;
|
|
|
- it->ct = 0;
|
|
|
it->eof = 0;
|
|
|
/*!stags:re2c format = "it->@@ = NULL; "; */
|
|
|
fill (it);
|
|
@@ -110,17 +108,19 @@ loop:
|
|
|
CHAR_BASE = "\\u" HEX{4} | "\\U" HEX{8} | '\\'
|
|
|
| [\U0000005D-\U0010FFFF];
|
|
|
CHARACTER = CHAR_BASE | [\x20-\x5B];
|
|
|
- NSTART_CHAR = [a-zA-Z_] | [\u00C0-\u00D6] | [\u00D8-\u00F6]
|
|
|
+ // Prefix start character.
|
|
|
+ PSTART_CHAR = [a-zA-Z] | [\u00C0-\u00D6] | [\u00D8-\u00F6]
|
|
|
| [\u00F8-\u02FF] | [\u0370-\u037D] | [\u037F-\u1FFF]
|
|
|
| [\u200C-\u200D] | [\u2070-\u218F] | [\u2C00-\u2FEF]
|
|
|
| [\u3001-\uD7FF] | [\uF900-\uFDCF] | [\uFDF0-\uFFFD]
|
|
|
| [\U00010000-\U000EFFFF];
|
|
|
- NAME_CHAR = NSTART_CHAR | '-'
|
|
|
- | [0-9\u00B7\u0300-\u036F\u203F-\u2040];
|
|
|
- ECHAR = CHARACTER | [\t\n\r];
|
|
|
- UCHAR = (CHAR_BASE | ([\x20-\x5B] \ [\x3E])) | '>';
|
|
|
- SCHAR = (CHAR_BASE | ([\x20-\x5B] \ [\x22])) | '"';
|
|
|
- LCHAR = ECHAR | ["\x09\x0A\x0D];
|
|
|
+ // Name start character.
|
|
|
+ NSTART_CHAR = PSTART_CHAR | '_';
|
|
|
+ NAME_CHAR = NSTART_CHAR | [0-9\-\u00B7\u0300-\u036F\u203F-\u2040];
|
|
|
+ ECHAR = CHARACTER | ([\\] [tnr]);
|
|
|
+ UCHAR = (CHAR_BASE | ([\x20-\x5B] \ [>])) | ([\\] [>]);
|
|
|
+ SCHAR = (CHAR_BASE | ([\x20-\x5B] \ ["])) | ([\\] ["]);
|
|
|
+ LCHAR = ECHAR | ([\\] ["]) | [\t\n\r];
|
|
|
|
|
|
// Constructs.
|
|
|
COMMENT = '#' ( [^\x0A\x0D] )*;
|
|
@@ -129,78 +129,85 @@ loop:
|
|
|
EXPONENT = [eE] INTEGER;
|
|
|
LANGUAGE = [a-z]+ ('-' [a-z0-9]+)*;
|
|
|
REL_IRI = UCHAR*;
|
|
|
-
|
|
|
- // Token aliases.
|
|
|
IRIREF = '<' REL_IRI '>';
|
|
|
- PFX_NAME = (NSTART_CHAR \ [_]) NAME_CHAR*;
|
|
|
NAME = NSTART_CHAR NAME_CHAR*;
|
|
|
- LSTRING = [\x22]{3} LCHAR [\x22]{3};
|
|
|
- STRING = [\x22] SCHAR [\x22];
|
|
|
+ PFX = PSTART_CHAR NAME_CHAR* ':';
|
|
|
+ LSTRING = [\x22]{3} LCHAR*? [\x22]{3};
|
|
|
+ STRING = [\x22] SCHAR* [\x22];
|
|
|
LANGTAG = '@' LANGUAGE;
|
|
|
DOUBLE = ('-' | '+') ? ([0-9]+ '.' [0-9]* EXPONENT
|
|
|
| '.' ([0-9])+ EXPONENT | ([0-9])+ EXPONENT);
|
|
|
DECIMAL = ('-' | '+')?
|
|
|
( [0-9]+ '.' [0-9]* | '.' ([0-9])+ | ([0-9])+ );
|
|
|
BOOLEAN = 'true' | 'false';
|
|
|
- //RDF_TYPE = NCWS 'a' / WS;
|
|
|
+ //RDF_TYPE = 'a' / WS;
|
|
|
|
|
|
|
|
|
$ {
|
|
|
- log_debug ("End of document.");
|
|
|
+ log_trace ("End of document.");
|
|
|
return T_EOF;
|
|
|
}
|
|
|
|
|
|
- '.' {
|
|
|
- log_debug ("End of statement #%u.", it->stmt);
|
|
|
- it->stmt++;
|
|
|
- return T_EOS;
|
|
|
- }
|
|
|
-
|
|
|
EOL {
|
|
|
it->line ++;
|
|
|
it->bol = YYCURSOR;
|
|
|
- log_debug ("New line: #%u.", it->line);
|
|
|
+ log_trace ("New line: #%u.", it->line);
|
|
|
goto loop;
|
|
|
}
|
|
|
|
|
|
+ '@prefix' {
|
|
|
+ log_trace ("'@prefix' keyword.");
|
|
|
+ return T_PREFIX;
|
|
|
+ }
|
|
|
+
|
|
|
IRIREF {
|
|
|
*token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
|
|
|
- log_debug ("URI data: %s", *token_p);
|
|
|
+ log_trace ("URI data: %s", *token_p);
|
|
|
|
|
|
return T_IRIREF;
|
|
|
}
|
|
|
|
|
|
- PFX_NAME {
|
|
|
- *token_p = unescape_unicode (it->tok, YYCURSOR - it->tok - 1);
|
|
|
- log_debug ("Prefix name: %s", *token_p);
|
|
|
-
|
|
|
- return T_PFX_NAME;
|
|
|
+ NCWS 'a' / WS {
|
|
|
+ log_trace ("RDF type shorthand 'a'.");
|
|
|
+ return T_RDF_TYPE;
|
|
|
}
|
|
|
|
|
|
NAME {
|
|
|
- *token_p = unescape_unicode (it->tok, YYCURSOR - it->tok - 1);
|
|
|
- log_debug ("name: %s", *token_p);
|
|
|
+ *token_p = unescape_unicode (it->tok, YYCURSOR - it->tok);
|
|
|
+ log_trace ("name: %s", *token_p);
|
|
|
|
|
|
return T_IDNAME;
|
|
|
}
|
|
|
|
|
|
+ PFX {
|
|
|
+ *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok - 1);
|
|
|
+ log_trace ("Prefix name: '%s'", *token_p);
|
|
|
+
|
|
|
+ return T_PFX;
|
|
|
+ }
|
|
|
+
|
|
|
LSTRING {
|
|
|
- *token_p = unescape_unicode (it->tok + 3, YYCURSOR - it->tok - 4);
|
|
|
- log_debug ("Long string: %s", *token_p);
|
|
|
+ *token_p = unescape_unicode (it->tok + 3, YYCURSOR - it->tok - 6);
|
|
|
+ log_trace ("Long string: %s", it->tok);
|
|
|
|
|
|
return T_STRING;
|
|
|
}
|
|
|
|
|
|
STRING {
|
|
|
*token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
|
|
|
- log_debug ("Long string: %s", *token_p);
|
|
|
+ log_trace ("Long string: %s", *token_p);
|
|
|
|
|
|
return T_STRING;
|
|
|
}
|
|
|
|
|
|
+ WS {
|
|
|
+ log_trace ("Whitespace.");
|
|
|
+ return T_WS;
|
|
|
+ }
|
|
|
+
|
|
|
LANGTAG {
|
|
|
- *token_p = uint8_ndup (it->tok + 1, YYCURSOR - it->tok - 1);
|
|
|
- log_debug ("Lang tag: %s", *token_p);
|
|
|
+ *token_p = uint8_ndup (it->tok + 1, YYCURSOR - it->tok);
|
|
|
+ log_trace ("Lang tag: %s", *token_p);
|
|
|
|
|
|
return T_LANGTAG;
|
|
|
}
|
|
@@ -209,8 +216,8 @@ loop:
|
|
|
// Normalize sign.
|
|
|
size_t offset = *it->tok == '+' ? 1 : 0;
|
|
|
|
|
|
- *token_p = uint8_ndup (it->tok + offset, YYCURSOR - it->tok - 1);
|
|
|
- log_debug ("Integer: %s", *token_p);
|
|
|
+ *token_p = uint8_ndup (it->tok + offset, YYCURSOR - it->tok - offset);
|
|
|
+ log_trace ("Integer: %s", *token_p);
|
|
|
|
|
|
return T_INTEGER;
|
|
|
}
|
|
@@ -219,8 +226,8 @@ loop:
|
|
|
// Normalize sign.
|
|
|
size_t offset = *it->tok == '+' ? 1 : 0;
|
|
|
|
|
|
- *token_p = uint8_ndup (it->tok + offset, YYCURSOR - it->tok - 1);
|
|
|
- log_debug ("Integer: %s", *token_p);
|
|
|
+ *token_p = uint8_ndup (it->tok + offset, YYCURSOR - it->tok - offset);
|
|
|
+ log_trace ("Integer: %s", *token_p);
|
|
|
|
|
|
return T_DOUBLE;
|
|
|
}
|
|
@@ -230,49 +237,56 @@ loop:
|
|
|
YYCTYPE offset = *it->tok == '+' ? 1 : 0;
|
|
|
|
|
|
// Normalize trailing zeros in fractional part.
|
|
|
- size_t size = YYCURSOR - it->tok - 1;
|
|
|
+ size_t size = YYCURSOR - it->tok - offset;
|
|
|
if (strchr ((char *)it->tok, '.'))
|
|
|
- for (YYCTYPE *i = YYCURSOR; *i == '0'; i--) *i = '\0';
|
|
|
+ for (YYCTYPE *i = YYCURSOR; *i == '0'; i--) size--;
|
|
|
|
|
|
*token_p = uint8_ndup (it->tok + offset, size);
|
|
|
- log_debug ("Integer: %s", *token_p);
|
|
|
+ log_trace ("Integer: %s", *token_p);
|
|
|
|
|
|
return T_DECIMAL;
|
|
|
}
|
|
|
|
|
|
BOOLEAN {
|
|
|
- *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok - 1);
|
|
|
- log_debug ("Boolean: %s", *token_p);
|
|
|
+ *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok);
|
|
|
+ log_trace ("Boolean: %s", *token_p);
|
|
|
|
|
|
return T_BOOLEAN;
|
|
|
}
|
|
|
|
|
|
'(' { return T_LPAREN; }
|
|
|
+
|
|
|
')' { return T_RPAREN; }
|
|
|
+
|
|
|
'[' { return T_LBRACKET; }
|
|
|
+
|
|
|
']' { return T_RBRACKET; }
|
|
|
- ';' { return T_SEMICOLON; }
|
|
|
- ',' { return T_COMMA; }
|
|
|
+
|
|
|
':' { return T_COLON; }
|
|
|
- 'a' { return T_RDF_TYPE; }
|
|
|
+
|
|
|
+ WS? ';' WS? { return T_SEMICOLON; }
|
|
|
+
|
|
|
+ WS? ',' WS? { return T_COMMA; }
|
|
|
+
|
|
|
+ WS? '.' WS? {
|
|
|
+ log_trace ("End of statement #%u.", it->stmt);
|
|
|
+ it->stmt++;
|
|
|
+ return T_EOS;
|
|
|
+ }
|
|
|
+
|
|
|
'_:' { return T_BNODE_PFX; }
|
|
|
+
|
|
|
'^^' { return T_DTYPE_MARKER; }
|
|
|
+
|
|
|
'@base' {return T_BASE; }
|
|
|
- '@prefix' {return T_PREFIX; }
|
|
|
|
|
|
COMMENT {
|
|
|
- size_t size = YYCURSOR - it->tok + 1;
|
|
|
- YYCTYPE *data = malloc (size);
|
|
|
- memcpy (data, it->tok, size);
|
|
|
- data [size - 1] = '\0';
|
|
|
- log_debug ("Comment: `%s`", data);
|
|
|
- free (data);
|
|
|
-
|
|
|
+ log_trace ("Comment: `%s`", it->tok);
|
|
|
goto loop;
|
|
|
}
|
|
|
|
|
|
* {
|
|
|
- log_debug (
|
|
|
+ log_warn (
|
|
|
"Invalid token @ %lu: %s (\\x%x)",
|
|
|
YYCURSOR - it->buf - 1, it->tok, *it->tok);
|
|
|
|
|
@@ -302,6 +316,7 @@ LSUP_ttl_parse_doc (FILE *fh, LSUP_Graph **gr_p, size_t *ct, char **err_p)
|
|
|
goto finally;
|
|
|
}
|
|
|
state->base = NULL;
|
|
|
+ state->ct = 0;
|
|
|
|
|
|
state->nsm = LSUP_nsmap_new();
|
|
|
// TODO add basic NS, critically xsd: and rdf:
|
|
@@ -350,12 +365,12 @@ LSUP_ttl_parse_doc (FILE *fh, LSUP_Graph **gr_p, size_t *ct, char **err_p)
|
|
|
if (ttype == T_EOF) break;
|
|
|
};
|
|
|
|
|
|
- if (ct) *ct = parse_it.ct;
|
|
|
+ if (ct) *ct = state->ct;
|
|
|
|
|
|
- log_info ("Parsed %u triples.", parse_it.ct);
|
|
|
+ log_info ("Parsed %u triples.", state->ct);
|
|
|
log_debug ("Graph size: %lu", LSUP_graph_size (gr));
|
|
|
|
|
|
- rc = parse_it.ct > 0 ? LSUP_OK : LSUP_NORESULT;
|
|
|
+ rc = state->ct > 0 ? LSUP_OK : LSUP_NORESULT;
|
|
|
*gr_p = gr;
|
|
|
|
|
|
finally:
|