#include "codec/parser_ttl.h" #include "codec/tokens_ttl.h" /** @brief TTL is UTF-8 encoded. * * @sa https://www.w3.org/TeamSubmission/turtle/#sec-grammar * * `char` should be considered to be UTF-8 throughout this library, however, * setting YYCTYPE to char generates case labels outside of the char range. */ #define YYCTYPE uint8_t #define YYCURSOR it->cur #define YYMARKER it->mar #define YYLIMIT it->lim #define YYFILL fill(it) == 0 typedef struct { FILE * fh; // Input file handle. YYCTYPE buf[CHUNK_SIZE], // Start of buffer. * lim, // Position after the last available // input character (YYLIMIT). * cur, // Next input character to be read // (YYCURSOR) * mar, // Most recent match (YYMARKER) * tok, // Start of current token. * bol; // Address of the beginning of the // current line (for debugging). unsigned line; // Current line no. (for debugging). unsigned stmt; // Current statement. bool eof; // if we have reached EOF. /*!stags:re2c format = "YYCTYPE *@@;"; */ } ParseIterator; typedef struct { YYCTYPE * data; size_t size; } ParserToken; static int fill (ParseIterator *it) { if (it->eof) { return 1; } const size_t shift = it->tok - it->buf; if (shift < 1) { return 2; } log_trace ("Shifting bytes: %lu", shift); memmove(it->buf, it->tok, it->lim - it->tok); it->lim -= shift; it->cur -= shift; it->mar -= shift; it->tok -= shift; it->lim += fread(it->lim, 1, shift, it->fh); /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */ it->lim[0] = 0; it->eof |= it->lim < it->buf + CHUNK_SIZE - 1; return 0; } static void parse_init (ParseIterator *it, FILE *fh) { it->fh = fh; it->cur = it->mar = it->tok = it->lim = it->buf + CHUNK_SIZE - 1; it->line = 1; it->stmt = 1; it->bol = it->buf; it->eof = 0; /*!stags:re2c format = "it->@@ = NULL; "; */ fill (it); } static inline void newline (ParseIterator *it) { it->line ++; it->bol = YYCURSOR; log_trace ("New line: #%u.", it->line); } // Parser interface. Required here to silence linters. void *TTLParseAlloc(); void TTLParse(); void TTLParseFree(); #ifdef DEBUG void TTLParseTrace(); #endif // Lexer. static int lex (ParseIterator *it, YYCTYPE **token_p) { const YYCTYPE *pfx; /*!re2c re2c:eof = 0; re2c:flags:8 = 1; re2c:flags:tags = 1; re2c:tags:expression = "it->@@"; re2c:api:style = functions; re2c:define:YYFILL:naked = 1; // Character classes. EOL = [\n\r]; DIG = [0-9]; HEX = [\x30-\x39\x41-\x46]; CHAR_BASE = "\\u" HEX{4} | "\\U" HEX{8} | '\\' | [\U0000005D-\U0010FFFF]; CHARACTER = CHAR_BASE | [\x20-\x5B]; // Prefix start character. PSTART_CHAR = [a-zA-Z] | [\u00C0-\u00D6] | [\u00D8-\u00F6] | [\u00F8-\u02FF] | [\u0370-\u037D] | [\u037F-\u1FFF] | [\u200C-\u200D] | [\u2070-\u218F] | [\u2C00-\u2FEF] | [\u3001-\uD7FF] | [\uF900-\uFDCF] | [\uFDF0-\uFFFD] | [\U00010000-\U000EFFFF]; // Name start character. NSTART_CHAR = PSTART_CHAR | '_'; NAME_CHAR = NSTART_CHAR | [0-9\-\u00B7\u0300-\u036F\u203F-\u2040]; ECHAR = CHARACTER | ([\\] [tnr]); UCHAR = (CHAR_BASE | ([\x20-\x5B] \ [>])) | ([\\] [>]); SCHAR = (CHAR_BASE | ([\x20-\x5B] \ ["])) | ([\\] ["]); LCHAR = ECHAR | ([\\] ["]) | [\t\n\r]; // Constructs. COMMENT = '#' [^\n\r]*; WS = ([\t\x20] | EOL | COMMENT)+; INTEGER = [-+]? DIG+; EXPONENT = [eE] INTEGER; DOUBLE = [-+]? (DIG+ '.' DIG* EXPONENT | '.'? DIG+ EXPONENT); DECIMAL = [-+]? (DIG+ '.' DIG* | '.'? DIG+); */ loop: // Start new token. it->tok = it->cur; *token_p = NULL; /*!re2c * { log_warn ( "Invalid token @ %lu: %s (\\x%x)", YYCURSOR - it->buf - 1, it->tok, *it->tok); return -1; } $ { log_trace ("End of document."); return T_EOF; } EOL { newline (it); goto loop; } [\x22]{3} { goto lchar; } [\x22] { goto schar; } "true" | "false" { *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok); log_trace ("Boolean: %s", *token_p); return T_BOOLEAN; } '<' UCHAR* '>' { *token_p = uint8_ndup (it->tok + 1, YYCURSOR - it->tok - 2); log_trace ("URI data: %s", *token_p); return T_IRIREF; } '@prefix' WS @pfx (PSTART_CHAR NAME_CHAR*)? ":" { *token_p = uint8_ndup (pfx, YYCURSOR - pfx - 1); log_trace ("Prefix declaration: '%s'", *token_p); return T_PREFIX; } '@base' { log_trace ("'@base' keyword."); return T_BASE; } (PSTART_CHAR NAME_CHAR*)? ":" (NSTART_CHAR NAME_CHAR*)? { *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok); log_trace ("ID name: %s", *token_p); return T_QNAME; } '_:' NSTART_CHAR NAME_CHAR* { *token_p = uint8_ndup (it->tok + 2, YYCURSOR - it->tok - 2); log_trace ("BNode name: %s", *token_p); return T_BNODE_ID; } COMMENT { log_trace ("Comment: `%s`", it->tok); goto loop; } WS { uint8_t *ws = uint8_ndup (it->tok, YYCURSOR - it->tok); log_trace ("Whitespace: '%s'", ws); // Count newlines in mixed whitespace. // That's not great because it scans through the whole whitespace again // but it's the simplest and safest. for (size_t i = 0; i < strlen ((char *)ws); i++) if (ws[i] == '\n' || ws[i] == '\r') newline (it); free (ws); return T_WS; } '@' [a-z]+ ('-' [a-zA-Z0-9]+)* { *token_p = uint8_ndup (it->tok + 1, YYCURSOR - it->tok - 1); log_trace ("Lang tag: '%s'", *token_p); return T_LANGTAG; } INTEGER { // Normalize sign. size_t offset = *it->tok == '+' ? 1 : 0; *token_p = uint8_ndup (it->tok + offset, YYCURSOR - it->tok - offset); log_trace ("Integer: %s", *token_p); return T_INTEGER; } DOUBLE { // Normalize sign. size_t offset = *it->tok == '+' ? 1 : 0; *token_p = uint8_ndup (it->tok + offset, YYCURSOR - it->tok - offset); log_trace ("Integer: %s", *token_p); return T_DOUBLE; } DECIMAL { // Normalize sign. YYCTYPE offset = *it->tok == '+' ? 1 : 0; // Normalize trailing zeros in fractional part. size_t size = YYCURSOR - it->tok - offset; if (strchr ((char *)it->tok, '.')) for (YYCTYPE *i = YYCURSOR; *i == '0'; i--) size--; *token_p = uint8_ndup (it->tok + offset, size); log_trace ("Integer: %s", *token_p); return T_DECIMAL; } '(' WS? { return T_LPAREN; } WS? ')' { return T_RPAREN; } '[' WS? { return T_LBRACKET; } WS? ']' { return T_RBRACKET; } ':' { return T_COLON; } WS? ';' WS? { log_trace ("End of object list."); return T_SEMICOLON; } WS? ',' WS? { return T_COMMA; } WS? '.' { log_trace ("End of statement #%u.", it->stmt); it->stmt++; return T_PERIOD; } '^^' { return T_DTYPE_MARKER; } "a" { log_trace ("RDF type shorthand 'a'."); return T_RDF_TYPE; } */ schar: /*!re2c * { log_warn ( "Invalid token in string @ %lu: %s (\\x%x)", YYCURSOR - it->buf - 1, it->tok, *it->tok); return -1; } $ { log_warn ("Unterminated string!"); return -1; } SCHAR { goto schar; } [\x22] { *token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2); log_trace ("String: %s", *token_p); return T_STRING; } */ lchar: /*!re2c $ { log_warn ("Unterminated long string!"); return -1; } LCHAR { goto lchar; } [\x22]{3} { *token_p = unescape_unicode (it->tok + 3, YYCURSOR - it->tok - 6); log_trace ("Long string: %s", it->tok); return T_STRING; } * { log_warn ( "Invalid token in long string @ %lu: %s (\\x%x)", YYCURSOR - it->buf - 1, it->tok, *it->tok); return -1; } */ } LSUP_rc LSUP_ttl_parse_doc (FILE *fh, LSUP_Graph **gr_p, size_t *ct, char **err_p) { *err_p = NULL; *gr_p = NULL; LSUP_TTLParserState *state = malloc (sizeof (*state)); if (UNLIKELY (!state)) return LSUP_MEM_ERR; state->base = NULL; state->ct = 0; state->rc = LSUP_NORESULT; ParseIterator parse_it; parse_init (&parse_it, fh); void *parser = TTLParseAlloc (malloc); // TODO add basic NS, critically xsd: and rdf: LSUP_Graph *gr = LSUP_graph_new (NULL, NULL, NULL); if (UNLIKELY (!gr)) return LSUP_MEM_ERR; state->it = LSUP_graph_add_init (gr); if (UNLIKELY (!state->it)) { LSUP_graph_free (gr); return LSUP_MEM_ERR; } YYCTYPE *token; #ifdef DEBUG TTLParseTrace (stdout, "TTL Parser > "); #endif for (;;) { int ttype = lex (&parse_it, &token); if (ttype == -1) { char err_token[16] = {'\0'}; strncpy (err_token, (const char *)parse_it.tok, 15); char *err_start = "Lexer error near token `"; char err_info [64]; sprintf( err_info, "[...]' at line %u, character %ld.\n", parse_it.line, parse_it.cur - parse_it.bol); size_t err_size = strlen (err_start) + 16 + strlen(err_info); char *err_str = malloc (err_size); sprintf (err_str, "%s%s%s", err_start, err_token, err_info); log_error (err_str); state->rc = LSUP_PARSE_ERR; *err_p = err_str; goto finally; } TTLParse (parser, ttype, token, state); if (ttype == T_EOF) break; }; if (ct) *ct = state->ct; log_info ("Parsed %u triples.", state->ct); log_debug ("Graph size: %lu", LSUP_graph_size (gr)); *gr_p = gr; finally: LSUP_rc rc = state->rc; log_trace ("rc is %d", rc); TTLParseFree (parser, free); LSUP_graph_add_done (state->it); LSUP_term_free (state->base); free (state); if (rc < 0) LSUP_graph_free (gr); return rc; }