#include "codec/parser_ttl.h" #include "codec/tokens_ttl.h" /** @brief TTL is UTF-8 encoded. * * @sa https://www.w3.org/TeamSubmission/turtle/#sec-grammar * * `char` should be considered to be UTF-8 throughout this library, however, * setting YYCTYPE to char generates case labels outside of the char range. */ #define YYCTYPE uint8_t #define YYCURSOR it->cur #define YYMARKER it->mar #define YYLIMIT it->lim #define YYFILL fill(it) == 0 typedef struct { FILE * fh; // Input file handle. YYCTYPE buf[CHUNK_SIZE], // Start of buffer. * lim, // Position after the last available // input character (YYLIMIT). * cur, // Next input character to be read // (YYCURSOR) * mar, // Most recent match (YYMARKER) * tok, // Start of current token. * bol; // Address of the beginning of the // current line (for debugging). unsigned line; // Current line no. (for debugging). unsigned stmt; // Current statement. bool eof; // if we have reached EOF. /*!stags:re2c format = "YYCTYPE *@@;"; */ } ParseIterator; typedef struct { YYCTYPE * data; size_t size; } ParserToken; static int fill (ParseIterator *it) { if (it->eof) { return 1; } const size_t shift = it->tok - it->buf; if (shift < 1) { return 2; } log_trace ("Shifting bytes: %lu", shift); memmove(it->buf, it->tok, it->lim - it->tok); it->lim -= shift; it->cur -= shift; it->mar -= shift; it->tok -= shift; it->lim += fread(it->lim, 1, shift, it->fh); /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */ it->lim[0] = 0; it->eof |= it->lim < it->buf + CHUNK_SIZE - 1; return 0; } static void parse_init (ParseIterator *it, FILE *fh) { it->fh = fh; it->cur = it->mar = it->tok = it->lim = it->buf + CHUNK_SIZE - 1; it->line = 1; it->stmt = 1; it->bol = it->buf; it->eof = 0; /*!stags:re2c format = "it->@@ = NULL; "; */ fill (it); } // Parser interface. Required here to silence linters. void *TTLParseAlloc(); void TTLParse(); void TTLParseFree(); void TTLParseTrace(); // Lexer. static int lex (ParseIterator *it, YYCTYPE **token_p) { loop: it->tok = it->cur; *token_p = NULL; /*!re2c re2c:eof = 0; re2c:flags:8 = 1; re2c:flags:tags = 1; re2c:tags:expression = "it->@@"; re2c:api:style = functions; re2c:define:YYFILL:naked = 1; // Character classes. EOL = [\x0A\x0D]; NCWS = [\x09\x20] | EOL; HEX = [\x30-\x39\x41-\x46]; CHAR_BASE = "\\u" HEX{4} | "\\U" HEX{8} | '\\' | [\U0000005D-\U0010FFFF]; CHARACTER = CHAR_BASE | [\x20-\x5B]; // Prefix start character. PSTART_CHAR = [a-zA-Z] | [\u00C0-\u00D6] | [\u00D8-\u00F6] | [\u00F8-\u02FF] | [\u0370-\u037D] | [\u037F-\u1FFF] | [\u200C-\u200D] | [\u2070-\u218F] | [\u2C00-\u2FEF] | [\u3001-\uD7FF] | [\uF900-\uFDCF] | [\uFDF0-\uFFFD] | [\U00010000-\U000EFFFF]; // Name start character. NSTART_CHAR = PSTART_CHAR | '_'; NAME_CHAR = NSTART_CHAR | [0-9\-\u00B7\u0300-\u036F\u203F-\u2040]; ECHAR = CHARACTER | ([\\] [tnr]); UCHAR = (CHAR_BASE | ([\x20-\x5B] \ [>])) | ([\\] [>]); SCHAR = (CHAR_BASE | ([\x20-\x5B] \ ["])) | ([\\] ["]); LCHAR = ECHAR | ([\\] ["]) | [\t\n\r]; // Constructs. COMMENT = '#' ( [^\x0A\x0D] )*; WS = NCWS+ | COMMENT; INTEGER = ('-' | '+')? [0-9]+; EXPONENT = [eE] INTEGER; LANGUAGE = [a-z]+ ('-' [a-z0-9]+)*; REL_IRI = UCHAR*; IRIREF = '<' REL_IRI '>'; NAME = NSTART_CHAR NAME_CHAR*; PFX = PSTART_CHAR NAME_CHAR* ':'; LSTRING = [\x22]{3} LCHAR*? [\x22]{3}; STRING = [\x22] SCHAR* [\x22]; LANGTAG = '@' LANGUAGE; DOUBLE = ('-' | '+') ? ([0-9]+ '.' [0-9]* EXPONENT | '.' ([0-9])+ EXPONENT | ([0-9])+ EXPONENT); DECIMAL = ('-' | '+')? ( [0-9]+ '.' [0-9]* | '.' ([0-9])+ | ([0-9])+ ); BOOLEAN = 'true' | 'false'; //RDF_TYPE = 'a' / WS; $ { log_trace ("End of document."); return T_EOF; } EOL { it->line ++; it->bol = YYCURSOR; log_trace ("New line: #%u.", it->line); goto loop; } '@prefix' { log_trace ("'@prefix' keyword."); return T_PREFIX; } IRIREF { *token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2); log_trace ("URI data: %s", *token_p); return T_IRIREF; } NCWS 'a' / WS { log_trace ("RDF type shorthand 'a'."); return T_RDF_TYPE; } NAME { *token_p = unescape_unicode (it->tok, YYCURSOR - it->tok); log_trace ("name: %s", *token_p); return T_IDNAME; } PFX { *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok - 1); log_trace ("Prefix name: '%s'", *token_p); return T_PFX; } LSTRING { *token_p = unescape_unicode (it->tok + 3, YYCURSOR - it->tok - 6); log_trace ("Long string: %s", it->tok); return T_STRING; } STRING { *token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2); log_trace ("Long string: %s", *token_p); return T_STRING; } WS { log_trace ("Whitespace."); return T_WS; } LANGTAG { *token_p = uint8_ndup (it->tok + 1, YYCURSOR - it->tok); log_trace ("Lang tag: %s", *token_p); return T_LANGTAG; } INTEGER { // Normalize sign. size_t offset = *it->tok == '+' ? 1 : 0; *token_p = uint8_ndup (it->tok + offset, YYCURSOR - it->tok - offset); log_trace ("Integer: %s", *token_p); return T_INTEGER; } DOUBLE { // Normalize sign. size_t offset = *it->tok == '+' ? 1 : 0; *token_p = uint8_ndup (it->tok + offset, YYCURSOR - it->tok - offset); log_trace ("Integer: %s", *token_p); return T_DOUBLE; } DECIMAL { // Normalize sign. YYCTYPE offset = *it->tok == '+' ? 1 : 0; // Normalize trailing zeros in fractional part. size_t size = YYCURSOR - it->tok - offset; if (strchr ((char *)it->tok, '.')) for (YYCTYPE *i = YYCURSOR; *i == '0'; i--) size--; *token_p = uint8_ndup (it->tok + offset, size); log_trace ("Integer: %s", *token_p); return T_DECIMAL; } BOOLEAN { *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok); log_trace ("Boolean: %s", *token_p); return T_BOOLEAN; } '(' { return T_LPAREN; } ')' { return T_RPAREN; } '[' { return T_LBRACKET; } ']' { return T_RBRACKET; } ':' { return T_COLON; } WS? ';' WS? { return T_SEMICOLON; } WS? ',' WS? { return T_COMMA; } WS? '.' WS? { log_trace ("End of statement #%u.", it->stmt); it->stmt++; return T_EOS; } '_:' { return T_BNODE_PFX; } '^^' { return T_DTYPE_MARKER; } '@base' {return T_BASE; } COMMENT { log_trace ("Comment: `%s`", it->tok); goto loop; } * { log_warn ( "Invalid token @ %lu: %s (\\x%x)", YYCURSOR - it->buf - 1, it->tok, *it->tok); return -1; } */ } LSUP_rc LSUP_ttl_parse_doc (FILE *fh, LSUP_Graph **gr_p, size_t *ct, char **err_p) { *err_p = NULL; *gr_p = NULL; ParseIterator parse_it; parse_init (&parse_it, fh); void *parser = TTLParseAlloc (malloc); LSUP_rc rc; LSUP_TTLParserState *state = malloc (sizeof (*state)); if (UNLIKELY (!state)) { rc = LSUP_MEM_ERR; goto finally; } state->base = NULL; state->ct = 0; state->nsm = LSUP_nsmap_new(); // TODO add basic NS, critically xsd: and rdf: LSUP_Graph *gr = LSUP_graph_new ( LSUP_iriref_new (NULL, NULL), LSUP_STORE_HTABLE, NULL, state->nsm, 0); if (UNLIKELY (!gr)) return LSUP_MEM_ERR; state->it = LSUP_graph_add_init (gr); if (UNLIKELY (!state->it)) { LSUP_graph_free (gr); return LSUP_MEM_ERR; } YYCTYPE *token; #ifdef DEBUG TTLParseTrace (stdout, "TTL Parser > "); #endif for (;;) { int ttype = lex (&parse_it, &token); if (ttype == -1) { char token[16] = {'\0'}; strncpy (token, (const char *)parse_it.tok, 15); char *err_start = "Parse error near token `"; char err_info [64]; sprintf( err_info, "[...]' at line %u, character %ld.\n", parse_it.line, parse_it.cur - parse_it.bol); size_t err_size = strlen (err_start) + 16 + strlen(err_info); char *err_str = malloc (err_size); sprintf (err_str, "%s%s%s", err_start, token, err_info); rc = LSUP_VALUE_ERR; *err_p = err_str; goto finally; } TTLParse (parser, ttype, token, state); if (ttype == T_EOF) break; }; if (ct) *ct = state->ct; log_info ("Parsed %u triples.", state->ct); log_debug ("Graph size: %lu", LSUP_graph_size (gr)); rc = state->ct > 0 ? LSUP_OK : LSUP_NORESULT; *gr_p = gr; finally: TTLParse (parser, 0, NULL, state); TTLParseFree (parser, free); LSUP_graph_add_done (state->it); free (state); if (rc < 0) LSUP_graph_free (gr); return rc; }