#include "volksdata/codec/parser_ttl.h" #include "volksdata/codec/tokens_ttl.h" //#include "volksdata/codec/parser_common.h" /** BEGIN duplicate section * This section is bit-by-bit identical in NT and TTL lexers. The copy in * include/volksdata/codec/parser_common.h should be used, but some re2c tags * are not being parsed in that location. */ /** @brief TTL is UTF-8 encoded. * * @sa https://www.w3.org/TeamSubmission/turtle/#sec-grammar * * `char` should be considered to be UTF-8 throughout this library, however, * setting YYCTYPE to char generates case labels outside of the char range. */ #define YYCTYPE uint8_t #define YYCURSOR it->cur #define YYMARKER it->mar #define YYLIMIT it->lim #define YYFILL fill(it) == 0 typedef struct { FILE * fh; ///< Input file handle. const char * sh; ///< Input string. Exclusive with fh. size_t buf_size; ///< Initial allocation for buffer. YYCTYPE * buf, ///< Start of buffer. * lim, ///< Position after the last available ///< input character (YYLIMIT). * cur, ///< Next input character to be read (YYCURSOR) * mar, ///< Most recent match (YYMARKER) * tok, ///< Start of current token. * bol; ///< Address of the beginning of the ///< current line (for debugging). unsigned line; ///< Current line no. (for debugging). unsigned ct; ///< Number of statements parsed. bool eof; ///< if we have reached EOF. /*!stags:re2c format = "YYCTYPE *@@;"; */ } ParseIterator; static int fill(ParseIterator *it) { if (it->eof) { return 1; } size_t shift = it->tok - it->buf; // If buffer is too small for the lexeme, double the capacity. while (shift < 1) { it->buf_size = 2 * it->buf_size; it->buf = realloc (it->buf, it->buf_size); if (!it->buf) { log_error ("Memory allocation error."); return -1; } shift = it->tok - it->buf; } LOG_DEBUG("Shifting bytes: %lu", shift); memmove (it->buf, it->tok, it->lim - it->tok); it->lim -= shift; it->cur -= shift; it->mar -= shift; it->tok -= shift; it->lim += fread (it->lim, 1, shift, it->fh); /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */ it->lim[0] = 0; it->eof |= it->lim < it->buf + CHUNK_SIZE - 1; return 0; } /** @brief Initialize parser. * * @param[in] it iterator handle to be initialized. * * @param[in] fh Open file handle to read from. This is exclusive with sh. If * both fh and sh are provided, fh has precedence. * * @param[in] sh String to read from. This is exclusive with fh. */ static void parse_init (ParseIterator *it, FILE *fh, const char *sh) { if(fh) { // Stream handling. It engages YYFILL and reads by chunks. /*!re2c re2c:yyfill:enable = 1; */ it->fh = fh; it->sh = NULL; it->buf_size = CHUNK_SIZE; it->buf = malloc(it->buf_size); if (!it->buf) log_error ("Error allocating lexer buffer."); it->cur = it->mar = it->tok = it->lim = it->buf + it->buf_size - 1; it->bol = it->buf; it->eof = 0; } else { // String handling. Uses the provided string as the buffer. /*!re2c re2c:yyfill:enable = 0; */ it->fh = NULL; it->sh = sh; it->buf_size = strlen(sh) + 1; it->buf = NULL; it->cur = it->tok = (YYCTYPE*)it->sh; it->lim = it->mar = it->cur + it->buf_size - 1; it->bol = it->cur; it->eof = 1; } it->line = 1; it->ct = 0; /*!stags:re2c format = "it->@@ = NULL; "; */ if (it->fh) fill (it); } /** END duplicate section */ typedef struct { YYCTYPE * data; size_t size; } ParserToken; // Parser interface. Required here to silence linters. void *TTLParseAlloc(); void TTLParse(); void TTLParseFree(); #ifdef DEBUG void TTLParseTrace(); #endif static inline void newline (ParseIterator *it) { it->line ++; it->bol = YYCURSOR; LOG_TRACE("New line: #%u.", it->line); } // Lexer. static int lex (ParseIterator *it, YYCTYPE **token_p) { const YYCTYPE *pfx; /*!re2c re2c:eof = 0; re2c:flags:8 = 1; re2c:flags:tags = 1; re2c:tags:expression = "it->@@"; re2c:api:style = functions; re2c:define:YYFILL:naked = 1; // Character classes. EOL = [\n\r]; DIG = [0-9]; HEX = [\x30-\x39\x41-\x46]; CHAR_BASE = "\\u" HEX{4} | "\\U" HEX{8} | '\\' | [\U0000005D-\U0010FFFF]; CHARACTER = CHAR_BASE | [\x20-\x5B]; // Prefix start character. PSTART_CHAR = [a-zA-Z] | [\u00C0-\u00D6] | [\u00D8-\u00F6] | [\u00F8-\u02FF] | [\u0370-\u037D] | [\u037F-\u1FFF] | [\u200C-\u200D] | [\u2070-\u218F] | [\u2C00-\u2FEF] | [\u3001-\uD7FF] | [\uF900-\uFDCF] | [\uFDF0-\uFFFD] | [\U00010000-\U000EFFFF]; // Name start character. NSTART_CHAR = PSTART_CHAR | '_'; NAME_CHAR = NSTART_CHAR | [0-9\-\u00B7\u0300-\u036F\u203F-\u2040]; ECHAR = CHARACTER | ([\\] [tnr]); UCHAR = (CHAR_BASE | ([\x20-\x5B] \ [>])) | ([\\] [>]); SCHAR = (CHAR_BASE | ([\x20-\x5B] \ ["])) | ([\\] ["]); LCHAR = ECHAR | ([\\] ["]) | [\t\n\r]; // Constructs. COMMENT = '#' [^\n\r]*; WS = ([\t\x20] | EOL | COMMENT)+; INTEGER = [-+]? DIG+; EXPONENT = [eE] INTEGER; DOUBLE = [-+]? (DIG+ '.' DIG* EXPONENT | '.'? DIG+ EXPONENT); DECIMAL = [-+]? (DIG+ '.' DIG* | '.'? DIG+); */ loop: // Start new token. it->tok = it->cur; *token_p = NULL; /*!re2c * { log_warn ( "Invalid token @ %lu: %s (\\x%x)", YYCURSOR - it->buf - 1, it->tok, *it->tok); return -1; } $ { LOG_TRACE("End of document."); return T_EOF; } EOL { newline (it); goto loop; } [\x22]{3} { goto lchar; } [\x22] { goto schar; } "true" | "false" { *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok); LOG_TRACE("Boolean: %s", *token_p); return T_BOOLEAN; } '<' UCHAR* '>' { *token_p = uint8_ndup (it->tok + 1, YYCURSOR - it->tok - 2); LOG_TRACE("URI data: %s", *token_p); return T_IRIREF; } '[' WS? ']' { return T_EMPTY_BNODE; } '@prefix' WS @pfx (PSTART_CHAR NAME_CHAR*)? ":" { *token_p = uint8_ndup (pfx, YYCURSOR - pfx - 1); LOG_TRACE("Prefix declaration: '%s'", *token_p); return T_PREFIX; } '@base' { LOG_TRACE("'@base' keyword."); return T_BASE; } (PSTART_CHAR NAME_CHAR*)? ":" (NSTART_CHAR NAME_CHAR*)? { *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok); LOG_TRACE("ID name: %s", *token_p); return T_QNAME; } '_:' NSTART_CHAR NAME_CHAR* { *token_p = uint8_ndup (it->tok + 2, YYCURSOR - it->tok - 2); LOG_TRACE("BNode name: %s", *token_p); return T_BNODE_ID; } COMMENT { LOG_TRACE("Comment: `%s`", it->tok); goto loop; } WS { uint8_t *ws = uint8_ndup (it->tok, YYCURSOR - it->tok); LOG_TRACE("Whitespace: '%s'", ws); // Count newlines in mixed whitespace. // That's not great because it scans through the whole whitespace again // but it's the simplest and safest. for (size_t i = 0; i < strlen ((char *)ws); i++) if (ws[i] == '\n' || ws[i] == '\r') newline (it); free (ws); return T_WS; } '@' [a-z]+ ('-' [a-zA-Z0-9]+)* { *token_p = uint8_ndup (it->tok + 1, YYCURSOR - it->tok - 1); LOG_TRACE("Lang tag: '%s'", *token_p); return T_LANGTAG; } INTEGER { // Normalize sign. size_t offset = *it->tok == '+' ? 1 : 0; *token_p = uint8_ndup (it->tok + offset, YYCURSOR - it->tok - offset); LOG_TRACE("Integer: %s", *token_p); return T_INTEGER; } DOUBLE { // Normalize sign. size_t offset = *it->tok == '+' ? 1 : 0; *token_p = uint8_ndup (it->tok + offset, YYCURSOR - it->tok - offset); LOG_TRACE("Integer: %s", *token_p); return T_DOUBLE; } DECIMAL { // Normalize sign. YYCTYPE offset = *it->tok == '+' ? 1 : 0; // Normalize trailing zeros in fractional part. size_t size = YYCURSOR - it->tok - offset; if (strchr ((char *)it->tok, '.')) for (YYCTYPE *i = YYCURSOR; *i == '0'; i--) size--; *token_p = uint8_ndup (it->tok + offset, size); LOG_TRACE("Integer: %s", *token_p); return T_DECIMAL; } '(' WS? { return T_LPAREN; } WS? ')' { return T_RPAREN; } '[' WS? { return T_LBRACKET; } WS? ']' { return T_RBRACKET; } ':' { return T_COLON; } WS? ';' WS? { LOG_TRACE("End of object list."); return T_SEMICOLON; } WS? ',' WS? { return T_COMMA; } WS? '.' { LOG_TRACE("End of statement #%u.", it->ct); it->ct++; return T_PERIOD; } '^^' { return T_DTYPE_MARKER; } "a" { LOG_TRACE("RDF type shorthand 'a'."); return T_RDF_TYPE; } */ schar: /*!re2c * { log_warn ( "Invalid token in string @ %lu: %s (\\x%x)", YYCURSOR - it->buf - 1, it->tok, *it->tok); return -1; } $ { log_warn ("Unterminated string!"); return -1; } SCHAR { goto schar; } [\x22] { *token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2); LOG_TRACE("String: %s", *token_p); return T_STRING; } */ lchar: /*!re2c $ { log_warn ("Unterminated long string!"); return -1; } LCHAR { goto lchar; } [\x22]{3} { *token_p = unescape_unicode (it->tok + 3, YYCURSOR - it->tok - 6); LOG_TRACE("Long string: %s", it->tok); return T_STRING; } * { log_warn ( "Invalid token in long string @ %lu: %s (\\x%x)", YYCURSOR - it->buf - 1, it->tok, *it->tok); return -1; } */ } VOLK_rc VOLK_ttl_parse_doc ( FILE *fh, const char *sh, VOLK_Graph **gr_p, size_t *ct, char **err_p) { *err_p = NULL; *gr_p = NULL; if (!fh && !sh) { log_error ("Neither file handle nor string input provided."); return VOLK_VALUE_ERR; } VOLK_TTLParserState *state = malloc (sizeof (*state)); if (UNLIKELY (!state)) return VOLK_MEM_ERR; state->base = NULL; state->ct = 0; state->rc = VOLK_NORESULT; ParseIterator parse_it; parse_init (&parse_it, fh, sh); void *parser = TTLParseAlloc (malloc); // TODO add basic NS, critically xsd: and rdf: VOLK_Graph *gr = VOLK_graph_new (NULL, NULL); if (UNLIKELY (!gr)) return VOLK_MEM_ERR; state->it = VOLK_graph_add_init (gr); if (UNLIKELY (!state->it)) { VOLK_graph_free (gr); return VOLK_MEM_ERR; } YYCTYPE *token; #ifdef DEBUG TTLParseTrace (stdout, "TTL Parser > "); #endif for (;;) { int ttype = lex (&parse_it, &token); if (ttype == -1) { char err_token[16] = {'\0'}; strncpy (err_token, (const char *)parse_it.tok, 15); char *err_start = "Lexer error near token `"; char err_info [64]; sprintf( err_info, "[...]' at line %u, character %ld.\n", parse_it.line, parse_it.cur - parse_it.bol); size_t err_size = strlen (err_start) + 16 + strlen(err_info); char *err_str = malloc (err_size); sprintf (err_str, "%s%s%s", err_start, err_token, err_info); log_error (err_str); state->rc = VOLK_PARSE_ERR; *err_p = err_str; goto finally; } TTLParse (parser, ttype, token, state); if (ttype == T_EOF) break; }; if (ct) *ct = state->ct; log_info ("Parsed %u triples.", state->ct); LOG_DEBUG("Graph size: %lu", VOLK_graph_size (gr)); *gr_p = gr; finally: ; VOLK_rc rc = state->rc; LOG_TRACE("rc is %d", rc); TTLParseFree (parser, free); VOLK_graph_add_done (state->it); VOLK_term_free (state->base); free (state); if (rc < 0) VOLK_graph_free (gr); return rc; }