Browse Source

Duplicate all NT codec files for working with TTL.

Stefano Cossu 2 years ago
parent
commit
0cfac367a5
5 changed files with 726 additions and 1 deletions
  1. 1 1
      TODO.md
  2. 10 0
      include/codec_ttl.h
  3. 58 0
      src/codec/ttl_grammar.y
  4. 400 0
      src/codec/ttl_lexer.re
  5. 257 0
      src/codec_ttl.c

+ 1 - 1
TODO.md

@@ -24,7 +24,7 @@
 - *D* Relative IRIs
 - *D* Flexible store interface
 - *D* Transaction control
-- *P* Turtle serialization / deserialization
+- *W* Turtle serialization / deserialization
 - *P* Full UTF-8 support
 - *P* Extended tests
     - *P* C API

+ 10 - 0
include/codec_ttl.h

@@ -0,0 +1,10 @@
+#ifndef _LSUP_CODEC_NT_H
+#define _LSUP_CODEC_NT_H
+
+#include "codec_base.h"
+
+/** @brief N-Triples codec.
+ */
+extern const LSUP_Codec nt_codec;
+
+#endif

+ 58 - 0
src/codec/ttl_grammar.y

@@ -0,0 +1,58 @@
+%include {
+
+/** @brief Lemon parser grammar for N-Triples.
+ *
+ * The `lemon' parser generator executable must be in your PATH:
+ * https://sqlite.org/src/doc/trunk/doc/lemon.html
+ *
+ * To generate the parser, run: `lemon ${FILE}'
+ */
+
+#include "graph.h"
+}
+
+
+%token_type { LSUP_Term * }
+%token_prefix "T_"
+
+%type triple            { LSUP_Triple * }
+%destructor triple      { LSUP_triple_free ($$); }
+%type subject           { LSUP_Term * }
+%destructor subject     { LSUP_term_free ($$); }
+%type predicate         { LSUP_Term * }
+%destructor predicate   { LSUP_term_free ($$); }
+%type object            { LSUP_Term * }
+%destructor object      { LSUP_term_free ($$); }
+%default_type           { void * }
+
+%extra_argument         { LSUP_GraphIterator *it }
+
+
+// Rules.
+
+ntriplesDoc ::= triples EOF.
+
+triples     ::= eol.
+triples     ::= triple eol.
+triples     ::= triples triple eol.
+
+triple(A)   ::= ws subject(S) ws predicate(P) ws object(O) ws DOT. {
+
+                A = LSUP_triple_new (S, P, O);
+                LSUP_graph_add_iter (it, A);
+            }
+
+subject     ::= IRIREF.
+subject     ::= BNODE.
+
+predicate   ::= IRIREF.
+
+object      ::= IRIREF.
+object      ::= BNODE.
+object      ::= LITERAL.
+
+eol         ::= EOL.
+eol         ::= eol EOL.
+
+ws          ::=.
+ws          ::= WS.

+ 400 - 0
src/codec/ttl_lexer.re

@@ -0,0 +1,400 @@
+#include "nt_grammar.h"
+#include "nt_parser.h"
+
+
+#define YYCTYPE     unsigned char
+#define YYCURSOR    it->cur
+#define YYMARKER    it->mar
+#define YYLIMIT     it->lim
+#define YYFILL      fill(it) == 0
+
+/**
+ * Max chunk size passed to scanner at each iteration.
+ */
+#ifdef LSUP_RDF_STREAM_CHUNK_SIZE
+#define CHUNK_SIZE LSUP_RDF_STREAM_CHUNK_SIZE
+#else
+#define CHUNK_SIZE 8192
+#endif
+
+
+typedef struct {
+    FILE *          fh;                 // Input file handle.
+    YYCTYPE         buf[CHUNK_SIZE + 1],// Start of buffer.
+            *       lim,                // Position after the last available
+                                        //   input character (YYLIMIT).
+            *       cur,                // Next input character to be read
+                                        //   (YYCURSOR)
+            *       mar,                // Most recent match (YYMARKER)
+            *       tok,                // Start of current token.
+            *       bol;                // Address of the beginning of the
+                                        //   current line (for debugging).
+    unsigned        line;               // Current line no. (for debugging).
+    unsigned        ct;                 // Number of parsed triples.
+    bool            eof;                // if we have reached EOF.
+    /*!stags:re2c format = "YYCTYPE *@@;"; */
+} ParseIterator;
+
+
+// TODO The opposite of this is in codec_nt.c. Find a better place for both.
+static inline char unescape_char(const char c) {
+    switch (c) {
+        case 't': return '\t';
+        case 'b': return '\b';
+        case 'n': return '\n';
+        case 'r': return '\r';
+        case 'f': return '\f';
+        default: return c;
+    }
+}
+
+
+static int fill(ParseIterator *it)
+{
+    if (it->eof) {
+        return 1;
+    }
+    const size_t shift = it->tok - it->buf;
+    if (shift < 1) {
+        return 2;
+    }
+    log_debug ("Shifting bytes: %lu", shift);
+    memmove(it->buf, it->tok, it->lim - it->tok);
+    it->lim -= shift;
+    it->cur -= shift;
+    it->mar -= shift;
+    it->tok -= shift;
+    it->lim += fread(it->lim, 1, shift, it->fh);
+    /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
+    it->lim[0] = 0;
+    it->eof |= it->lim < it->buf + CHUNK_SIZE;
+    return 0;
+}
+
+
+static void parse_init(ParseIterator *it, FILE *fh)
+{
+    it->fh = fh;
+    it->cur = it->mar = it->tok = it->lim = it->buf + CHUNK_SIZE;
+    it->line = 1;
+    it->bol = it->buf;
+    it->ct = 0;
+    it->eof = 0;
+    /*!stags:re2c format = "it->@@ = NULL; "; */
+    fill (it);
+}
+
+
+/** @brief Replace \uxxxx and \Uxxxxxxxx with Unicode bytes.
+ */
+static YYCTYPE *unescape_unicode (const YYCTYPE *esc_str, size_t size)
+{
+    YYCTYPE *uc_str = malloc (size + 1);
+
+    size_t j = 0;
+    YYCTYPE tmp_chr[5];
+    for (size_t i = 0; i < size;) {
+        if (esc_str[i] == '\\') {
+            i++; // Skip over '\\'
+
+            // 4-hex sequence.
+            if (esc_str[i] == 'u') {
+                i ++; // Skip over 'u'
+
+                // Use tmp_chr to hold the hex string for the code point.
+                memcpy(tmp_chr, esc_str + i, sizeof (tmp_chr) - 1);
+                tmp_chr[4] = '\0';
+
+                uint32_t tmp_val = strtol ((char*)tmp_chr, NULL, 16);
+                log_debug ("tmp_val: %d", tmp_val);
+
+                // Reuse tmp_chr to hold the byte values for the code point.
+                int nbytes = utf8_encode (tmp_val, tmp_chr);
+
+                // Copy bytes into destination.
+                memcpy (uc_str + j, tmp_chr, nbytes);
+                log_debug ("UC byte value: %x %x", uc_str[j], uc_str[j + 1]);
+
+                j += nbytes;
+                i += 4;
+
+            // 8-hex sequence.
+            } else if (esc_str[i] == 'U') {
+                i ++; // Skip over 'U'
+                log_error ("UTF-16 sequence unescaping not yet implemented.");
+                return NULL; // TODO encode UTF-16
+
+            // Unescape other escaped characters.
+            } else uc_str[j++] = unescape_char(esc_str[i++]);
+        } else {
+            // Copy ASCII char verbatim.
+            uc_str[j++] = esc_str[i++];
+        }
+    }
+
+    YYCTYPE *tmp = realloc (uc_str, j + 1);
+    if (UNLIKELY (!tmp)) return NULL;
+    uc_str = tmp;
+    uc_str[j] = '\0';
+
+    return uc_str;
+}
+
+
+// Parser interface.
+
+void *ParseAlloc();
+void Parse();
+void ParseFree();
+
+
+// Lexer.
+
+static int lex (ParseIterator *it, LSUP_Term **term)
+{
+    const YYCTYPE *lit_data_e, *dtype_s, *lang_s;
+
+loop:
+
+    it->tok = it->cur;
+
+    *term = NULL;
+
+    /*!re2c
+    re2c:eof = 0;
+    re2c:flags:8 = 1;
+    re2c:flags:tags = 1;
+    re2c:tags:expression = "it->@@";
+    re2c:api:style = functions;
+    re2c:define:YYFILL:naked = 1;
+
+
+    // For unresolved and partially resolved inconsistencies of the spec, see
+    // https://lists.w3.org/Archives/Public/public-rdf-comments/2017Jun/0000.html
+    _WS                 = [\x09\x20];
+    WS                  = _WS+;
+    EOL                 = [\x0D\x0A] (_WS | [\x0D\x0A])*;
+    DOT                 = [.];
+    HEX                 = [0-9A-Fa-f];
+    ECHAR               = [\\] [tbnrf"'\\];
+    UCHAR               = "\\u" HEX{4} | "\\U" HEX{8};
+    PN_CHARS_BASE       = [A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF];
+    PN_CHARS_U          = PN_CHARS_BASE | '_' | ':';
+    PN_CHARS            = PN_CHARS_U | '-' | [0-9\u00B7\u0300-\u036F\u203F-\u2040];
+    IRI_CHARS           = ([^\x00-\x20<>"{}|^`\\] | UCHAR)*;
+    LITERAL_QUOTE       = ["] ([^\x22\x5C\x0A\x0D] | ECHAR|UCHAR)* ["];
+    LANGTAG             = [@] [a-zA-Z]+ ("-" [a-zA-Z0-9]+)*;
+
+    IRIREF              = [<] IRI_CHARS [>];
+    LITERAL             = LITERAL_QUOTE @lit_data_e _WS* ("^^" _WS* @dtype_s IRIREF | @lang_s LANGTAG)?;
+    BNODE               = "_:" ((PN_CHARS_U | [0-9]) ((PN_CHARS | ".")* PN_CHARS)?);
+    COMMENT             = "#" .*;
+
+
+    EOL {
+        it->line ++;
+        it->bol = YYCURSOR;
+        log_debug ("New line: #%u.", it->line);
+        return T_EOL;
+    }
+
+    $ {
+        log_debug ("End of buffer.");
+        return T_EOF;
+    }
+
+    IRIREF {
+        YYCTYPE *data = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
+
+        log_debug ("URI data: %s", data);
+
+        *term = LSUP_iriref_new ((char*)data, NULL);
+        free (data);
+
+        return T_IRIREF;
+    }
+
+    LITERAL {
+        // Only unescape Unicode from data.
+        size_t size = lit_data_e - it->tok - 2;
+        YYCTYPE *data = unescape_unicode (it->tok + 1, size);
+        log_trace ("Literal data: %s", data);
+
+        char *metadata = NULL;
+        const YYCTYPE *md_marker;
+        LSUP_TermType type = LSUP_TERM_LITERAL;
+
+        if (dtype_s) {
+            md_marker = dtype_s;
+            size = YYCURSOR - md_marker - 1;
+        } else if (lang_s) {
+            type = LSUP_TERM_LT_LITERAL;
+            md_marker = lang_s;
+            size = YYCURSOR - md_marker;
+        } else md_marker = NULL;
+
+        if (md_marker) {
+            metadata = malloc (size);
+            memcpy (metadata, md_marker + 1, size);
+            metadata [size - 1] = '\0';
+            log_trace ("metadata: %s", metadata);
+        }
+
+        if (type == LSUP_TERM_LITERAL) {
+            LSUP_Term *dtype;
+            dtype = (
+                metadata ? LSUP_iriref_new ((char *) metadata, NULL) : NULL);
+
+            *term = LSUP_literal_new ((char *) data, dtype);
+
+        } else *term = LSUP_lt_literal_new ((char *) data, (char *) metadata);
+
+        free (data);
+        free (metadata);
+
+        return T_LITERAL;
+    }
+
+    BNODE {
+        YYCTYPE *data = unescape_unicode (it->tok + 2, YYCURSOR - it->tok - 2);
+
+        log_debug ("BNode data: %s", data);
+
+        *term = LSUP_term_new (LSUP_TERM_BNODE, (char*)data, NULL);
+        free (data);
+
+        return T_BNODE;
+    }
+
+    DOT {
+        log_debug ("End of triple.");
+        it->ct ++;
+
+        return T_DOT;
+    }
+
+    WS {
+        log_debug ("Separator.");
+
+        return T_WS;
+    }
+
+    COMMENT {
+        size_t size = YYCURSOR - it->tok + 1;
+        YYCTYPE *data = malloc (size);
+        memcpy (data, it->tok, size);
+        data [size - 1] = '\0';
+        log_debug ("Comment: `%s`", data);
+        free (data);
+
+        goto loop;
+    }
+
+    * {
+        log_debug (
+            "Invalid token @ %lu: %s (\\x%x)",
+            YYCURSOR - it->buf - 1, it->tok, *it->tok);
+
+        return -1;
+    }
+
+    */
+}
+
+
+LSUP_rc
+LSUP_nt_parse_term (const char *rep, const LSUP_NSMap *map, LSUP_Term **term)
+{
+    FILE *fh = fmemopen ((void *)rep, strlen (rep), "r");
+
+    ParseIterator it;
+    parse_init (&it, fh);
+
+    int ttype = lex (&it, term);
+
+    fclose (fh);
+
+    switch (ttype) {
+        case T_IRIREF:
+        case T_LITERAL:
+        case T_BNODE:
+            return LSUP_OK;
+        default:
+            return LSUP_VALUE_ERR;
+    }
+}
+
+LSUP_rc
+LSUP_nt_parse_doc (FILE *fh, LSUP_Graph **gr_p, size_t *ct, char **err_p)
+{
+    *err_p = NULL;
+    *gr_p = NULL;
+
+    ParseIterator parse_it;
+    parse_init (&parse_it, fh);
+
+    void *parser = ParseAlloc (malloc);
+
+    LSUP_rc rc;
+
+    LSUP_Graph *gr = LSUP_graph_new (
+            LSUP_iriref_new (NULL, NULL), LSUP_STORE_HTABLE, NULL, NULL, 0);
+    if (UNLIKELY (!gr)) return LSUP_MEM_ERR;
+
+    LSUP_GraphIterator *it = LSUP_graph_add_init (gr);
+    if (UNLIKELY (!it)) {
+        LSUP_graph_free (gr);
+        return LSUP_MEM_ERR;
+    }
+
+    LSUP_Term *term = NULL;
+
+    for (;;) {
+        int ttype = lex (&parse_it, &term);
+
+        if (ttype == -1) {
+            char token[16] = {'\0'};
+            strncpy (token, (const char *)parse_it.tok, 15);
+
+            char *err_start = "Parse error near token `";
+
+            char err_info [64];
+            sprintf(
+                    err_info, "[...]' at line %u, character %ld.\n",
+                    parse_it.line, parse_it.cur - parse_it.bol);
+
+            size_t err_size = strlen (err_start) + 16 + strlen(err_info);
+            char *err_str = malloc (err_size);
+            sprintf (err_str, "%s%s%s", err_start, token, err_info);
+
+            rc = LSUP_VALUE_ERR;
+            *err_p = err_str;
+
+            goto finally;
+        }
+
+        Parse (parser, ttype, term, it);
+
+        if (ttype == T_EOF) break;
+    };
+
+    if (ct) *ct = parse_it.ct;
+
+    log_info ("Parsed %u triples.", parse_it.ct);
+    log_debug ("Graph size: %lu", LSUP_graph_size (gr));
+
+    rc = parse_it.ct > 0 ? LSUP_OK : LSUP_NORESULT;
+    *gr_p = gr;
+
+finally:
+    Parse (parser, 0, NULL, it);
+    ParseFree (parser, free);
+
+    LSUP_graph_add_done (it);
+    LSUP_term_free (term);
+
+    if (rc < 0) LSUP_graph_free (gr);
+
+    return rc;
+}
+

+ 257 - 0
src/codec_ttl.c

@@ -0,0 +1,257 @@
+#include "codec_nt.h"
+#include "nt_parser.h"
+
+/** @brief List of characters to be escaped in serialized literals.
+ *
+ * @sa https://www.w3.org/TR/n-triples/#grammar-production-ECHAR
+ */
+#define LIT_ECHAR "\t\b\n\r\f\"\'\\"
+
+/** @brief Regex of characters to be escaped in serialized IRIs.
+ *
+ * @sa https://www.w3.org/TR/n-triples/#grammar-production-IRIREF
+ */
+#define IRI_ECHAR_PTN "[\x00-\x20<>\"\\{\\}\\|\\^`\\\\]"
+
+
+/* * * Static prototypes. * * */
+
+static LSUP_rc escape_lit (const char *in, char **out_p);
+
+
+/* * * Codec functions. * * */
+
+static LSUP_rc
+term_to_nt (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
+{
+    LSUP_rc rc;
+    char *out = NULL, *tmp, *escaped;
+    const char *metadata = NULL;
+    size_t buf_len;
+
+    // Free previous content if not NULL.
+    if (*out_p != NULL) out = realloc (*out_p, 0);
+
+    switch (term->type) {
+        case LSUP_TERM_IRIREF:
+            tmp = realloc (out, strlen (term->data) + 3);
+            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
+            out = tmp;
+
+            sprintf (out, "<%s>", term->data);
+            rc = LSUP_OK;
+            break;
+
+        case LSUP_TERM_LITERAL:
+            // Calculate string length.
+            if (escape_lit (term->data, &escaped) != LSUP_OK)
+                return LSUP_ERROR;
+            buf_len = strlen (escaped) + 3; // Room for "" and terminator
+
+            if (
+                term->datatype != 0
+                && term->datatype != LSUP_default_datatype
+            ) {
+                metadata = term->datatype->data;
+                buf_len += strlen (metadata) + 4; // Room for ^^<>
+            }
+
+            tmp = realloc (out, buf_len);
+            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
+            out = tmp;
+
+            sprintf (out, "\"%s\"", escaped);
+            free (escaped);
+
+            // Add datatype.
+            if (metadata)
+                out = strcat (strcat (strcat (out, "^^<"), metadata), ">");
+
+            rc = LSUP_OK;
+
+            break;
+
+        case LSUP_TERM_LT_LITERAL:
+            // Calculate string length.
+            if (escape_lit (term->data, &escaped) != LSUP_OK)
+                return LSUP_ERROR;
+            buf_len = strlen (escaped) + 3; // Room for "" and terminator
+
+            if (term->lang != 0) {
+                metadata = term->lang;
+                buf_len += strlen (metadata) + 1; // Room for @
+            }
+
+            tmp = realloc (out, buf_len);
+            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
+            out = tmp;
+
+            sprintf (out, "\"%s\"", escaped);
+            free (escaped);
+
+            // Add lang.
+            if (metadata) out = strcat (strcat (out, "@"), metadata);
+
+            rc = LSUP_OK;
+
+            break;
+
+        case LSUP_TERM_BNODE:
+            tmp = realloc (out, strlen (term->data) + 3);
+            if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
+            out = tmp;
+
+            sprintf (out, "_:%s", term->data);
+            rc = LSUP_OK;
+
+            break;
+
+        default:
+            out = NULL;
+            rc = LSUP_VALUE_ERR;
+    }
+
+    *out_p = out;
+    return rc;
+}
+
+
+static LSUP_CodecIterator *
+gr_to_nt_init (const LSUP_Graph *gr);
+
+
+static LSUP_rc
+gr_to_nt_iter (LSUP_CodecIterator *it, unsigned char **res) {
+    LSUP_rc rc = LSUP_graph_iter_next (it->gr_it, it->trp);
+    if (rc != LSUP_OK) goto finally;
+
+    term_to_nt (it->trp->s, it->nsm, &it->str_s);
+    term_to_nt (it->trp->p, it->nsm, &it->str_p);
+    term_to_nt (it->trp->o, it->nsm, &it->str_o);
+
+    // 3 term separators + dot + newline + terminal = 6
+    unsigned char *tmp = realloc (
+            *res, strlen (it->str_s) + strlen (it->str_p)
+            + strlen (it->str_o) + 6);
+    if (UNLIKELY (!tmp)) {
+        *res = NULL;
+        rc = LSUP_MEM_ERR;
+        goto finally;
+    }
+
+    sprintf ((char*)tmp, "%s %s %s .\n", it->str_s, it->str_p, it->str_o);
+    *res = tmp;
+
+    it->cur++;
+
+finally:
+    LSUP_term_free (it->trp->s); it->trp->s = NULL;
+    LSUP_term_free (it->trp->p); it->trp->p = NULL;
+    LSUP_term_free (it->trp->o); it->trp->o = NULL;
+
+    return rc;
+}
+
+
+static void
+gr_to_nt_done (LSUP_CodecIterator *it)
+{
+    LSUP_graph_iter_free (it->gr_it);
+    LSUP_triple_free (it->trp);
+    free (it->rep);
+    free (it->str_s);
+    free (it->str_p);
+    free (it->str_o);
+    free (it);
+}
+
+
+const LSUP_Codec nt_codec = {
+    .name               = "N-Triples",
+    .mimetype           = "application/n-triples",
+    .extension          = "nt",
+
+    .encode_term        = term_to_nt,
+
+    .encode_graph_init  = gr_to_nt_init,
+    .encode_graph_iter  = gr_to_nt_iter,
+    .encode_graph_done  = gr_to_nt_done,
+
+    .decode_term        = LSUP_nt_parse_term,
+    .decode_graph       = LSUP_nt_parse_doc,
+};
+
+
+/* * * Other internal functions. * * */
+
+/** Replace non-printable characters with their literal byte.
+ *
+ *  Escape backslash is to be added separately.
+ */
+static inline char replace_char(const char c) {
+    switch (c) {
+        case '\t': return 't';
+        case '\b': return 'b';
+        case '\n': return 'n';
+        case '\r': return 'r';
+        case '\f': return 'f';
+        default: return c;
+    }
+}
+
+
+static LSUP_CodecIterator *
+gr_to_nt_init (const LSUP_Graph *gr)
+{
+    LSUP_CodecIterator *it;
+    MALLOC_GUARD (it, NULL);
+
+    it->codec = &nt_codec;
+    it->gr_it = LSUP_graph_lookup(gr, NULL, NULL, NULL, &it->cur);
+    it->nsm = LSUP_graph_namespace (gr);
+    it->cur = 0;
+    it->trp = TRP_DUMMY;
+    it->rep = NULL;
+    it->str_s = NULL;
+    it->str_p = NULL;
+    it->str_o = NULL;
+
+    return it;
+}
+
+
+/** @brief Add escape character (backslash) to illegal literal characters.
+ */
+static LSUP_rc
+escape_lit (const char *in, char **out_p)
+{
+    size_t out_size = strlen (in) + 1;
+
+    // Expand output string size to accommodate escape characters.
+    for (
+            size_t i = strcspn (in, LIT_ECHAR);
+            i < strlen (in);
+            i += strcspn (in + i + 1, LIT_ECHAR) + 1) {
+        out_size ++;
+    }
+
+    char *out = calloc (1, out_size);
+    if (UNLIKELY (!out)) return LSUP_MEM_ERR;
+
+    size_t boundary;
+    boundary = strcspn (in, LIT_ECHAR);
+    for (size_t i = 0, j = 0;;) {
+        out = strncat (out, in + i, boundary);
+
+        i += boundary;
+        j += boundary;
+        if (i >= strlen (in)) break;
+
+        out[j++] = '\\';
+        out[j++] = replace_char (in[i++]);
+        boundary = strcspn (in + i, LIT_ECHAR);
+    }
+
+    *out_p = out;
+    return 0;
+}