Pārlūkot izejas kodu

WIP ISO C; partly fixed codec; tests still failing.

scossu 1 nedēļu atpakaļ
vecāks
revīzija
656e53c4a5

+ 1 - 1
include/volksdata/codec.h

@@ -13,7 +13,7 @@
 #ifdef VOLK_RDF_STREAM_CHUNK_SIZE
 #define CHUNK_SIZE VOLK_RDF_STREAM_CHUNK_SIZE
 #else
-#define CHUNK_SIZE 8192
+#define CHUNK_SIZE 4096
 #endif
 
 

+ 116 - 0
include/volksdata/codec/parser_common.h

@@ -0,0 +1,116 @@
+#ifndef _VOLK_PARSER_COMMON_H
+#define _VOLK_PARSER_COMMON_H
+
+#include "volksdata/codec.h"
+
+
+/** @brief TTL is UTF-8 encoded.
+ *
+ * @sa https://www.w3.org/TeamSubmission/turtle/#sec-grammar
+ *
+ * `char` should be considered to be UTF-8 throughout this library, however,
+ * setting YYCTYPE to char generates case labels outside of the char range.
+ */
+#define YYCTYPE     uint8_t
+#define YYCURSOR    it->cur
+#define YYMARKER    it->mar
+#define YYLIMIT     it->lim
+#define YYFILL      fill(it) == 0
+
+
+typedef struct {
+    FILE          * fh;         ///< Input file handle.
+    const char    * sh;         ///< Input string. Exclusive with fh.
+    size_t          buf_size;   ///< Initial allocation for buffer.
+    YYCTYPE       * buf,        ///< Start of buffer.
+                  * lim,        ///< Position after the last available
+                                ///<   input character (YYLIMIT).
+                  * cur,        ///< Next input character to be read (YYCURSOR)
+                  * mar,        ///< Most recent match (YYMARKER)
+                  * tok,        ///< Start of current token.
+                  * bol;        ///< Address of the beginning of the
+                                ///<   current line (for debugging).
+    unsigned        line;       ///< Current line no. (for debugging).
+    unsigned        ct;         ///< Number of statements parsed.
+    bool            eof;        ///< if we have reached EOF.
+    /*!stags:re2c format = "YYCTYPE *@@;"; */
+} ParseIterator;
+
+
+static int fill(ParseIterator *it)
+{
+    if (it->eof) {
+        return 1;
+    }
+    size_t shift = it->tok - it->buf;
+
+    // If buffer is too small for the lexeme, double the capacity.
+    while (shift < 1) {
+        it->buf_size = 2 * it->buf_size;
+        it->buf = realloc (it->buf, it->buf_size);
+        if (!it->buf) {
+            log_error ("Memory allocation error.");
+            return -1;
+        }
+        shift = it->tok - it->buf;
+    }
+    LOG_DEBUG("Shifting bytes: %lu", shift);
+    memmove (it->buf, it->tok, it->lim - it->tok);
+    it->lim -= shift;
+    it->cur -= shift;
+    it->mar -= shift;
+    it->tok -= shift;
+    it->lim += fread (it->lim, 1, shift, it->fh);
+    /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
+    it->lim[0] = 0;
+    it->eof |= it->lim < it->buf + CHUNK_SIZE - 1;
+    return 0;
+}
+
+
+/** @brief Initialize parser.
+ *
+ * @param[in] it iterator handle to be initialized.
+ *
+ * @param[in] fh Open file handle to read from. This is exclusive with sh. If
+ *  both fh and sh are provided, fh has precedence.
+ *
+ * @param[in] sh String to read from. This is exclusive with fh.
+ */
+static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
+{
+    if(fh) {
+        // Stream handling. It engages YYFILL and reads by chunks.
+        /*!re2c
+        re2c:yyfill:enable = 1;
+        */
+        it->fh = fh;
+        it->sh = NULL;
+        it->buf_size = CHUNK_SIZE;
+        it->buf = malloc(it->buf_size);
+        if (!it->buf) log_error ("Error allocating lexer buffer.");
+        it->cur = it->mar = it->tok = it->lim = it->buf + it->buf_size - 1;
+        it->eof = 0;
+    } else {
+        // String handling. Uses the provided string as the buffer.
+        /*!re2c
+        re2c:yyfill:enable = 0;
+        */
+        it->fh = NULL;
+        it->sh = sh;
+        it->buf_size = strlen(sh) + 1;
+        it->buf = NULL;
+        it->cur = it->tok = (YYCTYPE*)it->sh;
+        it->lim = it->mar = it->cur + it->buf_size - 1;
+        it->eof = 1;
+    }
+    it->line = 1;
+    it->bol = it->buf;
+    it->ct = 0;
+    /*!stags:re2c format = "it->@@ = NULL; "; */
+
+    if (it->fh) fill (it);
+}
+
+
+#endif // _VOLK_PARSER_COMMON_H

+ 1 - 1
src/codec/Makefile

@@ -48,7 +48,7 @@ $(BUILDDIR)/%_dbg.o: %.c
 
 # Parser C sources.
 parser_%.c: lexer_%.re grammar_%.c ../codec.c
-	$(LEXER) $< -o $@ -T --case-ranges
+	$(LEXER) $< -o $@ -T --case-ranges -W
 
 
 .PRECIOUS: grammar_%.c $(CODEC_INCLUDE_DIR)/tokens_%.h

+ 82 - 28
src/codec/lexer_nt.re

@@ -1,7 +1,21 @@
 #include "volksdata/codec/parser_nt.h"
 #include "volksdata/codec/tokens_nt.h"
+//#include "volksdata/codec/parser_common.h"
 
 
+/** BEGIN duplicate section
+ * This section is bit-by-bit identical in NT and TTL lexers. The copy in
+ * include/volksdata/codec/parser_common.h should be used, but some re2c tags
+ * are not being parsed in that location.
+ */
+
+/** @brief TTL is UTF-8 encoded.
+ *
+ * @sa https://www.w3.org/TeamSubmission/turtle/#sec-grammar
+ *
+ * `char` should be considered to be UTF-8 throughout this library, however,
+ * setting YYCTYPE to char generates case labels outside of the char range.
+ */
 #define YYCTYPE     uint8_t
 #define YYCURSOR    it->cur
 #define YYMARKER    it->mar
@@ -10,20 +24,20 @@
 
 
 typedef struct {
-    FILE           *fh;                 ///< Input file handle.
-    const char     *sh;                 ///< Input string. Exclusive with fh.
-    YYCTYPE         buf[CHUNK_SIZE],    ///< Start of buffer.
-                   *lim,                ///< Position after the last available
-                                        ///<   input character (YYLIMIT).
-                   *cur,                ///< Next input character to be read
-                                        ///<   (YYCURSOR)
-                   *mar,                ///< Most recent match (YYMARKER)
-                   *tok,                ///< Start of current token.
-                   *bol;                ///< Address of the beginning of the
-                                        ///<   current line (for debugging).
-    unsigned        line;               ///< Current line no. (for debugging).
-    unsigned        ct;                 ///< Number of parsed triples.
-    bool            eof;                ///< if we have reached EOF.
+    FILE          * fh;         ///< Input file handle.
+    const char    * sh;         ///< Input string. Exclusive with fh.
+    size_t          buf_size;   ///< Initial allocation for buffer.
+    YYCTYPE       * buf,        ///< Start of buffer.
+                  * lim,        ///< Position after the last available
+                                ///<   input character (YYLIMIT).
+                  * cur,        ///< Next input character to be read (YYCURSOR)
+                  * mar,        ///< Most recent match (YYMARKER)
+                  * tok,        ///< Start of current token.
+                  * bol;        ///< Address of the beginning of the
+                                ///<   current line (for debugging).
+    unsigned        line;       ///< Current line no. (for debugging).
+    unsigned        ct;         ///< Number of statements parsed.
+    bool            eof;        ///< if we have reached EOF.
     /*!stags:re2c format = "YYCTYPE *@@;"; */
 } ParseIterator;
 
@@ -33,9 +47,17 @@ static int fill(ParseIterator *it)
     if (it->eof) {
         return 1;
     }
-    const size_t shift = it->tok - it->buf;
-    if (shift < 1) {
-        return 2;
+    size_t shift = it->tok - it->buf;
+
+    // If buffer is too small for the lexeme, double the capacity.
+    while (shift < 1) {
+        it->buf_size = 2 * it->buf_size;
+        it->buf = realloc (it->buf, it->buf_size);
+        if (!it->buf) {
+            log_error ("Memory allocation error.");
+            return -1;
+        }
+        shift = it->tok - it->buf;
     }
     LOG_DEBUG("Shifting bytes: %lu", shift);
     memmove (it->buf, it->tok, it->lim - it->tok);
@@ -43,9 +65,7 @@ static int fill(ParseIterator *it)
     it->cur -= shift;
     it->mar -= shift;
     it->tok -= shift;
-    if (it->fh) it->lim += fread (it->lim, 1, shift, it->fh);
-    // With a string handle, assume the whole input fits in CHUNK_SIZE.
-    else it->lim = memcpy (it->lim, it->sh, sizeof(it->buf));
+    it->lim += fread (it->lim, 1, shift, it->fh);
     /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
     it->lim[0] = 0;
     it->eof |= it->lim < it->buf + CHUNK_SIZE - 1;
@@ -62,19 +82,44 @@ static int fill(ParseIterator *it)
  *
  * @param[in] sh String to read from. This is exclusive with fh.
  */
-static void parse_init(ParseIterator *it, FILE *fh, const char *sh)
+static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
 {
-    it->fh = fh;
-    it->sh = sh;
-    it->cur = it->mar = it->tok = it->lim = it->buf + CHUNK_SIZE - 1;
+    if(fh) {
+        // Stream handling. It engages YYFILL and reads by chunks.
+        /*!re2c
+        re2c:yyfill:enable = 1;
+        */
+        it->fh = fh;
+        it->sh = NULL;
+        it->buf_size = CHUNK_SIZE;
+        it->buf = malloc(it->buf_size);
+        if (!it->buf) log_error ("Error allocating lexer buffer.");
+        it->cur = it->mar = it->tok = it->lim = it->buf + it->buf_size - 1;
+        it->bol = it->buf;
+        it->eof = 0;
+    } else {
+        // String handling. Uses the provided string as the buffer.
+        /*!re2c
+        re2c:yyfill:enable = 0;
+        */
+        it->fh = NULL;
+        it->sh = sh;
+        it->buf_size = strlen(sh) + 1;
+        it->buf = NULL;
+        it->cur = it->tok = (YYCTYPE*)it->sh;
+        it->lim = it->mar = it->cur + it->buf_size - 1;
+        it->bol = it->cur;
+        it->eof = 1;
+    }
     it->line = 1;
-    it->bol = it->buf;
     it->ct = 0;
-    it->eof = 0;
     /*!stags:re2c format = "it->@@ = NULL; "; */
-    fill (it);
+
+    if (it->fh) fill (it);
 }
 
+/** END duplicate section */
+
 
 // Parser interface. Required here to silence linters.
 void *NTParseAlloc();
@@ -232,7 +277,7 @@ loop:
     }
 
     * {
-        LOG_DEBUG(
+        log_error (
             "Invalid token @ %lu: %s (\\x%x)",
             YYCURSOR - it->buf - 1, it->tok, *it->tok);
 
@@ -251,6 +296,8 @@ VOLK_nt_parse_term (const char *rep, VOLK_Term **term)
 
     int ttype = lex (&it, term);
 
+    free (it.buf);
+
     switch (ttype) {
         case T_IRIREF:
         case T_LITERAL:
@@ -268,6 +315,11 @@ VOLK_nt_parse_doc (
     *err_p = NULL;
     *gr_p = NULL;
 
+    if (!fh && !sh) {
+        log_error ("Neither file handle nor string input provided.");
+        return VOLK_VALUE_ERR;
+    }
+
     ParseIterator parse_it;
     parse_init (&parse_it, fh, sh);
 
@@ -332,6 +384,8 @@ finally: ;
     NTParse (parser, 0, NULL, it);
     NTParseFree (parser, free);
 
+    free (parse_it.buf);
+
     VOLK_graph_add_done (it);
     VOLK_term_free (term);
 

+ 93 - 43
src/codec/lexer_ttl.re

@@ -1,6 +1,12 @@
 #include "volksdata/codec/parser_ttl.h"
 #include "volksdata/codec/tokens_ttl.h"
+//#include "volksdata/codec/parser_common.h"
 
+/** BEGIN duplicate section
+ * This section is bit-by-bit identical in NT and TTL lexers. The copy in
+ * include/volksdata/codec/parser_common.h should be used, but some re2c tags
+ * are not being parsed in that location.
+ */
 
 /** @brief TTL is UTF-8 encoded.
  *
@@ -17,47 +23,48 @@
 
 
 typedef struct {
-    FILE          * fh;                 ///< Input file handle.
-    const char    * sh;                 ///< Input string. Exclusive with fh.
-    YYCTYPE         buf[CHUNK_SIZE],    ///< Start of buffer.
-                  * lim,                ///< Position after the last available
-                                        ///<   input character (YYLIMIT).
-                  * cur,                ///< Next input character to be read
-                                        ///<   (YYCURSOR)
-                  * mar,                ///< Most recent match (YYMARKER)
-                  * tok,                ///< Start of current token.
-                  * bol;                ///< Address of the beginning of the
-                                        ///<   current line (for debugging).
-    unsigned        line;               ///< Current line no. (for debugging).
-    unsigned        stmt;               ///< Current statement.
-    bool            eof;                ///< if we have reached EOF.
+    FILE          * fh;         ///< Input file handle.
+    const char    * sh;         ///< Input string. Exclusive with fh.
+    size_t          buf_size;   ///< Initial allocation for buffer.
+    YYCTYPE       * buf,        ///< Start of buffer.
+                  * lim,        ///< Position after the last available
+                                ///<   input character (YYLIMIT).
+                  * cur,        ///< Next input character to be read (YYCURSOR)
+                  * mar,        ///< Most recent match (YYMARKER)
+                  * tok,        ///< Start of current token.
+                  * bol;        ///< Address of the beginning of the
+                                ///<   current line (for debugging).
+    unsigned        line;       ///< Current line no. (for debugging).
+    unsigned        ct;         ///< Number of statements parsed.
+    bool            eof;        ///< if we have reached EOF.
     /*!stags:re2c format = "YYCTYPE *@@;"; */
 } ParseIterator;
 
-typedef struct {
-    YYCTYPE *       data;
-    size_t          size;
-} ParserToken;
-
 
-static int fill (ParseIterator *it)
+static int fill(ParseIterator *it)
 {
     if (it->eof) {
         return 1;
     }
-    const size_t shift = it->tok - it->buf;
-    if (shift < 1) {
-        return 2;
+    size_t shift = it->tok - it->buf;
+
+    // If buffer is too small for the lexeme, double the capacity.
+    while (shift < 1) {
+        it->buf_size = 2 * it->buf_size;
+        it->buf = realloc (it->buf, it->buf_size);
+        if (!it->buf) {
+            log_error ("Memory allocation error.");
+            return -1;
+        }
+        shift = it->tok - it->buf;
     }
-    LOG_TRACE("Shifting bytes: %lu", shift);
-    memmove(it->buf, it->tok, it->lim - it->tok);
+    LOG_DEBUG("Shifting bytes: %lu", shift);
+    memmove (it->buf, it->tok, it->lim - it->tok);
     it->lim -= shift;
     it->cur -= shift;
     it->mar -= shift;
     it->tok -= shift;
-    if (it->fh) it->lim += fread (it->lim, 1, shift, it->fh);
-    // With a string handle, assume the whole input fits in CHUNK_SIZE.
-    else it->lim = memcpy (it->lim, it->sh, sizeof(it->buf));
+    it->lim += fread (it->lim, 1, shift, it->fh);
     /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
     it->lim[0] = 0;
     it->eof |= it->lim < it->buf + CHUNK_SIZE - 1;
@@ -65,25 +72,58 @@ static int fill (ParseIterator *it)
 }
 
 
+/** @brief Initialize parser.
+ *
+ * @param[in] it iterator handle to be initialized.
+ *
+ * @param[in] fh Open file handle to read from. This is exclusive with sh. If
+ *  both fh and sh are provided, fh has precedence.
+ *
+ * @param[in] sh String to read from. This is exclusive with fh.
+ */
 static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
 {
-    it->fh = fh;
-    it->sh = sh;
-    it->cur = it->mar = it->tok = it->lim = it->buf + CHUNK_SIZE - 1;
+    if(fh) {
+        // Stream handling. It engages YYFILL and reads by chunks.
+        /*!re2c
+        re2c:yyfill:enable = 1;
+        */
+        it->fh = fh;
+        it->sh = NULL;
+        it->buf_size = CHUNK_SIZE;
+        it->buf = malloc(it->buf_size);
+        if (!it->buf) log_error ("Error allocating lexer buffer.");
+        it->cur = it->mar = it->tok = it->lim = it->buf + it->buf_size - 1;
+        it->bol = it->buf;
+        it->eof = 0;
+    } else {
+        // String handling. Uses the provided string as the buffer.
+        /*!re2c
+        re2c:yyfill:enable = 0;
+        */
+        it->fh = NULL;
+        it->sh = sh;
+        it->buf_size = strlen(sh) + 1;
+        it->buf = NULL;
+        it->cur = it->tok = (YYCTYPE*)it->sh;
+        it->lim = it->mar = it->cur + it->buf_size - 1;
+        it->bol = it->cur;
+        it->eof = 1;
+    }
     it->line = 1;
-    it->stmt = 1;
-    it->bol = it->buf;
-    it->eof = 0;
+    it->ct = 0;
     /*!stags:re2c format = "it->@@ = NULL; "; */
-    fill (it);
+
+    if (it->fh) fill (it);
 }
 
+/** END duplicate section */
 
-static inline void newline (ParseIterator *it) {
-    it->line ++;
-    it->bol = YYCURSOR;
-    LOG_TRACE("New line: #%u.", it->line);
-}
+
+typedef struct {
+    YYCTYPE *       data;
+    size_t          size;
+} ParserToken;
 
 
 // Parser interface. Required here to silence linters.
@@ -94,6 +134,13 @@ void TTLParseFree();
 void TTLParseTrace();
 #endif
 
+
+static inline void newline (ParseIterator *it) {
+    it->line ++;
+    it->bol = YYCURSOR;
+    LOG_TRACE("New line: #%u.", it->line);
+}
+
 // Lexer.
 
 static int lex (ParseIterator *it, YYCTYPE **token_p)
@@ -290,8 +337,8 @@ loop: // Start new token.
     WS? ',' WS? { return T_COMMA; }
 
     WS? '.' {
-        LOG_TRACE("End of statement #%u.", it->stmt);
-        it->stmt++;
+        LOG_TRACE("End of statement #%u.", it->ct);
+        it->ct++;
         return T_PERIOD;
     }
 
@@ -369,7 +416,10 @@ VOLK_ttl_parse_doc (
     *err_p = NULL;
     *gr_p = NULL;
 
-    if (!fh) return VOLK_VALUE_ERR;
+    if (!fh && !sh) {
+        log_error ("Neither file handle nor string input provided.");
+        return VOLK_VALUE_ERR;
+    }
 
     VOLK_TTLParserState *state = malloc (sizeof (*state));
     if (UNLIKELY (!state)) return VOLK_MEM_ERR;

Failā izmaiņas netiks attēlotas, jo tās ir par lielu
+ 206 - 499
src/codec/parser_nt.c


Failā izmaiņas netiks attēlotas, jo tās ir par lielu
+ 208 - 410
src/codec/parser_ttl.c


+ 14 - 10
src/core.c

@@ -88,27 +88,31 @@ finally:
 }
 
 
-char *strndup (const char *src, size_t max)
+char *
+strndup (const char *src, size_t max)
 {
     size_t len = strlen (src);
     if (len > max) len = max;
 
-    char *res = (char*)malloc (len + 1);
-    if (res) {
-        memcpy (res, src, len);
-        res[len] = '\0';
+    char *dup;
+    dup = malloc (len + 1);
+    if (dup) {
+        memcpy (dup, src, len);
+        dup[len] = '\0';
     }
 
-    return res;
+    return dup;
 }
 
 
-char *strdup (const char *src)
+char *
+strdup (const char *src)
 {
-   char *res = (char*)malloc (strlen (src) + 1);
-   if (res) strcpy(res, src);
+   char *dup;
+   dup = malloc (strlen (src) + 1);
+   if (dup) strcpy(dup, src);
 
-   return res;
+   return dup;
 }
 
 

Daži faili netika attēloti, jo izmaiņu fails ir pārāk liels