浏览代码

Fix issue with large token.

scossu 1 周之前
父节点
当前提交
568d6ec5c1
共有 9 个文件被更改,包括 1436 次插入750 次删除
  1. 7 4
      README.md
  2. 48 38
      include/volksdata/codec/parser_common.h
  3. 108 112
      src/codec/grammar_ttl.c
  4. 49 39
      src/codec/lexer_nt.re
  5. 49 38
      src/codec/lexer_ttl.re
  6. 520 172
      src/codec/parser_nt.c
  7. 635 340
      src/codec/parser_ttl.c
  8. 17 4
      test/test_codec_nt.c
  9. 3 3
      test/test_codec_ttl.c

+ 7 - 4
README.md

@@ -132,10 +132,13 @@ Builds and installs the library with debug symbols in `~/.local`.
 
 `VOLK_RDF_STREAM_CHUNK_SIZE`: Size of RDF decoding buffer, i.e., maximum size
 of a chunk of RDF data fed to the parser when decoding a RDF file into a graph.
-This should be larger than the maximum expected size of a single term in your
-RDF source. The default value is 8192, which is mildly conservative. If you
-experience parsing errors on decoding, and they happen to be on a term such a
-very long string literal, try recompiling the library with a larger value.
+This is set to 4Kb by default and is heap-allocated, so that if a RDF term
+is larger than this size, it is automatically increased for the duration of the
+decoding session. Systems with more memory may benefit from a larger buffer.
+
+`VOLK_TEST_LARGE`: Used with `make test`, `make memtest`, etc. to thoroughly
+test codecs on very large triple sets. Normally these tests are skipped to
+speed up development when no codec changes are involved.
 
 ## Embedding & linking
 

+ 48 - 38
include/volksdata/codec/parser_common.h

@@ -37,35 +37,7 @@ typedef struct {
 } ParseIterator;
 
 
-static int fill(ParseIterator *it)
-{
-    if (it->eof) {
-        return 1;
-    }
-    size_t shift = it->tok - it->buf;
-
-    // If buffer is too small for the lexeme, double the capacity.
-    while (shift < 1) {
-        it->buf_size = 2 * it->buf_size;
-        it->buf = realloc (it->buf, it->buf_size);
-        if (!it->buf) {
-            log_error ("Memory allocation error.");
-            return -1;
-        }
-        shift = it->tok - it->buf;
-    }
-    LOG_DEBUG("Shifting bytes: %lu", shift);
-    memmove (it->buf, it->tok, it->lim - it->tok);
-    it->lim -= shift;
-    it->cur -= shift;
-    it->mar -= shift;
-    it->tok -= shift;
-    it->lim += fread (it->lim, 1, shift, it->fh);
-    /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
-    it->lim[0] = 0;
-    it->eof |= it->lim < it->buf + CHUNK_SIZE - 1;
-    return 0;
-}
+static int fill(ParseIterator *it);
 
 
 /** @brief Initialize parser.
@@ -81,9 +53,6 @@ static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
 {
     if(fh) {
         // Stream handling. It engages YYFILL and reads by chunks.
-        /*!re2c
-        re2c:yyfill:enable = 1;
-        */
         it->fh = fh;
         it->sh = NULL;
         it->buf_size = CHUNK_SIZE;
@@ -91,12 +60,10 @@ static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
         if (!it->buf) log_error ("Error allocating lexer buffer.");
         it->cur = it->mar = it->tok = it->lim = it->buf + it->buf_size - 1;
         it->bol = it->buf;
-        it->eof = 0;
+        it->eof = false;
+        it->lim[0] = 0;
     } else {
         // String handling. Uses the provided string as the buffer.
-        /*!re2c
-        re2c:yyfill:enable = 0;
-        */
         it->fh = NULL;
         it->sh = sh;
         it->buf_size = strlen(sh) + 1;
@@ -104,13 +71,56 @@ static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
         it->cur = it->tok = (YYCTYPE*)it->sh;
         it->lim = it->mar = it->cur + it->buf_size - 1;
         it->bol = it->cur;
-        it->eof = 1;
+        it->eof = true;
     }
     it->line = 1;
     it->ct = 0;
     /*!stags:re2c format = "it->@@ = NULL; "; */
+}
 
-    if (it->fh) fill (it);
+
+int
+fill(ParseIterator *it)
+{
+    log_debug ("Filling codec buffer @ %p.", it->buf);
+    if (it->eof) return 1;
+
+    size_t shift = it->tok - it->buf;
+    size_t used = it->lim - it->tok;
+
+    // If buffer is too small for the lexeme, double the capacity.
+    if (shift < 1) {
+        YYCTYPE *old_buf = it->buf;
+        shift += it->buf_size;
+        it->buf_size *= 2;
+        LOG_DEBUG ("Reallocating buffer to %zu bytes.", it->buf_size);
+        it->buf = realloc (it->buf, it->buf_size);
+        if (!it->buf) {
+            log_error ("Memory allocation error.");
+            return -1;
+        }
+        // Move all relative points if address changed.
+        size_t reloc_off = it->buf - old_buf;
+        it->cur += reloc_off;
+        it->tok += reloc_off;
+        it->lim += reloc_off;
+        it->mar += reloc_off;
+    } else {
+        LOG_DEBUG("Shifting bytes: %zu", shift);
+        memmove (it->buf, it->tok, used);
+        LOG_TRACE ("Limit offset before reading data: %zu", it->lim - it->tok);
+        it->lim -= shift;
+        it->cur -= shift;
+        it->mar -= shift;
+        it->tok -= shift;
+    }
+    it->lim += fread (it->lim, 1, it->buf_size - used - 1, it->fh);
+    /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
+    LOG_TRACE ("Cursor offset from last token: %zu", it->cur - it->tok);
+    LOG_TRACE ("Limit offset from last token: %zu", it->lim - it->tok);
+    it->lim[0] = 0;
+    it->eof = it->lim < it->buf + it->buf_size - 1;
+    return 0;
 }
 
 

+ 108 - 112
src/codec/grammar_ttl.c

@@ -122,18 +122,18 @@ typedef union {
 #define TTLParseCTX_PARAM
 #define TTLParseCTX_FETCH
 #define TTLParseCTX_STORE
-#define YYNSTATE             31
+#define YYNSTATE             28
 #define YYNRULE              41
 #define YYNRULE_WITH_ACTION  27
 #define YYNTOKEN             25
-#define YY_MAX_SHIFT         30
-#define YY_MIN_SHIFTREDUCE   66
-#define YY_MAX_SHIFTREDUCE   106
-#define YY_ERROR_ACTION      107
-#define YY_ACCEPT_ACTION     108
-#define YY_NO_ACTION         109
-#define YY_MIN_REDUCE        110
-#define YY_MAX_REDUCE        150
+#define YY_MAX_SHIFT         27
+#define YY_MIN_SHIFTREDUCE   63
+#define YY_MAX_SHIFTREDUCE   103
+#define YY_ERROR_ACTION      104
+#define YY_ACCEPT_ACTION     105
+#define YY_NO_ACTION         106
+#define YY_MIN_REDUCE        107
+#define YY_MAX_REDUCE        147
 /************* End control #defines *******************************************/
 #define YY_NLOOKAHEAD ((int)(sizeof(yy_lookahead)/sizeof(yy_lookahead[0])))
 
@@ -200,60 +200,56 @@ typedef union {
 **  yy_default[]       Default action for each state.
 **
 *********** Begin parsing tables **********************************************/
-#define YY_ACTTAB_COUNT (124)
+#define YY_ACTTAB_COUNT (115)
 static const YYACTIONTYPE yy_action[] = {
- /*     0 */   108,    1,   68,    9,   19,   80,   81,   82,   83,   84,
- /*    10 */    92,   91,   19,   80,   81,   82,   83,   84,   92,   91,
- /*    20 */    85,   16,   17,   14,   87,   17,   11,   15,   85,   16,
- /*    30 */    15,   14,   88,   19,   80,   81,   82,   83,   84,   92,
- /*    40 */    91,   10,   21,  138,  138,  138,  138,   18,  123,   85,
- /*    50 */    16,    5,   14,  118,  119,  105,  119,   92,   91,   84,
- /*    60 */    92,   91,  105,   29,    2,   30,   99,   26,   20,   15,
- /*    70 */    85,   16,   15,   14,  134,  134,  134,  134,  134,   13,
- /*    80 */    22,  117,  117,  117,  117,  117,   23,  117,  117,  117,
- /*    90 */   117,  117,    3,  133,  133,  133,  133,  133,  116,  116,
- /*   100 */   116,  116,  116,   69,   78,    7,   92,   91,    4,    8,
- /*   110 */    86,   12,   92,   91,   76,    6,   67,   24,   66,   25,
- /*   120 */    76,   27,  137,   28,
+ /*     0 */   105,    1,   65,    9,   17,   77,   78,   79,   80,   81,
+ /*    10 */    89,   88,   17,   77,   78,   79,   80,   81,   89,   88,
+ /*    20 */    82,    7,   14,    2,   84,   14,   16,   13,   82,    7,
+ /*    30 */    13,    2,  120,   81,   89,   88,    5,   26,  102,   27,
+ /*    40 */    96,   23,   18,   13,   82,    7,   13,    2,  135,  135,
+ /*    50 */   135,  135,   15,  130,  130,  130,  130,  130,  115,  116,
+ /*    60 */     3,  116,   89,   88,    4,  131,  131,  131,  131,  131,
+ /*    70 */    11,   19,  114,  114,  114,  114,  114,   20,  114,  114,
+ /*    80 */   114,  114,  114,   66,  113,  113,  113,  113,  113,   89,
+ /*    90 */    88,  102,   89,   88,   10,   75,    8,   73,    6,   85,
+ /*   100 */    73,   64,   12,   22,   21,   63,   25,   24,  134,  106,
+ /*   110 */   106,  106,  106,  106,   83,
 };
 static const YYCODETYPE yy_lookahead[] = {
  /*     0 */    25,   26,    1,    2,    4,    5,    6,    7,    8,    9,
  /*    10 */    10,   11,    4,    5,    6,    7,    8,    9,   10,   11,
  /*    20 */    20,   21,   34,   23,   24,   37,   33,   34,   20,   21,
- /*    30 */    37,   23,   24,    4,    5,    6,    7,    8,    9,   10,
- /*    40 */    11,    2,   32,   27,   28,   29,   30,   31,   37,   20,
- /*    50 */    21,   32,   23,   37,   38,   16,   40,   10,   11,    9,
- /*    60 */    10,   11,   16,   13,   32,   15,   16,   17,   33,   34,
- /*    70 */    20,   21,   37,   23,   36,   37,   38,   39,   40,   41,
- /*    80 */    35,   36,   37,   38,   39,   40,   35,   36,   37,   38,
- /*    90 */    39,   40,   32,   36,   37,   38,   39,   40,   36,   37,
- /*   100 */    38,   39,   40,    1,   12,   32,   10,   11,   32,   32,
- /*   110 */    22,   19,   10,   11,   18,    3,    1,   11,    1,   16,
- /*   120 */    18,   11,    0,   16,   42,   42,   42,   42,   42,   42,
- /*   130 */    42,   42,   42,   42,   42,   42,   42,   42,   42,   42,
- /*   140 */    42,   42,   42,   42,   42,   42,   42,   42,   42,
+ /*    30 */    37,   23,   37,    9,   10,   11,   32,   13,   16,   15,
+ /*    40 */    16,   17,   33,   34,   20,   21,   37,   23,   27,   28,
+ /*    50 */    29,   30,   31,   36,   37,   38,   39,   40,   37,   38,
+ /*    60 */    32,   40,   10,   11,   32,   36,   37,   38,   39,   40,
+ /*    70 */    41,   35,   36,   37,   38,   39,   40,   35,   36,   37,
+ /*    80 */    38,   39,   40,    1,   36,   37,   38,   39,   40,   10,
+ /*    90 */    11,   16,   10,   11,    2,   12,   32,   18,    3,   24,
+ /*   100 */    18,    1,   19,   16,   11,    1,   16,   11,    0,   42,
+ /*   110 */    42,   42,   42,   42,   22,   42,   42,   42,   42,   42,
+ /*   120 */    42,   42,   42,   42,   42,   42,   42,   42,   42,   42,
+ /*   130 */    42,   42,   42,   42,   25,   25,   25,   25,   25,   25,
 };
-#define YY_SHIFT_COUNT    (30)
+#define YY_SHIFT_COUNT    (27)
 #define YY_SHIFT_MIN      (0)
-#define YY_SHIFT_MAX      (122)
+#define YY_SHIFT_MAX      (108)
 static const unsigned char yy_shift_ofst[] = {
- /*     0 */   124,   50,    0,   29,   29,    8,   29,   96,   96,  102,
- /*    10 */    96,   39,   47,   46,   46,   46,   46,   46,   46,   92,
- /*    20 */     1,   88,  112,  112,  115,  106,  103,  117,  110,  107,
- /*    30 */   122,
+ /*     0 */   115,   24,    0,    8,    8,    8,    8,   79,   79,   82,
+ /*    10 */    79,   75,   52,   22,   22,   22,   92,   83,    1,   95,
+ /*    20 */    95,  100,   93,   87,  104,   96,   90,  108,
 };
-#define YY_REDUCE_COUNT (18)
+#define YY_REDUCE_COUNT (15)
 #define YY_REDUCE_MIN   (-25)
-#define YY_REDUCE_MAX   (77)
+#define YY_REDUCE_MAX   (64)
 static const signed char yy_reduce_ofst[] = {
- /*     0 */   -25,   16,   38,   45,   51,   57,   62,   -7,   35,  -12,
- /*    10 */   -12,   10,   11,   19,   32,   60,   73,   76,   77,
+ /*     0 */   -25,   21,   29,   36,   42,   17,   48,   -7,    9,  -12,
+ /*    10 */   -12,    4,   -5,   28,   32,   64,
 };
 static const YYACTIONTYPE yy_default[] = {
- /*     0 */   139,  107,  107,  107,  107,  107,  107,  107,  107,  107,
- /*    10 */   107,  150,  107,  150,  150,  150,  150,  150,  150,  121,
- /*    20 */   107,  107,  114,  115,  107,  107,  107,  107,  107,  107,
- /*    30 */   107,
+ /*     0 */   136,  104,  104,  104,  104,  104,  104,  104,  104,  104,
+ /*    10 */   104,  147,  104,  147,  147,  147,  104,  118,  104,  111,
+ /*    20 */   112,  104,  104,  104,  104,  104,  104,  104,
 };
 /********** End of lemon-generated parsing tables *****************************/
 
@@ -430,9 +426,9 @@ static const char *const yyRuleName[] = {
  /*  17 */ "literal ::= BOOLEAN",
  /*  18 */ "blank ::= BNODE_ID",
  /*  19 */ "blank ::= ANON",
- /*  20 */ "blank ::= LBRACKET ows predObjList ows RBRACKET",
- /*  21 */ "blank ::= LPAREN ows RPAREN",
- /*  22 */ "collection ::= LPAREN ows itemList ows RPAREN",
+ /*  20 */ "blank ::= LBRACKET predObjList RBRACKET",
+ /*  21 */ "blank ::= LPAREN RPAREN",
+ /*  22 */ "collection ::= LPAREN itemList RPAREN",
  /*  23 */ "itemList ::= itemList ows object",
  /*  24 */ "itemList ::= object",
  /*  25 */ "resource ::= IRIREF",
@@ -604,7 +600,7 @@ static void yy_destructor(
 {
 #line 36 "grammar_ttl.y"
  (void) state; free ((yypminor->yy0)); 
-#line 632 "../../build/grammar_ttl.c"
+#line 628 "../../build/grammar_ttl.c"
 }
       break;
     case 31: /* subject */
@@ -617,14 +613,14 @@ static void yy_destructor(
 {
 #line 129 "grammar_ttl.y"
  VOLK_term_free ((yypminor->yy50)); 
-#line 645 "../../build/grammar_ttl.c"
+#line 641 "../../build/grammar_ttl.c"
 }
       break;
     case 33: /* predObjList */
 {
 #line 97 "grammar_ttl.y"
  VOLK_link_map_free ((yypminor->yy34)); 
-#line 652 "../../build/grammar_ttl.c"
+#line 648 "../../build/grammar_ttl.c"
 }
       break;
     case 35: /* objectList */
@@ -632,7 +628,7 @@ static void yy_destructor(
 {
 #line 117 "grammar_ttl.y"
  VOLK_term_set_free ((yypminor->yy22)); 
-#line 660 "../../build/grammar_ttl.c"
+#line 656 "../../build/grammar_ttl.c"
 }
       break;
 /********* End destructor definitions *****************************************/
@@ -854,7 +850,7 @@ static void yyStackOverflow(yyParser *yypParser){
 
     log_error ("Stack oveflow in TTL parsing.");
     state->rc = VOLK_MEM_ERR;
-#line 882 "../../build/grammar_ttl.c"
+#line 878 "../../build/grammar_ttl.c"
 /******** End %stack_overflow code ********************************************/
    TTLParseARG_STORE /* Suppress warning about unused %extra_argument var */
    TTLParseCTX_STORE
@@ -946,9 +942,9 @@ static const YYCODETYPE yyRuleInfoLhs[] = {
     39,  /* (17) literal ::= BOOLEAN */
     38,  /* (18) blank ::= BNODE_ID */
     38,  /* (19) blank ::= ANON */
-    38,  /* (20) blank ::= LBRACKET ows predObjList ows RBRACKET */
-    38,  /* (21) blank ::= LPAREN ows RPAREN */
-    40,  /* (22) collection ::= LPAREN ows itemList ows RPAREN */
+    38,  /* (20) blank ::= LBRACKET predObjList RBRACKET */
+    38,  /* (21) blank ::= LPAREN RPAREN */
+    40,  /* (22) collection ::= LPAREN itemList RPAREN */
     41,  /* (23) itemList ::= itemList ows object */
     41,  /* (24) itemList ::= object */
     37,  /* (25) resource ::= IRIREF */
@@ -992,9 +988,9 @@ static const signed char yyRuleInfoNRhs[] = {
    -1,  /* (17) literal ::= BOOLEAN */
    -1,  /* (18) blank ::= BNODE_ID */
    -1,  /* (19) blank ::= ANON */
-   -5,  /* (20) blank ::= LBRACKET ows predObjList ows RBRACKET */
-   -3,  /* (21) blank ::= LPAREN ows RPAREN */
-   -5,  /* (22) collection ::= LPAREN ows itemList ows RPAREN */
+   -3,  /* (20) blank ::= LBRACKET predObjList RBRACKET */
+   -2,  /* (21) blank ::= LPAREN RPAREN */
+   -3,  /* (22) collection ::= LPAREN itemList RPAREN */
    -3,  /* (23) itemList ::= itemList ows object */
    -1,  /* (24) itemList ::= object */
    -1,  /* (25) resource ::= IRIREF */
@@ -1062,7 +1058,7 @@ static YYACTIONTYPE yy_reduce(
                 free (yymsp[-3].minor.yy0);
                 free (yymsp[-1].minor.yy0);
             }
-#line 1090 "../../build/grammar_ttl.c"
+#line 1086 "../../build/grammar_ttl.c"
   yy_destructor(yypParser,16,&yymsp[-2].minor);
   yy_destructor(yypParser,1,&yymsp[0].minor);
         break;
@@ -1075,7 +1071,7 @@ static YYACTIONTYPE yy_reduce(
 
                 free (yymsp[-1].minor.yy0);
             }
-#line 1103 "../../build/grammar_ttl.c"
+#line 1099 "../../build/grammar_ttl.c"
   yy_destructor(yypParser,16,&yymsp[-2].minor);
   yy_destructor(yypParser,1,&yymsp[0].minor);
 }
@@ -1091,7 +1087,7 @@ static YYACTIONTYPE yy_reduce(
                 VOLK_term_free (yymsp[-3].minor.yy50);
                 VOLK_link_map_free (yymsp[-1].minor.yy34);
             }
-#line 1119 "../../build/grammar_ttl.c"
+#line 1115 "../../build/grammar_ttl.c"
   yy_destructor(yypParser,1,&yymsp[0].minor);
         break;
       case 3: /* triples ::= subject ows predObjList SEMICOLON PERIOD */
@@ -1105,7 +1101,7 @@ static YYACTIONTYPE yy_reduce(
                 VOLK_term_free (yymsp[-4].minor.yy50);
                 VOLK_link_map_free (yymsp[-2].minor.yy34);
             }
-#line 1133 "../../build/grammar_ttl.c"
+#line 1129 "../../build/grammar_ttl.c"
   yy_destructor(yypParser,2,&yymsp[-1].minor);
   yy_destructor(yypParser,1,&yymsp[0].minor);
         break;
@@ -1124,7 +1120,7 @@ static YYACTIONTYPE yy_reduce(
                 VOLK_link_map_add (yylhsminor.yy34, yymsp[-2].minor.yy50, yymsp[0].minor.yy22);
                 if (s != state->lms) VOLK_term_free (s);
             }
-#line 1152 "../../build/grammar_ttl.c"
+#line 1148 "../../build/grammar_ttl.c"
   yymsp[-2].minor.yy34 = yylhsminor.yy34;
         break;
       case 5: /* predObjList ::= predObjList SEMICOLON predicate ows objectList */
@@ -1133,7 +1129,7 @@ static YYACTIONTYPE yy_reduce(
                 VOLK_link_map_add (yymsp[-4].minor.yy34, yymsp[-2].minor.yy50, yymsp[0].minor.yy22);
                 yylhsminor.yy34 = yymsp[-4].minor.yy34;
             }
-#line 1161 "../../build/grammar_ttl.c"
+#line 1157 "../../build/grammar_ttl.c"
   yy_destructor(yypParser,2,&yymsp[-3].minor);
   yymsp[-4].minor.yy34 = yylhsminor.yy34;
         break;
@@ -1144,7 +1140,7 @@ static YYACTIONTYPE yy_reduce(
                     VOLK_term_free (yymsp[0].minor.yy50);
                 yylhsminor.yy22 = yymsp[-2].minor.yy22;
             }
-#line 1172 "../../build/grammar_ttl.c"
+#line 1168 "../../build/grammar_ttl.c"
   yy_destructor(yypParser,3,&yymsp[-1].minor);
   yymsp[-2].minor.yy22 = yylhsminor.yy22;
         break;
@@ -1154,20 +1150,20 @@ static YYACTIONTYPE yy_reduce(
                 yylhsminor.yy22 = VOLK_term_set_new();
                 VOLK_term_set_add (yylhsminor.yy22, yymsp[0].minor.yy50, NULL);
             }
-#line 1182 "../../build/grammar_ttl.c"
+#line 1178 "../../build/grammar_ttl.c"
   yymsp[0].minor.yy22 = yylhsminor.yy22;
         break;
       case 8: /* subject ::= resource */
       case 9: /* subject ::= blank */ yytestcase(yyruleno==9);
 #line 130 "grammar_ttl.y"
 { state->lms = yymsp[0].minor.yy50; }
-#line 1189 "../../build/grammar_ttl.c"
+#line 1185 "../../build/grammar_ttl.c"
         break;
       case 10: /* predicate ::= RDF_TYPE */
 {  yy_destructor(yypParser,18,&yymsp[0].minor);
 #line 136 "grammar_ttl.y"
 { yymsp[0].minor.yy50 = VOLK_iriref_new_ns ("rdf:type"); }
-#line 1195 "../../build/grammar_ttl.c"
+#line 1191 "../../build/grammar_ttl.c"
 }
         break;
       case 11: /* literal ::= STRING */
@@ -1177,7 +1173,7 @@ static YYACTIONTYPE yy_reduce(
                 LOG_TRACE("Created plain literal: \"%s\"", yylhsminor.yy50->data);
                 free (yymsp[0].minor.yy0);
             }
-#line 1205 "../../build/grammar_ttl.c"
+#line 1201 "../../build/grammar_ttl.c"
   yymsp[0].minor.yy50 = yylhsminor.yy50;
         break;
       case 12: /* literal ::= STRING LANGTAG */
@@ -1188,7 +1184,7 @@ static YYACTIONTYPE yy_reduce(
                 free (yymsp[-1].minor.yy0);
                 free (yymsp[0].minor.yy0);
             }
-#line 1216 "../../build/grammar_ttl.c"
+#line 1212 "../../build/grammar_ttl.c"
   yymsp[-1].minor.yy50 = yylhsminor.yy50;
         break;
       case 13: /* literal ::= STRING DTYPE_MARKER resource */
@@ -1200,7 +1196,7 @@ static YYACTIONTYPE yy_reduce(
                         yylhsminor.yy50->data, yylhsminor.yy50->datatype);
                 free (yymsp[-2].minor.yy0);
             }
-#line 1228 "../../build/grammar_ttl.c"
+#line 1224 "../../build/grammar_ttl.c"
   yy_destructor(yypParser,19,&yymsp[-1].minor);
   yymsp[-2].minor.yy50 = yylhsminor.yy50;
         break;
@@ -1210,7 +1206,7 @@ static YYACTIONTYPE yy_reduce(
                 yylhsminor.yy50 = VOLK_literal_new (yymsp[0].minor.yy0, VOLK_iriref_new_ns ("xsd:integer"));
                 free (yymsp[0].minor.yy0);
             }
-#line 1238 "../../build/grammar_ttl.c"
+#line 1234 "../../build/grammar_ttl.c"
   yymsp[0].minor.yy50 = yylhsminor.yy50;
         break;
       case 15: /* literal ::= DOUBLE */
@@ -1219,7 +1215,7 @@ static YYACTIONTYPE yy_reduce(
                 yylhsminor.yy50 = VOLK_literal_new (yymsp[0].minor.yy0, VOLK_iriref_new_ns ("xsd:double"));
                 free (yymsp[0].minor.yy0);
             }
-#line 1247 "../../build/grammar_ttl.c"
+#line 1243 "../../build/grammar_ttl.c"
   yymsp[0].minor.yy50 = yylhsminor.yy50;
         break;
       case 16: /* literal ::= DECIMAL */
@@ -1228,7 +1224,7 @@ static YYACTIONTYPE yy_reduce(
                 yylhsminor.yy50 = VOLK_literal_new (yymsp[0].minor.yy0, VOLK_iriref_new_ns ("xsd:decimal"));
                 free (yymsp[0].minor.yy0);
             }
-#line 1256 "../../build/grammar_ttl.c"
+#line 1252 "../../build/grammar_ttl.c"
   yymsp[0].minor.yy50 = yylhsminor.yy50;
         break;
       case 17: /* literal ::= BOOLEAN */
@@ -1237,7 +1233,7 @@ static YYACTIONTYPE yy_reduce(
                 yylhsminor.yy50 = VOLK_literal_new (yymsp[0].minor.yy0, VOLK_iriref_new_ns ("xsd:boolean"));
                 free (yymsp[0].minor.yy0);
             }
-#line 1265 "../../build/grammar_ttl.c"
+#line 1261 "../../build/grammar_ttl.c"
   yymsp[0].minor.yy50 = yylhsminor.yy50;
         break;
       case 18: /* blank ::= BNODE_ID */
@@ -1247,7 +1243,7 @@ static YYACTIONTYPE yy_reduce(
                 LOG_TRACE("Created blank node: _:%s", yylhsminor.yy50->data);
                 free (yymsp[0].minor.yy0);
             }
-#line 1275 "../../build/grammar_ttl.c"
+#line 1271 "../../build/grammar_ttl.c"
   yymsp[0].minor.yy50 = yylhsminor.yy50;
         break;
       case 19: /* blank ::= ANON */
@@ -1258,44 +1254,44 @@ static YYACTIONTYPE yy_reduce(
                 yymsp[0].minor.yy50 = VOLK_bnode_new (NULL);
                 LOG_TRACE("Created empty list BN: _:%s", yymsp[0].minor.yy50->data);
             }
-#line 1286 "../../build/grammar_ttl.c"
+#line 1282 "../../build/grammar_ttl.c"
 }
         break;
-      case 20: /* blank ::= LBRACKET ows predObjList ows RBRACKET */
-{  yy_destructor(yypParser,21,&yymsp[-4].minor);
+      case 20: /* blank ::= LBRACKET predObjList RBRACKET */
+{  yy_destructor(yypParser,21,&yymsp[-2].minor);
 #line 193 "grammar_ttl.y"
 {
                 LOG_TRACE ("Found BNode with data.");
-                yymsp[-4].minor.yy50 = VOLK_bnode_new (NULL);
-                state->lms = yymsp[-4].minor.yy50;
-                state->ct += VOLK_graph_add_link_map (state->it, yymsp[-2].minor.yy34);
-                LOG_TRACE("Created list BN: _:%s", yymsp[-4].minor.yy50->data);
+                yymsp[-2].minor.yy50 = VOLK_bnode_new (NULL);
+                state->lms = yymsp[-2].minor.yy50;
+                state->ct += VOLK_graph_add_link_map (state->it, yymsp[-1].minor.yy34);
+                LOG_TRACE("Created list BN: _:%s", yymsp[-2].minor.yy50->data);
 
-                VOLK_link_map_free (yymsp[-2].minor.yy34);
+                VOLK_link_map_free (yymsp[-1].minor.yy34);
             }
-#line 1301 "../../build/grammar_ttl.c"
+#line 1297 "../../build/grammar_ttl.c"
   yy_destructor(yypParser,22,&yymsp[0].minor);
 }
         break;
-      case 21: /* blank ::= LPAREN ows RPAREN */
-{  yy_destructor(yypParser,23,&yymsp[-2].minor);
+      case 21: /* blank ::= LPAREN RPAREN */
+{  yy_destructor(yypParser,23,&yymsp[-1].minor);
 #line 203 "grammar_ttl.y"
 {
-                yymsp[-2].minor.yy50 = VOLK_iriref_new_ns ("rdf:nil");
-                LOG_TRACE("Created list terminator: %s", yymsp[-2].minor.yy50->data);
+                yymsp[-1].minor.yy50 = VOLK_iriref_new_ns ("rdf:nil");
+                LOG_TRACE("Created list terminator: %s", yymsp[-1].minor.yy50->data);
             }
-#line 1312 "../../build/grammar_ttl.c"
+#line 1308 "../../build/grammar_ttl.c"
   yy_destructor(yypParser,24,&yymsp[0].minor);
 }
         break;
-      case 22: /* collection ::= LPAREN ows itemList ows RPAREN */
-{  yy_destructor(yypParser,23,&yymsp[-4].minor);
+      case 22: /* collection ::= LPAREN itemList RPAREN */
+{  yy_destructor(yypParser,23,&yymsp[-2].minor);
 #line 212 "grammar_ttl.y"
 {
-                yymsp[-4].minor.yy50 = VOLK_bnode_add_collection (state->it, yymsp[-2].minor.yy22);
-                VOLK_term_set_free (yymsp[-2].minor.yy22);
+                yymsp[-2].minor.yy50 = VOLK_bnode_add_collection (state->it, yymsp[-1].minor.yy22);
+                VOLK_term_set_free (yymsp[-1].minor.yy22);
             }
-#line 1323 "../../build/grammar_ttl.c"
+#line 1319 "../../build/grammar_ttl.c"
   yy_destructor(yypParser,24,&yymsp[0].minor);
 }
         break;
@@ -1306,7 +1302,7 @@ static YYACTIONTYPE yy_reduce(
                     VOLK_term_free (yymsp[0].minor.yy50);
                 yylhsminor.yy22 = yymsp[-2].minor.yy22;
             }
-#line 1334 "../../build/grammar_ttl.c"
+#line 1330 "../../build/grammar_ttl.c"
   yymsp[-2].minor.yy22 = yylhsminor.yy22;
         break;
       case 24: /* itemList ::= object */
@@ -1315,7 +1311,7 @@ static YYACTIONTYPE yy_reduce(
                 yylhsminor.yy22 = VOLK_term_set_new ();
                 VOLK_term_set_add (yylhsminor.yy22, yymsp[0].minor.yy50, NULL);
             }
-#line 1343 "../../build/grammar_ttl.c"
+#line 1339 "../../build/grammar_ttl.c"
   yymsp[0].minor.yy22 = yylhsminor.yy22;
         break;
       case 25: /* resource ::= IRIREF */
@@ -1331,7 +1327,7 @@ static YYACTIONTYPE yy_reduce(
                 }
                 LOG_TRACE("Created IRI: <%s>", yylhsminor.yy50->data);
             }
-#line 1359 "../../build/grammar_ttl.c"
+#line 1355 "../../build/grammar_ttl.c"
   yymsp[0].minor.yy50 = yylhsminor.yy50;
         break;
       case 26: /* resource ::= QNAME */
@@ -1341,14 +1337,14 @@ static YYACTIONTYPE yy_reduce(
                 LOG_TRACE("Created IRI: %s", yylhsminor.yy50->data);
                 free (yymsp[0].minor.yy0);
             }
-#line 1369 "../../build/grammar_ttl.c"
+#line 1365 "../../build/grammar_ttl.c"
   yymsp[0].minor.yy50 = yylhsminor.yy50;
         break;
       case 27: /* turtleDoc ::= statements EOF */
 #line 54 "grammar_ttl.y"
 {
 }
-#line 1376 "../../build/grammar_ttl.c"
+#line 1372 "../../build/grammar_ttl.c"
   yy_destructor(yypParser,15,&yymsp[0].minor);
         break;
       case 33: /* statement ::= WS */
@@ -1357,7 +1353,7 @@ static YYACTIONTYPE yy_reduce(
 #line 61 "grammar_ttl.y"
 {
 }
-#line 1385 "../../build/grammar_ttl.c"
+#line 1381 "../../build/grammar_ttl.c"
 }
         break;
       case 34: /* predicate ::= resource */
@@ -1366,7 +1362,7 @@ static YYACTIONTYPE yy_reduce(
 #line 135 "grammar_ttl.y"
 {
 }
-#line 1394 "../../build/grammar_ttl.c"
+#line 1390 "../../build/grammar_ttl.c"
 }
         break;
       case 36: /* object ::= blank */
@@ -1374,7 +1370,7 @@ static YYACTIONTYPE yy_reduce(
 #line 141 "grammar_ttl.y"
 {
 }
-#line 1402 "../../build/grammar_ttl.c"
+#line 1398 "../../build/grammar_ttl.c"
 }
         break;
       case 37: /* object ::= literal */
@@ -1382,7 +1378,7 @@ static YYACTIONTYPE yy_reduce(
 #line 142 "grammar_ttl.y"
 {
 }
-#line 1410 "../../build/grammar_ttl.c"
+#line 1406 "../../build/grammar_ttl.c"
 }
         break;
       case 38: /* blank ::= collection */
@@ -1390,7 +1386,7 @@ static YYACTIONTYPE yy_reduce(
 #line 202 "grammar_ttl.y"
 {
 }
-#line 1418 "../../build/grammar_ttl.c"
+#line 1414 "../../build/grammar_ttl.c"
 }
         break;
       default:
@@ -1445,7 +1441,7 @@ static void yy_parse_failed(
 
     log_error ("TTL parse error. Cannot continue.");
     state->rc = VOLK_PARSE_ERR;
-#line 1473 "../../build/grammar_ttl.c"
+#line 1469 "../../build/grammar_ttl.c"
 /************ End %parse_failure code *****************************************/
   TTLParseARG_STORE /* Suppress warning about unused %extra_argument variable */
   TTLParseCTX_STORE
@@ -1468,7 +1464,7 @@ static void yy_syntax_error(
 
     // Fail immediately on first error.
     yy_parse_failed (yypParser);
-#line 1496 "../../build/grammar_ttl.c"
+#line 1492 "../../build/grammar_ttl.c"
 /************ End %syntax_error code ******************************************/
   TTLParseARG_STORE /* Suppress warning about unused %extra_argument variable */
   TTLParseCTX_STORE

+ 49 - 39
src/codec/lexer_nt.re

@@ -42,35 +42,7 @@ typedef struct {
 } ParseIterator;
 
 
-static int fill(ParseIterator *it)
-{
-    if (it->eof) {
-        return 1;
-    }
-    size_t shift = it->tok - it->buf;
-
-    // If buffer is too small for the lexeme, double the capacity.
-    while (shift < 1) {
-        it->buf_size = 2 * it->buf_size;
-        it->buf = realloc (it->buf, it->buf_size);
-        if (!it->buf) {
-            log_error ("Memory allocation error.");
-            return -1;
-        }
-        shift = it->tok - it->buf;
-    }
-    LOG_DEBUG("Shifting bytes: %lu", shift);
-    memmove (it->buf, it->tok, it->lim - it->tok);
-    it->lim -= shift;
-    it->cur -= shift;
-    it->mar -= shift;
-    it->tok -= shift;
-    it->lim += fread (it->lim, 1, shift, it->fh);
-    /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
-    it->lim[0] = 0;
-    it->eof |= it->lim < it->buf + CHUNK_SIZE - 1;
-    return 0;
-}
+static int fill(ParseIterator *it);
 
 
 /** @brief Initialize parser.
@@ -86,9 +58,6 @@ static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
 {
     if(fh) {
         // Stream handling. It engages YYFILL and reads by chunks.
-        /*!re2c
-        re2c:yyfill:enable = 1;
-        */
         it->fh = fh;
         it->sh = NULL;
         it->buf_size = CHUNK_SIZE;
@@ -96,12 +65,10 @@ static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
         if (!it->buf) log_error ("Error allocating lexer buffer.");
         it->cur = it->mar = it->tok = it->lim = it->buf + it->buf_size - 1;
         it->bol = it->buf;
-        it->eof = 0;
+        it->eof = false;
+        it->lim[0] = 0;
     } else {
         // String handling. Uses the provided string as the buffer.
-        /*!re2c
-        re2c:yyfill:enable = 0;
-        */
         it->fh = NULL;
         it->sh = sh;
         it->buf_size = strlen(sh) + 1;
@@ -109,13 +76,56 @@ static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
         it->cur = it->tok = (YYCTYPE*)it->sh;
         it->lim = it->mar = it->cur + it->buf_size - 1;
         it->bol = it->cur;
-        it->eof = 1;
+        it->eof = true;
     }
     it->line = 1;
     it->ct = 0;
     /*!stags:re2c format = "it->@@ = NULL; "; */
+}
 
-    if (it->fh) fill (it);
+
+int
+fill(ParseIterator *it)
+{
+    log_debug ("Filling codec buffer @ %p.", it->buf);
+    if (it->eof) return 1;
+
+    size_t shift = it->tok - it->buf;
+    size_t used = it->lim - it->tok;
+
+    // If buffer is too small for the lexeme, double the capacity.
+    if (shift < 1) {
+        YYCTYPE *old_buf = it->buf;
+        shift += it->buf_size;
+        it->buf_size *= 2;
+        LOG_DEBUG ("Reallocating buffer to %zu bytes.", it->buf_size);
+        it->buf = realloc (it->buf, it->buf_size);
+        if (!it->buf) {
+            log_error ("Memory allocation error.");
+            return -1;
+        }
+        // Move all relative points if address changed.
+        size_t reloc_off = it->buf - old_buf;
+        it->cur += reloc_off;
+        it->tok += reloc_off;
+        it->lim += reloc_off;
+        it->mar += reloc_off;
+    } else {
+        LOG_DEBUG("Shifting bytes: %zu", shift);
+        memmove (it->buf, it->tok, used);
+        LOG_TRACE ("Limit offset before reading data: %zu", it->lim - it->tok);
+        it->lim -= shift;
+        it->cur -= shift;
+        it->mar -= shift;
+        it->tok -= shift;
+    }
+    it->lim += fread (it->lim, 1, it->buf_size - used - 1, it->fh);
+    /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
+    LOG_TRACE ("Cursor offset from last token: %zu", it->cur - it->tok);
+    LOG_TRACE ("Limit offset from last token: %zu", it->lim - it->tok);
+    it->lim[0] = 0;
+    it->eof = it->lim < it->buf + it->buf_size - 1;
+    return 0;
 }
 
 /** END duplicate section */
@@ -278,7 +288,7 @@ loop:
 
     * {
         log_error (
-            "Invalid token @ %lu: %s (\\x%x)",
+            "Invalid token @ %p: %s (\\x%x)",
             YYCURSOR - it->buf - 1, it->tok, *it->tok);
 
         return -1;

+ 49 - 38
src/codec/lexer_ttl.re

@@ -41,35 +41,7 @@ typedef struct {
 } ParseIterator;
 
 
-static int fill(ParseIterator *it)
-{
-    if (it->eof) {
-        return 1;
-    }
-    size_t shift = it->tok - it->buf;
-
-    // If buffer is too small for the lexeme, double the capacity.
-    while (shift < 1) {
-        it->buf_size = 2 * it->buf_size;
-        it->buf = realloc (it->buf, it->buf_size);
-        if (!it->buf) {
-            log_error ("Memory allocation error.");
-            return -1;
-        }
-        shift = it->tok - it->buf;
-    }
-    LOG_DEBUG("Shifting bytes: %lu", shift);
-    memmove (it->buf, it->tok, it->lim - it->tok);
-    it->lim -= shift;
-    it->cur -= shift;
-    it->mar -= shift;
-    it->tok -= shift;
-    it->lim += fread (it->lim, 1, shift, it->fh);
-    /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
-    it->lim[0] = 0;
-    it->eof |= it->lim < it->buf + CHUNK_SIZE - 1;
-    return 0;
-}
+static int fill(ParseIterator *it);
 
 
 /** @brief Initialize parser.
@@ -85,9 +57,6 @@ static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
 {
     if(fh) {
         // Stream handling. It engages YYFILL and reads by chunks.
-        /*!re2c
-        re2c:yyfill:enable = 1;
-        */
         it->fh = fh;
         it->sh = NULL;
         it->buf_size = CHUNK_SIZE;
@@ -95,12 +64,10 @@ static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
         if (!it->buf) log_error ("Error allocating lexer buffer.");
         it->cur = it->mar = it->tok = it->lim = it->buf + it->buf_size - 1;
         it->bol = it->buf;
-        it->eof = 0;
+        it->eof = false;
+        it->lim[0] = 0;
     } else {
         // String handling. Uses the provided string as the buffer.
-        /*!re2c
-        re2c:yyfill:enable = 0;
-        */
         it->fh = NULL;
         it->sh = sh;
         it->buf_size = strlen(sh) + 1;
@@ -108,13 +75,56 @@ static void parse_init (ParseIterator *it, FILE *fh, const char *sh)
         it->cur = it->tok = (YYCTYPE*)it->sh;
         it->lim = it->mar = it->cur + it->buf_size - 1;
         it->bol = it->cur;
-        it->eof = 1;
+        it->eof = true;
     }
     it->line = 1;
     it->ct = 0;
     /*!stags:re2c format = "it->@@ = NULL; "; */
+}
 
-    if (it->fh) fill (it);
+
+int
+fill(ParseIterator *it)
+{
+    log_debug ("Filling codec buffer @ %p.", it->buf);
+    if (it->eof) return 1;
+
+    size_t shift = it->tok - it->buf;
+    size_t used = it->lim - it->tok;
+
+    // If buffer is too small for the lexeme, double the capacity.
+    if (shift < 1) {
+        YYCTYPE *old_buf = it->buf;
+        shift += it->buf_size;
+        it->buf_size *= 2;
+        LOG_DEBUG ("Reallocating buffer to %zu bytes.", it->buf_size);
+        it->buf = realloc (it->buf, it->buf_size);
+        if (!it->buf) {
+            log_error ("Memory allocation error.");
+            return -1;
+        }
+        // Move all relative points if address changed.
+        size_t reloc_off = it->buf - old_buf;
+        it->cur += reloc_off;
+        it->tok += reloc_off;
+        it->lim += reloc_off;
+        it->mar += reloc_off;
+    } else {
+        LOG_DEBUG("Shifting bytes: %zu", shift);
+        memmove (it->buf, it->tok, used);
+        LOG_TRACE ("Limit offset before reading data: %zu", it->lim - it->tok);
+        it->lim -= shift;
+        it->cur -= shift;
+        it->mar -= shift;
+        it->tok -= shift;
+    }
+    it->lim += fread (it->lim, 1, it->buf_size - used - 1, it->fh);
+    /*!stags:re2c format = "if (it->@@) it->@@ -= shift; "; */
+    LOG_TRACE ("Cursor offset from last token: %zu", it->cur - it->tok);
+    LOG_TRACE ("Limit offset from last token: %zu", it->lim - it->tok);
+    it->lim[0] = 0;
+    it->eof = it->lim < it->buf + it->buf_size - 1;
+    return 0;
 }
 
 /** END duplicate section */
@@ -149,6 +159,7 @@ static int lex (ParseIterator *it, YYCTYPE **token_p)
 
     /*!re2c
 
+    re2c:yyfill:enable = 1;
     re2c:eof = 0;
     re2c:flags:8 = 1;
     re2c:flags:tags = 1;

文件差异内容过多而无法显示
+ 520 - 172
src/codec/parser_nt.c


文件差异内容过多而无法显示
+ 635 - 340
src/codec/parser_ttl.c


+ 17 - 4
test/test_codec_nt.c

@@ -269,7 +269,7 @@ test_decode_nt_file()
 }
 
 
-#define LARGE_LIT_SIZE CHUNK_SIZE * 2 + 1  // More than 2  buffer pages.
+#define LARGE_LIT_SIZE CHUNK_SIZE * 2 + 2  // More than 2 buffer pages.
 int
 test_decode_large_lit_file()
 {
@@ -279,9 +279,13 @@ test_decode_large_lit_file()
     const char *fpath = "/tmp/test_large_lit.nt";
     FILE *fh = fopen (fpath, "w");
 
-    fprintf (fh, "<urn:s:1> <urn:p:1> \"");
+    char *large_lit = malloc(LARGE_LIT_SIZE + 1);
     for (unsigned i = 0; i < LARGE_LIT_SIZE; i++)
-        fputc (rand() % 25 + 65, fh);  // A-Z
+        large_lit[i] = rand() % 25 + 65;  // A-Z
+    large_lit[LARGE_LIT_SIZE] = '\0';
+
+    fprintf (fh, "<urn:s:1> <urn:p:1> \"");
+    fprintf(fh, large_lit);
     fprintf(fh, "\" .\n");
     fclose(fh);
 
@@ -291,9 +295,18 @@ test_decode_large_lit_file()
     EXPECT_INT_EQ (VOLK_graph_size (gr), 1);
     EXPECT_INT_EQ (ct, 1);
 
+    VOLK_GraphIterator *it = VOLK_graph_lookup (gr, NULL, NULL, NULL, NULL);
+    VOLK_Triple *spo;
+    EXPECT_PASS (VOLK_graph_iter_next (it, &spo));
+    VOLK_graph_iter_free (it);
+
+    EXPECT_STR_EQ (spo->o->data, large_lit);
+
     VOLK_graph_free (gr);
+    VOLK_triple_free (spo);
     fclose(fh);
     unlink (fpath);
+    free (large_lit);
 
     return 0;
 }
@@ -334,7 +347,7 @@ int codec_nt_tests()
     RUN (test_decode_nt_term);
     RUN (test_decode_nt_graph);
     RUN (test_decode_nt_file);
-    //RUN (test_decode_large_lit_file);  // FIXME large literals still not working.
+    RUN (test_decode_large_lit_file);
     RUN (test_decode_nt_bad_graph);
 
     free_terms (terms);

+ 3 - 3
test/test_codec_ttl.c

@@ -59,9 +59,9 @@ test_w3c_pos()
     char ch;
 
     for (int i = 0; i <= W3C_POS_TEST_CT; i++) {
-#if 1
-        // Tests 14÷16 with 10K triples is quite long. Skip them temporarily.
-        // TODO use a switch based on env var.
+#ifndef VOLK_TEST_LARGE
+        // Tests 14÷16 with 10K triples is quite long. Skip them unless
+        // explicitly requested.
         if (i > 12 && i <17) continue;
 #endif
         size_t nt_ct = 0;

部分文件因为文件数量过多而无法显示