Selaa lähdekoodia

Split long string parsing into 2 separate automata.

Stefano Cossu 2 vuotta sitten
vanhempi
commit
4b7a4e2094
2 muutettua tiedostoa jossa 52 lisäystä ja 24 poistoa
  1. 2 1
      Makefile
  2. 50 23
      src/codec/lexer_ttl.re

+ 2 - 1
Makefile

@@ -157,7 +157,8 @@ debug_install: install debug ## Install standard and debug libraries.
 
 .PHONY: clean ## Clean up artifacts, including language parsers.
 clean:
-	rm -f src/*.[aod] ./*[aod] src/codec/*[aod]
+	rm -f src/*.[aod] ./*.[aod] src/codec/*.[aod]
+	rm -rf build/*
 	rm -f include/codec/grammar_*.h
 	rm -f src/codec/{grammar,parser}_*.c
 

+ 50 - 23
src/codec/lexer_ttl.re

@@ -85,14 +85,8 @@ void TTLParseTrace();
 
 static int lex (ParseIterator *it, YYCTYPE **token_p)
 {
-
-loop:
-
-    it->tok = it->cur;
-
-    *token_p = NULL;
-
     /*!re2c
+
     re2c:eof = 0;
     re2c:flags:8 = 1;
     re2c:flags:tags = 1;
@@ -132,16 +126,23 @@ loop:
     IRIREF          = '<' REL_IRI '>';
     NAME            = NSTART_CHAR NAME_CHAR*;
     PFX             = PSTART_CHAR NAME_CHAR* ':';
-    LSTRING         = [\x22]{3} LCHAR*? [\x22]{3};
+    //LSTRING         = [\x22]{3} LCHAR* [\x22]{3};
     STRING          = [\x22] SCHAR* [\x22];
     LANGTAG         = '@' LANGUAGE;
     DOUBLE          = ('-' | '+') ? ([0-9]+ '.' [0-9]* EXPONENT
                     | '.' ([0-9])+ EXPONENT | ([0-9])+ EXPONENT);
     DECIMAL         = ('-' | '+')?
                     ( [0-9]+ '.' [0-9]* | '.' ([0-9])+ | ([0-9])+ );
-    BOOLEAN         = 'true' | 'false';
-    //RDF_TYPE        = 'a' / WS;
 
+     */
+
+loop: // Start new token.
+    it->tok = it->cur;
+
+    *token_p = NULL;
+
+cont: // Continue token parsing.
+    /*!re2c
 
     $ {
         log_trace ("End of document.");
@@ -160,6 +161,13 @@ loop:
         return T_PREFIX;
     }
 
+    "true" | "false" {
+        *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok);
+        log_trace ("Boolean: %s", *token_p);
+
+        return T_BOOLEAN;
+    }
+
     IRIREF {
         *token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
         log_trace ("URI data: %s", *token_p);
@@ -186,12 +194,7 @@ loop:
         return T_PFX;
     }
 
-    LSTRING {
-        *token_p = unescape_unicode (it->tok + 3, YYCURSOR - it->tok - 6);
-        log_trace ("Long string: %s", it->tok);
-
-        return T_STRING;
-    }
+    [\x22]{3} { goto lchar; }
 
     STRING {
         *token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
@@ -247,13 +250,6 @@ loop:
         return T_DECIMAL;
     }
 
-    BOOLEAN {
-        *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok);
-        log_trace ("Boolean: %s", *token_p);
-
-        return T_BOOLEAN;
-    }
-
     '(' { return T_LPAREN; }
 
     ')' { return T_RPAREN; }
@@ -294,6 +290,37 @@ loop:
     }
 
     */
+
+lchar:
+    /*!re2c
+
+    * {
+        log_warn (
+            "Invalid token @ %lu: %s (\\x%x)",
+            YYCURSOR - it->buf - 1, it->tok, *it->tok);
+
+        return -1;
+    }
+
+    $ {
+        log_warn ("Unterminated long string!");
+
+        return -1;
+    }
+
+    LCHAR {
+        log_trace ("Continue long string token: %s", it->tok);
+
+        goto lchar;
+    }
+
+    [\x22]{3} {
+        *token_p = unescape_unicode (it->tok + 3, YYCURSOR - it->tok - 6);
+        log_trace ("Long string: %s", it->tok);
+
+        return T_STRING;
+    }
+     */
 }