Browse Source

Reduce number of variables in parser.

Stefano Cossu 2 years ago
parent
commit
9036a54e1d
2 changed files with 13 additions and 19 deletions
  1. 4 2
      src/codec.c
  2. 9 17
      src/codec/lexer_ttl.re

+ 4 - 2
src/codec.c

@@ -33,7 +33,7 @@ uint8_t *unescape_unicode (const uint8_t *esc_str, size_t size)
             tmp_chr[esc_len] = '\0';
 
             uint32_t tmp_val = strtol ((char *) tmp_chr, NULL, 16);
-            log_debug ("tmp_val: %d", tmp_val);
+            //log_debug ("tmp_val: %d", tmp_val);
 
             // Reuse tmp_chr to hold the byte values for the code point.
             int cp_len = utf8_encode (tmp_val, tmp_chr);
@@ -44,6 +44,8 @@ uint8_t *unescape_unicode (const uint8_t *esc_str, size_t size)
 
             // Copy bytes into destination.
             memcpy (data + len, tmp_chr, cp_len);
+#if 0
+            // This can generate a LOT of output.
             if (esc_len == 4)
                 log_trace ("UC byte value: %2x %2x", data[len], data[len + 1]);
             else
@@ -51,7 +53,7 @@ uint8_t *unescape_unicode (const uint8_t *esc_str, size_t size)
                     "UC byte value: %2x %2x %2x %2x",
                     data[len], data[len + 1], data[len + 2], data[len + 3]
                 );
-
+#endif
             len += cp_len;
             i += esc_len;
         } else {

+ 9 - 17
src/codec/lexer_ttl.re

@@ -96,8 +96,8 @@ static int lex (ParseIterator *it, YYCTYPE **token_p)
 
 
     // Character classes.
-    EOL             = [\x0A\x0D];
-    NCWS            = [\x09\x20] | EOL;
+    EOL             = [\n\r];
+    NCWS            = [\t\x20] | EOL;
     HEX             = [\x30-\x39\x41-\x46];
     CHAR_BASE       = "\\u" HEX{4} | "\\U" HEX{8} | '\\'
                     | [\U0000005D-\U0010FFFF];
@@ -117,18 +117,10 @@ static int lex (ParseIterator *it, YYCTYPE **token_p)
     LCHAR           = ECHAR | ([\\] ["]) | [\t\n\r];
 
     // Constructs.
-    COMMENT         = '#' ( [^\x0A\x0D] )*;
+    COMMENT         = '#' ( [^\n\r] )*;
     WS              = NCWS+ | COMMENT;
     INTEGER         = ('-' | '+')? [0-9]+;
     EXPONENT        = [eE] INTEGER;
-    LANGUAGE        = [a-z]+ ('-' [a-z0-9]+)*;
-    REL_IRI         = UCHAR*;
-    IRIREF          = '<' REL_IRI '>';
-    NAME            = NSTART_CHAR NAME_CHAR*;
-    PFX             = PSTART_CHAR NAME_CHAR* ':';
-    //LSTRING         = [\x22]{3} LCHAR* [\x22]{3};
-    STRING          = [\x22] SCHAR* [\x22];
-    LANGTAG         = '@' LANGUAGE;
     DOUBLE          = ('-' | '+') ? ([0-9]+ '.' [0-9]* EXPONENT
                     | '.' ([0-9])+ EXPONENT | ([0-9])+ EXPONENT);
     DECIMAL         = ('-' | '+')?
@@ -168,7 +160,7 @@ cont: // Continue token parsing.
         return T_BOOLEAN;
     }
 
-    IRIREF {
+    '<' UCHAR* '>' {
         *token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
         log_trace ("URI data: %s", *token_p);
 
@@ -180,14 +172,14 @@ cont: // Continue token parsing.
         return T_RDF_TYPE;
     }
 
-    NAME {
+    NSTART_CHAR NAME_CHAR* {
         *token_p = unescape_unicode (it->tok, YYCURSOR - it->tok);
-        log_trace ("name: %s", *token_p);
+        log_trace ("ID name: %s", *token_p);
 
         return T_IDNAME;
     }
 
-    PFX {
+    PSTART_CHAR NAME_CHAR* ':' {
         *token_p = uint8_ndup (it->tok, YYCURSOR - it->tok - 1);
         log_trace ("Prefix name: '%s'", *token_p);
 
@@ -196,7 +188,7 @@ cont: // Continue token parsing.
 
     [\x22]{3} { goto lchar; }
 
-    STRING {
+    [\x22] SCHAR* [\x22] {
         *token_p = unescape_unicode (it->tok + 1, YYCURSOR - it->tok - 2);
         log_trace ("Long string: %s", *token_p);
 
@@ -208,7 +200,7 @@ cont: // Continue token parsing.
         return T_WS;
     }
 
-    LANGTAG {
+    '@' [a-z]+ ('-' [a-z0-9]+)* {
         *token_p = uint8_ndup (it->tok + 1, YYCURSOR - it->tok);
         log_trace ("Lang tag: %s", *token_p);