Browse Source

WIP regex overhaul.

scossu 2 weeks ago
parent
commit
cb7d76cf3a
3 changed files with 130 additions and 6 deletions
  1. 2 2
      Makefile
  2. 1 1
      src/core.c
  3. 127 3
      src/term.c

+ 2 - 2
Makefile

@@ -210,7 +210,7 @@ memcheck:
 memtest: bin/test memcheck ## Run a test suite using Valgrind. Output to separate file.
 
 
-# Performance test application. Essentially the profiling code without debug.
+# Profiling application.
 bin/profile: debug profile.c
 	$(CC) $(CFLAGS) -g -DTESTING $(LDFLAGS) -llsuprdf_dbg \
 		profile.c -o bin/profile
@@ -229,7 +229,7 @@ perftest: bin/perftest ## Run a performance test by creating, inserting and look
 .PHONY: profile
 profile: bin/profile ## Run a profiling session. Output can be inspected with KCachegrind.
 	LSUP_MDB_MAPSIZE=800000 valgrind --tool=callgrind \
-		--callgrind-out-file="$(CALLGRIND_DUMP)" bin/perftest 1000
+		--callgrind-out-file="$(CALLGRIND_DUMP)" bin/profile 1000
 	@echo "Profile dump written at $(CALLGRIND_DUMP). Open it with "\
 		"qcachegrind, kcachegrind, etc."
 

+ 1 - 1
src/core.c

@@ -1,4 +1,4 @@
-//#define _XOPEN_SOURCE 500
+#define _XOPEN_SOURCE 500
 #include <errno.h>
 #include <ftw.h>
 #include <string.h>

+ 127 - 3
src/term.c

@@ -16,12 +16,15 @@
  * Data structures.
  */
 
+/// Sub-string coordinates for IRI matches.
+typedef size_t match_coord_t[2];
+
 /// Matching sub-patterns for IRI parts.
 struct iri_info_t {
     LSUP_NSMap *        nsm;        ///< NSM handle for prefixed IRI.
-    regmatch_t          prefix;     ///< Matching group #1.
-    regmatch_t          path;       ///< Matching group #5.
-    regmatch_t          frag;       ///< Matching group #10.
+    match_coord_t       prefix;     ///< Matching group #1.
+    match_coord_t       path;       ///< Matching group #5.
+    match_coord_t       frag;       ///< Matching group #10.
 };
 
 
@@ -881,6 +884,127 @@ term_init (
 }
 
 
+/**
+ * @brief scan an IRI string and parse IRI parts.
+ *
+ * Experimental replacement of a regex engine for better performance.
+ *
+ * https://datatracker.ietf.org/doc/html/rfc3986#appendix-B
+ *
+ * Reference regex and group numbering:
+ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ *  12            3  4          5       6  7        8 9
+ *
+ * Capturing groups:
+ *
+ * scheme    = $2
+ * authority = $4
+ * path      = $5
+ * query     = $7
+ * fragment  = $9
+ *
+ * #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
+ * #1: Domain prefix (http://example.org)
+ * #2: Protocol (http:)
+ * #4: Authority (example.org)
+ * #5: Path, excluding query and fragment (/123/456/)
+ * #7: Query (query=blah)
+ * #9: Fragment (frag)
+ */
+static LSUP_rc
+parse_iri (char *iri_str, LSUP_IRIInfo *iri_info) {
+    char *cur = iri_str;
+    size_t
+        coord[2][10] = {0},  // Store capture group relative position & length
+        tmp[2] = {0};  // Temporary storage for capture groups
+
+    // #1: (([^:/?#]+:)?(//([^/?#]*))?)
+    while (*cur != '\0') {
+        // #2: ([^:/?#]+:)?
+        tmp[0] = cur - iri_str;
+        while (
+                *cur != ':' && *cur != '/' && *cur != '?'
+                && *cur != '#' && *cur != '\0') {
+            tmp[1]++;
+            cur++;
+        }
+        if (tmp[1] > 0 && *cur == ':') {
+            // Got capture group #2. Store it.
+            tmp[1]++;
+            coord[3][0] = tmp[0];
+            coord[3][1] = tmp[1];
+            cur++;
+        }
+        // #3: (//([^/?#]*))?
+        if (*cur == '/' && *(cur + 1) == '/') {
+            tmp[0] = cur - iri_str;
+            tmp[1] = 2;
+            cur += 2;
+
+            // #4: ([^/?#]*)
+            while (*cur != '/' && *cur != '?' && *cur != '#' && *cur != '\0') {
+                // Continue recording length for #3, coordinates for #4 can
+                // be inferred.
+                tmp[1]++;
+                cur++;
+            }
+
+            if (tmp[1] > 2) {
+                // Got capture groups #3 and #4. Store them.
+                coord[3][0] = tmp[0];
+                coord[3][1] = tmp[1];
+                coord[4][0] = tmp[0] + 2;
+                coord[4][1] = tmp[1] -2;
+            }
+        }
+    }
+
+    // #5: ([^?#]*)
+    tmp[0] = cur - iri_str;
+    tmp[1] = 0;
+    while (*cur != '?' && *cur != '#' && *cur != '\0') {
+        tmp[1]++;
+        cur++;
+    }
+    if (tmp[1] > 0) {
+        // Got capture group #5. Store it.
+        coord[5][0] = tmp[0];
+        coord[5][1] = tmp[1];
+    }
+
+    // #6: (\?([^#]*))
+    if (*cur == '?') {
+        // Advance cursor by one and skip storing '?'.
+        tmp[0] = ++cur - iri_str;
+        tmp[1] = 0;
+
+        // 7: ([^#]*)
+        while (*cur != '#' && *cur != '\0') {
+            tmp[1]++;
+            cur++;
+        }
+
+        if (tmp[1] > 0) {
+            // Got capture group #7. Store it.
+            // Group #6 (query including '?') is ignored.
+            coord[7][0] = tmp[0];
+            coord[7][1] = tmp[1];
+        }
+    }
+
+    // 8: (#(.*))
+    if (*cur == '#') {
+        // 9: (.*)
+        // Skip storing '#'.
+        tmp[0] = ++cur - iri_str;
+        // Store the length of the remaining string.
+        tmp[1] = strlen(iri_str) + iri_str - cur;
+    }
+
+    return LSUP_OK;
+}
+
+
 /*
  * Extern inline functions.
  */