Browse Source

Replace regex with ad-hoc IRIRef scanning.

scossu 1 week ago
parent
commit
5843ba995b
4 changed files with 133 additions and 153 deletions
  1. 0 8
      include/environment.h
  2. 10 5
      include/term.h
  3. 1 24
      src/environment.c
  4. 122 116
      src/term.c

+ 0 - 8
include/environment.h

@@ -15,14 +15,6 @@
 #include "term.h"
 
 
-/** @brief Whether the environment is already initialized.
- *
- * @TODO Check if the default NS was inserted; this would be slower but more
- * accurate.
- */
-#define LSUP_IS_INIT (LSUP_term_cache != NULL)
-
-
 /*
  * External variables.
  */

+ 10 - 5
include/term.h

@@ -2,7 +2,6 @@
 #define _LSUP_TERM_H
 
 #include <assert.h>
-#include <regex.h>
 
 #include "buffer.h"
 #include "namespace.h"
@@ -33,6 +32,8 @@
  *
  * For URN-like URIs, such as `urn:s:0`, the prefix part (#1) is `urn:` and
  * the path (#4) is `s:0`.
+ *
+ * TODO Remove. Superseded by ad-hoc scanning (see static parse_ini in term.c.)
  */
 #define LSUP_URI_REGEX_STR \
     "^(([^:/?#]+:)?(//([^/?#]*))?)?(([^?#]*)(\\?([^#]*))?(#(.*))?)"
@@ -90,6 +91,14 @@ typedef struct term_t {
     ((term)->type == LSUP_TERM_LITERAL || (term)->type == LSUP_TERM_LT_LITERAL)
 
 
+/** @brief Whether the environment is already initialized.
+ *
+ * @TODO Check if the default NS was inserted; this would be slower but more
+ * accurate.
+ */
+#define LSUP_IS_INIT (LSUP_term_cache != NULL)
+
+
 /** @brief RDF triple.
  *
  * This represents a complete RDF statement. Triple terms can be accessed
@@ -143,10 +152,6 @@ typedef struct hashmap LSUP_TermSet;
  */
 extern uint32_t LSUP_default_dtype_key;
 
-/** @brief URI validation pattern, compiled in #LSUP_init().
- */
-extern regex_t *LSUP_uri_ptn;
-
 /** @brief Default literal data type URI.
  *
  * Literal terms created with undefined data type will have it set to this

+ 1 - 24
src/environment.c

@@ -13,7 +13,6 @@
 /*
  * External variables.
  */
-LSUP_TermSet *LSUP_term_cache = NULL;
 LSUP_NSMap *LSUP_default_nsm = NULL;
 LSUP_Term *LSUP_default_ctx = NULL;
 LSUP_Buffer *LSUP_default_ctx_buf = NULL;
@@ -40,26 +39,6 @@ LSUP_init (void)
 #endif
     log_set_level (loglevel);
 
-    // URI validation pattern.
-    MALLOC_GUARD (LSUP_uri_ptn, LSUP_MEM_ERR);
-
-#if 0 // Re-activate in case a change in the URI regex results in an error.
-    int regex_rc = regcomp (LSUP_uri_ptn, LSUP_URI_REGEX_STR, REG_EXTENDED);
-    if (regex_rc != 0) {
-        char err_msg[128];
-        size_t err_msg_sz = regerror (regex_rc, LSUP_uri_ptn, err_msg, 128);
-        log_error (
-                "Error compiling regular expression pattern: %s.",
-                err_msg);
-        return LSUP_ERROR;
-    }
-#else
-    if (regcomp (LSUP_uri_ptn, LSUP_URI_REGEX_STR, REG_EXTENDED) != 0) {
-        log_error ("Error compiling regular expression pattern.");
-        return LSUP_ERROR;
-    }
-#endif
-
     // Default namespace map.
     LSUP_default_nsm = LSUP_nsmap_new();
     if (UNLIKELY (!LSUP_default_nsm)) return LSUP_ERROR;
@@ -75,6 +54,7 @@ LSUP_init (void)
     if (UNLIKELY (!LSUP_default_ctx_buf)) return LSUP_ERROR;
 
     // Initialize term cache.
+    // This is the indicator for LSUP_IS_INIT.
     LSUP_term_cache = LSUP_term_set_new();
     if (UNLIKELY (!LSUP_term_cache)) return LSUP_MEM_ERR;
 
@@ -98,9 +78,6 @@ LSUP_done (void)
 {
     if (!LSUP_IS_INIT) return;
 
-    regfree (LSUP_uri_ptn);
-    free (LSUP_uri_ptn);
-
     // Free default NS map and context.
     LSUP_buffer_free (LSUP_default_ctx_buf);
     LSUP_term_free (LSUP_default_ctx);

+ 122 - 116
src/term.c

@@ -16,15 +16,19 @@
  * Data structures.
  */
 
-/// Sub-string coordinates for IRI matches.
-typedef size_t match_coord_t[2];
+/// Sub-match coordinates in IRI parsing results.
+typedef struct match_coord_t {
+    size_t              offset;     ///< Offset of match from start of string.
+    size_t              size;       ///< Length of match.
+} MatchCoord;
+
 
 /// Matching sub-patterns for IRI parts.
 struct iri_info_t {
     LSUP_NSMap *        nsm;        ///< NSM handle for prefixed IRI.
-    match_coord_t       prefix;     ///< Matching group #1.
-    match_coord_t       path;       ///< Matching group #5.
-    match_coord_t       frag;       ///< Matching group #10.
+    MatchCoord          prefix;     ///< URI prefix (scheme + authority).
+    MatchCoord          path;       ///< URI path (including fragment).
+    MatchCoord          frag;       ///< URI fragment.
 };
 
 
@@ -75,8 +79,8 @@ typedef struct link_map {
  */
 
 uint32_t LSUP_default_dtype_key = 0;
-regex_t *LSUP_uri_ptn;
 LSUP_Term *LSUP_default_datatype = NULL;
+LSUP_TermSet *LSUP_term_cache = NULL;
 
 
 /*
@@ -149,6 +153,9 @@ link_map_free_fn (void *item)
 }
 
 
+static LSUP_rc parse_iri (char *iri, MatchCoord coords[]);
+
+
  /*
  * Term API.
  */
@@ -395,12 +402,11 @@ LSUP_iriref_prefix (const LSUP_Term *iri)
         return NULL;
     }
 
-    if (iri->iri_info->prefix.rm_so == -1) return NULL;
-
-    size_t len = iri->iri_info->prefix.rm_eo - iri->iri_info->prefix.rm_so;
-    if (len == 0) return NULL;
+    if (iri->iri_info->prefix.size == 0) return NULL;
 
-    return strndup (iri->data + iri->iri_info->prefix.rm_so, len);
+    return strndup (
+            iri->data + iri->iri_info->prefix.offset,
+            iri->iri_info->prefix.size);
 }
 
 
@@ -412,12 +418,11 @@ LSUP_iriref_path (const LSUP_Term *iri)
         return NULL;
     }
 
-    if (iri->iri_info->path.rm_so == -1) return NULL;
+    if (iri->iri_info->path.size == 0) return NULL;
 
-    size_t len = iri->iri_info->path.rm_eo - iri->iri_info->path.rm_so;
-    if (len == 0) return NULL;
-
-    return strndup (iri->data + iri->iri_info->path.rm_so, len);
+    return strndup (
+            iri->data + iri->iri_info->path.offset,
+            iri->iri_info->path.size);
 }
 
 
@@ -429,11 +434,11 @@ LSUP_iriref_frag (const LSUP_Term *iri)
         return NULL;
     }
 
-    if (iri->iri_info->frag.rm_so == -1) return NULL;
-
-    size_t len = iri->iri_info->frag.rm_eo - iri->iri_info->frag.rm_so;
+    if (iri->iri_info->frag.size == 0) return NULL;
 
-    return strndup (iri->data + iri->iri_info->frag.rm_so, len);
+    return strndup (
+            iri->data + iri->iri_info->frag.offset,
+            iri->iri_info->frag.size);
 }
 
 
@@ -743,7 +748,7 @@ term_init (
         LSUP_Term *term, LSUP_TermType type,
         const char *data, void *metadata)
 {
-    if (UNLIKELY (!LSUP_uri_ptn)) {
+    if (UNLIKELY (!LSUP_IS_INIT)) {
         log_error ("Environment not initialized. Did you call LSUP_init()?");
         return LSUP_ERROR;
     }
@@ -781,9 +786,9 @@ term_init (
             }
 
             // Capture interesting IRI parts.
-            regmatch_t matches[11];
-            if (UNLIKELY (regexec (LSUP_uri_ptn, fquri, 11, matches, 0) != 0)) {
-                fprintf (stderr, "Error matching URI pattern.\n");
+            MatchCoord matches[7] = {};  // Initialize all to 0.
+            if (UNLIKELY (parse_iri (fquri, matches) != LSUP_OK)) {
+                log_error ("Error matching URI pattern.");
 
                 return LSUP_VALUE_ERR;
             }
@@ -792,8 +797,8 @@ term_init (
             MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
 
             term->iri_info->prefix = matches[1];
-            term->iri_info->path = matches[5];
-            term->iri_info->frag = matches[10];
+            term->iri_info->path = matches[4];
+            term->iri_info->frag = matches[6];
             term->iri_info->nsm = metadata;
         }
 
@@ -816,12 +821,12 @@ term_init (
                 MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
 
                 // Allocate IRI match patterns manually.
-                term->iri_info->prefix.rm_so = 0;
-                term->iri_info->prefix.rm_eo = 4;
-                term->iri_info->path.rm_so = 4;
-                term->iri_info->path.rm_eo = UUIDSTR_SIZE + 6;
-                term->iri_info->frag.rm_so = -1;
-                term->iri_info->frag.rm_eo = -1;
+                term->iri_info->prefix.offset = 0;
+                term->iri_info->prefix.size = 4;
+                term->iri_info->path.offset = 4;
+                term->iri_info->path.size = UUIDSTR_SIZE + 6;
+                term->iri_info->frag.offset = 0;
+                term->iri_info->frag.size = 0;
                 term->iri_info->nsm = NULL;
 
             } else term->data = strdup (uuid_str);
@@ -889,118 +894,119 @@ term_init (
  *
  * Experimental replacement of a regex engine for better performance.
  *
- * https://datatracker.ietf.org/doc/html/rfc3986#appendix-B
+ * Slightly adapted from regex on
+ * https://datatracker.ietf.org/doc/html/rfc3986#appendix-B to capture relevant
+ * parts of the IRI.
  *
  * Reference regex and group numbering:
- * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
- *  12            3  4          5       6  7        8 9
+ * ^((?([^:/?#]+):)?(?//([^/?#]*))?)((?[^?#]*)(?\?([^#]*))?(?#(.*))?)
+ *  1  2                3           4             5           6
  *
  * Capturing groups:
  *
- * scheme    = $2
- * authority = $4
- * path      = $5
- * query     = $7
- * fragment  = $9
- *
  * #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
- * #1: Domain prefix (http://example.org)
- * #2: Protocol (http:)
- * #4: Authority (example.org)
- * #5: Path, excluding query and fragment (/123/456/)
- * #7: Query (query=blah)
- * #9: Fragment (frag)
+ * #1: Prefix (http://example.org)
+ * #2: Scheme (http)
+ * #3: Authority (example.org)
+ * #4: Path, including query and fragment (/123/456/?query=blah#frag)
+ * #5: Query (query=blah)
+ * #6: Fragment (frag)
+ *
+ *
+ * @param iri_str[in] IRI string to parse.
+ *
+ * @param match_coord_t[out] coord Coordinates to be stored. This must be a
+ * pre-allocated array of at least 7 elements.
+ *
+ * The first size_t of each element stores the relative position of a match,
+ * and the second one stores the length of the match. A length of 0 indicates
+ * no match.
  */
 static LSUP_rc
-parse_iri (char *iri_str, LSUP_IRIInfo *iri_info) {
+parse_iri (char *iri_str, MatchCoord coord[]) {
     char *cur = iri_str;
-    size_t
-        coord[2][10] = {0},  // Store capture group relative position & length
-        tmp[2] = {0};  // Temporary storage for capture groups
-
-    // #1: (([^:/?#]+:)?(//([^/?#]*))?)
-    while (*cur != '\0') {
-        // #2: ([^:/?#]+:)?
-        tmp[0] = cur - iri_str;
-        while (
-                *cur != ':' && *cur != '/' && *cur != '?'
-                && *cur != '#' && *cur != '\0') {
-            tmp[1]++;
-            cur++;
-        }
-        if (tmp[1] > 0 && *cur == ':') {
-            // Got capture group #2. Store it.
-            tmp[1]++;
-            coord[3][0] = tmp[0];
-            coord[3][1] = tmp[1];
+    size_t iri_len = strlen (iri_str);
+    MatchCoord tmp = {};  // Temporary storage for capture groups
+
+    // #2: ([^:/?#]+)
+    while (
+            *cur != ':' && *cur != '/' && *cur != '?'
+            && *cur != '#' && *cur != '\0') {
+        tmp.size++;
+        cur++;
+    }
+
+    // Non-capturing: (?([^:/?#]+):)?
+    if (tmp.size > 0 && *(++cur) == ':') {
+        // Got capture groups #2 and #3. Store them.
+        tmp.size++;
+        coord[3].offset = tmp.offset;
+        coord[3].size = tmp.size - 1;
+    }
+
+    // Non-capturing: (?//([^/?#]*))?
+    if (*(cur + 1) == '/' && *(cur + 2) == '/') {
+        cur++;
+        tmp.offset = cur - iri_str;
+        tmp.size = 2;
+        cur += 2;
+
+        // #3: ([^/?#]*)
+        while (*cur != '/' && *cur != '?' && *cur != '#' && *cur != '\0') {
+            tmp.size++;
             cur++;
         }
-        // #3: (//([^/?#]*))?
-        if (*cur == '/' && *(cur + 1) == '/') {
-            tmp[0] = cur - iri_str;
-            tmp[1] = 2;
-            cur += 2;
-
-            // #4: ([^/?#]*)
-            while (*cur != '/' && *cur != '?' && *cur != '#' && *cur != '\0') {
-                // Continue recording length for #3, coordinates for #4 can
-                // be inferred.
-                tmp[1]++;
-                cur++;
-            }
 
-            if (tmp[1] > 2) {
-                // Got capture groups #3 and #4. Store them.
-                coord[3][0] = tmp[0];
-                coord[3][1] = tmp[1];
-                coord[4][0] = tmp[0] + 2;
-                coord[4][1] = tmp[1] -2;
-            }
-        }
+        // Maybe got capture group #5.
+        coord[3].offset = tmp.offset + 2;
+        coord[3].size = tmp.size -2;
     }
 
-    // #5: ([^?#]*)
-    tmp[0] = cur - iri_str;
-    tmp[1] = 0;
+    // Capture group 1 and advance cursor.
+    coord[1].offset = 0;
+    coord[1].size = cur++ - iri_str;
+
+    // Non-capturing: (?[^?#]*)
+    tmp.offset = cur - iri_str;
+    tmp.size = 0;
     while (*cur != '?' && *cur != '#' && *cur != '\0') {
-        tmp[1]++;
+        tmp.size++;
         cur++;
     }
-    if (tmp[1] > 0) {
-        // Got capture group #5. Store it.
-        coord[5][0] = tmp[0];
-        coord[5][1] = tmp[1];
-    }
 
-    // #6: (\?([^#]*))
-    if (*cur == '?') {
-        // Advance cursor by one and skip storing '?'.
-        tmp[0] = ++cur - iri_str;
-        tmp[1] = 0;
+    if (tmp.size > 0) {
+        coord[4].offset = tmp.offset;
+        coord[4].size = iri_str + iri_len - cur;
+
+    } else return LSUP_NORESULT;  // This group is the only mandatory match.
 
-        // 7: ([^#]*)
+    // Non-capturing: (?\?([^#]*))
+    if (*(++cur) == '?') {
+        // 5: ([^#]*)
+        tmp.offset = ++cur - iri_str;
+        tmp.size = 0;
         while (*cur != '#' && *cur != '\0') {
-            tmp[1]++;
+            tmp.size++;
             cur++;
         }
 
-        if (tmp[1] > 0) {
-            // Got capture group #7. Store it.
-            // Group #6 (query including '?') is ignored.
-            coord[7][0] = tmp[0];
-            coord[7][1] = tmp[1];
+        if (tmp.size > 0) {
+            // Got capture group #5.
+            coord[5].offset = tmp.offset;
+            coord[5].size = tmp.size;
         }
     }
 
-    // 8: (#(.*))
-    if (*cur == '#') {
-        // 9: (.*)
-        // Skip storing '#'.
-        tmp[0] = ++cur - iri_str;
-        // Store the length of the remaining string.
-        tmp[1] = strlen(iri_str) + iri_str - cur;
+    // Non-capturing: (?#(.*))?
+    if (*(++cur) == '#') {
+        // #6: (.*)
+        coord[6].offset = ++cur - iri_str;
+        coord[6].size = iri_str + iri_len - cur;
     }
 
+    coord[0].offset = 0;
+    coord[0].size = iri_len;
+
     return LSUP_OK;
 }