|
@@ -16,15 +16,19 @@
|
|
* Data structures.
|
|
* Data structures.
|
|
*/
|
|
*/
|
|
|
|
|
|
-/// Sub-string coordinates for IRI matches.
|
|
|
|
-typedef size_t match_coord_t[2];
|
|
|
|
|
|
+/// Sub-match coordinates in IRI parsing results.
|
|
|
|
+typedef struct match_coord_t {
|
|
|
|
+ size_t offset; ///< Offset of match from start of string.
|
|
|
|
+ size_t size; ///< Length of match.
|
|
|
|
+} MatchCoord;
|
|
|
|
+
|
|
|
|
|
|
/// Matching sub-patterns for IRI parts.
|
|
/// Matching sub-patterns for IRI parts.
|
|
struct iri_info_t {
|
|
struct iri_info_t {
|
|
LSUP_NSMap * nsm; ///< NSM handle for prefixed IRI.
|
|
LSUP_NSMap * nsm; ///< NSM handle for prefixed IRI.
|
|
- match_coord_t prefix; ///< Matching group #1.
|
|
|
|
- match_coord_t path; ///< Matching group #5.
|
|
|
|
- match_coord_t frag; ///< Matching group #10.
|
|
|
|
|
|
+ MatchCoord prefix; ///< URI prefix (scheme + authority).
|
|
|
|
+ MatchCoord path; ///< URI path (including fragment).
|
|
|
|
+ MatchCoord frag; ///< URI fragment.
|
|
};
|
|
};
|
|
|
|
|
|
|
|
|
|
@@ -75,8 +79,8 @@ typedef struct link_map {
|
|
*/
|
|
*/
|
|
|
|
|
|
uint32_t LSUP_default_dtype_key = 0;
|
|
uint32_t LSUP_default_dtype_key = 0;
|
|
-regex_t *LSUP_uri_ptn;
|
|
|
|
LSUP_Term *LSUP_default_datatype = NULL;
|
|
LSUP_Term *LSUP_default_datatype = NULL;
|
|
|
|
+LSUP_TermSet *LSUP_term_cache = NULL;
|
|
|
|
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -149,6 +153,9 @@ link_map_free_fn (void *item)
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
+static LSUP_rc parse_iri (char *iri, MatchCoord coords[]);
|
|
|
|
+
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* Term API.
|
|
* Term API.
|
|
*/
|
|
*/
|
|
@@ -395,12 +402,11 @@ LSUP_iriref_prefix (const LSUP_Term *iri)
|
|
return NULL;
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
|
|
- if (iri->iri_info->prefix.rm_so == -1) return NULL;
|
|
|
|
-
|
|
|
|
- size_t len = iri->iri_info->prefix.rm_eo - iri->iri_info->prefix.rm_so;
|
|
|
|
- if (len == 0) return NULL;
|
|
|
|
|
|
+ if (iri->iri_info->prefix.size == 0) return NULL;
|
|
|
|
|
|
- return strndup (iri->data + iri->iri_info->prefix.rm_so, len);
|
|
|
|
|
|
+ return strndup (
|
|
|
|
+ iri->data + iri->iri_info->prefix.offset,
|
|
|
|
+ iri->iri_info->prefix.size);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -412,12 +418,11 @@ LSUP_iriref_path (const LSUP_Term *iri)
|
|
return NULL;
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
|
|
- if (iri->iri_info->path.rm_so == -1) return NULL;
|
|
|
|
|
|
+ if (iri->iri_info->path.size == 0) return NULL;
|
|
|
|
|
|
- size_t len = iri->iri_info->path.rm_eo - iri->iri_info->path.rm_so;
|
|
|
|
- if (len == 0) return NULL;
|
|
|
|
-
|
|
|
|
- return strndup (iri->data + iri->iri_info->path.rm_so, len);
|
|
|
|
|
|
+ return strndup (
|
|
|
|
+ iri->data + iri->iri_info->path.offset,
|
|
|
|
+ iri->iri_info->path.size);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -429,11 +434,11 @@ LSUP_iriref_frag (const LSUP_Term *iri)
|
|
return NULL;
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
|
|
- if (iri->iri_info->frag.rm_so == -1) return NULL;
|
|
|
|
-
|
|
|
|
- size_t len = iri->iri_info->frag.rm_eo - iri->iri_info->frag.rm_so;
|
|
|
|
|
|
+ if (iri->iri_info->frag.size == 0) return NULL;
|
|
|
|
|
|
- return strndup (iri->data + iri->iri_info->frag.rm_so, len);
|
|
|
|
|
|
+ return strndup (
|
|
|
|
+ iri->data + iri->iri_info->frag.offset,
|
|
|
|
+ iri->iri_info->frag.size);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -743,7 +748,7 @@ term_init (
|
|
LSUP_Term *term, LSUP_TermType type,
|
|
LSUP_Term *term, LSUP_TermType type,
|
|
const char *data, void *metadata)
|
|
const char *data, void *metadata)
|
|
{
|
|
{
|
|
- if (UNLIKELY (!LSUP_uri_ptn)) {
|
|
|
|
|
|
+ if (UNLIKELY (!LSUP_IS_INIT)) {
|
|
log_error ("Environment not initialized. Did you call LSUP_init()?");
|
|
log_error ("Environment not initialized. Did you call LSUP_init()?");
|
|
return LSUP_ERROR;
|
|
return LSUP_ERROR;
|
|
}
|
|
}
|
|
@@ -781,9 +786,9 @@ term_init (
|
|
}
|
|
}
|
|
|
|
|
|
// Capture interesting IRI parts.
|
|
// Capture interesting IRI parts.
|
|
- regmatch_t matches[11];
|
|
|
|
- if (UNLIKELY (regexec (LSUP_uri_ptn, fquri, 11, matches, 0) != 0)) {
|
|
|
|
- fprintf (stderr, "Error matching URI pattern.\n");
|
|
|
|
|
|
+ MatchCoord matches[7] = {}; // Initialize all to 0.
|
|
|
|
+ if (UNLIKELY (parse_iri (fquri, matches) != LSUP_OK)) {
|
|
|
|
+ log_error ("Error matching URI pattern.");
|
|
|
|
|
|
return LSUP_VALUE_ERR;
|
|
return LSUP_VALUE_ERR;
|
|
}
|
|
}
|
|
@@ -792,8 +797,8 @@ term_init (
|
|
MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
|
|
MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
|
|
|
|
|
|
term->iri_info->prefix = matches[1];
|
|
term->iri_info->prefix = matches[1];
|
|
- term->iri_info->path = matches[5];
|
|
|
|
- term->iri_info->frag = matches[10];
|
|
|
|
|
|
+ term->iri_info->path = matches[4];
|
|
|
|
+ term->iri_info->frag = matches[6];
|
|
term->iri_info->nsm = metadata;
|
|
term->iri_info->nsm = metadata;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -816,12 +821,12 @@ term_init (
|
|
MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
|
|
MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
|
|
|
|
|
|
// Allocate IRI match patterns manually.
|
|
// Allocate IRI match patterns manually.
|
|
- term->iri_info->prefix.rm_so = 0;
|
|
|
|
- term->iri_info->prefix.rm_eo = 4;
|
|
|
|
- term->iri_info->path.rm_so = 4;
|
|
|
|
- term->iri_info->path.rm_eo = UUIDSTR_SIZE + 6;
|
|
|
|
- term->iri_info->frag.rm_so = -1;
|
|
|
|
- term->iri_info->frag.rm_eo = -1;
|
|
|
|
|
|
+ term->iri_info->prefix.offset = 0;
|
|
|
|
+ term->iri_info->prefix.size = 4;
|
|
|
|
+ term->iri_info->path.offset = 4;
|
|
|
|
+ term->iri_info->path.size = UUIDSTR_SIZE + 6;
|
|
|
|
+ term->iri_info->frag.offset = 0;
|
|
|
|
+ term->iri_info->frag.size = 0;
|
|
term->iri_info->nsm = NULL;
|
|
term->iri_info->nsm = NULL;
|
|
|
|
|
|
} else term->data = strdup (uuid_str);
|
|
} else term->data = strdup (uuid_str);
|
|
@@ -889,118 +894,119 @@ term_init (
|
|
*
|
|
*
|
|
* Experimental replacement of a regex engine for better performance.
|
|
* Experimental replacement of a regex engine for better performance.
|
|
*
|
|
*
|
|
- * https://datatracker.ietf.org/doc/html/rfc3986#appendix-B
|
|
|
|
|
|
+ * Slightly adapted from regex on
|
|
|
|
+ * https://datatracker.ietf.org/doc/html/rfc3986#appendix-B to capture relevant
|
|
|
|
+ * parts of the IRI.
|
|
*
|
|
*
|
|
* Reference regex and group numbering:
|
|
* Reference regex and group numbering:
|
|
- * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
|
|
|
|
- * 12 3 4 5 6 7 8 9
|
|
|
|
|
|
+ * ^((?([^:/?#]+):)?(?//([^/?#]*))?)((?[^?#]*)(?\?([^#]*))?(?#(.*))?)
|
|
|
|
+ * 1 2 3 4 5 6
|
|
*
|
|
*
|
|
* Capturing groups:
|
|
* Capturing groups:
|
|
*
|
|
*
|
|
- * scheme = $2
|
|
|
|
- * authority = $4
|
|
|
|
- * path = $5
|
|
|
|
- * query = $7
|
|
|
|
- * fragment = $9
|
|
|
|
- *
|
|
|
|
* #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
|
|
* #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
|
|
- * #1: Domain prefix (http://example.org)
|
|
|
|
- * #2: Protocol (http:)
|
|
|
|
- * #4: Authority (example.org)
|
|
|
|
- * #5: Path, excluding query and fragment (/123/456/)
|
|
|
|
- * #7: Query (query=blah)
|
|
|
|
- * #9: Fragment (frag)
|
|
|
|
|
|
+ * #1: Prefix (http://example.org)
|
|
|
|
+ * #2: Scheme (http)
|
|
|
|
+ * #3: Authority (example.org)
|
|
|
|
+ * #4: Path, including query and fragment (/123/456/?query=blah#frag)
|
|
|
|
+ * #5: Query (query=blah)
|
|
|
|
+ * #6: Fragment (frag)
|
|
|
|
+ *
|
|
|
|
+ *
|
|
|
|
+ * @param iri_str[in] IRI string to parse.
|
|
|
|
+ *
|
|
|
|
+ * @param match_coord_t[out] coord Coordinates to be stored. This must be a
|
|
|
|
+ * pre-allocated array of at least 7 elements.
|
|
|
|
+ *
|
|
|
|
+ * The first size_t of each element stores the relative position of a match,
|
|
|
|
+ * and the second one stores the length of the match. A length of 0 indicates
|
|
|
|
+ * no match.
|
|
*/
|
|
*/
|
|
static LSUP_rc
|
|
static LSUP_rc
|
|
-parse_iri (char *iri_str, LSUP_IRIInfo *iri_info) {
|
|
|
|
|
|
+parse_iri (char *iri_str, MatchCoord coord[]) {
|
|
char *cur = iri_str;
|
|
char *cur = iri_str;
|
|
- size_t
|
|
|
|
- coord[2][10] = {0}, // Store capture group relative position & length
|
|
|
|
- tmp[2] = {0}; // Temporary storage for capture groups
|
|
|
|
-
|
|
|
|
- // #1: (([^:/?#]+:)?(//([^/?#]*))?)
|
|
|
|
- while (*cur != '\0') {
|
|
|
|
- // #2: ([^:/?#]+:)?
|
|
|
|
- tmp[0] = cur - iri_str;
|
|
|
|
- while (
|
|
|
|
- *cur != ':' && *cur != '/' && *cur != '?'
|
|
|
|
- && *cur != '#' && *cur != '\0') {
|
|
|
|
- tmp[1]++;
|
|
|
|
- cur++;
|
|
|
|
- }
|
|
|
|
- if (tmp[1] > 0 && *cur == ':') {
|
|
|
|
- // Got capture group #2. Store it.
|
|
|
|
- tmp[1]++;
|
|
|
|
- coord[3][0] = tmp[0];
|
|
|
|
- coord[3][1] = tmp[1];
|
|
|
|
|
|
+ size_t iri_len = strlen (iri_str);
|
|
|
|
+ MatchCoord tmp = {}; // Temporary storage for capture groups
|
|
|
|
+
|
|
|
|
+ // #2: ([^:/?#]+)
|
|
|
|
+ while (
|
|
|
|
+ *cur != ':' && *cur != '/' && *cur != '?'
|
|
|
|
+ && *cur != '#' && *cur != '\0') {
|
|
|
|
+ tmp.size++;
|
|
|
|
+ cur++;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // Non-capturing: (?([^:/?#]+):)?
|
|
|
|
+ if (tmp.size > 0 && *(++cur) == ':') {
|
|
|
|
+ // Got capture groups #2 and #3. Store them.
|
|
|
|
+ tmp.size++;
|
|
|
|
+ coord[3].offset = tmp.offset;
|
|
|
|
+ coord[3].size = tmp.size - 1;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // Non-capturing: (?//([^/?#]*))?
|
|
|
|
+ if (*(cur + 1) == '/' && *(cur + 2) == '/') {
|
|
|
|
+ cur++;
|
|
|
|
+ tmp.offset = cur - iri_str;
|
|
|
|
+ tmp.size = 2;
|
|
|
|
+ cur += 2;
|
|
|
|
+
|
|
|
|
+ // #3: ([^/?#]*)
|
|
|
|
+ while (*cur != '/' && *cur != '?' && *cur != '#' && *cur != '\0') {
|
|
|
|
+ tmp.size++;
|
|
cur++;
|
|
cur++;
|
|
}
|
|
}
|
|
- // #3: (//([^/?#]*))?
|
|
|
|
- if (*cur == '/' && *(cur + 1) == '/') {
|
|
|
|
- tmp[0] = cur - iri_str;
|
|
|
|
- tmp[1] = 2;
|
|
|
|
- cur += 2;
|
|
|
|
-
|
|
|
|
- // #4: ([^/?#]*)
|
|
|
|
- while (*cur != '/' && *cur != '?' && *cur != '#' && *cur != '\0') {
|
|
|
|
- // Continue recording length for #3, coordinates for #4 can
|
|
|
|
- // be inferred.
|
|
|
|
- tmp[1]++;
|
|
|
|
- cur++;
|
|
|
|
- }
|
|
|
|
|
|
|
|
- if (tmp[1] > 2) {
|
|
|
|
- // Got capture groups #3 and #4. Store them.
|
|
|
|
- coord[3][0] = tmp[0];
|
|
|
|
- coord[3][1] = tmp[1];
|
|
|
|
- coord[4][0] = tmp[0] + 2;
|
|
|
|
- coord[4][1] = tmp[1] -2;
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
|
|
+ // Maybe got capture group #5.
|
|
|
|
+ coord[3].offset = tmp.offset + 2;
|
|
|
|
+ coord[3].size = tmp.size -2;
|
|
}
|
|
}
|
|
|
|
|
|
- // #5: ([^?#]*)
|
|
|
|
- tmp[0] = cur - iri_str;
|
|
|
|
- tmp[1] = 0;
|
|
|
|
|
|
+ // Capture group 1 and advance cursor.
|
|
|
|
+ coord[1].offset = 0;
|
|
|
|
+ coord[1].size = cur++ - iri_str;
|
|
|
|
+
|
|
|
|
+ // Non-capturing: (?[^?#]*)
|
|
|
|
+ tmp.offset = cur - iri_str;
|
|
|
|
+ tmp.size = 0;
|
|
while (*cur != '?' && *cur != '#' && *cur != '\0') {
|
|
while (*cur != '?' && *cur != '#' && *cur != '\0') {
|
|
- tmp[1]++;
|
|
|
|
|
|
+ tmp.size++;
|
|
cur++;
|
|
cur++;
|
|
}
|
|
}
|
|
- if (tmp[1] > 0) {
|
|
|
|
- // Got capture group #5. Store it.
|
|
|
|
- coord[5][0] = tmp[0];
|
|
|
|
- coord[5][1] = tmp[1];
|
|
|
|
- }
|
|
|
|
|
|
|
|
- // #6: (\?([^#]*))
|
|
|
|
- if (*cur == '?') {
|
|
|
|
- // Advance cursor by one and skip storing '?'.
|
|
|
|
- tmp[0] = ++cur - iri_str;
|
|
|
|
- tmp[1] = 0;
|
|
|
|
|
|
+ if (tmp.size > 0) {
|
|
|
|
+ coord[4].offset = tmp.offset;
|
|
|
|
+ coord[4].size = iri_str + iri_len - cur;
|
|
|
|
+
|
|
|
|
+ } else return LSUP_NORESULT; // This group is the only mandatory match.
|
|
|
|
|
|
- // 7: ([^#]*)
|
|
|
|
|
|
+ // Non-capturing: (?\?([^#]*))
|
|
|
|
+ if (*(++cur) == '?') {
|
|
|
|
+ // 5: ([^#]*)
|
|
|
|
+ tmp.offset = ++cur - iri_str;
|
|
|
|
+ tmp.size = 0;
|
|
while (*cur != '#' && *cur != '\0') {
|
|
while (*cur != '#' && *cur != '\0') {
|
|
- tmp[1]++;
|
|
|
|
|
|
+ tmp.size++;
|
|
cur++;
|
|
cur++;
|
|
}
|
|
}
|
|
|
|
|
|
- if (tmp[1] > 0) {
|
|
|
|
- // Got capture group #7. Store it.
|
|
|
|
- // Group #6 (query including '?') is ignored.
|
|
|
|
- coord[7][0] = tmp[0];
|
|
|
|
- coord[7][1] = tmp[1];
|
|
|
|
|
|
+ if (tmp.size > 0) {
|
|
|
|
+ // Got capture group #5.
|
|
|
|
+ coord[5].offset = tmp.offset;
|
|
|
|
+ coord[5].size = tmp.size;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
- // 8: (#(.*))
|
|
|
|
- if (*cur == '#') {
|
|
|
|
- // 9: (.*)
|
|
|
|
- // Skip storing '#'.
|
|
|
|
- tmp[0] = ++cur - iri_str;
|
|
|
|
- // Store the length of the remaining string.
|
|
|
|
- tmp[1] = strlen(iri_str) + iri_str - cur;
|
|
|
|
|
|
+ // Non-capturing: (?#(.*))?
|
|
|
|
+ if (*(++cur) == '#') {
|
|
|
|
+ // #6: (.*)
|
|
|
|
+ coord[6].offset = ++cur - iri_str;
|
|
|
|
+ coord[6].size = iri_str + iri_len - cur;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ coord[0].offset = 0;
|
|
|
|
+ coord[0].size = iri_len;
|
|
|
|
+
|
|
return LSUP_OK;
|
|
return LSUP_OK;
|
|
}
|
|
}
|
|
|
|
|