Browse Source

Merge branch 'clang' of scossu/lsup_rdf into master

scossu 1 week ago
parent
commit
887d272b2d
18 changed files with 347 additions and 175 deletions
  1. 18 0
      Dockerfile
  2. 25 16
      Makefile
  3. 1 1
      cpython/py_graph.h
  4. 30 33
      docs/dev/deps.dot
  5. BIN
      docs/dev/deps.pdf
  6. 0 8
      include/environment.h
  7. 5 26
      include/term.h
  8. 12 16
      profile.c
  9. 7 6
      src/codec/Makefile
  10. 1 1
      src/codec/codec_ttl.c
  11. 1 1
      src/codec/lexer_nt.re
  12. 1 1
      src/codec/lexer_ttl.re
  13. 1 0
      src/core.c
  14. 7 25
      src/environment.c
  15. 1 1
      src/store_mdb.c
  16. 175 37
      src/term.c
  17. 3 3
      test/assets/triples.h
  18. 59 0
      test/test_term.c

+ 18 - 0
Dockerfile

@@ -0,0 +1,18 @@
+FROM --platform=linux/amd64 archlinux:base-devel
+
+RUN pacman -Syu --noconfirm
+RUN pacman -S --noconfirm git xxhash lmdb re2c valgrind gdb graphviz
+
+WORKDIR /opt/lsup/lsup_rdf/src
+RUN git clone --recurse-submodules https://git.knowledgetx.com/scossu/lsup_rdf.git
+WORKDIR lsup_rdf
+
+RUN useradd --create-home -G wheel lsup
+RUN echo "%wheel ALL=(ALL:ALL) NOPASSWD:ALL" >| /etc/sudoers.d/wheel
+RUN chown -R lsup:lsup .
+
+USER lsup
+
+ENV LD_LIBRARY_PATH /opt/local/lsup
+
+CMD ["/usr/bin/bash"]

+ 25 - 16
Makefile

@@ -9,34 +9,36 @@ AR = ar
 ## Paths.
 
 PREFIX ?= /usr/local
-bindir ::= $(PREFIX)/bin
-libdir ::= $(PREFIX)/lib
+bindir := $(PREFIX)/bin
+libdir := $(PREFIX)/lib
 includedir = $(PREFIX)/include/lsup
 outdir = ./bin
 VALGRIND_DUMP = /tmp/lsup_valgrind.log
 CALLGRIND_DUMP = /tmp/lsup_callgrind.out
 MASSIF_DUMP = /tmp/lsup_massif.out
 
-INCLUDE_BASE ::= . -Iinclude -Iext/tpl/src -Iext/hashmap -Iext/log/src
-INCLUDE ::= -I$(INCLUDE_BASE)
-_CFLAGS ::= -std=gnu11 -Wall -fPIC -MMD $(INCLUDE)
+INCLUDE_BASE := . -Iinclude -Iext/tpl/src -Iext/hashmap -Iext/log/src
+INCLUDE := -I$(INCLUDE_BASE)
+_CFLAGS := -std=gnu11 -Wall -fPIC -MMD $(INCLUDE)
 CFLAGS = $(_CFLAGS) -O3
 DBG_CFLAGS = $(_CFLAGS) -Itest -O0 -g3 -DDEBUG
+#$(info CFLAGS: $(CFLAGS))
+#$(info DBG_CFLAGS: $(DBG_CFLAGS))
 # NOTE: -luuid is a Linux system library. Other OS's might need a different
 # link or a non-system library built.
-LDFLAGS ::= -L. -L$(outdir) -L$(libdir) -llmdb -lxxhash -luuid
+LDFLAGS := -L$(libdir) -L$(outdir) -L. -llmdb -lxxhash -luuid
 
 PARSER = bin/lemon
 LEMON_SRC = ext/sqlite/tool/lemon.c
 CODEC_DIR = src/codec
 
 # External sources compiled in core object.
-EXT_SRC ::= $(wildcard ext/log/src/*.c) \
+EXT_SRC := $(wildcard ext/log/src/*.c) \
 	  	  $(wildcard ext/hashmap/*.c) \
 	  	  $(wildcard ext/tpl/src/*.c)
 
 # External headers of libraries compiled in core.
-EXT_H ::= $(wildcard ext/log/src/*.h) \
+EXT_H := $(wildcard ext/log/src/*.h) \
 	  	$(wildcard ext/tpl/src/*.h) \
 	  	$(wildcard ext/hashmap/*.h)
 
@@ -44,11 +46,11 @@ LSUP_SRC = $(wildcard src/*.c)
 SRC = $(EXT_SRC) $(LSUP_SRC)
 TEST_SRC = $(wildcard test/*.c) test.c
 
-EXT_OBJ ::= $(EXT_SRC:.c=.o)
+EXT_OBJ := $(EXT_SRC:.c=.o)
 # TODO This is extremely convoluted, simplify if possible.
-CODEC_SRC ::= $(wildcard $(CODEC_DIR)/codec_*.c)
-CODEC_REL_SRC ::= $(CODEC_SRC:$(CODEC_DIR)/%=%)
-ALL_CODEC_REL_SRC ::= $(CODEC_REL_SRC) $(CODEC_REL_SRC:codec_%=parser_%) \
+CODEC_SRC := $(wildcard $(CODEC_DIR)/codec_*.c)
+CODEC_REL_SRC := $(CODEC_SRC:$(CODEC_DIR)/%=%)
+ALL_CODEC_REL_SRC := $(CODEC_REL_SRC) $(CODEC_REL_SRC:codec_%=parser_%) \
 			$(CODEC_REL_SRC:codec_%=grammar_%)
 CODEC_SRC = $(ALL_CODEC_REL_SRC:%=$(CODEC_DIR)/%)
 CODEC_OBJ = $(CODEC_SRC:.c=.o)
@@ -63,6 +65,13 @@ DYN_DBG_LIB = $(outdir)/liblsuprdf_dbg.so
 LIBS = $(STATIC_LIB) $(DYN_LIB)
 DBG_LIBS = $(STATIC_DBG_LIB) $(DYN_DBG_LIB)
 
+# LDD for Linux, otool -L for OSX.
+ifeq (, $(shell which ldd))
+LDD := otool -L
+else
+LDD := ldd
+endif
+
 # For visual dep graph.
 DEPS := $(shell echo "${INCLUDE_BASE}" | sed -e 's/ -I/,/g'),include/codec
 DOCS = docs
@@ -170,13 +179,13 @@ bin/test: debug $(TEST_SRC)
 
 .PHONY: test
 test: bin/test ## Run a test suite.
-	@echo "Using libraries: "; ldd bin/test
+	@echo "Using libraries: "; $(LDD) bin/test
 	exec bin/test
 
 
 .PHONY: gdb_test
 gdb_test: bin/test ## Run a test suite within gdb.
-	@echo "Using libraries: "; ldd bin/test
+	@echo "Using libraries: "; $(LDD) bin/test
 	exec gdb bin/test
 
 
@@ -201,7 +210,7 @@ memcheck:
 memtest: bin/test memcheck ## Run a test suite using Valgrind. Output to separate file.
 
 
-# Performance test application. Essentially the profiling code without debug.
+# Profiling application.
 bin/profile: debug profile.c
 	$(CC) $(CFLAGS) -g -DTESTING $(LDFLAGS) -llsuprdf_dbg \
 		profile.c -o bin/profile
@@ -220,7 +229,7 @@ perftest: bin/perftest ## Run a performance test by creating, inserting and look
 .PHONY: profile
 profile: bin/profile ## Run a profiling session. Output can be inspected with KCachegrind.
 	LSUP_MDB_MAPSIZE=800000 valgrind --tool=callgrind \
-		--callgrind-out-file="$(CALLGRIND_DUMP)" bin/perftest 1000
+		--callgrind-out-file="$(CALLGRIND_DUMP)" bin/profile 1000
 	@echo "Profile dump written at $(CALLGRIND_DUMP). Open it with "\
 		"qcachegrind, kcachegrind, etc."
 

+ 1 - 1
cpython/py_graph.h

@@ -167,7 +167,7 @@ Graph_init (GraphObject *self, PyObject *args, PyObject *kwargs)
     }
 
     // TODO Move store creation fn and handle into a separate module.
-    LSUP_store *store = LSUP_store_new (store_type, NULL, 0);
+    LSUP_Store *store = LSUP_store_new (store_type, NULL, 0);
     if (sif->setup_fn) {
         if (sif->setup_fn(NULL, false) < LSUP_OK) {
             PyErr_SetString (

+ 30 - 33
docs/dev/deps.dot

@@ -5,48 +5,45 @@ digraph "source tree" {
     fontsize="16";
     fontname="Helvetica";
 	clusterrank="local";
+	"py_lsup_rdf" -> "py_graph"
 	"term" -> "namespace"
-	"profile" -> "lsup_rdf"
+	"store" -> "store_mdb"
 	"store_htable" -> "buffer"
-	"grammar_ttl" -> "codec"
-	"graph" -> "environment"
-	"lsup_rdf" -> "codec_ttl"
-	"store_interface" -> "environment"
-	"lsup_rdf" -> "codec_nt"
 	"namespace" -> "hashmap"
-	"core" -> "lmdb"
-	"environment" -> "term"
-	"py_triple" -> "py_term"
-	"core" -> "xxhash"
-	"store_mdb" -> "buffer"
+	"profile" -> "lsup_rdf"
+	"py_graph" -> "codec_ttl"
+	"store_mdb" -> "store_interface"
+	"py_term" -> "py_namespace"
+	"grammar_nt" -> "codec"
 	"py_namespace" -> "namespace"
-	"store_htable" -> "store_interface"
 	"term" -> "buffer"
+	"parser_nt" -> "codec"
+	"term" -> "tpl"
+	"store_mdb" -> "buffer"
+	"environment" -> "bootstrap"
+	"py_graph" -> "graph"
+	"lsup_rdf" -> "codec_ttl"
 	"graph" -> "store"
-	"py_lsup_rdf" -> "py_graph"
-	"store_htable" -> "hashmap"
+	"parser_nt" -> "tokens_nt"
+	"graph" -> "environment"
 	"py_term" -> "term"
-	"core" -> "log"
-	"parser_ttl" -> "codec"
-	"py_graph" -> "codec_ttl"
-	"store" -> "store_htable"
-	"codec_nt" -> "parser_nt"
+	"codec_ttl" -> "parser_ttl"
+	"store_htable" -> "hashmap"
+	"grammar_ttl" -> "codec"
 	"buffer" -> "core"
-	"py_term" -> "py_namespace"
-	"store_mdb" -> "store_interface"
-	"parser_nt" -> "tokens_nt"
-	"store" -> "store_mdb"
-	"parser_nt" -> "codec"
-	"parser_ttl" -> "tokens_ttl"
 	"graph" -> "term"
-	"environment" -> "bootstrap"
-	"store_mdb" -> "lmdb"
-	"term" -> "tpl"
-	"codec" -> "graph"
-	"codec_ttl" -> "parser_ttl"
-	"py_graph" -> "graph"
-	"py_graph" -> "py_triple"
-	"grammar_nt" -> "codec"
+	"lsup_rdf" -> "codec_nt"
 	"py_graph" -> "codec_nt"
+	"core" -> "log"
+	"py_triple" -> "py_term"
+	"py_graph" -> "py_triple"
+	"codec_nt" -> "parser_nt"
+	"codec" -> "graph"
+	"store" -> "store_htable"
+	"store_interface" -> "environment"
+	"parser_ttl" -> "codec"
+	"store_htable" -> "store_interface"
 	"namespace" -> "core"
+	"environment" -> "term"
+	"parser_ttl" -> "tokens_ttl"
 }

BIN
docs/dev/deps.pdf


+ 0 - 8
include/environment.h

@@ -15,14 +15,6 @@
 #include "term.h"
 
 
-/** @brief Whether the environment is already initialized.
- *
- * @TODO Check if the default NS was inserted; this would be slower but more
- * accurate.
- */
-#define LSUP_IS_INIT (LSUP_term_cache != NULL)
-
-
 /*
  * External variables.
  */

+ 5 - 26
include/term.h

@@ -2,7 +2,6 @@
 #define _LSUP_TERM_H
 
 #include <assert.h>
-#include <regex.h>
 
 #include "buffer.h"
 #include "namespace.h"
@@ -16,27 +15,6 @@
 #define DEFAULT_DTYPE       "http://www.w3.org/2001/XMLSchema#string"
 #define DEFAULT_DTYPE_NS    "xsd:string"
 
-/** @brief URI parsing regular expression.
- *
- * Based on RFC3986 (see https://tools.ietf.org/html/rfc3986#appendix-B) and
- * modified for use in this application. Relevant matching groups are the
- * following, for a sample URI `http://example.org/123/456/?query=blah#frag`:
- *
- * #0:  Full parsed URI (http://example.org/123/456/?query=blah#frag)
- * #1:  Domain prefix (http://example.org)
- * #2:  Protocol (http:)
- * #4:  Authority (example.org)
- * #5:  Path relative to domain (/123/456/?query=blah#frag)
- * #6:  Path, excluding query and fragment (/123/456/)
- * #8:  Query (query=blah)
- * #10: Fragment (frag)
- *
- * For URN-like URIs, such as `urn:s:0`, the prefix part (#1) is `urn:` and
- * the path (#4) is `s:0`.
- */
-#define LSUP_URI_REGEX_STR \
-    "^(([^:/?#]+:)?(//([^/?#]*))?)?(([^?#]*)(\\?([^#]*))?(#(.*))?)"
-
 
 /*
  * Data types.
@@ -90,6 +68,11 @@ typedef struct term_t {
     ((term)->type == LSUP_TERM_LITERAL || (term)->type == LSUP_TERM_LT_LITERAL)
 
 
+/** @brief Whether the environment is already initialized.
+ */
+#define LSUP_IS_INIT (LSUP_default_datatype != NULL)
+
+
 /** @brief RDF triple.
  *
  * This represents a complete RDF statement. Triple terms can be accessed
@@ -143,10 +126,6 @@ typedef struct hashmap LSUP_TermSet;
  */
 extern uint32_t LSUP_default_dtype_key;
 
-/** @brief URI validation pattern, compiled in #LSUP_init().
- */
-extern regex_t *LSUP_uri_ptn;
-
 /** @brief Default literal data type URI.
  *
  * Literal terms created with undefined data type will have it set to this

+ 12 - 16
profile.c

@@ -3,11 +3,11 @@
 
 #define NT 100000
 
-static LSUP_Triple *
-generate_triples(size_t nt)
+static LSUP_Triple **
+generate_triples (size_t nt)
 {
-    LSUP_Triple *trp;
-    trp = malloc((nt  + 1) * sizeof(LSUP_Triple));
+    LSUP_Triple **trp;
+    trp = malloc((nt  + 1) * sizeof(*trp));
     if (!trp) exit (-1);
 
     for (size_t i = 0; i < nt; i++) {
@@ -16,11 +16,11 @@ generate_triples(size_t nt)
         sprintf(sstr, "urn:s:%lu", i % (nt / 100));
         sprintf(pstr, "urn:p:%lu", i % (nt / 1000));
         sprintf(ostr, "urn:o:%lu", i);
-        LSUP_triple_init(
-                trp + i, LSUP_iriref_new (sstr, NULL),
-                LSUP_iriref_new (pstr, NULL), LSUP_iriref_new (ostr, NULL));
+        trp[i] = LSUP_triple_new(
+                LSUP_iriref_new (sstr, NULL),
+                LSUP_iriref_new (pstr, NULL),
+                LSUP_iriref_new (ostr, NULL));
     }
-    LSUP_triple_init (trp + nt, NULL, NULL, NULL);
     log_info ("Triples generated.");
 
     return trp;
@@ -46,14 +46,13 @@ int main(int argc, char *argv[])
     log_info ("Generating %lu triples.", nt);
     start = clock();
 
-    LSUP_Triple *trp = generate_triples(nt);
+    LSUP_Triple **trp = generate_triples(nt);
     tc1 = clock();
     wallclock = (double) (tc1 - start) / CLOCKS_PER_SEC;
     log_info ("Time elapsed: %lf s", wallclock);
 
     log_info ("Inserting triples.");
-    LSUP_Graph *gr = LSUP_graph_new (
-            LSUP_iriref_new (NULL, NULL), LSUP_STORE_MDB, NULL, NULL, nt);
+    LSUP_Graph *gr = LSUP_graph_new (NULL, NULL, NULL);
     if (!gr) {
         log_error ("Error creating graph!");
         return -1;
@@ -64,11 +63,8 @@ int main(int argc, char *argv[])
     if (rc != LSUP_OK) log_warn ("Graph loading interrupted: %d.", rc);
     else log_info ("Graph populated with %lu triples.", ct);
 
-    for (size_t i = 0; i < nt; i++) {
-        LSUP_term_free (trp[i].s);
-        LSUP_term_free (trp[i].p);
-        LSUP_term_free (trp[i].o);
-    }
+    for (size_t i = 0; i < nt; i++)
+        LSUP_triple_free (trp[i]);
     free (trp);
 
     tc2 = clock();

+ 7 - 6
src/codec/Makefile

@@ -7,15 +7,16 @@ LEMON_SRC_DIR = $(BASEDIR)/ext/sqlite/tool
 INCLUDE_DIR = $(BASEDIR)/include
 CODEC_INCLUDE_DIR = $(INCLUDE_DIR)/codec
 
-CODEC_SRC ::= $(wildcard codec_*.c)
-PARSER_SRC ::= $(CODEC_SRC:codec_%=parser_%)
-CODEC_OBJ ::= $(CODEC_SRC:.c=.o)
-PARSER_OBJ ::= $(CODEC_OBJ:codec_%=parser_%)
-GRAMMAR_OBJ ::= $(CODEC_OBJ:codec_%=grammar_%)
+CODEC_SRC := $(wildcard codec_*.c)
+PARSER_SRC := $(CODEC_SRC:codec_%=parser_%)
+CODEC_OBJ := $(CODEC_SRC:.c=.o)
+PARSER_OBJ := $(CODEC_OBJ:codec_%=parser_%)
+GRAMMAR_OBJ := $(CODEC_OBJ:codec_%=grammar_%)
 OBJ = $(GRAMMAR_OBJ) $(PARSER_OBJ) $(CODEC_OBJ)
 DBG_OBJ = $(OBJ:%.o=%_dbg.o)
+$(info DBG_OBJ: $(DBG_OBJ))
 
-INCLUDE ::= -I$(INCLUDE_DIR) -I../../ext/tpl/src -I../../ext/hashmap \
+INCLUDE := -I$(INCLUDE_DIR) -I../../ext/tpl/src -I../../ext/hashmap \
 	-I../../ext/log/src
 CFLAGS = -std=gnu11 -Wall -fPIC -MMD $(INCLUDE)
 DBG_CFLAGS = -I../../test -O0 -g3 -DDEBUG

+ 1 - 1
src/codec/codec_ttl.c

@@ -26,7 +26,7 @@ static LSUP_rc
 term_to_ttl (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
 {
     LSUP_rc rc;
-    char *tmp = NULL, *out;
+    char *tmp = NULL, *out = NULL;
     char *metadata = NULL;
     size_t buf_len;
 

+ 1 - 1
src/codec/lexer_nt.re

@@ -316,7 +316,7 @@ LSUP_nt_parse_doc (FILE *fh, LSUP_Graph **gr_p, size_t *ct, char **err_p)
     rc = parse_it.ct > 0 ? LSUP_OK : LSUP_NORESULT;
     *gr_p = gr;
 
-finally:
+finally: ;
     NTParse (parser, 0, NULL, it);
     NTParseFree (parser, free);
 

+ 1 - 1
src/codec/lexer_ttl.re

@@ -428,7 +428,7 @@ LSUP_ttl_parse_doc (FILE *fh, LSUP_Graph **gr_p, size_t *ct, char **err_p)
 
     *gr_p = gr;
 
-finally:
+finally: ;
     LSUP_rc rc = state->rc;
     log_trace ("rc is %d", rc);
 

+ 1 - 0
src/core.c

@@ -1,6 +1,7 @@
 #define _XOPEN_SOURCE 500
 #include <errno.h>
 #include <ftw.h>
+#include <string.h>
 #include "core.h"
 #include "lmdb.h"
 

+ 7 - 25
src/environment.c

@@ -13,7 +13,6 @@
 /*
  * External variables.
  */
-LSUP_TermSet *LSUP_term_cache = NULL;
 LSUP_NSMap *LSUP_default_nsm = NULL;
 LSUP_Term *LSUP_default_ctx = NULL;
 LSUP_Buffer *LSUP_default_ctx_buf = NULL;
@@ -40,26 +39,6 @@ LSUP_init (void)
 #endif
     log_set_level (loglevel);
 
-    // URI validation pattern.
-    MALLOC_GUARD (LSUP_uri_ptn, LSUP_MEM_ERR);
-
-#if 0 // Re-activate in case a change in the URI regex results in an error.
-    int regex_rc = regcomp (LSUP_uri_ptn, LSUP_URI_REGEX_STR, REG_EXTENDED);
-    if (regex_rc != 0) {
-        char err_msg[128];
-        size_t err_msg_sz = regerror (regex_rc, LSUP_uri_ptn, err_msg, 128);
-        log_error (
-                "Error compiling regular expression pattern: %s.",
-                err_msg);
-        return LSUP_ERROR;
-    }
-#else
-    if (regcomp (LSUP_uri_ptn, LSUP_URI_REGEX_STR, REG_EXTENDED) != 0) {
-        log_error ("Error compiling regular expression pattern.");
-        return LSUP_ERROR;
-    }
-#endif
-
     // Default namespace map.
     LSUP_default_nsm = LSUP_nsmap_new();
     if (UNLIKELY (!LSUP_default_nsm)) return LSUP_ERROR;
@@ -81,11 +60,15 @@ LSUP_init (void)
     // Create and cache default literal datatype key.
     // This will be done only once in the program, so no need to check for
     // duplicates.
+    // This is the last operation that can fail in this function, and it is
+    // the indicator for LSUP_IS_INIT.
     LSUP_default_datatype = LSUP_iriref_new (DEFAULT_DTYPE, NULL);
     LSUP_rc rc = LSUP_term_set_add (
             LSUP_term_cache, LSUP_default_datatype, NULL);
     PRCCK (rc);
 
+    log_info ("LSUP environment initialized.");
+
     // Set automatic teardown TODO Is this a good idea?
     atexit (LSUP_done);
 
@@ -98,9 +81,6 @@ LSUP_done (void)
 {
     if (!LSUP_IS_INIT) return;
 
-    regfree (LSUP_uri_ptn);
-    free (LSUP_uri_ptn);
-
     // Free default NS map and context.
     LSUP_buffer_free (LSUP_default_ctx_buf);
     LSUP_term_free (LSUP_default_ctx);
@@ -108,7 +88,9 @@ LSUP_done (void)
 
     // Free ID cache, including default literal datatype.
     hashmap_free (LSUP_term_cache);
-    LSUP_term_cache = NULL; // This causes LSUP_IS_INIT to return false.
+    LSUP_default_datatype = NULL; // This causes LSUP_IS_INIT to return false.
+
+    log_info ("LSUP environment torn down.");
 }
 
 

+ 1 - 1
src/store_mdb.c

@@ -406,6 +406,7 @@ mdbstore_new (const char *id, size_t _unused)
     CALLOC_GUARD (store, NULL);
 
     RCNL (mdb_env_create (&store->env));
+    MDB_txn *txn = NULL;
 
     // Set map size.
     size_t mapsize;
@@ -420,7 +421,6 @@ mdbstore_new (const char *id, size_t _unused)
     CHECK (mdb_env_open (store->env, path, 0, ENV_FILE_MODE), fail);
 
     // Assign DB handles to store->dbi.
-    MDB_txn *txn = NULL;
     CHECK (mdb_txn_begin (store->env, NULL, 0, &txn), fail);
     for (int i = 0; i < N_DB; i++)
         CHECK (mdb_dbi_open (

+ 175 - 37
src/term.c

@@ -16,12 +16,19 @@
  * Data structures.
  */
 
+/// Sub-match coordinates in IRI parsing results.
+typedef struct match_coord_t {
+    size_t              offset;     ///< Offset of match from start of string.
+    size_t              size;       ///< Length of match.
+} MatchCoord;
+
+
 /// Matching sub-patterns for IRI parts.
 struct iri_info_t {
     LSUP_NSMap *        nsm;        ///< NSM handle for prefixed IRI.
-    regmatch_t          prefix;     ///< Matching group #1.
-    regmatch_t          path;       ///< Matching group #5.
-    regmatch_t          frag;       ///< Matching group #10.
+    MatchCoord          prefix;     ///< URI prefix (scheme + authority).
+    MatchCoord          path;       ///< URI path (including fragment).
+    MatchCoord          frag;       ///< URI fragment.
 };
 
 
@@ -72,8 +79,8 @@ typedef struct link_map {
  */
 
 uint32_t LSUP_default_dtype_key = 0;
-regex_t *LSUP_uri_ptn;
 LSUP_Term *LSUP_default_datatype = NULL;
+LSUP_TermSet *LSUP_term_cache = NULL;
 
 
 /*
@@ -83,6 +90,10 @@ LSUP_Term *LSUP_default_datatype = NULL;
 // Characters not allowed in a URI string.
 static const char *invalid_uri_chars = "<>\" {}|\\^`";
 
+/// Minimum valid type code.
+static const LSUP_TermType MIN_VALID_TYPE = LSUP_TERM_IRIREF;
+/// Maximum valid type code. Change this if adding to enum LSUP_TermType.
+static const LSUP_TermType MAX_VALID_TYPE = LSUP_TERM_BNODE;
 
 /*
  * Static prototypes.
@@ -146,6 +157,9 @@ link_map_free_fn (void *item)
 }
 
 
+static LSUP_rc parse_iri (char *iri, MatchCoord coords[]);
+
+
  /*
  * Term API.
  */
@@ -235,9 +249,11 @@ LSUP_iriref_absolute (const LSUP_Term *root, const LSUP_Term *iri)
         return NULL;
     }
 
-    char *data, *pfx = LSUP_iriref_prefix (iri);
+    char
+        *data,
+        *pfx = LSUP_iriref_prefix (iri);
 
-    if (pfx) data = iri->data;
+    if (strlen (pfx) > 0) data = iri->data;
 
     else if (iri->data[0] == '/') {
         free (pfx);
@@ -392,12 +408,11 @@ LSUP_iriref_prefix (const LSUP_Term *iri)
         return NULL;
     }
 
-    if (iri->iri_info->prefix.rm_so == -1) return NULL;
-
-    size_t len = iri->iri_info->prefix.rm_eo - iri->iri_info->prefix.rm_so;
-    if (len == 0) return NULL;
+    // if (iri->iri_info->prefix.size == 0) return NULL;
 
-    return strndup (iri->data + iri->iri_info->prefix.rm_so, len);
+    return strndup (
+            iri->data + iri->iri_info->prefix.offset,
+            iri->iri_info->prefix.size);
 }
 
 
@@ -409,12 +424,11 @@ LSUP_iriref_path (const LSUP_Term *iri)
         return NULL;
     }
 
-    if (iri->iri_info->path.rm_so == -1) return NULL;
-
-    size_t len = iri->iri_info->path.rm_eo - iri->iri_info->path.rm_so;
-    if (len == 0) return NULL;
+    // if (iri->iri_info->path.size == 0) return NULL;
 
-    return strndup (iri->data + iri->iri_info->path.rm_so, len);
+    return strndup (
+            iri->data + iri->iri_info->path.offset,
+            iri->iri_info->path.size);
 }
 
 
@@ -426,11 +440,11 @@ LSUP_iriref_frag (const LSUP_Term *iri)
         return NULL;
     }
 
-    if (iri->iri_info->frag.rm_so == -1) return NULL;
+    // if (iri->iri_info->frag.size == 0) return NULL;
 
-    size_t len = iri->iri_info->frag.rm_eo - iri->iri_info->frag.rm_so;
-
-    return strndup (iri->data + iri->iri_info->frag.rm_so, len);
+    return strndup (
+            iri->data + iri->iri_info->frag.offset,
+            iri->iri_info->frag.size);
 }
 
 
@@ -654,7 +668,7 @@ LSUP_link_map_add (
         KeyedTerm *ins;
         MALLOC_GUARD (ins, LSUP_MEM_ERR);
         memcpy (ins, &entry_s, sizeof (entry_s));
-        Link link = {.term=ins, tset=tset};
+        Link link = {.term=ins, .tset=tset};
         hashmap_set (cmap->links, &link);
     }
 
@@ -740,12 +754,8 @@ term_init (
         LSUP_Term *term, LSUP_TermType type,
         const char *data, void *metadata)
 {
-    if (UNLIKELY (!LSUP_uri_ptn)) {
-        log_error ("Environment not initialized. Did you call LSUP_init()?");
-        return LSUP_ERROR;
-    }
     // This can never be LSUP_TERM_UNDEFINED.
-    if (type == LSUP_TERM_UNDEFINED) {
+    if (type < MIN_VALID_TYPE || type > MAX_VALID_TYPE) {
         log_error ("%d is not a valid term type.", type);
         return LSUP_VALUE_ERR;
     }
@@ -778,9 +788,9 @@ term_init (
             }
 
             // Capture interesting IRI parts.
-            regmatch_t matches[11];
-            if (UNLIKELY (regexec (LSUP_uri_ptn, fquri, 11, matches, 0) != 0)) {
-                fprintf (stderr, "Error matching URI pattern.\n");
+            MatchCoord matches[7] = {};  // Initialize all to 0.
+            if (UNLIKELY (parse_iri (fquri, matches) != LSUP_OK)) {
+                log_error ("Error matching URI pattern.");
 
                 return LSUP_VALUE_ERR;
             }
@@ -789,8 +799,8 @@ term_init (
             MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
 
             term->iri_info->prefix = matches[1];
-            term->iri_info->path = matches[5];
-            term->iri_info->frag = matches[10];
+            term->iri_info->path = matches[4];
+            term->iri_info->frag = matches[6];
             term->iri_info->nsm = metadata;
         }
 
@@ -813,12 +823,12 @@ term_init (
                 MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
 
                 // Allocate IRI match patterns manually.
-                term->iri_info->prefix.rm_so = 0;
-                term->iri_info->prefix.rm_eo = 4;
-                term->iri_info->path.rm_so = 4;
-                term->iri_info->path.rm_eo = UUIDSTR_SIZE + 6;
-                term->iri_info->frag.rm_so = -1;
-                term->iri_info->frag.rm_eo = -1;
+                term->iri_info->prefix.offset = 0;
+                term->iri_info->prefix.size = 4;
+                term->iri_info->path.offset = 4;
+                term->iri_info->path.size = UUIDSTR_SIZE + 6;
+                term->iri_info->frag.offset = 0;
+                term->iri_info->frag.size = 0;
                 term->iri_info->nsm = NULL;
 
             } else term->data = strdup (uuid_str);
@@ -881,6 +891,134 @@ term_init (
 }
 
 
+/**
+ * @brief scan an IRI string and parse IRI parts.
+ *
+ * Experimental replacement of a regex engine for better performance.
+ *
+ * Slightly adapted from regex on
+ * https://datatracker.ietf.org/doc/html/rfc3986#appendix-B to capture relevant
+ * parts of the IRI.
+ *
+ * Reference regex and group numbering:
+ * ^((?([^:/?#]+):)?(?//([^/?#]*))?)((?[^?#]*)(?\?([^#]*))?(?#(.*))?)
+ *  1  2                3           4             5           6
+ *
+ * Capturing groups:
+ *
+ * #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
+ * #1: Prefix (http://example.org)
+ * #2: Scheme (http)
+ * #3: Authority (example.org)
+ * #4: Path, including query and fragment (/123/456/?query=blah#frag)
+ * #5: Query (query=blah)
+ * #6: Fragment (frag)
+ *
+ *
+ * @param iri_str[in] IRI string to parse.
+ *
+ * @param match_coord_t[out] coord Coordinates to be stored. This must be a
+ * pre-allocated array of at least 7 elements.
+ *
+ * The first size_t of each element stores the relative position of a match,
+ * and the second one stores the length of the match. A length of 0 indicates
+ * no match.
+ */
+static LSUP_rc
+parse_iri (char *iri_str, MatchCoord coord[]) {
+    char *cur = iri_str;
+    size_t iri_len = strlen (iri_str);
+    MatchCoord tmp = {};  // Temporary storage for capture groups
+
+    // Redundant if only called by term_init.
+    // memset (coord, 0, sizeof(*coord));
+
+    //log_debug ("Parsing IRI: %s", iri_str);
+    // #2: ([^:/?#]+)
+    while (
+            *cur != ':' && *cur != '/' && *cur != '?'
+            && *cur != '#' && *cur != '\0') {
+        tmp.size++;
+        cur++;
+    }
+
+    // Non-capturing: (?([^:/?#]+):)?
+    if (tmp.size > 0 && *cur == ':') {
+        // Got capture groups #2 and #3. Store them.
+        coord[2].offset = 0;
+        coord[2].size = tmp.size;
+        cur++;
+        //log_debug ("Group #2: %lu, %lu", coord[2].offset, coord[2].size);
+    } else cur = iri_str;  // Backtrack if no match.
+
+    // Non-capturing: (?//([^/?#]*))?
+    if (*cur == '/' && *(cur + 1) == '/') {
+        cur += 2;
+        tmp.offset = cur - iri_str;
+        tmp.size = 0;
+
+        // #3: ([^/?#]*)
+        while (*cur != '/' && *cur != '?' && *cur != '#' && *cur != '\0') {
+            tmp.size++;
+            cur++;
+        }
+        coord[3].offset = tmp.offset;
+        coord[3].size = tmp.size;
+        //log_debug ("Group #3: %lu, %lu", coord[3].offset, coord[3].size);
+    }
+
+    // Capture group 1.
+    coord[1].offset = 0;
+    coord[1].size = cur - iri_str;
+    //log_debug ("Group #1: %lu, %lu", coord[1].offset, coord[1].size);
+
+    tmp.offset = cur - iri_str;
+    tmp.size = 0;
+
+    coord[4].offset = tmp.offset;
+    coord[4].size = iri_len - tmp.offset;
+    //log_debug ("Group #4: %lu, %lu", coord[4].offset, coord[4].size);
+
+    // Non-capturing: (?[^?#]*)
+    while (*cur != '?' && *cur != '#' && *cur != '\0') {
+        tmp.size++;
+        cur++;
+    }
+
+    // Non-capturing: (?\?([^#]*))
+    if (*cur == '?') {
+        // 5: ([^#]*)
+        tmp.offset = ++cur - iri_str;
+        tmp.size = 0;
+        while (*cur != '#' && *cur != '\0') {
+            tmp.size++;
+            cur++;
+        }
+
+        if (tmp.size > 0) {
+            // Got capture group #5.
+            coord[5].offset = tmp.offset;
+            coord[5].size = tmp.size;
+            //log_debug ("Group #5: %lu, %lu", coord[5].offset, coord[5].size);
+        }
+    }
+
+    // Non-capturing: (?#(.*))?
+    if (*cur == '#') {
+        // #6: (.*)
+        coord[6].offset = ++cur - iri_str;
+        coord[6].size = iri_str + iri_len - cur;
+        //log_debug ("Group #6: %lu, %lu", coord[6].offset, coord[6].size);
+    }
+
+    coord[0].offset = 0;
+    coord[0].size = iri_len;
+    //log_debug ("Full match: %lu, %lu", coord[0].offset, coord[0].size);
+
+    return LSUP_OK;
+}
+
+
 /*
  * Extern inline functions.
  */

+ 3 - 3
test/assets/triples.h

@@ -1,5 +1,5 @@
-#ifndef _TEST_ASSETS_H
-#define _TEST_ASSETS_H
+#ifndef _TEST_ASSETS_TRIPLES_H
+#define _TEST_ASSETS_TRIPLES_H
 
 #include "term.h"
 
@@ -71,5 +71,5 @@ void free_triples (LSUP_Triple **trp)
 
     free(trp);
 }
-#endif  /* _TEST_ASSETS_H */
+#endif  /* _TEST_ASSETS_TRIPLES_H */
 

+ 59 - 0
test/test_term.c

@@ -63,6 +63,64 @@ static int test_iriref()
 }
 
 
+static int test_iriref_parts()
+{
+    char *data[18][4] = {
+        {"http://example.org", "http://example.org", "", ""},
+        {"http://example.org/", "http://example.org", "/", ""},
+        {"http://example.org?option", "http://example.org", "?option", ""},
+        {"http://example.org/?option", "http://example.org", "/?option", ""},
+        {
+                "http://example.org#anchor",
+                "http://example.org", "#anchor", "anchor"},
+        {
+                "http://example.org/#anchor",
+                "http://example.org", "/#anchor", "anchor"},
+        {
+                "http://example.org/?option#anchor",
+                "http://example.org", "/?option#anchor", "anchor"},
+        {
+                "http://example.org?option#anchor",
+                "http://example.org", "?option#anchor", "anchor"},
+        {
+                "http://hanzi.edu/漢魏六朝隋碑誌索引/53?option#anchor",
+                "http://hanzi.edu", "/漢魏六朝隋碑誌索引/53?option#anchor",
+                "anchor"},
+        {"ftp:///", "ftp://", "/", ""},
+        {
+                "file:///usr/local/lib/liblsuprdf.so",
+                "file://", "/usr/local/lib/liblsuprdf.so", ""},
+        {"/", "", "/", ""},
+        {"/tmp", "", "/tmp", ""},
+        {"./tmp", "", "./tmp", ""},
+        {"tmp/test.nt", "", "tmp/test.nt", ""},
+        {"", "", "", ""},
+        {"#hello", "", "#hello", "hello"},
+        {
+                "urn:uuid:950404b6-0e4f-4e21-8267-c8c00e83563b",
+                "urn:", "uuid:950404b6-0e4f-4e21-8267-c8c00e83563b", ""}
+    };
+
+    for (size_t i = 0; i < 18; i++) {
+        LSUP_Term *iri = LSUP_iriref_new(data[i][0], NULL);
+        char
+            *pfx = LSUP_iriref_prefix (iri),
+            *path = LSUP_iriref_path (iri),
+            *frag = LSUP_iriref_frag (iri);
+
+        EXPECT_STR_EQ (pfx, data[i][1]);
+        EXPECT_STR_EQ (path, data[i][2]);
+        EXPECT_STR_EQ (frag, data[i][3]);
+
+        free (pfx);
+        free (path);
+        free (frag);
+        LSUP_term_free (iri);
+    }
+
+    return 0;
+}
+
 static int test_iriref_abs_rel()
 {
     LSUP_NSMap *nsm1 = LSUP_nsmap_new();
@@ -294,6 +352,7 @@ static int test_term_to_key()
 
 int term_tests() {
     RUN (test_iriref);
+    RUN (test_iriref_parts);
     RUN (test_iriref_abs_rel);
     RUN (test_literal);
     RUN (test_term_copy);