Browse Source

Complete IRI parsing.

scossu 1 week ago
parent
commit
dded6705b6
8 changed files with 142 additions and 103 deletions
  1. 4 4
      Makefile
  2. 30 33
      docs/dev/deps.dot
  3. BIN
      docs/dev/deps.pdf
  4. 1 27
      include/term.h
  5. 7 2
      src/environment.c
  6. 42 34
      src/term.c
  7. 3 3
      test/assets/triples.h
  8. 55 0
      test/test_term.c

+ 4 - 4
Makefile

@@ -26,7 +26,7 @@ DBG_CFLAGS = $(_CFLAGS) -Itest -O0 -g3 -DDEBUG
 #$(info DBG_CFLAGS: $(DBG_CFLAGS))
 # NOTE: -luuid is a Linux system library. Other OS's might need a different
 # link or a non-system library built.
-LDFLAGS := -L. -L$(outdir) -L$(libdir) -llmdb -lxxhash -luuid
+LDFLAGS := -L$(libdir) -L$(outdir) -L. -llmdb -lxxhash -luuid
 
 PARSER = bin/lemon
 LEMON_SRC = ext/sqlite/tool/lemon.c
@@ -66,10 +66,10 @@ LIBS = $(STATIC_LIB) $(DYN_LIB)
 DBG_LIBS = $(STATIC_DBG_LIB) $(DYN_DBG_LIB)
 
 # LDD for Linux, otool -L for OSX.
-ifdef $(shell which ldd 2> /dev/null)
-LDD := ldd
-else
+ifeq (, $(shell which ldd))
 LDD := otool -L
+else
+LDD := ldd
 endif
 
 # For visual dep graph.

+ 30 - 33
docs/dev/deps.dot

@@ -5,48 +5,45 @@ digraph "source tree" {
     fontsize="16";
     fontname="Helvetica";
 	clusterrank="local";
+	"py_lsup_rdf" -> "py_graph"
 	"term" -> "namespace"
-	"profile" -> "lsup_rdf"
+	"store" -> "store_mdb"
 	"store_htable" -> "buffer"
-	"grammar_ttl" -> "codec"
-	"graph" -> "environment"
-	"lsup_rdf" -> "codec_ttl"
-	"store_interface" -> "environment"
-	"lsup_rdf" -> "codec_nt"
 	"namespace" -> "hashmap"
-	"core" -> "lmdb"
-	"environment" -> "term"
-	"py_triple" -> "py_term"
-	"core" -> "xxhash"
-	"store_mdb" -> "buffer"
+	"profile" -> "lsup_rdf"
+	"py_graph" -> "codec_ttl"
+	"store_mdb" -> "store_interface"
+	"py_term" -> "py_namespace"
+	"grammar_nt" -> "codec"
 	"py_namespace" -> "namespace"
-	"store_htable" -> "store_interface"
 	"term" -> "buffer"
+	"parser_nt" -> "codec"
+	"term" -> "tpl"
+	"store_mdb" -> "buffer"
+	"environment" -> "bootstrap"
+	"py_graph" -> "graph"
+	"lsup_rdf" -> "codec_ttl"
 	"graph" -> "store"
-	"py_lsup_rdf" -> "py_graph"
-	"store_htable" -> "hashmap"
+	"parser_nt" -> "tokens_nt"
+	"graph" -> "environment"
 	"py_term" -> "term"
-	"core" -> "log"
-	"parser_ttl" -> "codec"
-	"py_graph" -> "codec_ttl"
-	"store" -> "store_htable"
-	"codec_nt" -> "parser_nt"
+	"codec_ttl" -> "parser_ttl"
+	"store_htable" -> "hashmap"
+	"grammar_ttl" -> "codec"
 	"buffer" -> "core"
-	"py_term" -> "py_namespace"
-	"store_mdb" -> "store_interface"
-	"parser_nt" -> "tokens_nt"
-	"store" -> "store_mdb"
-	"parser_nt" -> "codec"
-	"parser_ttl" -> "tokens_ttl"
 	"graph" -> "term"
-	"environment" -> "bootstrap"
-	"store_mdb" -> "lmdb"
-	"term" -> "tpl"
-	"codec" -> "graph"
-	"codec_ttl" -> "parser_ttl"
-	"py_graph" -> "graph"
-	"py_graph" -> "py_triple"
-	"grammar_nt" -> "codec"
+	"lsup_rdf" -> "codec_nt"
 	"py_graph" -> "codec_nt"
+	"core" -> "log"
+	"py_triple" -> "py_term"
+	"py_graph" -> "py_triple"
+	"codec_nt" -> "parser_nt"
+	"codec" -> "graph"
+	"store" -> "store_htable"
+	"store_interface" -> "environment"
+	"parser_ttl" -> "codec"
+	"store_htable" -> "store_interface"
 	"namespace" -> "core"
+	"environment" -> "term"
+	"parser_ttl" -> "tokens_ttl"
 }

BIN
docs/dev/deps.pdf


+ 1 - 27
include/term.h

@@ -15,29 +15,6 @@
 #define DEFAULT_DTYPE       "http://www.w3.org/2001/XMLSchema#string"
 #define DEFAULT_DTYPE_NS    "xsd:string"
 
-/** @brief URI parsing regular expression.
- *
- * Based on RFC3986 (see https://tools.ietf.org/html/rfc3986#appendix-B) and
- * modified for use in this application. Relevant matching groups are the
- * following, for a sample URI `http://example.org/123/456/?query=blah#frag`:
- *
- * #0:  Full parsed URI (http://example.org/123/456/?query=blah#frag)
- * #1:  Domain prefix (http://example.org)
- * #2:  Protocol (http:)
- * #4:  Authority (example.org)
- * #5:  Path relative to domain (/123/456/?query=blah#frag)
- * #6:  Path, excluding query and fragment (/123/456/)
- * #8:  Query (query=blah)
- * #10: Fragment (frag)
- *
- * For URN-like URIs, such as `urn:s:0`, the prefix part (#1) is `urn:` and
- * the path (#4) is `s:0`.
- *
- * TODO Remove. Superseded by ad-hoc scanning (see static parse_ini in term.c.)
- */
-#define LSUP_URI_REGEX_STR \
-    "^(([^:/?#]+:)?(//([^/?#]*))?)?(([^?#]*)(\\?([^#]*))?(#(.*))?)"
-
 
 /*
  * Data types.
@@ -92,11 +69,8 @@ typedef struct term_t {
 
 
 /** @brief Whether the environment is already initialized.
- *
- * @TODO Check if the default NS was inserted; this would be slower but more
- * accurate.
  */
-#define LSUP_IS_INIT (LSUP_term_cache != NULL)
+#define LSUP_IS_INIT (LSUP_default_datatype != NULL)
 
 
 /** @brief RDF triple.

+ 7 - 2
src/environment.c

@@ -54,18 +54,21 @@ LSUP_init (void)
     if (UNLIKELY (!LSUP_default_ctx_buf)) return LSUP_ERROR;
 
     // Initialize term cache.
-    // This is the indicator for LSUP_IS_INIT.
     LSUP_term_cache = LSUP_term_set_new();
     if (UNLIKELY (!LSUP_term_cache)) return LSUP_MEM_ERR;
 
     // Create and cache default literal datatype key.
     // This will be done only once in the program, so no need to check for
     // duplicates.
+    // This is the last operation that can fail in this function, and it is
+    // the indicator for LSUP_IS_INIT.
     LSUP_default_datatype = LSUP_iriref_new (DEFAULT_DTYPE, NULL);
     LSUP_rc rc = LSUP_term_set_add (
             LSUP_term_cache, LSUP_default_datatype, NULL);
     PRCCK (rc);
 
+    log_info ("LSUP environment initialized.");
+
     // Set automatic teardown TODO Is this a good idea?
     atexit (LSUP_done);
 
@@ -85,7 +88,9 @@ LSUP_done (void)
 
     // Free ID cache, including default literal datatype.
     hashmap_free (LSUP_term_cache);
-    LSUP_term_cache = NULL; // This causes LSUP_IS_INIT to return false.
+    LSUP_default_datatype = NULL; // This causes LSUP_IS_INIT to return false.
+
+    log_info ("LSUP environment torn down.");
 }
 
 

+ 42 - 34
src/term.c

@@ -90,6 +90,10 @@ LSUP_TermSet *LSUP_term_cache = NULL;
 // Characters not allowed in a URI string.
 static const char *invalid_uri_chars = "<>\" {}|\\^`";
 
+/// Minimum valid type code.
+static const LSUP_TermType MIN_VALID_TYPE = LSUP_TERM_IRIREF;
+/// Maximum valid type code. Change this if adding to enum LSUP_TermType.
+static const LSUP_TermType MAX_VALID_TYPE = LSUP_TERM_BNODE;
 
 /*
  * Static prototypes.
@@ -245,9 +249,11 @@ LSUP_iriref_absolute (const LSUP_Term *root, const LSUP_Term *iri)
         return NULL;
     }
 
-    char *data, *pfx = LSUP_iriref_prefix (iri);
+    char
+        *data,
+        *pfx = LSUP_iriref_prefix (iri);
 
-    if (pfx) data = iri->data;
+    if (strlen (pfx) > 0) data = iri->data;
 
     else if (iri->data[0] == '/') {
         free (pfx);
@@ -402,7 +408,7 @@ LSUP_iriref_prefix (const LSUP_Term *iri)
         return NULL;
     }
 
-    if (iri->iri_info->prefix.size == 0) return NULL;
+    // if (iri->iri_info->prefix.size == 0) return NULL;
 
     return strndup (
             iri->data + iri->iri_info->prefix.offset,
@@ -418,7 +424,7 @@ LSUP_iriref_path (const LSUP_Term *iri)
         return NULL;
     }
 
-    if (iri->iri_info->path.size == 0) return NULL;
+    // if (iri->iri_info->path.size == 0) return NULL;
 
     return strndup (
             iri->data + iri->iri_info->path.offset,
@@ -434,7 +440,7 @@ LSUP_iriref_frag (const LSUP_Term *iri)
         return NULL;
     }
 
-    if (iri->iri_info->frag.size == 0) return NULL;
+    // if (iri->iri_info->frag.size == 0) return NULL;
 
     return strndup (
             iri->data + iri->iri_info->frag.offset,
@@ -748,12 +754,8 @@ term_init (
         LSUP_Term *term, LSUP_TermType type,
         const char *data, void *metadata)
 {
-    if (UNLIKELY (!LSUP_IS_INIT)) {
-        log_error ("Environment not initialized. Did you call LSUP_init()?");
-        return LSUP_ERROR;
-    }
     // This can never be LSUP_TERM_UNDEFINED.
-    if (type == LSUP_TERM_UNDEFINED) {
+    if (type < MIN_VALID_TYPE || type > MAX_VALID_TYPE) {
         log_error ("%d is not a valid term type.", type);
         return LSUP_VALUE_ERR;
     }
@@ -928,6 +930,10 @@ parse_iri (char *iri_str, MatchCoord coord[]) {
     size_t iri_len = strlen (iri_str);
     MatchCoord tmp = {};  // Temporary storage for capture groups
 
+    // Redundant if only called by term_init.
+    // memset (coord, 0, sizeof(*coord));
+
+    log_debug ("Parsing IRI: %s", iri_str);
     // #2: ([^:/?#]+)
     while (
             *cur != ':' && *cur != '/' && *cur != '?'
@@ -937,51 +943,50 @@ parse_iri (char *iri_str, MatchCoord coord[]) {
     }
 
     // Non-capturing: (?([^:/?#]+):)?
-    if (tmp.size > 0 && *(++cur) == ':') {
+    if (tmp.size > 0 && *cur == ':') {
         // Got capture groups #2 and #3. Store them.
-        tmp.size++;
-        coord[3].offset = tmp.offset;
-        coord[3].size = tmp.size - 1;
-    }
+        coord[2].offset = 0;
+        coord[2].size = tmp.size;
+        cur++;
+        log_debug ("Group #2: %lu, %lu", coord[2].offset, coord[2].size);
+    } else cur = iri_str;  // Backtrack if no match.
 
     // Non-capturing: (?//([^/?#]*))?
-    if (*(cur + 1) == '/' && *(cur + 2) == '/') {
-        cur++;
-        tmp.offset = cur - iri_str;
-        tmp.size = 2;
+    if (*cur == '/' && *(cur + 1) == '/') {
         cur += 2;
+        tmp.offset = cur - iri_str;
+        tmp.size = 0;
 
         // #3: ([^/?#]*)
         while (*cur != '/' && *cur != '?' && *cur != '#' && *cur != '\0') {
             tmp.size++;
             cur++;
         }
-
-        // Maybe got capture group #5.
-        coord[3].offset = tmp.offset + 2;
-        coord[3].size = tmp.size -2;
+        coord[3].offset = tmp.offset;
+        coord[3].size = tmp.size;
+        log_debug ("Group #3: %lu, %lu", coord[3].offset, coord[3].size);
     }
 
-    // Capture group 1 and advance cursor.
+    // Capture group 1.
     coord[1].offset = 0;
-    coord[1].size = cur++ - iri_str;
+    coord[1].size = cur - iri_str;
+    log_debug ("Group #1: %lu, %lu", coord[1].offset, coord[1].size);
 
-    // Non-capturing: (?[^?#]*)
     tmp.offset = cur - iri_str;
     tmp.size = 0;
+
+    coord[4].offset = tmp.offset;
+    coord[4].size = iri_len - tmp.offset;
+    log_debug ("Group #4: %lu, %lu", coord[4].offset, coord[4].size);
+
+    // Non-capturing: (?[^?#]*)
     while (*cur != '?' && *cur != '#' && *cur != '\0') {
         tmp.size++;
         cur++;
     }
 
-    if (tmp.size > 0) {
-        coord[4].offset = tmp.offset;
-        coord[4].size = iri_str + iri_len - cur;
-
-    } else return LSUP_NORESULT;  // This group is the only mandatory match.
-
     // Non-capturing: (?\?([^#]*))
-    if (*(++cur) == '?') {
+    if (*cur == '?') {
         // 5: ([^#]*)
         tmp.offset = ++cur - iri_str;
         tmp.size = 0;
@@ -994,18 +999,21 @@ parse_iri (char *iri_str, MatchCoord coord[]) {
             // Got capture group #5.
             coord[5].offset = tmp.offset;
             coord[5].size = tmp.size;
+            log_debug ("Group #5: %lu, %lu", coord[5].offset, coord[5].size);
         }
     }
 
     // Non-capturing: (?#(.*))?
-    if (*(++cur) == '#') {
+    if (*cur == '#') {
         // #6: (.*)
         coord[6].offset = ++cur - iri_str;
         coord[6].size = iri_str + iri_len - cur;
+        log_debug ("Group #6: %lu, %lu", coord[6].offset, coord[6].size);
     }
 
     coord[0].offset = 0;
     coord[0].size = iri_len;
+    log_debug ("Full match: %lu, %lu", coord[0].offset, coord[0].size);
 
     return LSUP_OK;
 }

+ 3 - 3
test/assets/triples.h

@@ -1,5 +1,5 @@
-#ifndef _TEST_ASSETS_H
-#define _TEST_ASSETS_H
+#ifndef _TEST_ASSETS_TRIPLES_H
+#define _TEST_ASSETS_TRIPLES_H
 
 #include "term.h"
 
@@ -71,5 +71,5 @@ void free_triples (LSUP_Triple **trp)
 
     free(trp);
 }
-#endif  /* _TEST_ASSETS_H */
+#endif  /* _TEST_ASSETS_TRIPLES_H */
 

+ 55 - 0
test/test_term.c

@@ -63,6 +63,60 @@ static int test_iriref()
 }
 
 
+static int test_iriref_parts()
+{
+    char *data[17][4] = {
+        {"http://example.org", "http://example.org", "", ""},
+        {"http://example.org/", "http://example.org", "/", ""},
+        {"http://example.org?option", "http://example.org", "?option", ""},
+        {"http://example.org/?option", "http://example.org", "/?option", ""},
+        {
+                "http://example.org#anchor",
+                "http://example.org", "#anchor", "anchor"},
+        {
+                "http://example.org/#anchor",
+                "http://example.org", "/#anchor", "anchor"},
+        {
+                "http://example.org/?option#anchor",
+                "http://example.org", "/?option#anchor", "anchor"},
+        {
+                "http://example.org?option#anchor",
+                "http://example.org", "?option#anchor", "anchor"},
+        {"ftp:///", "ftp://", "/", ""},
+        {
+                "file:///usr/local/lib/liblsuprdf.so",
+                "file://", "/usr/local/lib/liblsuprdf.so", ""},
+        {"/", "", "/", ""},
+        {"/tmp", "", "/tmp", ""},
+        {"./tmp", "", "./tmp", ""},
+        {"tmp/test.nt", "", "tmp/test.nt", ""},
+        {"", "", "", ""},
+        {"#hello", "", "#hello", "hello"},
+        {
+                "urn:uuid:950404b6-0e4f-4e21-8267-c8c00e83563b",
+                "urn:", "uuid:950404b6-0e4f-4e21-8267-c8c00e83563b", ""}
+    };
+
+    for (size_t i = 0; i < 17; i++) {
+        LSUP_Term *iri = LSUP_iriref_new(data[i][0], NULL);
+        char
+            *pfx = LSUP_iriref_prefix (iri),
+            *path = LSUP_iriref_path (iri),
+            *frag = LSUP_iriref_frag (iri);
+
+        EXPECT_STR_EQ (pfx, data[i][1]);
+        EXPECT_STR_EQ (path, data[i][2]);
+        EXPECT_STR_EQ (frag, data[i][3]);
+
+        free (pfx);
+        free (path);
+        free (frag);
+        LSUP_term_free (iri);
+    }
+
+    return 0;
+}
+
 static int test_iriref_abs_rel()
 {
     LSUP_NSMap *nsm1 = LSUP_nsmap_new();
@@ -294,6 +348,7 @@ static int test_term_to_key()
 
 int term_tests() {
     RUN (test_iriref);
+    RUN (test_iriref_parts);
     RUN (test_iriref_abs_rel);
     RUN (test_literal);
     RUN (test_term_copy);