Forráskód Böngészése

Term reform:

* Rewrite term serialization & deserialization functions.
* Use own serialization format instead of TPL.
* Make VOLK_parse_iri public.
* Add metadata union member in term struct.
scossu 3 hete
szülő
commit
140f1a0ccb
4 módosított fájl, 195 hozzáadás és 172 törlés
  1. 9 1
      include/volksdata/core.h
  2. 55 6
      include/volksdata/term.h
  3. 129 165
      src/term.c
  4. 2 0
      test/test_graph.c

+ 9 - 1
include/volksdata/core.h

@@ -295,7 +295,7 @@ VOLK_strerror (VOLK_rc rc);
     }                                                               \
 } while (0);
 
-/// Jump to `marker` if `exp` returns a negative value (skip warnings).
+/// Log error and jump to `marker` if `exp` is NULL.
 #define NLCHECK(exp, marker) do {                                   \
     if (UNLIKELY ((exp) == NULL)) {                                 \
         log_error ("*** PREMATURE EXIT due to NULL result.");       \
@@ -347,6 +347,14 @@ VOLK_strerror (VOLK_rc rc);
     }                                                               \
 } while (0);
 
+/// Log error and return NULL if `exp` is NULL.
+#define NLNL(exp) do {                                              \
+    if (UNLIKELY ((exp) == NULL)) {                                 \
+        log_error ("*** PREMATURE EXIT due to NULL result.");       \
+        return NULL;                                                \
+    }                                                               \
+} while (0);
+
 /// Allocate one pointer with malloc and return rc if it fails.
 #define MALLOC_GUARD(var, rc) do {                                  \
     (var) = malloc (sizeof *(var));                                 \

+ 55 - 6
include/volksdata/term.h

@@ -20,7 +20,6 @@
 #define DEFAULT_DTYPE       "http://www.w3.org/2001/XMLSchema#string"
 #define DEFAULT_DTYPE_NS    "xsd:string"
 
-
 /*
  * Data types.
  */
@@ -39,20 +38,34 @@ typedef enum {
     VOLK_TERM_BNODE,        ///< Blank node.
 } VOLK_TermType;
 
-/// Opaque IRI information.
-typedef struct iri_info_t VOLK_IRIInfo;
+/// Match coordinates in IRI parsing results.
+typedef struct match_coord_t {
+    unsigned int        offset;     ///< Offset of match from start of string.
+    unsigned int        size;       ///< Length of match.
+} MatchCoord;
+
+/// Matching sub-patterns for IRI parts.
+typedef struct iri_info_t {
+    MatchCoord          prefix;     ///< Prefix (http://example.org).
+    MatchCoord          scheme;     ///< Scheme (http).
+    MatchCoord          auth;       ///< Authority (example.org).
+    MatchCoord          path;       ///< Path, including query and fragment
+                                    ///< (/123/456/?query=blah#frag).
+    MatchCoord          query;      ///< Query (query=blah).
+    MatchCoord          frag;       ///< Fragment (frag).
+} VOLK_IRIInfo;
 
 /// Opaque iterator for link maps.
 typedef struct link_map_iter VOLK_LinkMapIterator;
 
 /// RDF term.
 typedef struct term_t {
-    char *              data;       ///< URI, literal value, or BNode label.
+    char              * data;       ///< URI, literal value, or BNode label.
     union {
         struct term_t * datatype;   ///< Data type IRI for VOLK_TERM_LITERAL.
         VOLK_LangTag    lang;       ///< Lang tag for VOLK_TERM_LT_LITERAL.
-        VOLK_Key        bnode_id;   ///< BNode ID for comparison & skolemization.
-        VOLK_IRIInfo *  iri_info;   ///< IRI information structure.
+        VOLK_Key        bnode_id;   ///< BN ID for comparison & skolemization.
+        void          * metadata;   ///< Generic metadata pointer.
     };
     VOLK_TermType       type;       ///< Term type.
 } VOLK_Term;
@@ -256,6 +269,42 @@ VOLK_literal_new (const char *data, VOLK_Term *datatype)
 { return VOLK_term_new (VOLK_TERM_LITERAL, data, datatype); }
 
 
+/**
+ * @brief scan an IRI string and parse IRI parts.
+ *
+ * Replacement of a regex engine for better performance.
+ *
+ * Slightly adapted from regex on
+ * https://datatracker.ietf.org/doc/html/rfc3986#appendix-B to capture relevant
+ * parts of the IRI.
+ *
+ * Reference regex and group numbering:
+ * ^((?([^:/?#]+):)?(?//([^/?#]*))?)((?[^?#]*)(?\?([^#]*))?(?#(.*))?)
+ *  1  2                3           4             5           6
+ *
+ * Capturing groups:
+ *
+ * #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
+ * #1: Prefix (http://example.org)
+ * #2: Scheme (http)
+ * #3: Authority (example.org)
+ * #4: Path, including query and fragment (/123/456/?query=blah#frag)
+ * #5: Query (query=blah)
+ * #6: Fragment (frag)
+ *
+ *
+ * @param[in] iri_str IRI string to parse.
+ *
+ * @param[in,out] iri_info Coordinates to be stored.
+ *
+ * The first element of each `MatchCoord` element stores the position of a
+ * match, and the second one stores the length of the match. A length of 0
+ * indicates no match.
+ */
+VOLK_rc
+VOLK_parse_iri (char *iri_str, VOLK_IRIInfo *iri_info);
+
+
 /** @brief Shortcut to create a language-tagged literal term.
  *
  * Must be freed with #VOLK_term_free.

+ 129 - 165
src/term.c

@@ -1,36 +1,16 @@
-#include "tpl.h"
-
 #include "volksdata/term.h"
 
-/** @brief tpl packing format for a term.
- *
- * The pack elements are: 1. term type (char); 2. data (string); 3. void* type
- * metadata, cast to 8-byte unsigned.
- */
-#define TERM_PACK_FMT "csU"
-
 #define MAX_VALID_TERM_TYPE     VOLK_TERM_BNODE /* For type validation. */
+/// Bits to distinguish language-tagged literals.
+#define LIT_NONE 0
+#define LIT_NOLT 1
+#define LIT_LT 2
 
 
 /*
  * Data structures.
  */
 
-/// Sub-match coordinates in IRI parsing results.
-typedef struct match_coord_t {
-    size_t              offset;     ///< Offset of match from start of string.
-    size_t              size;       ///< Length of match.
-} MatchCoord;
-
-
-/// Matching sub-patterns for IRI parts.
-struct iri_info_t {
-    MatchCoord          prefix;     ///< URI prefix (scheme + authority).
-    MatchCoord          path;       ///< URI path (including fragment).
-    MatchCoord          frag;       ///< URI fragment.
-};
-
-
 /// Key-term pair in term set.
 typedef struct keyed_term {
     VOLK_Key            key;        ///< Key (hash) of the term.
@@ -158,9 +138,6 @@ link_map_free_fn (void *item)
 }
 
 
-static VOLK_rc parse_iri (char *iri, MatchCoord coords[]);
-
-
  /*
  * Term API.
  */
@@ -196,39 +173,46 @@ VOLK_term_copy (const VOLK_Term *src)
 }
 
 
+/// See notes in VOLK_term_serialize function body for format info.
 VOLK_Term *
 VOLK_term_new_from_buffer (const VOLK_Buffer *sterm)
 {
     if (UNLIKELY (!sterm)) return NULL;
 
-    VOLK_Term *term = NULL;
-    VOLK_TermType type = VOLK_TERM_UNDEFINED;
-    char *data = NULL;
-    void *metadata;
-
-    tpl_node *tn;
-
-    tn = tpl_map (TERM_PACK_FMT, &type, &data, &metadata);
-    if (UNLIKELY (!tn)) goto finally;
+    VOLK_TermType type;
+    char *data;
+    void *metadata = NULL;
 
-    if (UNLIKELY (tpl_load (tn, TPL_MEM, sterm->addr, sterm->size) < 0)) {
-        log_error ("Error loading serialized term.");
-        goto finally;
-    }
-    if (UNLIKELY (tpl_unpack (tn, 0) < 0)) {
-        log_error ("Error unpacking serialized term.");
-        goto finally;
+    // Copy term type.
+    size_t cplen = sizeof (type);
+    char *cpcur = (char *)sterm->addr;
+    memcpy (&type, cpcur, cplen);
+
+    // Copy term data.
+    cpcur += cplen;
+    cplen = strlen (cpcur) + 1;
+    data = malloc (cplen);
+    NLNL (data);
+    memcpy (data, cpcur, cplen);
+
+    // If applicable, create term metadata.
+    cpcur += cplen;
+    if (type == VOLK_TERM_LITERAL) {
+        if (strlen(cpcur) > 0)
+            NLNL (metadata = (void *) VOLK_iriref_new (cpcur));
+    } else if (type == VOLK_TERM_LT_LITERAL) {
+        cplen = sizeof (VOLK_LangTag);
+        metadata = malloc (cplen);
+        NLNL (metadata);
+        memcpy (metadata, cpcur, cplen);
     }
 
-    if (type == VOLK_TERM_LT_LITERAL)
-        term = VOLK_lt_literal_new (data, (char *)&metadata);
-    else term = VOLK_term_new (type, data, metadata);
+    VOLK_Term *ret = VOLK_term_new (type, data, metadata);
 
-finally:
-    tpl_free (tn);
     free (data);
+    if (type == VOLK_TERM_LT_LITERAL) free (metadata);
 
-    return term;
+    return ret;
 }
 
 
@@ -297,45 +281,75 @@ VOLK_iriref_new_rel (const VOLK_Term *root, const VOLK_Term *iri)
 VOLK_Buffer *
 VOLK_term_serialize (const VOLK_Term *term)
 {
-    /*
+    /* The serialized data are a byte string (unsigned char *) formatted in
+     * the following way:
+     *
+     * - (unsigned char) term->type
+     * - (char *) NUL-delimited term->data
+     * - (char *) serialized metadata as byte string
+     *
+     * All fields are cast to uchar. The first field is fixed, the
+     * second and third are NUL-delimited, hence all fields are easily
+     * identifiable.
+     *
+     * Metadata are:
+     *
+     * - For VOLK_TERM_IRIREF, no data. IRI info is calculated on demand.
+     * - For VOLK_TERM_LITERAL, a `char` (`\1` or `\2`) indicating if a
+     *   language tag is present, followed by the fully-qualified data type URI
+     *   or the language tag, as a `NUL`-delimited string. For a `xsd:string`
+     *   literal with no language, it is a zero-length string.
+     * - For VOLK_TERM_BNODE, no data. Skolemization ID is calculated on
+     *   deserialization.
+     *
      * In serializing a term, the fact that two terms of different types may
      * be semantically identical must be taken into account. Specifically, a
-     * namespace-prefixed IRI ref is identical to its fully qualified version,
-     * and a VOLK_TERM_LT_LITERAL with no language tag is identical to a
+     * VOLK_TERM_LT_LITERAL with no language tag is identical to a
      * VOLK_TERM_LITERAL of xsd:string type, made up of the same string. Such
      * terms must have identical serializations.
      */
 
     if (UNLIKELY (!term)) return NULL;
 
-    VOLK_Term *tmp_term;
-    void *metadata = NULL;
-
-    if (term->type == VOLK_TERM_LT_LITERAL) {
-        // For LT literals with empty lang tag, convert to a normal xsd:string.
-        if (strlen (term->lang) == 0)
-            tmp_term = VOLK_literal_new (term->data, NULL);
-        else {
-            tmp_term = VOLK_lt_literal_new (term->data, (char *) term->lang);
-            memcpy (&metadata, tmp_term->lang, sizeof (metadata));
-        }
-    } else if (term->type == VOLK_TERM_LITERAL) {
-        tmp_term = VOLK_term_new (term->type, term->data, term->datatype);
-        metadata = tmp_term->datatype;
-    } else tmp_term = VOLK_term_copy (term);
-
     VOLK_Buffer *sterm;
     CALLOC_GUARD (sterm, NULL);
 
-    //LOG_TRACE("Effective term being serialized: %s", tmp_term->data);
-    int rc = tpl_jot (
-            TPL_MEM, &sterm->addr, &sterm->size, TERM_PACK_FMT,
-            &tmp_term->type, &tmp_term->data, &metadata);
-    VOLK_term_free (tmp_term);
+    sterm->size = sizeof(VOLK_TermType) + strlen(term->data) + 1;
+    NLNL (sterm->addr = malloc (sterm->size));
 
-    if (rc != 0) {
-        VOLK_buffer_free (sterm);
-        return NULL;
+    // Copy term type.
+    size_t offset = 0;
+    size_t cplen = sizeof(term->type);
+    memcpy (sterm->addr, &term->type, cplen);
+
+    // Copy term data.
+    offset += cplen;
+    cplen = strlen (term->data) + 1;
+    memcpy (sterm->addr + offset, term->data, cplen);
+
+    // If applicable, copy literal metadata.
+    offset += cplen;
+    // Copy data type URI string or lang tag.
+    if (term->type == VOLK_TERM_LITERAL) {
+        // Non-language-tagged term.
+        // Don't store default datatype (xsd:string).
+        if (term->datatype == VOLK_default_datatype) {
+            NLNL (sterm->addr = realloc (sterm->addr, ++sterm->size));
+            sterm->addr[offset] = '\0';
+        }
+        else {
+            cplen = strlen (term->datatype->data) + 1;
+            sterm->size += cplen;
+            NLNL (sterm->addr = realloc (sterm->addr, sterm->size));
+            memcpy (
+                sterm->addr + offset, term->datatype->data, cplen);
+        }
+    } else if (term->type == VOLK_TERM_LT_LITERAL) {
+        // Language-tagged term.
+        cplen = sizeof (VOLK_LangTag);
+        sterm->size += cplen;
+        NLNL (sterm->addr = realloc (sterm->addr, sterm->size));
+        memcpy (sterm->addr + offset, &term->lang, sizeof (VOLK_LangTag));
     }
 
     return sterm;
@@ -363,8 +377,13 @@ VOLK_term_free (VOLK_Term *term)
 {
     if (UNLIKELY (!term)) return;
 
-    if (term->type == VOLK_TERM_IRIREF) free (term->iri_info);
     free (term->data);
+    /*
+    if (
+            term->type == VOLK_TERM_LITERAL &&
+            term->datatype != VOLK_default_datatype)
+        free (term->datatype);
+    */
     free (term);
 }
 
@@ -378,10 +397,11 @@ VOLK_iriref_prefix (const VOLK_Term *iri)
     }
 
     // if (iri->iri_info->prefix.size == 0) return NULL;
+    VOLK_IRIInfo iri_info;
+    RCNL (VOLK_parse_iri (iri->data, &iri_info));
 
     return strndup (
-            iri->data + iri->iri_info->prefix.offset,
-            iri->iri_info->prefix.size);
+            iri->data + iri_info.prefix.offset, iri_info.prefix.size);
 }
 
 
@@ -394,10 +414,10 @@ VOLK_iriref_path (const VOLK_Term *iri)
     }
 
     // if (iri->iri_info->path.size == 0) return NULL;
+    VOLK_IRIInfo iri_info;
+    RCNL (VOLK_parse_iri (iri->data, &iri_info));
 
-    return strndup (
-            iri->data + iri->iri_info->path.offset,
-            iri->iri_info->path.size);
+    return strndup (iri->data + iri_info.path.offset, iri_info.path.size);
 }
 
 
@@ -410,10 +430,10 @@ VOLK_iriref_frag (const VOLK_Term *iri)
     }
 
     // if (iri->iri_info->frag.size == 0) return NULL;
+    VOLK_IRIInfo iri_info;
+    RCNL (VOLK_parse_iri (iri->data, &iri_info));
 
-    return strndup (
-            iri->data + iri->iri_info->frag.offset,
-            iri->iri_info->frag.size);
+    return strndup (iri->data + iri_info.frag.offset, iri_info.frag.size);
 }
 
 
@@ -769,24 +789,11 @@ term_init (
                 log_warn (
                         "Characters %s are not valid in a URI. Got: %s\n",
                         invalid_uri_chars, fquri);
-#if 0
+#if 1
                 // TODO This causes W3C TTL test #29 to fail. Remove?
                 return VOLK_VALUE_ERR;
 #endif
             }
-
-            // Capture interesting IRI parts.
-            MatchCoord matches[7] = {};  // Initialize all to 0.
-            if (UNLIKELY (parse_iri (fquri, matches) != VOLK_OK)) {
-                log_error ("Error matching URI pattern.");
-
-                return VOLK_VALUE_ERR;
-            }
-            MALLOC_GUARD (term->iri_info, VOLK_MEM_ERR);
-
-            term->iri_info->prefix = matches[1];
-            term->iri_info->path = matches[4];
-            term->iri_info->frag = matches[6];
         }
 
         term->data = strdup (data);
@@ -803,18 +810,7 @@ term_init (
             if (type == VOLK_TERM_IRIREF) {
                 term->data = malloc (UUID4_URN_SIZE);
                 snprintf (
-                        term->data, UUID4_URN_SIZE, "urn:uuid4:%s", uuid_str);
-
-                MALLOC_GUARD (term->iri_info, VOLK_MEM_ERR);
-
-                // Allocate IRI match patterns manually.
-                term->iri_info->prefix.offset = 0;
-                term->iri_info->prefix.size = 4;
-                term->iri_info->path.offset = 4;
-                term->iri_info->path.size = UUIDSTR_SIZE + 6;
-                term->iri_info->frag.offset = 0;
-                term->iri_info->frag.size = 0;
-
+                        term->data, UUID4_URN_SIZE, "urn:uuid:%s", uuid_str);
             } else term->data = strdup (uuid_str);
         } else {
             log_error ("No data provided for term.");
@@ -876,48 +872,13 @@ term_init (
 }
 
 
-/**
- * @brief scan an IRI string and parse IRI parts.
- *
- * Experimental replacement of a regex engine for better performance.
- *
- * Slightly adapted from regex on
- * https://datatracker.ietf.org/doc/html/rfc3986#appendix-B to capture relevant
- * parts of the IRI.
- *
- * Reference regex and group numbering:
- * ^((?([^:/?#]+):)?(?//([^/?#]*))?)((?[^?#]*)(?\?([^#]*))?(?#(.*))?)
- *  1  2                3           4             5           6
- *
- * Capturing groups:
- *
- * #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
- * #1: Prefix (http://example.org)
- * #2: Scheme (http)
- * #3: Authority (example.org)
- * #4: Path, including query and fragment (/123/456/?query=blah#frag)
- * #5: Query (query=blah)
- * #6: Fragment (frag)
- *
- *
- * @param iri_str[in] IRI string to parse.
- *
- * @param match_coord_t[out] coord Coordinates to be stored. This must be a
- * pre-allocated array of at least 7 elements.
- *
- * The first size_t of each element stores the relative position of a match,
- * and the second one stores the length of the match. A length of 0 indicates
- * no match.
- */
-static VOLK_rc
-parse_iri (char *iri_str, MatchCoord coord[]) {
+VOLK_rc
+VOLK_parse_iri (char *iri_str, VOLK_IRIInfo *iri_info) {
     char *cur = iri_str;
     size_t iri_len = strlen (iri_str);
     MatchCoord tmp = {};  // Temporary storage for capture groups
 
-    // Redundant if only called by term_init.
-    // memset (coord, 0, sizeof(*coord));
-
+    memset (iri_info, 0, sizeof (*iri_info));
     //LOG_DEBUG("Parsing IRI: %s", iri_str);
     // #2: ([^:/?#]+)
     while (
@@ -930,8 +891,8 @@ parse_iri (char *iri_str, MatchCoord coord[]) {
     // Non-capturing: (?([^:/?#]+):)?
     if (tmp.size > 0 && *cur == ':') {
         // Got capture groups #2 and #3. Store them.
-        coord[2].offset = 0;
-        coord[2].size = tmp.size;
+        iri_info->scheme.offset = 0;
+        iri_info->scheme.size = tmp.size;
         cur++;
         //LOG_DEBUG("Group #2: %lu, %lu", coord[2].offset, coord[2].size);
     } else cur = iri_str;  // Backtrack if no match.
@@ -947,21 +908,21 @@ parse_iri (char *iri_str, MatchCoord coord[]) {
             tmp.size++;
             cur++;
         }
-        coord[3].offset = tmp.offset;
-        coord[3].size = tmp.size;
+        iri_info->auth.offset = tmp.offset;
+        iri_info->auth.size = tmp.size;
         //LOG_DEBUG("Group #3: %lu, %lu", coord[3].offset, coord[3].size);
     }
 
     // Capture group 1.
-    coord[1].offset = 0;
-    coord[1].size = cur - iri_str;
+    iri_info->prefix.offset = 0;
+    iri_info->prefix.size = cur - iri_str;
     //LOG_DEBUG("Group #1: %lu, %lu", coord[1].offset, coord[1].size);
 
     tmp.offset = cur - iri_str;
     tmp.size = 0;
 
-    coord[4].offset = tmp.offset;
-    coord[4].size = iri_len - tmp.offset;
+    iri_info->path.offset = tmp.offset;
+    iri_info->path.size = iri_len - tmp.offset;
     //LOG_DEBUG("Group #4: %lu, %lu", coord[4].offset, coord[4].size);
 
     // Non-capturing: (?[^?#]*)
@@ -982,8 +943,8 @@ parse_iri (char *iri_str, MatchCoord coord[]) {
 
         if (tmp.size > 0) {
             // Got capture group #5.
-            coord[5].offset = tmp.offset;
-            coord[5].size = tmp.size;
+            iri_info->query.offset = tmp.offset;
+            iri_info->query.size = tmp.size;
             //LOG_DEBUG("Group #5: %lu, %lu", coord[5].offset, coord[5].size);
         }
     }
@@ -991,19 +952,22 @@ parse_iri (char *iri_str, MatchCoord coord[]) {
     // Non-capturing: (?#(.*))?
     if (*cur == '#') {
         // #6: (.*)
-        coord[6].offset = ++cur - iri_str;
-        coord[6].size = iri_str + iri_len - cur;
+        iri_info->frag.offset = ++cur - iri_str;
+        iri_info->frag.size = iri_str + iri_len - cur;
         //LOG_DEBUG("Group #6: %lu, %lu", coord[6].offset, coord[6].size);
     }
 
-    coord[0].offset = 0;
-    coord[0].size = iri_len;
-    //LOG_DEBUG("Full match: %lu, %lu", coord[0].offset, coord[0].size);
+    /* TODO add error cases.
+    if (UNLIKELY (rc != VOLK_OK)) {
+        log_error ("Error matching URI pattern.");
+
+        return VOLK_VALUE_ERR;
+    }
+    */
 
     return VOLK_OK;
 }
 
-
 /*
  * Extern inline functions.
  */

+ 2 - 0
test/test_graph.c

@@ -170,7 +170,9 @@ _graph_get (VOLK_StoreType type)
         *gr4 = VOLK_graph_get (store, VOLK_graph_uri (gr2), &ct4);
 
     EXPECT_INT_EQ (VOLK_graph_size (gr3), VOLK_graph_size (gr1));
+    EXPECT_INT_EQ (VOLK_graph_size (gr3), ct3);
     EXPECT_INT_EQ (VOLK_graph_size (gr4), VOLK_graph_size (gr2));
+    EXPECT_INT_EQ (VOLK_graph_size (gr4), ct4);
 
     ASSERT (!VOLK_graph_equals (gr1, gr2), "Graphs 1 and 2 are equal!");
     ASSERT (!VOLK_graph_equals (gr3, gr4), "Graphs 3 and 4 are equal!");