#include "tpl.h" #include "term.h" /** @brief tpl packing format for a term. * * The pack elements are: 1. term type (char); 2. data (string); 3. void* type * metadata, cast to 8-byte unsigned. */ #define TERM_PACK_FMT "csU" #define MAX_VALID_TERM_TYPE LSUP_TERM_BNODE /* For type validation. */ /* * Data structures. */ /// Matching sub-patterns for IRI parts. struct iri_info_t { LSUP_NSMap * nsm; ///< NSM handle for prefixed IRI. regmatch_t prefix; ///< Matching group #1. regmatch_t path; ///< Matching group #5. regmatch_t frag; ///< Matching group #10. }; /// Key-term pair in term set. typedef struct keyed_term { LSUP_Key key; ///< Key (hash) of the term. LSUP_Term * term; ///< Term handle. } KeyedTerm; /** @brief Single link between a term and a term set. * * This link is not qualified and may not be used by itself. It belongs * in a #LSUP_LinkMap which qualifies all links of the same type. */ typedef struct link { KeyedTerm * term; ///< Linked term. LSUP_TermSet * tset; ///< Term set linked to the term. } Link; /// Opaque link map iterator. struct link_map_iter { const LSUP_LinkMap *map; ///< Link map to iterate. size_t i; ///< Linking term loop cursor. size_t j; ///< Term set loop cursor. LSUP_Term * ext; ///< External link to look for connections. Link * link; ///< Current link being retrieved. }; /* * A link map is thus nested: * * - A link map contains a hash map of Link instances (link). * - Each Link contains a KeyedTerm (term) and a TermSet (tset). * - Each term set is a hash map of KeyedTerm instances. * - Each KeyedTerm contains a Term and its hash. */ typedef struct link_map { LSUP_LinkType type; ///< Link type. struct hashmap * links; ///< Map of #Link instances. } LSUP_LinkMap; /* * External variables. */ uint32_t LSUP_default_dtype_key = 0; regex_t *LSUP_uri_ptn; LSUP_Term *LSUP_default_datatype = NULL; /* * Static variables. */ // Characters not allowed in a URI string. static const char *invalid_uri_chars = "<>\" {}|\\^`"; /* * Static prototypes. */ static LSUP_rc term_init ( LSUP_Term *term, LSUP_TermType type, const char *data, void *metadata); /* * Term set callbacks. */ static uint64_t tset_hash_fn ( const void *item, uint64_t seed0, uint64_t seed1) { return ((const KeyedTerm *) item)->key; } static int tset_cmp_fn (const void *a, const void *b, void *udata) { return ((const KeyedTerm *) a)->key - ((const KeyedTerm *) b)->key; } static void tset_free_fn (void *item) { LSUP_term_free (((KeyedTerm *) item)->term); } /* * Link map callbacks. */ static uint64_t link_map_hash_fn ( const void *item, uint64_t seed0, uint64_t seed1) { return ((const Link *)item)->term->key; } static int link_map_cmp_fn (const void *a, const void *b, void *udata) { return ((const Link *)a)->term->key - ((const Link *)b)->term->key; } static void link_map_free_fn (void *item) { Link *link = item; LSUP_term_free (link->term->term); free (link->term); LSUP_term_set_free (link->tset); } /* * Term API. */ LSUP_Term * LSUP_term_new ( LSUP_TermType type, const char *data, void *metadata) { LSUP_Term *term; CALLOC_GUARD (term, NULL); // If undefined, just set the type. if (type == LSUP_TERM_UNDEFINED) term->type = type; else if (UNLIKELY (term_init ( term, type, data, metadata) != LSUP_OK)) { free (term); return NULL; } return term; } LSUP_Term * LSUP_term_copy (const LSUP_Term *src) { void *metadata = NULL; if (LSUP_IS_IRI (src)) metadata = (void *) LSUP_iriref_nsm (src); else if (src->type == LSUP_TERM_LITERAL) metadata = (void *) src->datatype; else if (src->type == LSUP_TERM_LT_LITERAL) { metadata = (void *) src->lang; } return LSUP_term_new (src->type, src->data, metadata); } LSUP_Term * LSUP_term_new_from_buffer (const LSUP_Buffer *sterm) { if (UNLIKELY (!sterm)) return NULL; LSUP_Term *term = NULL; LSUP_TermType type = LSUP_TERM_UNDEFINED; char *data = NULL; void *metadata; tpl_node *tn; tn = tpl_map (TERM_PACK_FMT, &type, &data, &metadata); if (UNLIKELY (!tn)) goto finally; if (UNLIKELY (tpl_load (tn, TPL_MEM, sterm->addr, sterm->size) < 0)) { log_error ("Error loading serialized term."); goto finally; } if (UNLIKELY (tpl_unpack (tn, 0) < 0)) { log_error ("Error unpacking serialized term."); goto finally; } if (type == LSUP_TERM_LT_LITERAL) term = LSUP_lt_literal_new (data, (char *)&metadata); else term = LSUP_term_new (type, data, metadata); finally: tpl_free (tn); free (data); return term; } LSUP_Term * LSUP_iriref_absolute (const LSUP_Term *root, const LSUP_Term *iri) { if (! LSUP_IS_IRI (iri)) { log_error ("Provided path is not an IRI."); return NULL; } if (! LSUP_IS_IRI (root)) { log_error ("Provided root is not an IRI."); return NULL; } char *data, *pfx = LSUP_iriref_prefix (iri); if (pfx) data = iri->data; else if (iri->data[0] == '/') { free (pfx); pfx = LSUP_iriref_prefix (root); data = malloc (strlen (iri->data) + strlen (pfx) + 1); if (!data) return NULL; sprintf (data, "%s%s", pfx, iri->data); } else { data = malloc (strlen (iri->data) + strlen (root->data) + 1); if (!data) return NULL; sprintf (data, "%s%s", root->data, iri->data); } free (pfx); LSUP_Term *ret = LSUP_iriref_new (data, NULL); if (data != iri->data) free (data); return ret; } LSUP_Term * LSUP_iriref_relative (const LSUP_Term *root, const LSUP_Term *iri) { if (! LSUP_IS_IRI (iri)) { log_error ("Provided path is not an IRI."); return NULL; } if (! LSUP_IS_IRI (root)) { log_error ("Provided root is not an IRI."); return NULL; } size_t offset = ( strstr (iri->data, root->data) == iri->data ? strlen (root->data) : 0); return LSUP_iriref_new (iri->data + offset, LSUP_iriref_nsm (iri)); } LSUP_Buffer * LSUP_term_serialize (const LSUP_Term *term) { /* * In serializing a term, the fact that two terms of different types may * be semantically identical must be taken into account. Specifically, a * namespace-prefixed IRI ref is identical to its fully qualified version, * and a LSUP_TERM_LT_LITERAL with no language tag is identical to a * LSUP_TERM_LITERAL of xsd:string type, made up of the same string. Such * terms must have identical serializations. */ if (UNLIKELY (!term)) return NULL; LSUP_Term *tmp_term; void *metadata = NULL; if (term->type == LSUP_TERM_NS_IRIREF) { // For IRI refs, simply serialize the FQ version of the term. char *fq_uri; if (LSUP_nsmap_normalize_uri ( term->iri_info->nsm, term->data, &fq_uri ) != LSUP_OK) return NULL; tmp_term = LSUP_iriref_new (fq_uri, NULL); free (fq_uri); } else if (term->type == LSUP_TERM_LT_LITERAL) { // For LT literals with empty lang tag, convert to a normal xsd:string. if (strlen (term->lang) == 0) tmp_term = LSUP_literal_new (term->data, NULL); else tmp_term = LSUP_lt_literal_new (term->data, (char *) term->lang); } else tmp_term = LSUP_term_new ( term->type, term->data, (void *) term->datatype); // "datatype" can be anything here since it's cast to void *. // metadata field is ignored for IRI ref. if (tmp_term->type == LSUP_TERM_LITERAL) metadata = tmp_term->datatype; else if (tmp_term->type == LSUP_TERM_LT_LITERAL) memcpy (&metadata, tmp_term->lang, sizeof (metadata)); LSUP_Buffer *sterm; MALLOC_GUARD (sterm, NULL); //log_trace ("Effective term being serialized: %s", tmp_term->data); int rc = tpl_jot ( TPL_MEM, &sterm->addr, &sterm->size, TERM_PACK_FMT, &tmp_term->type, &tmp_term->data, &metadata); LSUP_term_free (tmp_term); if (rc != 0) { LSUP_buffer_free (sterm); return NULL; } return sterm; } LSUP_Key LSUP_term_hash (const LSUP_Term *term) { LSUP_Buffer *buf; if (UNLIKELY (!term)) buf = BUF_DUMMY; else buf = LSUP_term_serialize (term); LSUP_Key key = LSUP_buffer_hash (buf); LSUP_buffer_free (buf); return key; } void LSUP_term_free (LSUP_Term *term) { if (UNLIKELY (!term)) return; if (LSUP_IS_IRI (term)) free (term->iri_info); free (term->data); free (term); } LSUP_NSMap * LSUP_iriref_nsm (const LSUP_Term *iri) { if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) { log_error ("Term is not a IRI ref type."); return NULL; } return iri->iri_info->nsm; } char * LSUP_iriref_prefix (const LSUP_Term *iri) { if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) { log_error ("Term is not a IRI ref type."); return NULL; } if (iri->iri_info->prefix.rm_so == -1) return NULL; size_t len = iri->iri_info->prefix.rm_eo - iri->iri_info->prefix.rm_so; if (len == 0) return NULL; return strndup (iri->data + iri->iri_info->prefix.rm_so, len); } char * LSUP_iriref_path (const LSUP_Term *iri) { if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) { log_error ("Term is not a IRI ref type."); return NULL; } if (iri->iri_info->path.rm_so == -1) return NULL; size_t len = iri->iri_info->path.rm_eo - iri->iri_info->path.rm_so; if (len == 0) return NULL; return strndup (iri->data + iri->iri_info->path.rm_so, len); } char * LSUP_iriref_frag (const LSUP_Term *iri) { if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) { log_error ("Term is not a IRI ref type."); return NULL; } if (iri->iri_info->frag.rm_so == -1) return NULL; size_t len = iri->iri_info->frag.rm_eo - iri->iri_info->frag.rm_so; return strndup (iri->data + iri->iri_info->frag.rm_so, len); } /* * Triple API. */ LSUP_Triple * LSUP_triple_new(LSUP_Term *s, LSUP_Term *p, LSUP_Term *o) { LSUP_Triple *spo = malloc (sizeof (*spo)); if (!spo) return NULL; if (UNLIKELY (LSUP_triple_init (spo, s, p, o))) { free (spo); return NULL; } return spo; } LSUP_Triple * LSUP_triple_new_from_btriple (const LSUP_BufferTriple *sspo) { LSUP_Triple *spo = malloc (sizeof (*spo)); if (!spo) return NULL; spo->s = LSUP_term_new_from_buffer (sspo->s); spo->p = LSUP_term_new_from_buffer (sspo->p); spo->o = LSUP_term_new_from_buffer (sspo->o); return spo; } LSUP_BufferTriple * LSUP_triple_serialize (const LSUP_Triple *spo) { LSUP_BufferTriple *sspo = malloc (sizeof (*sspo)); if (!sspo) return NULL; sspo->s = LSUP_term_serialize (spo->s); sspo->p = LSUP_term_serialize (spo->p); sspo->o = LSUP_term_serialize (spo->o); return sspo; } LSUP_rc LSUP_triple_init (LSUP_Triple *spo, LSUP_Term *s, LSUP_Term *p, LSUP_Term *o) { /* FIXME TRP_DUMMY is a problem here. if (! LSUP_IS_IRI (s) && s->type != LSUP_TERM_BNODE) { log_error ("Subject is not of a valid term type: %d", s->type); return LSUP_VALUE_ERR; } if (! LSUP_IS_IRI (p)) { log_error ("Predicate is not of a valid term type: %d", p->type); return LSUP_VALUE_ERR; } */ spo->s = s; spo->p = p; spo->o = o; return LSUP_OK; } void LSUP_triple_done (LSUP_Triple *spo) { if (UNLIKELY (!spo)) return; LSUP_term_free (spo->s); LSUP_term_free (spo->p); LSUP_term_free (spo->o); } void LSUP_triple_free (LSUP_Triple *spo) { if (UNLIKELY (!spo)) return; LSUP_term_free (spo->s); LSUP_term_free (spo->p); LSUP_term_free (spo->o); free (spo); } /* * Multi-add functions. */ LSUP_TermSet * LSUP_term_set_new () { // Capacity of 4 is an arbitrary guess. LSUP_TermSet *ts = hashmap_new ( sizeof (KeyedTerm), 4, LSUP_HASH_SEED, 0, tset_hash_fn, tset_cmp_fn, tset_free_fn, NULL); if (UNLIKELY (hashmap_oom (ts))) return NULL; return ts; } LSUP_rc LSUP_term_set_add (LSUP_TermSet *ts, LSUP_Term *term, LSUP_Term **existing) { LSUP_Hash key = LSUP_term_hash (term); KeyedTerm entry_s = {.key=key, .term=term}; KeyedTerm *ex = hashmap_get (ts, &entry_s); if (ex) { if (existing) *existing = ex->term; return LSUP_NOACTION; } hashmap_set (ts, &entry_s); if (hashmap_oom (ts)) return LSUP_MEM_ERR; return LSUP_OK; } const LSUP_Term * LSUP_term_set_get (LSUP_TermSet *ts, LSUP_Key key) { KeyedTerm *entry = hashmap_get (ts, &(KeyedTerm){.key=key}); if (entry) log_trace ("ID found for key %lx: %s", key, entry->term->data); else log_trace ("No ID found for key %lx.", key); return (entry) ? entry->term : NULL; } LSUP_rc LSUP_term_set_next (LSUP_TermSet *ts, size_t *i, LSUP_Term **term) { KeyedTerm *kt = NULL; if (!hashmap_iter (ts, i, (void **)&kt)) return LSUP_END; if (term) *term = kt->term; return LSUP_OK; } void LSUP_term_set_free (LSUP_TermSet *ts) { hashmap_free (ts); } LSUP_LinkMap * LSUP_link_map_new (LSUP_LinkType type) { LSUP_LinkMap *cm; MALLOC_GUARD (cm, NULL); cm->type = type; cm->links = hashmap_new ( sizeof (Link), 0, LSUP_HASH_SEED, 0, link_map_hash_fn, link_map_cmp_fn, link_map_free_fn, NULL); return cm; } void LSUP_link_map_free (LSUP_LinkMap *cm) { hashmap_free (cm->links); free (cm); } LSUP_LinkType LSUP_link_map_type (const LSUP_LinkMap *map) { return map->type; } // TODO Memory error handling. LSUP_rc LSUP_link_map_add ( LSUP_LinkMap *cmap, LSUP_Term *term, LSUP_TermSet *tset) { // Keyed term to look up the link term and insert it, if necessary. KeyedTerm entry_s = {.key=LSUP_term_hash (term), .term=term}; Link *ex = hashmap_get (cmap->links, &(Link){.term=&entry_s}); if (ex) { // Add terms one by one to the existing term set. log_trace ( "Linking term %s exists. Adding individual terms.", ex->term->term->data); size_t i = 0; KeyedTerm *kt; while (hashmap_iter (tset, &i, (void **)&kt)) { log_trace ( "Adding term %s to link %s", kt->term->data, ex->term->term->data); if (hashmap_get (ex->tset, kt)) // Term already exist, free the new one and move on. LSUP_term_free (kt->term); else // Insert KeyedTerm, the term set now owns the underlying term. hashmap_set (ex->tset, kt); } // Free link term that hasn't been used. LSUP_term_free (term); } else { // Add the new term and the termset wholesale. log_trace ("Adding new linking term %s.", term->data); // Allocate inserted member on heap, it will be owned by the map. KeyedTerm *ins; MALLOC_GUARD (ins, LSUP_MEM_ERR); memcpy (ins, &entry_s, sizeof (entry_s)); Link link = {.term=ins, .tset=tset}; hashmap_set (cmap->links, &link); } return LSUP_OK; } LSUP_LinkMapIterator * LSUP_link_map_iter_new (const LSUP_LinkMap *lmap, LSUP_Term *ext) { LSUP_LinkMapIterator *it; CALLOC_GUARD (it, NULL); it->map = lmap; it->ext = ext; return it; } void LSUP_link_map_iter_free (LSUP_LinkMapIterator *it) { free (it); } LSUP_rc LSUP_link_map_next ( LSUP_LinkMapIterator *it, LSUP_Term **lt, LSUP_TermSet **ts) { if (!hashmap_iter (it->map->links, &it->i, (void **)&it->link)) return LSUP_END; *lt = it->link->term->term; *ts = it->link->tset; return LSUP_OK; } // TODO dismantle if the only triple generator is for the graph. LSUP_rc LSUP_link_map_triples ( LSUP_LinkMapIterator *it, LSUP_Triple *spo) { // Assign external (related) term. if (it->map->type == LSUP_LINK_INBOUND) spo->o = it->ext; else if (it->map->type == LSUP_LINK_OUTBOUND) spo->s = it->ext; else spo->p = it->ext; KeyedTerm *kt; // If we are already handling a link, continue the internal loop. if (it->link) goto int_loop; ext_loop: // Advance external counter and start new internal loop. it->j = 0; if (!hashmap_iter (it->map->links, &it->i, (void **)&it->link)) return LSUP_END; int_loop: // If end of the term set is reached, start with a new linking term. if (!hashmap_iter (it->link->tset, &it->j, (void **)&kt)) goto ext_loop; // Continue pulling from term set. // Assign linking term. if (it->map->type == LSUP_LINK_EDGE) spo->s = it->link->term->term; else spo->p = it->link->term->term; // Assign term in term set. if (it->map->type == LSUP_LINK_INBOUND) spo->s = kt->term; else spo->o = kt->term; return LSUP_OK; } /* * Static functions. */ static LSUP_rc term_init ( LSUP_Term *term, LSUP_TermType type, const char *data, void *metadata) { if (UNLIKELY (!LSUP_uri_ptn)) { log_error ("Environment not initialized. Did you call LSUP_init()?"); return LSUP_ERROR; } // This can never be LSUP_TERM_UNDEFINED. if (type == LSUP_TERM_UNDEFINED) { log_error ("%d is not a valid term type.", type); return LSUP_VALUE_ERR; } term->type = type; if (data) { // Validate IRI. if (LSUP_IS_IRI (term)) { char *fquri; // Find fully qualified IRI to parse. if (term->type == LSUP_TERM_NS_IRIREF) { if (LSUP_nsmap_normalize_uri (metadata, data, &fquri) < 0) { log_error ("Error normalizing IRI data."); return LSUP_VALUE_ERR; } log_debug ("Fully qualified IRI: %s", fquri); } else fquri = (char *) data; if (strpbrk (fquri, invalid_uri_chars) != NULL) { log_warn ( "Characters %s are not valid in a URI. Got: %s\n", invalid_uri_chars, fquri); #if 0 // TODO This causes W3C TTL test #29 to fail. Remove? return LSUP_VALUE_ERR; #endif } // Capture interesting IRI parts. regmatch_t matches[11]; if (UNLIKELY (regexec (LSUP_uri_ptn, fquri, 11, matches, 0) != 0)) { fprintf (stderr, "Error matching URI pattern.\n"); return LSUP_VALUE_ERR; } if (term->type == LSUP_TERM_NS_IRIREF) free (fquri); MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR); term->iri_info->prefix = matches[1]; term->iri_info->path = matches[5]; term->iri_info->frag = matches[10]; term->iri_info->nsm = metadata; } term->data = strdup (data); } else { // No data. Make up a random UUID or URI if allowed. if (type == LSUP_TERM_IRIREF || type == LSUP_TERM_BNODE) { uuid_t uuid; uuid_generate_random (uuid); uuid_str_t uuid_str; uuid_unparse_lower (uuid, uuid_str); if (type == LSUP_TERM_IRIREF) { term->data = malloc (UUID4_URN_SIZE); snprintf ( term->data, UUID4_URN_SIZE, "urn:uuid4:%s", uuid_str); MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR); // Allocate IRI match patterns manually. term->iri_info->prefix.rm_so = 0; term->iri_info->prefix.rm_eo = 4; term->iri_info->path.rm_so = 4; term->iri_info->path.rm_eo = UUIDSTR_SIZE + 6; term->iri_info->frag.rm_so = -1; term->iri_info->frag.rm_eo = -1; term->iri_info->nsm = NULL; } else term->data = strdup (uuid_str); } else { log_error ("No data provided for term."); return LSUP_VALUE_ERR; } } if (term->type == LSUP_TERM_LT_LITERAL) { if (!metadata) { log_warn ("Lang tag is NULL. Creating a non-tagged literal."); term->type = LSUP_TERM_LITERAL; } else { char *lang_str = (char *) metadata; log_trace("Lang string: '%s'", lang_str); // Lang tags longer than 7 characters will be truncated. strncpy(term->lang, lang_str, sizeof (term->lang) - 1); if (strlen (term->lang) < 1) { log_error ("Lang tag cannot be an empty string."); return LSUP_VALUE_ERR; } term->lang[7] = '\0'; } } if (term->type == LSUP_TERM_LITERAL) { term->datatype = metadata; if (! term->datatype) term->datatype = LSUP_default_datatype; log_trace ("Storing data type: %s", term->datatype->data); if (! LSUP_IS_IRI (term->datatype)) { log_error ( "Literal data type is not an IRI: %s", term->datatype->data); return LSUP_VALUE_ERR; } LSUP_Term *ex = NULL; LSUP_term_set_add (LSUP_term_cache, term->datatype, &ex); if (ex && ex != term->datatype) { // Replace datatype handle with the one in term cache, and free // the new one. if (term->datatype != LSUP_default_datatype) LSUP_term_free (term->datatype); term->datatype = ex; } //log_trace ("Datatype address: %p", term->datatype); log_trace ("Datatype hash: %lx", LSUP_term_hash (term->datatype)); } else if (term->type == LSUP_TERM_BNODE) { // TODO This is not usable for global skolemization. term->bnode_id = LSUP_HASH ( term->data, strlen (term->data) + 1, LSUP_HASH_SEED); } return LSUP_OK; } /* * Extern inline functions. */ LSUP_Key LSUP_term_hash (const LSUP_Term *term); LSUP_Term *LSUP_iriref_new (const char *data, LSUP_NSMap *nsm); LSUP_Term *LSUP_literal_new (const char *data, LSUP_Term *datatype); LSUP_Term *LSUP_lt_literal_new (const char *data, char *lang); LSUP_Term *LSUP_bnode_new (const char *data); bool LSUP_term_equals (const LSUP_Term *term1, const LSUP_Term *term2); LSUP_Term *LSUP_triple_pos (const LSUP_Triple *trp, LSUP_TriplePos n); LSUP_Key LSUP_triple_hash (const LSUP_Triple *trp);