Browse Source

Implement bulk term handling functions.

Stefano Cossu 2 years ago
parent
commit
8ce24926f3
8 changed files with 619 additions and 261 deletions
  1. 2 2
      TODO.md
  2. 4 0
      include/environment.h
  3. 56 4
      include/graph.h
  4. 147 79
      include/term.h
  5. 28 35
      src/codec/grammar_ttl.y
  6. 9 29
      src/environment.c
  7. 134 38
      src/graph.c
  8. 239 74
      src/term.c

+ 2 - 2
TODO.md

@@ -27,6 +27,8 @@
 - *W* Turtle serialization / deserialization
   - *D* TTL decoder
   - *W* TTL encoder
+- *W* Enhanced graph operations
+    - *W* Extract unique terms and 2-term tuples
 - *P* Full UTF-8 support
 - *P* Extended tests
     - *P* C API
@@ -38,8 +40,6 @@
 
 - Graph checksum and semantic equality
 - Term and triple validation
-- Enhanced graph operations
-    - Extract unique terms and 2-term tuples
 - NQ codec
 - TriG codec
 

+ 4 - 0
include/environment.h

@@ -23,6 +23,10 @@
 #define LSUP_IS_INIT (LSUP_term_cache != NULL)
 
 
+/*
+ * External variables.
+ */
+
 extern LSUP_NSMap *LSUP_default_nsm; /// Default namespace prefix map.
 extern LSUP_Term *LSUP_default_ctx; /// Default context.
 extern LSUP_Buffer *LSUP_default_ctx_buf; /// Serialized default context.

+ 56 - 4
include/graph.h

@@ -320,6 +320,58 @@ void
 LSUP_graph_iter_free (LSUP_GraphIterator *it);
 
 
+/** @brief Get term pairs connected to a term in a graph.
+ *
+ * This returns a #LSUP_LinkMap extracted from a graph for a given term. The
+ * map can generate triples using #LSUP_link_map_triples().
+ *
+ * Depending on the type requested (`LSUP_CONN_*), the term can be leveraged
+ * as a subject, predicate, or object.
+ *
+ * @param[in] gr Graph to extract the connection list from.
+ *
+ * @param[in] t Term to query for connections.
+ *
+ * @param[in] type Type of connections to look up.
+ *
+ * @return Link map for the requested term. It should be freed with
+ *  #LSUP_conn_list_free().
+ */
+LSUP_LinkMap *
+LSUP_graph_connections (
+        LSUP_Graph *gr, LSUP_Term *t, LSUP_LinkType type);
+
+
+/** @brief Get a list of terms related to a term pair in a graph.
+ *
+ * @param[in] gr Graph to extract terms from.
+ *
+ * @param[in] t1 First term.
+ *
+ * @param[in] t1_pos Position of the first term in the triples to look up.
+ *
+ * @param[in] t2 Second term.
+ *
+ * @param[in] t2_pos Position of the second term in the triples to look up.
+ *
+ * @return Term set of results.
+ */
+LSUP_TermSet *
+LSUP_graph_term_set (
+        LSUP_Graph *gr, LSUP_Term *t1, LSUP_TriplePos t1_pos,
+        LSUP_Term *t2, LSUP_TriplePos t2_pos);
+
+
+/** @brief Get all unique subjcts, predicates, or objects in a graph.
+ *
+ * @param[in] gr Graph handle.
+ *
+ * @param[in] pos Position in the triples of the terms to look for.
+ */
+LSUP_TermSet *
+LSUP_graph_unique_terms (LSUP_Graph *gr, LSUP_TriplePos pos);
+
+
 /** @brief Add triples for a term and related connection list to a graph.
  *
  * The connection list can be of inbound, outbound, or edge type; depending on
@@ -330,13 +382,13 @@ LSUP_graph_iter_free (LSUP_GraphIterator *it);
  *
  * @param[in] t Term to be associated with the collection list.
  *
- * @param[in] cl Connection list.
+ * @param[in] cl Link map.
  *
  * @return Number of triples parsed on success, or <0 (LSUP_*_ERR) on error.
  */
 size_t
-LSUP_spo_list_add_triples (
-        LSUP_GraphIterator *it, LSUP_Term *t, const LSUP_ConnectionList *cl);
+LSUP_graph_add_link_map (
+        LSUP_GraphIterator *it, LSUP_Term *t, LSUP_LinkMap *cl);
 
 
 /** @brief Add triples for an anonymous collection to a graph.
@@ -351,6 +403,6 @@ LSUP_spo_list_add_triples (
  * @return Blank node representing the first list item.
  */
 LSUP_Term *
-LSUP_bnode_add_collection (LSUP_GraphIterator *it, LSUP_Term **ol);
+LSUP_bnode_add_collection (LSUP_GraphIterator *it, LSUP_TermSet *ts);
 
 #endif

+ 147 - 79
include/term.h

@@ -61,6 +61,8 @@ typedef enum {
  */
 typedef struct iri_info_t LSUP_IRIInfo;
 
+typedef struct link_map_iter LSUP_LinkMapIterator;
+
 /// RDF term.
 typedef struct term_t {
     char *              data;       // URI, literal value, or BNode label.
@@ -85,65 +87,54 @@ typedef struct term_t {
     ((term)->type == LSUP_TERM_LITERAL || (term)->type == LSUP_TERM_LT_LITERAL)
 
 
+/** @brief RDF triple.
+ *
+ * This represents a complete RDF statement. Triple terms can be accessed
+ * directly via the `s`, `p`, `o` members or sequentially via
+ * #LSUP_triple_pos().
+ */
 typedef struct triple_t {
-    LSUP_Term *s;
-    LSUP_Term *p;
-    LSUP_Term *o;
+    LSUP_Term *s;                   ///< Subject.
+    LSUP_Term *p;                   ///< Predicate.
+    LSUP_Term *o;                   ///< Object.
 } LSUP_Triple;
 
 
-/** @brief Key-term pair.
- */
-typedef struct term_cache_entry_t {
-    LSUP_Key            key;        // Key (hash) of the term.
-    LSUP_Term *         term;       // Term handle.
-} LSUP_KeyedTerm;
-
-
-/// Connection type.
+/// Link type.
 typedef enum {
-    LSUP_CONN_INBOUND,              ///< Inbound connection (sp).
-    LSUP_CONN_OUTBOUND,             ///< Outbound connection (po).
-    LSUP_CONN_EDGE,                 ///< Edge connection (so).
-} LSUP_ConnectionType;
+    LSUP_LINK_INBOUND,              ///< Inbound link (sp).
+    LSUP_LINK_OUTBOUND,             ///< Outbound link (po).
+    LSUP_LINK_EDGE,                 ///< Edge link (so).
+} LSUP_LinkType;
 
 
-/** @brief Connection list.
- *
- * A list of predicates and related lists of terms, that can be used to list
- * inbound or outbound connections to a node.
+/** @brief The immediate neighborhood of terms connected to a term.
  *
- * Each term in the NUL-terminated `p` list represent a term which is
- * paired with a list of terms in the `tl` list. The index of each term in this
- * list corresponds to the same index of a term list in `tl`.
- *
- * If the type of the connection list is `LSUP_CONN_INBOUND`, the term list
- * represent subjects and a term that is associated with the connection list is
- * the related object; if `LSUP_CONN_OUTBOUND`, the term list represents
- * objects, and a term that is associated with the connection list represents
- * the subject. If `LSUP_CONN_EDGE`, the members of the connection list
- * represent subjects and objects, and the associated term is the predicate.
+ * This is a hash map whose each term is related to a set of one or more other
+ * terms. The hash map is inside an opaque handle and is manipulated via the
+ * `LSUP_link_map_*` functions.
  *
+ * If the type of the link map is `LSUP_LINK_INBOUND`, the map keys
+ * represent predicates and the sets related to them are the objects, and the
+ * term associated to the link map is the object; if
+ * `LSUP_LINK_OUTBOUND`, the keys represent predicates, the related sets
+ * objects, and the associated term is the subject. If `LSUP_LINK_EDGE`, the
+ * keys represent subjects and the related sets objects, and the associated
+ * term is the predicate.
  */
-typedef struct {
-    LSUP_ConnectionType type;       ///< Inbound or outbound connection.
-    LSUP_Term **    t;              ///< NUL-terminated array of term handles.
-    LSUP_Term ***   tl;             /**<
-                                      * NUL-terminated array of
-                                      * NUL-terminated arrays of term handles.
-                                      */
-} LSUP_ConnectionList;
+typedef struct link_map LSUP_LinkMap;
 
 
-/*
- * Extern variables.
+/** @brief a set of unique terms.
+ *
+ * This is used to bulk-add terms to a link map.
  */
+typedef struct hashmap LSUP_TermSet;
 
-/** @brief Global term cache.
- *
- * Stores frequently used terms, e.g. data type URIs.
+
+/*
+ * External variables.
  */
-extern struct hashmap *LSUP_term_cache;
 
 /** @brief Compiled hash of default literal data type.
  */
@@ -160,6 +151,12 @@ extern regex_t *LSUP_uri_ptn;
  */
 extern LSUP_Term *LSUP_default_datatype;
 
+/** @brief Global term cache.
+ *
+ * Stores frequently used terms, e.g. data type URIs.
+ */
+extern LSUP_TermSet *LSUP_term_cache;
+
 
 /*
  * API functions.
@@ -492,83 +489,154 @@ LSUP_triple_hash (const LSUP_Triple *trp)
 }
 
 
-/** @brief Add an identifier to the term cache.
+/** @brief Create a new term set.
  *
- * @param[in] key Hash of the inserted term.
+ * @return New empty term set.
+ */
+LSUP_TermSet *
+LSUP_term_set_new (void);
+
+
+/** @brief Free a term set.
  *
- * @param[in] term Term to insert. A copy of the term is stored in the cache,
- *  which is freed on application teardown.
+ * @param[in] ts Term set handle.
+ */
+void
+LSUP_term_set_free (LSUP_TermSet *ts);
+
+
+/** @brief Add term to a term set.
+ *
+ * If the same term is already in the set, it is not replaced, and the existing
+ * term's handle is made available in the `existing` variable. In this case,
+ * the caller may want to free the passed term which has not been added.
+ *
+ * @param[in] tl Term set to be added to.
+ *
+ * @param[in] term Term to be added to the list. The term set will take
+ * ownership of the term and free it when it's freed with
+ * #LSUP_term_set_free()—only if the return code is LSUP_OK.
+ *
+ * @param[out] existing If not NULL, and if the term being added is a
+ * duplicate, this variable will be populated with the existing term handle.
+ *
+ * @return LSUP_OK on success; LSUP_NOACTION if the term is duplicate;
+ *  LSUP_MEM_ERR on memory error. Note: if not LSUP_OK, the caller is in charge
+ *  of freeing the `term` handle.
  */
 LSUP_rc
-LSUP_tcache_add (const LSUP_Key key, const LSUP_Term *term);
+LSUP_term_set_add (LSUP_TermSet *ts, LSUP_Term *term, LSUP_Term **existing);
+
 
-/** @brief Get an identifier from the cache.
+/** @brief Get a term from a term set.
+ *
+ * @param[in] ts Term set handle.
  *
  * @param[in] key Key for the queried term.
  *
- * @return The retrieved term if found, or NULL. The string must not be
+ * @return The retrieved term if found, or NULL. The term must not be
  *  modified or freed.
  */
 const LSUP_Term *
-LSUP_tcache_get (LSUP_Key key);
+LSUP_term_set_get (LSUP_TermSet *ts, LSUP_Key key);
 
 
-/** @brief Add term to a term list.
+/** @brief Iterate trough a term set.
  *
- * @param[in] tl Array of term handles to be added to. The handle must be NUL-
- *  terminated. On success, this handle will be reallocated and the new address
- *  returned, so the passed handle should no longer be used. On failure, it
- *  remains unchanged and may be reused.
+ * @param[in] ts Term set handle.
  *
- * @param[in] t Term to be added to the list. The object list will take
- *  ownership of the term.
+ * @param[in,out] i Iterator to be initially set to 0.
  *
- * @return Reallocated list on success; NULL on failure.
+ * @param[out] term Pointer to be populated with the next term on success. It
+ *  may be NULL.
+ *
+ * @return LSUP_OK if the next term was retrieved; LSUP_END if the end of the
+ *  set has been reached.
  */
-LSUP_Term **
-LSUP_term_list_add (LSUP_Term **tl, LSUP_Term *t);
+LSUP_rc
+LSUP_term_set_next (LSUP_TermSet *ts, size_t *i, LSUP_Term **term);
 
 
-/** @brief New connection list.
+/** @brief New link map.
  *
  * The initial state of the returned list is: `{t: [NULL], tl: [NULL]}`
  *
- * Predicates and term lists can be added with #LSUP_conn_list_add, and terms
+ * Predicates and term lists can be added with #LSUP_link_map_add, and terms
  * can be added to a term list with #LSUP_term_list_add.
  *
  * @return a new empty predicate-object list.
  */
-LSUP_ConnectionList *
-LSUP_conn_list_new (LSUP_ConnectionType type);
+LSUP_LinkMap *
+LSUP_link_map_new (LSUP_LinkType type);
 
 
-/** @brief Free a predicate-object list.
+/** @brief Free a link map.
  *
  * All arrays and term handles are recursively freed.
  *
- * @param[in] pol Predicate-object list handle obtained with
- *  #LSUP_conn_list_new().
+ * @param[in] pol link map handle obtained with #LSUP_link_map_new().
  */
 void
-LSUP_conn_list_free (LSUP_ConnectionList *pol);
+LSUP_link_map_free (LSUP_LinkMap *pol);
+
 
+/// Return the link map type.
+LSUP_LinkType
+LSUP_link_map_type (const LSUP_LinkMap *map);
 
-/** @brief Add a term - term list pair to a connection list.
+
+/** @brief Add a term - term set pair to a link map.
+ *
+ * If there is already a term set for the given term, items from the added term
+ * are added to the existing term set (if not duplicated). Otherwise, the term
+ * set handle is linked to the new term.
+ *
+ * In any case, the caller should not directly use the term and term set after
+ * passing them to this function.
  *
- * @param[in] cl Connection list handle obtained with
- *  #LSUP_conn_list_new().
+ * @param[in] cm Link map handle obtained with #LSUP_link_map_new().
  *
  *  @param[in] t Term to be associated with the given object list. The
- *   connection list structure takes ownership of the term.
+ *   link map structure takes ownership of the term.
  *
- *  @param[in] o NULL-terminated array of object term handles to be associated
- *   with the given predicate. The connection list structire takes ownership of
- *   the whole term array.
+ * @param[in] ts term set to be associated with the given term. The link
+ *  list structire takes ownership of the term set and the terms in it.
  *
  * @return LSUP_OK on success; LSUP_MEM_ERR on allocation error.
  */
 LSUP_rc
-LSUP_conn_list_add (
-        LSUP_ConnectionList *cl, LSUP_Term *t, LSUP_Term **tl);
+LSUP_link_map_add (
+        LSUP_LinkMap *cmap, LSUP_Term *term, LSUP_TermSet *tset);
+
+
+/** @brief Create a new iterator to loop through a link map.
+ *
+ * @param[in] lmap Map handle to iterate.
+ */
+LSUP_LinkMapIterator *
+LSUP_link_map_iter_new (const LSUP_LinkMap *lmap);
+
+
+/**@brief Iterate over a link map and generate triples.
+ *
+ * Calling this function repeatedly builds triples for all the linked terms and
+ * term sets in the map, based on a given related term.
+ *
+ * @param[in] it Link map iterator handle, obtained with
+ *  #LSUP_link_map_iter_new().
+ *
+ * @param[in] term Term to relate to the link map.
+ *
+ * @param[in|out] spo Result triple. The triple handle must be pre-allocated
+ *  (it may be TRP_DUMMY) and calls to this function will be set its memebers
+ *  to term handles owned by the link map. If rc != LSUP_OK, the contents are
+ *  undefined.
+ *
+ * @return LSUP_OK if a new triple was yielded; LSUP_END if the end of the loop
+ *  has been reached; <0 on error.
+ */
+LSUP_rc
+LSUP_link_map_triples (
+        LSUP_LinkMapIterator *it, LSUP_Term *term, LSUP_Triple *spo);
 
 #endif

+ 28 - 35
src/codec/grammar_ttl.y

@@ -79,48 +79,45 @@ base        ::= BASE WS IRIREF(D) PERIOD . {
             }
 
 triples 	::= subject(S) ows predObjList(L) PERIOD . {
-                size_t ct = LSUP_spo_list_add_triples (state->it, S, L);
+                size_t ct = LSUP_graph_add_link_map (state->it, S, L);
                 state->ct += ct;
                 state->rc = LSUP_OK;
                 log_trace ("Added %lu triples.", ct);
 
                 LSUP_term_free (S);
-                LSUP_conn_list_free (L);
+                LSUP_link_map_free (L);
             }
 triples 	::= subject(S) ows predObjList(L) SEMICOLON PERIOD . [PERIOD] {
-                size_t ct = LSUP_spo_list_add_triples (state->it, S, L);
+                size_t ct = LSUP_graph_add_link_map (state->it, S, L);
                 state->ct += ct;
                 state->rc = LSUP_OK;
                 log_trace ("Added %lu triples.", ct);
 
                 LSUP_term_free (S);
-                LSUP_conn_list_free (L);
+                LSUP_link_map_free (L);
             }
 
-%type predObjList       { LSUP_ConnectionList * }
-%destructor predObjList { LSUP_conn_list_free ($$); }
+%type predObjList       { LSUP_LinkMap * }
+%destructor predObjList { LSUP_link_map_free ($$); }
 predObjList(A) ::= predicate(P) ows objectList(O) . [SEMICOLON] {
-                A = LSUP_conn_list_new(LSUP_CONN_OUTBOUND);
-                LSUP_conn_list_add (A, P, O);
+                A = LSUP_link_map_new (LSUP_LINK_OUTBOUND);
+                LSUP_link_map_add (A, P, O);
             }
 predObjList(A) ::= predObjList(L) SEMICOLON predicate(P) ows objectList(O) . {
-                LSUP_conn_list_add (L, P, O);
+                LSUP_link_map_add (L, P, O);
                 A = L;
             }
 
-%type objectList { LSUP_Term ** }
-%destructor objectList {
-    log_debug ("Freeing object list.");
-    size_t i = 0;
-    while ($$[i]) LSUP_term_free ($$[i++]);
-    free ($$);
-}
+%type objectList { LSUP_TermSet * }
+%destructor objectList { LSUP_term_set_free ($$); }
 objectList(A) ::= objectList(L) COMMA object(O) . {
-                A = LSUP_term_list_add (L, O);
+                if (LSUP_term_set_add (L, O, NULL) == LSUP_NOACTION)
+                    LSUP_term_free (O);
+                A = L;
             }
 objectList(A) ::= object(O) . [IRIREF] {
-                A = calloc (sizeof (*A), 2);
-                A[0] = O;
+                A = LSUP_term_set_new();
+                LSUP_term_set_add (A, O, NULL);
             }
 
 %type subject { LSUP_Term * }
@@ -219,10 +216,10 @@ blank(A)    ::= LBRACKET RBRACKET . [BNODE_ID] {
             }
 blank(A)    ::= LBRACKET predObjList(L) RBRACKET . [BNODE_ID] {
                 A = LSUP_term_new (LSUP_TERM_BNODE, NULL, NULL);
-                state->ct += LSUP_spo_list_add_triples (state->it, A, L);
+                state->ct += LSUP_graph_add_link_map (state->it, A, L);
                 log_trace ("Created list BN: _:%s", A->data);
 
-                LSUP_conn_list_free (L);
+                LSUP_link_map_free (L);
             }
 blank       ::= collection . [BNODE_ID]
 blank(A)    ::= LPAREN RPAREN . [BNODE_ID] {
@@ -239,23 +236,19 @@ blank(A)    ::= LPAREN RPAREN . [BNODE_ID] {
 // Collection triples are added here to the graph.
 collection(A) ::= LPAREN itemList(L) RPAREN . {
                 A = LSUP_bnode_add_collection (state->it, L);
-                // Free the item list.
-                size_t i = 0;
-                while (L[i]) LSUP_term_free (L[i++]);
-                free (L);
+                LSUP_term_set_free (L);
             }
 
-%type itemList { LSUP_Term ** }
-%destructor itemList {
-    log_debug ("Freeing item list.");
-    size_t i = 0;
-    while ($$[i]) LSUP_term_free ($$[i++]);
-    free ($$);
-}
-itemList(A) ::= itemList(L) ows object(O) . { A = LSUP_term_list_add (L, O); }
+%type itemList { LSUP_TermSet * }
+%destructor itemList { LSUP_term_set_free ($$); }
+itemList(A) ::= itemList(L) ows object(O) . {
+                if (LSUP_term_set_add (L, O, NULL) == LSUP_NOACTION)
+                    LSUP_term_free (O);
+                A = L;
+            }
 itemList(A) ::= object(O) . {
-                A = calloc (sizeof (*A), 2);
-                A[0] = O;
+                A = LSUP_term_set_new ();
+                LSUP_term_set_add (A, O, NULL);
             }
 
 %type resource { LSUP_Term * }

+ 9 - 29
src/environment.c

@@ -13,35 +13,12 @@
 /*
  * External variables.
  */
+LSUP_TermSet *LSUP_term_cache = NULL;
 LSUP_NSMap *LSUP_default_nsm = NULL;
 LSUP_Term *LSUP_default_ctx = NULL;
 LSUP_Buffer *LSUP_default_ctx_buf = NULL;
 
 
-/*
- * Hashmap callbacks.
- */
-
-static uint64_t
-term_cache_hash_fn (
-        const void *item, uint64_t seed0, uint64_t seed1)
-{ return ((const LSUP_KeyedTerm *) item)->key; }
-
-
-static int
-term_cache_cmp_fn (const void *a, const void *b, void *udata)
-{
-    return 
-            ((const LSUP_KeyedTerm *) a)->key -
-            ((const LSUP_KeyedTerm *) b)->key;
-}
-
-
-static void
-term_cache_free_fn (void *item)
-{ LSUP_term_free (((LSUP_KeyedTerm *) item)->term); }
-
-
 /*
  * API
  */
@@ -98,13 +75,16 @@ LSUP_init (void)
     if (UNLIKELY (!LSUP_default_ctx_buf)) return LSUP_ERROR;
 
     // Initialize term cache.
-    LSUP_term_cache = hashmap_new (
-            sizeof(LSUP_KeyedTerm), 0, LSUP_HASH_SEED, 0,
-            term_cache_hash_fn, term_cache_cmp_fn, term_cache_free_fn, NULL);
+    LSUP_term_cache = LSUP_term_set_new();
+    if (UNLIKELY (!LSUP_term_cache)) return LSUP_MEM_ERR;
+
     // Create and cache default literal datatype key.
+    // This will be done only once in the program, so no need to check for
+    // duplicates.
     LSUP_default_datatype = LSUP_iriref_new (DEFAULT_DTYPE, NULL);
-    uint32_t dtype_hash = LSUP_term_hash (LSUP_default_datatype );
-    LSUP_tcache_add (dtype_hash, LSUP_default_datatype);
+    LSUP_rc rc = LSUP_term_set_add (
+            LSUP_term_cache, LSUP_default_datatype, NULL);
+    PRCCK (rc);
 
     // Set automatic teardown TODO Is this a good idea?
     atexit (LSUP_done);

+ 134 - 38
src/graph.c

@@ -249,6 +249,9 @@ LSUP_graph_add_init (LSUP_Graph *gr)
 LSUP_rc
 LSUP_graph_add_iter (LSUP_GraphIterator *it, const LSUP_Triple *spo)
 {
+    log_trace (
+            "Adding triple: {%s, %s, %s}",
+            spo->s->data, spo->p->data, spo->o->data);
     LSUP_BufferTriple *sspo = LSUP_triple_serialize (spo);
     if (UNLIKELY (!sspo)) return LSUP_MEM_ERR;
     const LSUP_StoreInt *sif = it->graph->store->sif;
@@ -501,45 +504,132 @@ LSUP_graph_abort (LSUP_Graph *gr)
 }
 
 
-size_t
-LSUP_spo_list_add_triples (
-        LSUP_GraphIterator *it, LSUP_Term *t, const LSUP_ConnectionList *cl)
+LSUP_LinkMap *
+LSUP_graph_connections (
+        LSUP_Graph *gr, LSUP_Term *t, LSUP_LinkType type)
 {
-    size_t ct = 0;
-    if (!t) {
-        log_error ("Related term is NULL!");
-        return LSUP_VALUE_ERR;
+    LSUP_Term
+        *s = NULL,
+        *p = NULL,
+        *o = NULL;
+
+    // Position of passed term and link terms, respectively.
+    LSUP_TriplePos pos1, pos2;
+
+    if (type == LSUP_LINK_INBOUND) {
+        o = t;
+        pos1 = TRP_POS_O;
+        pos2 = TRP_POS_P;
+    } else if (type == LSUP_LINK_OUTBOUND) {
+        s = t;
+        pos1 = TRP_POS_S;
+        pos2 = TRP_POS_P;
+    } else if (type == LSUP_LINK_EDGE) {
+        p = t;
+        pos1 = TRP_POS_P;
+        pos2 = TRP_POS_S;
+    } else {
+        // Very unlikely.
+        log_error ("Invalid connection type: %d", type);
+        return NULL;
     }
-    if (!cl->p) {
-        log_error ("Predicate is NULL!");
-        return LSUP_VALUE_ERR;
+
+    LSUP_GraphIterator *it = LSUP_graph_lookup (gr, s, p, o, NULL);
+
+    LSUP_LinkMap *ret = LSUP_link_map_new (type);
+    LSUP_BufferTriple *sspo = BTRP_DUMMY;
+    // Gather all linking terms in a set first.
+    LSUP_TermSet *lts = LSUP_term_set_new();
+    while (graph_iter_next_buffer (it, sspo) != LSUP_END) {
+        LSUP_Term
+            *ex = NULL,
+            *ins = LSUP_term_new_from_buffer (LSUP_btriple_pos (sspo, pos2));
+        LSUP_term_set_add (lts, ins, &ex);
+
+        if (ex) LSUP_term_free (ins);
     }
-    if (!cl->tl) {
-        log_error ("Term list is NULL!");
-        return LSUP_VALUE_ERR;
+    LSUP_graph_iter_free(it);
+
+    size_t i = 0;
+    LSUP_Term *lt;
+    while (LSUP_term_set_next (lts, &i, &lt) != LSUP_END) {
+        LSUP_link_map_add (
+                ret, lt, LSUP_graph_term_set (gr, t, pos1, lt, pos2));
     }
 
-    LSUP_Triple *spo = LSUP_triple_new (NULL, NULL, NULL);
-    if (cl->type == LSUP_CONN_INBOUND)
-        spo->o = t;
-    else if (cl->type == LSUP_CONN_OUTBOUND)
-        spo->s = t;
-    else spo->p = t;
-
-    for (size_t i = 0; cl->p[i]; i++) {
-        if (cl->type == LSUP_CONN_EDGE) spo->s = cl->p[i];
-        else spo->p = cl->p[i];
-
-        for (size_t j = 0; cl->tl[i][j]; j++) {
-            if (cl->type == LSUP_CONN_INBOUND)
-                spo->s = cl->tl[i][j];
-            else
-                spo->o = cl->tl[i][j];
-
-            LSUP_rc rc = LSUP_graph_add_iter (it, spo);
-            if (rc >= 0) ct++;
-            PRCCK (rc);
-        }
+    return ret;
+}
+
+
+LSUP_TermSet *
+LSUP_graph_term_set (
+        LSUP_Graph *gr, LSUP_Term *t1, LSUP_TriplePos t1_pos,
+        LSUP_Term *t2, LSUP_TriplePos t2_pos)
+{
+    if (t1_pos == t2_pos) {
+        log_error ("Term 1 and 2 positions cannot be the same!");
+        return NULL;
+    }
+
+    LSUP_Term *spo_l[3] = {NULL};
+    spo_l[t1_pos] = t1;
+    spo_l[t2_pos] = t2;
+    LSUP_TriplePos rpos;  // Position of term to be added to results.
+    for (int i = 0; i < 3; i++)
+        if (t1_pos != i && t2_pos != i) rpos = i;
+
+    LSUP_GraphIterator *it = LSUP_graph_lookup (
+            gr, spo_l[0], spo_l[1], spo_l[2], NULL);
+
+    LSUP_TermSet *ts = LSUP_term_set_new();
+    LSUP_BufferTriple *sspo = BTRP_DUMMY;
+    while (graph_iter_next_buffer (it, sspo) != LSUP_END) {
+        // There cannot be duplicates in a 2-bound lookup.
+        LSUP_term_set_add (
+                ts,
+                LSUP_term_new_from_buffer (LSUP_btriple_pos (sspo, rpos)),
+                NULL);
+    }
+    LSUP_graph_iter_free (it);
+
+    return ts;
+}
+
+
+LSUP_TermSet *
+LSUP_graph_unique_terms (LSUP_Graph *gr, LSUP_TriplePos pos)
+{
+    // TODO We should use spo indices for stores that have them...
+    LSUP_GraphIterator *it = LSUP_graph_lookup (gr, NULL, NULL, NULL, NULL);
+
+    LSUP_BufferTriple *sspo = BTRP_DUMMY;
+    LSUP_TermSet *ts = LSUP_term_set_new();
+    while (graph_iter_next_buffer (it, sspo) != LSUP_END) {
+        LSUP_Term
+            *ex = NULL,
+            *ins = LSUP_term_new_from_buffer (LSUP_btriple_pos (sspo, pos));
+        LSUP_term_set_add (ts, ins, &ex);
+
+        if (ex) LSUP_term_free (ins);
+    }
+    LSUP_graph_iter_free(it);
+
+    return ts;
+}
+
+
+size_t
+LSUP_graph_add_link_map (
+        LSUP_GraphIterator *it, LSUP_Term *t, LSUP_LinkMap *lmap)
+{
+    LSUP_Triple *spo = TRP_DUMMY;
+    size_t ct = 0;
+    LSUP_LinkMapIterator *lmit = LSUP_link_map_iter_new (lmap);
+
+    while (LSUP_link_map_triples (lmit, t, spo) != LSUP_END) {
+        LSUP_rc rc = LSUP_graph_add_iter (it, spo);
+        if (rc >= 0) ct++;
+        PRCCK (rc);
     }
     free (spo);
 
@@ -548,7 +638,7 @@ LSUP_spo_list_add_triples (
 
 
 LSUP_Term *
-LSUP_bnode_add_collection (LSUP_GraphIterator *it, LSUP_Term **ol)
+LSUP_bnode_add_collection (LSUP_GraphIterator *it, LSUP_TermSet *ts)
 {
     LSUP_NSMap *nsm = LSUP_graph_namespace (LSUP_graph_iter_graph (it));
     LSUP_Term
@@ -560,16 +650,22 @@ LSUP_bnode_add_collection (LSUP_GraphIterator *it, LSUP_Term **ol)
 
     LSUP_Triple *spo = TRP_DUMMY;
     link = s;
-    for (size_t i = 0; ol[i]; i++) {
+    size_t i = 0;
+    LSUP_Term *t;
+    while (LSUP_term_set_next (ts, &i, &t) != LSUP_END) {
         spo->s = link;
         spo->p = rdf_first;
-        spo->o = ol[i];
+        spo->o = t;
         PRCNL (LSUP_graph_add_iter (it, spo));
 
         spo->p = rdf_rest;
+        size_t save_i = i; // Save iterator position to restore it after peek.
         spo->o = (
-                ol[i + 1] ? LSUP_term_new (LSUP_TERM_BNODE, NULL, NULL)
+                // Peek into the next result.
+                LSUP_term_set_next (ts, &i, NULL) != LSUP_END ?
+                LSUP_term_new (LSUP_TERM_BNODE, NULL, NULL)
                 : rdf_nil);
+        i = save_i; // Restore the iterator that advanced when peeking.
 
         PRCNL (LSUP_graph_add_iter (it, spo));
 

+ 239 - 74
src/term.c

@@ -24,11 +24,51 @@ struct iri_info_t {
 };
 
 
+/// Key-term pair in term set.
+typedef struct keyed_term {
+    LSUP_Key            key;        ///< Key (hash) of the term.
+    LSUP_Term *         term;       ///< Term handle.
+} KeyedTerm;
+
+
+/** @brief Single link between a term and a term set.
+ *
+ * This link is not qualified and may not be used by itself. It belongs
+ * in a #LSUP_LinkMap which qualifies all links of the same type.
+ */
+typedef struct link {
+    KeyedTerm *         term;       ///< Linked term.
+    LSUP_TermSet *      tset;       ///< Term set linked to the term.
+} Link;
+
+
+/// Opaque link map iterator.
+struct link_map_iter {
+    const LSUP_LinkMap *map;        ///< Link map to iterate.
+    size_t              i;          ///< External loop cursor.
+    size_t              j;          ///< Internal loop cursor.
+    Link *              link;       ///< Current link being retrieved.
+};
+
+
+/*
+ * A link map is thus nested:
+ *
+ * - A link map contains a hash map of Link instances (link).
+ * - Each Link contains a KeyedTerm (term) and a TermSet (tset).
+ * - Each term set is a hash map of KeyedTerm instances.
+ * - Each KeyedTerm contains a Term and its hash.
+ */
+typedef struct link_map {
+    LSUP_LinkType       type;       ///< Link type.
+    struct hashmap *    links;       ///< Map of #Link instances.
+} LSUP_LinkMap;
+
+
 /*
- * Extern variables.
+ * External variables.
  */
 
-struct hashmap *LSUP_term_cache = NULL;
 uint32_t LSUP_default_dtype_key = 0;
 regex_t *LSUP_uri_ptn;
 LSUP_Term *LSUP_default_datatype = NULL;
@@ -52,6 +92,58 @@ term_init (
 
 
 /*
+ * Term set callbacks.
+ */
+
+static uint64_t
+tset_hash_fn (
+        const void *item, uint64_t seed0, uint64_t seed1)
+{ return ((const KeyedTerm *) item)->key; }
+
+
+static int
+tset_cmp_fn (const void *a, const void *b, void *udata)
+{
+    return 
+            ((const KeyedTerm *) a)->key -
+            ((const KeyedTerm *) b)->key;
+}
+
+
+static void
+tset_free_fn (void *item)
+{ LSUP_term_free (((KeyedTerm *) item)->term); }
+
+
+/*
+ * Link map callbacks.
+ */
+
+static uint64_t
+link_map_hash_fn (
+        const void *item, uint64_t seed0, uint64_t seed1)
+{ return ((const Link *)item)->term->key; }
+
+
+static int
+link_map_cmp_fn (const void *a, const void *b, void *udata)
+{
+    return 
+            ((const Link *)a)->term->key -
+            ((const Link *)b)->term->key;
+}
+
+
+static void
+link_map_free_fn (void *item)
+{
+    Link *link = item;
+    LSUP_term_free (link->term->term);
+    LSUP_term_set_free (link->tset);
+}
+
+
+ /*
  * Term API.
  */
 
@@ -431,25 +523,46 @@ LSUP_triple_free (LSUP_Triple *spo)
 }
 
 
+/*
+ * Multi-add functions.
+ */
+
+LSUP_TermSet *
+LSUP_term_set_new ()
+{
+    // Capacity of 8 is an arbitrary guess.
+    LSUP_TermSet *ts = hashmap_new (
+            sizeof (KeyedTerm), 8, LSUP_HASH_SEED, 0,
+            tset_hash_fn, tset_cmp_fn, tset_free_fn, NULL);
+    if (UNLIKELY (hashmap_oom (ts))) return NULL;
+
+    return ts;
+}
+
+
 LSUP_rc
-LSUP_tcache_add (const LSUP_Key key, const LSUP_Term *term)
+LSUP_term_set_add (LSUP_TermSet *ts, LSUP_Term *term, LSUP_Term **existing)
 {
-    LSUP_KeyedTerm entry_s = {.key=key, .term=(LSUP_Term *)term};
+    LSUP_Hash key = LSUP_term_hash (term);
+    KeyedTerm entry_s = {.key=key, .term=term};
 
-    // Many calls will likely attempt inserting duplicates after the first one.
-    if (LIKELY (hashmap_get (LSUP_term_cache, &entry_s))) return LSUP_NOACTION;
+    KeyedTerm *ex = hashmap_get (ts, &entry_s);
+    if (ex) {
+        if (existing) *existing = ex->term;
+        return LSUP_NOACTION;
+    }
 
-    hashmap_set (LSUP_term_cache, &entry_s);
+    hashmap_set (ts, &entry_s);
+    if (hashmap_oom (ts)) return LSUP_MEM_ERR;
 
     return LSUP_OK;
 }
 
 
 const LSUP_Term *
-LSUP_tcache_get (LSUP_Key key)
+LSUP_term_set_get (LSUP_TermSet *ts, LSUP_Key key)
 {
-    LSUP_KeyedTerm *entry = hashmap_get (
-            LSUP_term_cache, &(LSUP_KeyedTerm){.key=key});
+    KeyedTerm *entry = hashmap_get (ts, &(KeyedTerm){.key=key});
     if (entry) log_trace ("ID found for key %lx: %s", key, entry->term->data);
     else log_trace ("No ID found for key %lx.", key);
 
@@ -457,85 +570,137 @@ LSUP_tcache_get (LSUP_Key key)
 }
 
 
-/*
- * Multi-add functions.
- */
-
-LSUP_Term **
-LSUP_term_list_add (LSUP_Term **tl, LSUP_Term *t)
+LSUP_rc
+LSUP_term_set_next (LSUP_TermSet *ts, size_t *i, LSUP_Term **term)
 {
-    size_t i = 0;
-    while (tl[i++]); // Count includes sentinel.
-    LSUP_Term **ret = realloc (tl, sizeof (*tl) * (i + 1));
-    if (!ret) return NULL;
+    KeyedTerm *kt = NULL;
+    if (!hashmap_iter (ts, i, (void **)&kt)) return LSUP_END;
 
-    ret[i - 1] = t;
-    ret[i] = NULL;
+    if (term) *term = kt->term;
 
-    return ret;
+    return LSUP_OK;
 }
 
 
-LSUP_ConnectionList *
-LSUP_conn_list_new (LSUP_ConnectionType type)
+void
+LSUP_term_set_free (LSUP_TermSet *ts)
+{ hashmap_free (ts); }
+
+
+LSUP_LinkMap *
+LSUP_link_map_new (LSUP_LinkType type)
 {
-    LSUP_ConnectionList *cl;
-    MALLOC_GUARD (cl, NULL);
-    cl->type = type;
-    // Set sentinels.
-    CALLOC_GUARD (cl->t, NULL);
-    CALLOC_GUARD (cl->tl, NULL);
-
-    return cl;
+    LSUP_LinkMap *cm;
+    MALLOC_GUARD (cm, NULL);
+    cm->type = type;
+    cm->links = hashmap_new (
+            sizeof (Link), 0, LSUP_HASH_SEED, 0,
+            link_map_hash_fn, link_map_cmp_fn, link_map_free_fn, NULL);
+
+    return cm;
 }
 
 
 void
-LSUP_conn_list_free (LSUP_ConnectionList *cl)
+LSUP_link_map_free (LSUP_LinkMap *cm)
 {
-    log_debug ("Freeing predicate object list.");
-    for (size_t i = 0; cl->t[i]; i++) {
-        // Free individual predicate handles.
-        LSUP_term_free (cl->t[i]);
-    }
-    // Free pred list.
-    free (cl->t);
+    hashmap_free (cm->links);
+    free (cm);
+}
+
 
-    for (size_t i = 0; cl->tl[i]; i++) {
-        for (size_t j = 0; cl->tl[i][j]; j++) {
-            // Free individual term handles.
-            LSUP_term_free (cl->tl[i][j]);
+LSUP_LinkType
+LSUP_link_map_type (const LSUP_LinkMap *map)
+{ return map->type; }
+
+
+// TODO Memory error handling.
+LSUP_rc
+LSUP_link_map_add (
+        LSUP_LinkMap *cmap, LSUP_Term *term, LSUP_TermSet *tset)
+{
+    // Keyed term to look up the link term and insert it, if necessary.
+    KeyedTerm entry_s = {.key=LSUP_term_hash (term), .term=term};
+
+    Link *ex = hashmap_get (cmap->links, &(Link){.term=&entry_s});
+    if (ex) {
+        // Add terms one by one to the existing term set.
+        log_trace (
+                "Linking term %s exists. Adding individual terms.",
+                ex->term->term->data);
+        size_t i = 0;
+        KeyedTerm *kt;
+        while (hashmap_iter (tset, &i, (void **)&kt)) {
+            log_trace (
+                    "Adding term %s to link %s",
+                    kt->term->data, ex->term->term->data);
+            if (hashmap_get (ex->tset, kt))
+                // Term already exist, free the new one and move on.
+                LSUP_term_free (kt->term);
+            else
+                // Insert KeyedTerm, the term set now owns the underlying term.
+                hashmap_set (ex->tset, kt);
         }
-        // Free object list.
-        free (cl->tl[i]);
+        // Free link term that hasn't been used.
+        LSUP_term_free (term);
+    } else {
+        // Add the new term and the termset wholesale.
+        log_trace ("Adding new linking term %s.", term->data);
+        // Allocate inserted member on heap, it will be owned by the map.
+        KeyedTerm *ins;
+        MALLOC_GUARD (ins, LSUP_MEM_ERR);
+        memcpy (ins, &entry_s, sizeof (entry_s));
+        Link link = {.term=ins, tset=tset};
+        hashmap_set (cmap->links, &link);
     }
-    // Free list of object lists.
-    free (cl->tl);
 
-    free (cl);
+    return LSUP_OK;
+}
+
+
+LSUP_LinkMapIterator *
+LSUP_link_map_iter_new (const LSUP_LinkMap *lmap)
+{
+    LSUP_LinkMapIterator *it;
+    CALLOC_GUARD (it, NULL);
+    it->map = lmap;
+
+    return it;
 }
 
 
 LSUP_rc
-LSUP_conn_list_add (LSUP_ConnectionList *cl, LSUP_Term *t, LSUP_Term **tl)
+LSUP_link_map_triples (
+        LSUP_LinkMapIterator *it, LSUP_Term *term, LSUP_Triple *spo)
 {
-    size_t i;
-
-    i = 0;
-    while (cl->t[i++]);  // Count includes sentinel.
-    LSUP_Term **tmp_t = realloc (cl->t, sizeof (*cl->t) * (i + 1));
-    if (!tmp_t) return LSUP_MEM_ERR;
-    tmp_t[i - 1] = t;
-    tmp_t[i] = NULL;
-    cl->t = tmp_t;
-
-    i = 0;
-    while (cl->tl[i++]);
-    LSUP_Term ***tmp_tl = realloc (cl->tl, sizeof (*cl->tl) * (i + 1));
-    if (!tmp_tl) return LSUP_MEM_ERR;
-    tmp_tl[i - 1] = tl;
-    tmp_tl[i] = NULL;
-    cl->tl = tmp_tl;
+    // Assign external (related) term.
+    if (it->map->type == LSUP_LINK_INBOUND)
+        spo->o = term;
+    else if (it->map->type == LSUP_LINK_OUTBOUND)
+        spo->s = term;
+    else spo->p = term;
+
+    KeyedTerm *kt;
+
+    // If we are already handling a link, continue the internal loop.
+    if (it->link) goto int_loop;
+ext_loop:
+    // Advance external counter and start new internal loop.
+    it->j = 0;
+    if (!hashmap_iter (it->map->links, &it->i, (void **)&it->link))
+        return LSUP_END;
+int_loop:
+    // If end of the term set is reached, start with a new linking term.
+    if (!hashmap_iter (it->link->tset, &it->j, (void **)&kt)) goto ext_loop;
+
+    // Continue pulling from term set.
+    // Assign linking term.
+    if (it->map->type == LSUP_LINK_EDGE) spo->s = it->link->term->term;
+    else spo->p = it->link->term->term;
+
+    // Assign term in term set.
+    if (it->map->type == LSUP_LINK_INBOUND) spo->s = kt->term;
+    else spo->o = kt->term;
 
     return LSUP_OK;
 }
@@ -670,14 +835,14 @@ term_init (
             return LSUP_VALUE_ERR;
         }
 
-        uint32_t dtype_hash = LSUP_term_hash (term->datatype);
-
-        const LSUP_Term *tmp = LSUP_tcache_get (dtype_hash);
-        if (!tmp) LSUP_tcache_add (dtype_hash, term->datatype);
-        else if (term->datatype != tmp) {
+        LSUP_Term *ex = NULL;
+        LSUP_term_set_add (LSUP_term_cache, term->datatype, &ex);
+        if (ex && ex != term->datatype) {
+            // Replace datatype handle with the one in term cache, and free
+            // the new one.
             if (term->datatype != LSUP_default_datatype)
                 LSUP_term_free (term->datatype);
-            term->datatype = (LSUP_Term *)tmp;
+            term->datatype = ex;
         }
 
         //log_trace ("Datatype address: %p", term->datatype);