term.c 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019
  1. #include "tpl.h"
  2. #include "volksdata/term.h"
  3. /** @brief tpl packing format for a term.
  4. *
  5. * The pack elements are: 1. term type (char); 2. data (string); 3. void* type
  6. * metadata, cast to 8-byte unsigned.
  7. */
  8. #define TERM_PACK_FMT "csU"
  9. #define MAX_VALID_TERM_TYPE VOLK_TERM_BNODE /* For type validation. */
  10. /*
  11. * Data structures.
  12. */
  13. /// Sub-match coordinates in IRI parsing results.
  14. typedef struct match_coord_t {
  15. size_t offset; ///< Offset of match from start of string.
  16. size_t size; ///< Length of match.
  17. } MatchCoord;
  18. /// Matching sub-patterns for IRI parts.
  19. struct iri_info_t {
  20. MatchCoord prefix; ///< URI prefix (scheme + authority).
  21. MatchCoord path; ///< URI path (including fragment).
  22. MatchCoord frag; ///< URI fragment.
  23. };
  24. /// Key-term pair in term set.
  25. typedef struct keyed_term {
  26. VOLK_Key key; ///< Key (hash) of the term.
  27. VOLK_Term * term; ///< Term handle.
  28. } KeyedTerm;
  29. /** @brief Single link between a term and a term set.
  30. *
  31. * This link is not qualified and must not be used by itself. It belongs
  32. * in a #VOLK_LinkMap which qualifies all links of the same type.
  33. */
  34. typedef struct link {
  35. KeyedTerm * term; ///< Linked term.
  36. VOLK_TermSet * tset; ///< Term set linked to the term.
  37. } Link;
  38. /// Opaque link map iterator.
  39. struct link_map_iter {
  40. const VOLK_LinkMap *map; ///< Link map to iterate.
  41. size_t i; ///< Linking term loop cursor.
  42. size_t j; ///< Term set loop cursor.
  43. const Link * link; ///< Current link being retrieved.
  44. };
  45. /*
  46. * A link map is thus nested:
  47. *
  48. * - A link map contains a hash map of Link instances (link).
  49. * - It also contains the single term that the other terms are related to
  50. * (linked_t).
  51. * - Each Link contains a KeyedTerm (term) and a TermSet (tset).
  52. * - Each term set is a hash map of KeyedTerm instances.
  53. * - Each KeyedTerm contains a Term and its hash.
  54. */
  55. typedef struct link_map {
  56. VOLK_LinkType type; ///< Link type.
  57. VOLK_Term *linked_t; ///< Linked term.
  58. struct hashmap *links; ///< Map of #Link instances.
  59. } VOLK_LinkMap;
  60. /*
  61. * External variables.
  62. */
  63. uint32_t VOLK_default_dtype_key = 0;
  64. VOLK_Term *VOLK_default_datatype = NULL;
  65. VOLK_TermSet *VOLK_term_cache = NULL;
  66. /*
  67. * Static variables.
  68. */
  69. // Characters not allowed in a URI string.
  70. static const char *invalid_uri_chars = "<>\" {}|\\^`";
  71. /// Minimum valid type code.
  72. static const VOLK_TermType MIN_VALID_TYPE = VOLK_TERM_IRIREF;
  73. /// Maximum valid type code. Change this if adding to enum VOLK_TermType.
  74. static const VOLK_TermType MAX_VALID_TYPE = VOLK_TERM_BNODE;
  75. /*
  76. * Static prototypes.
  77. */
  78. static VOLK_rc
  79. term_init (
  80. VOLK_Term *term, VOLK_TermType type, const char *data, void *metadata);
  81. /*
  82. * Term set callbacks.
  83. */
  84. static uint64_t
  85. tset_hash_fn (
  86. const void *item, uint64_t seed0, uint64_t seed1)
  87. { return ((const KeyedTerm *) item)->key; }
  88. static int
  89. tset_cmp_fn (const void *a, const void *b, void *udata)
  90. {
  91. return
  92. ((const KeyedTerm *) a)->key -
  93. ((const KeyedTerm *) b)->key;
  94. }
  95. static void
  96. tset_free_fn (void *item)
  97. { VOLK_term_free (((KeyedTerm *) item)->term); }
  98. /*
  99. * Link map callbacks.
  100. */
  101. static uint64_t
  102. link_map_hash_fn (
  103. const void *item, uint64_t seed0, uint64_t seed1)
  104. { return ((const Link *)item)->term->key; }
  105. static int
  106. link_map_cmp_fn (const void *a, const void *b, void *udata)
  107. {
  108. return
  109. ((const Link *)a)->term->key -
  110. ((const Link *)b)->term->key;
  111. }
  112. static void
  113. link_map_free_fn (void *item)
  114. {
  115. Link *link = item;
  116. VOLK_term_free (link->term->term);
  117. free (link->term);
  118. VOLK_term_set_free (link->tset);
  119. }
  120. static VOLK_rc parse_iri (char *iri, MatchCoord coords[]);
  121. /*
  122. * Term API.
  123. */
  124. VOLK_Term *
  125. VOLK_term_new (
  126. VOLK_TermType type, const char *data, void *metadata)
  127. {
  128. VOLK_Term *term;
  129. CALLOC_GUARD (term, NULL);
  130. if (UNLIKELY (term_init (
  131. term, type, data, metadata) != VOLK_OK)) {
  132. free (term);
  133. return NULL;
  134. }
  135. return term;
  136. }
  137. VOLK_Term *
  138. VOLK_term_copy (const VOLK_Term *src)
  139. {
  140. void *metadata = NULL;
  141. if (src->type == VOLK_TERM_LITERAL)
  142. metadata = (void *) src->datatype;
  143. else if (src->type == VOLK_TERM_LT_LITERAL)
  144. metadata = (void *) src->lang;
  145. return VOLK_term_new (src->type, src->data, metadata);
  146. }
  147. VOLK_Term *
  148. VOLK_term_new_from_buffer (const VOLK_Buffer *sterm)
  149. {
  150. if (UNLIKELY (!sterm)) return NULL;
  151. VOLK_Term *term = NULL;
  152. VOLK_TermType type = VOLK_TERM_UNDEFINED;
  153. char *data = NULL;
  154. void *metadata;
  155. tpl_node *tn;
  156. tn = tpl_map (TERM_PACK_FMT, &type, &data, &metadata);
  157. if (UNLIKELY (!tn)) goto finally;
  158. if (UNLIKELY (tpl_load (tn, TPL_MEM, sterm->addr, sterm->size) < 0)) {
  159. log_error ("Error loading serialized term.");
  160. goto finally;
  161. }
  162. if (UNLIKELY (tpl_unpack (tn, 0) < 0)) {
  163. log_error ("Error unpacking serialized term.");
  164. goto finally;
  165. }
  166. if (type == VOLK_TERM_LT_LITERAL)
  167. term = VOLK_lt_literal_new (data, (char *)&metadata);
  168. else term = VOLK_term_new (type, data, metadata);
  169. finally:
  170. tpl_free (tn);
  171. free (data);
  172. return term;
  173. }
  174. VOLK_Term *
  175. VOLK_iriref_new_abs (const VOLK_Term *root, const VOLK_Term *iri)
  176. {
  177. if (iri->type != VOLK_TERM_IRIREF) {
  178. log_error ("Provided path is not an IRI.");
  179. return NULL;
  180. }
  181. if (root->type != VOLK_TERM_IRIREF) {
  182. log_error ("Provided root is not an IRI.");
  183. return NULL;
  184. }
  185. char
  186. *data,
  187. *pfx = VOLK_iriref_prefix (iri);
  188. if (strlen (pfx) > 0) data = iri->data;
  189. else if (iri->data[0] == '/') {
  190. free (pfx);
  191. pfx = VOLK_iriref_prefix (root);
  192. data = malloc (strlen (iri->data) + strlen (pfx) + 1);
  193. if (!data) return NULL;
  194. sprintf (data, "%s%s", pfx, iri->data);
  195. } else {
  196. data = malloc (strlen (iri->data) + strlen (root->data) + 1);
  197. if (!data) return NULL;
  198. sprintf (data, "%s%s", root->data, iri->data);
  199. }
  200. free (pfx);
  201. VOLK_Term *ret = VOLK_iriref_new (data);
  202. if (data != iri->data) free (data);
  203. return ret;
  204. }
  205. VOLK_Term *
  206. VOLK_iriref_new_rel (const VOLK_Term *root, const VOLK_Term *iri)
  207. {
  208. if (iri->type != VOLK_TERM_IRIREF) {
  209. log_error ("Provided path is not an IRI.");
  210. return NULL;
  211. }
  212. if (root->type != VOLK_TERM_IRIREF) {
  213. log_error ("Provided root is not an IRI.");
  214. return NULL;
  215. }
  216. size_t offset = (
  217. strstr (iri->data, root->data) == iri->data ?
  218. strlen (root->data) : 0);
  219. return VOLK_iriref_new (iri->data + offset);
  220. }
  221. VOLK_Buffer *
  222. VOLK_term_serialize (const VOLK_Term *term)
  223. {
  224. /*
  225. * In serializing a term, the fact that two terms of different types may
  226. * be semantically identical must be taken into account. Specifically, a
  227. * namespace-prefixed IRI ref is identical to its fully qualified version,
  228. * and a VOLK_TERM_LT_LITERAL with no language tag is identical to a
  229. * VOLK_TERM_LITERAL of xsd:string type, made up of the same string. Such
  230. * terms must have identical serializations.
  231. */
  232. if (UNLIKELY (!term)) return NULL;
  233. VOLK_Term *tmp_term;
  234. void *metadata = NULL;
  235. if (term->type == VOLK_TERM_LT_LITERAL) {
  236. // For LT literals with empty lang tag, convert to a normal xsd:string.
  237. if (strlen (term->lang) == 0)
  238. tmp_term = VOLK_literal_new (term->data, NULL);
  239. else {
  240. tmp_term = VOLK_lt_literal_new (term->data, (char *) term->lang);
  241. memcpy (&metadata, tmp_term->lang, sizeof (metadata));
  242. }
  243. } else if (term->type == VOLK_TERM_LITERAL) {
  244. tmp_term = VOLK_term_new (term->type, term->data, term->datatype);
  245. metadata = tmp_term->datatype;
  246. } else tmp_term = VOLK_term_copy (term);
  247. VOLK_Buffer *sterm;
  248. CALLOC_GUARD (sterm, NULL);
  249. //LOG_TRACE("Effective term being serialized: %s", tmp_term->data);
  250. int rc = tpl_jot (
  251. TPL_MEM, &sterm->addr, &sterm->size, TERM_PACK_FMT,
  252. &tmp_term->type, &tmp_term->data, &metadata);
  253. VOLK_term_free (tmp_term);
  254. if (rc != 0) {
  255. VOLK_buffer_free (sterm);
  256. return NULL;
  257. }
  258. return sterm;
  259. }
  260. VOLK_Key
  261. VOLK_term_hash (const VOLK_Term *term)
  262. {
  263. VOLK_Buffer *buf;
  264. if (UNLIKELY (!term)) buf = BUF_DUMMY;
  265. else buf = VOLK_term_serialize (term);
  266. VOLK_Key key = VOLK_buffer_hash (buf);
  267. VOLK_buffer_free (buf);
  268. return key;
  269. }
  270. void
  271. VOLK_term_free (VOLK_Term *term)
  272. {
  273. if (UNLIKELY (!term)) return;
  274. if (term->type == VOLK_TERM_IRIREF) free (term->iri_info);
  275. free (term->data);
  276. free (term);
  277. }
  278. char *
  279. VOLK_iriref_prefix (const VOLK_Term *iri)
  280. {
  281. if (iri->type != VOLK_TERM_IRIREF) {
  282. log_error ("Term is not a IRI ref type.");
  283. return NULL;
  284. }
  285. // if (iri->iri_info->prefix.size == 0) return NULL;
  286. return strndup (
  287. iri->data + iri->iri_info->prefix.offset,
  288. iri->iri_info->prefix.size);
  289. }
  290. char *
  291. VOLK_iriref_path (const VOLK_Term *iri)
  292. {
  293. if (iri->type != VOLK_TERM_IRIREF) {
  294. log_error ("Term is not a IRI ref type.");
  295. return NULL;
  296. }
  297. // if (iri->iri_info->path.size == 0) return NULL;
  298. return strndup (
  299. iri->data + iri->iri_info->path.offset,
  300. iri->iri_info->path.size);
  301. }
  302. char *
  303. VOLK_iriref_frag (const VOLK_Term *iri)
  304. {
  305. if (iri->type != VOLK_TERM_IRIREF) {
  306. log_error ("Term is not a IRI ref type.");
  307. return NULL;
  308. }
  309. // if (iri->iri_info->frag.size == 0) return NULL;
  310. return strndup (
  311. iri->data + iri->iri_info->frag.offset,
  312. iri->iri_info->frag.size);
  313. }
  314. /*
  315. * Triple API.
  316. */
  317. VOLK_Triple *
  318. VOLK_triple_new(VOLK_Term *s, VOLK_Term *p, VOLK_Term *o)
  319. {
  320. VOLK_Triple *spo = malloc (sizeof (*spo));
  321. if (!spo) return NULL;
  322. if (UNLIKELY (VOLK_triple_init (spo, s, p, o))) {
  323. free (spo);
  324. return NULL;
  325. }
  326. return spo;
  327. }
  328. VOLK_Triple *
  329. VOLK_triple_new_from_btriple (const VOLK_BufferTriple *sspo)
  330. {
  331. VOLK_Triple *spo = malloc (sizeof (*spo));
  332. if (!spo) return NULL;
  333. spo->s = VOLK_term_new_from_buffer (sspo->s);
  334. spo->p = VOLK_term_new_from_buffer (sspo->p);
  335. spo->o = VOLK_term_new_from_buffer (sspo->o);
  336. return spo;
  337. }
  338. VOLK_BufferTriple *
  339. VOLK_triple_serialize (const VOLK_Triple *spo)
  340. {
  341. VOLK_BufferTriple *sspo = malloc (sizeof (*sspo));
  342. if (!sspo) return NULL;
  343. sspo->s = VOLK_term_serialize (spo->s);
  344. sspo->p = VOLK_term_serialize (spo->p);
  345. sspo->o = VOLK_term_serialize (spo->o);
  346. return sspo;
  347. }
  348. VOLK_rc
  349. VOLK_triple_init (VOLK_Triple *spo, VOLK_Term *s, VOLK_Term *p, VOLK_Term *o)
  350. {
  351. /* FIXME TRP_DUMMY is a problem here.
  352. if (! VOLK_IS_IRI (s) && s->type != VOLK_TERM_BNODE) {
  353. log_error ("Subject is not of a valid term type: %d", s->type);
  354. return VOLK_VALUE_ERR;
  355. }
  356. if (! VOLK_IS_IRI (p)) {
  357. log_error ("Predicate is not of a valid term type: %d", p->type);
  358. return VOLK_VALUE_ERR;
  359. }
  360. */
  361. spo->s = s;
  362. spo->p = p;
  363. spo->o = o;
  364. return VOLK_OK;
  365. }
  366. void
  367. VOLK_triple_done (VOLK_Triple *spo)
  368. {
  369. if (UNLIKELY (!spo)) return;
  370. VOLK_term_free (spo->s);
  371. VOLK_term_free (spo->p);
  372. VOLK_term_free (spo->o);
  373. }
  374. void
  375. VOLK_triple_free (VOLK_Triple *spo)
  376. {
  377. if (UNLIKELY (!spo)) return;
  378. VOLK_term_free (spo->s);
  379. VOLK_term_free (spo->p);
  380. VOLK_term_free (spo->o);
  381. free (spo);
  382. }
  383. /*
  384. * Multi-add functions.
  385. */
  386. VOLK_TermSet *
  387. VOLK_term_set_new ()
  388. {
  389. // Capacity of 4 is an arbitrary guess.
  390. VOLK_TermSet *ts = hashmap_new (
  391. sizeof (KeyedTerm), 4, VOLK_HASH_SEED, 0,
  392. tset_hash_fn, tset_cmp_fn, tset_free_fn, NULL);
  393. if (UNLIKELY (hashmap_oom (ts))) return NULL;
  394. return ts;
  395. }
  396. VOLK_rc
  397. VOLK_term_set_add (VOLK_TermSet *ts, VOLK_Term *term, VOLK_Term **existing)
  398. {
  399. VOLK_Hash key = VOLK_term_hash (term);
  400. KeyedTerm entry_s = {.key=key, .term=term};
  401. const KeyedTerm *ex = hashmap_get (ts, &entry_s);
  402. if (ex) {
  403. if (existing) *existing = ex->term;
  404. return VOLK_NOACTION;
  405. }
  406. hashmap_set (ts, &entry_s);
  407. if (hashmap_oom (ts)) return VOLK_MEM_ERR;
  408. return VOLK_OK;
  409. }
  410. const VOLK_Term *
  411. VOLK_term_set_get (VOLK_TermSet *ts, VOLK_Key key)
  412. {
  413. const KeyedTerm *entry = hashmap_get (ts, &(KeyedTerm){.key=key});
  414. if (entry) LOG_TRACE("ID found for key %lx: %s", key, entry->term->data);
  415. else LOG_TRACE("No ID found for key %lx.", key);
  416. return (entry) ? entry->term : NULL;
  417. }
  418. VOLK_rc
  419. VOLK_term_set_next (VOLK_TermSet *ts, size_t *i, VOLK_Term **term)
  420. {
  421. KeyedTerm *kt = NULL;
  422. if (!hashmap_iter (ts, i, (void **)&kt)) return VOLK_END;
  423. if (term) *term = kt->term;
  424. return VOLK_OK;
  425. }
  426. void
  427. VOLK_term_set_free (VOLK_TermSet *ts)
  428. {
  429. if (UNLIKELY (!ts)) return;
  430. hashmap_free (ts);
  431. }
  432. size_t
  433. VOLK_term_set_size (VOLK_TermSet *ts)
  434. { return hashmap_count (ts); }
  435. VOLK_LinkMap *
  436. VOLK_link_map_new (const VOLK_Term *linked_term, VOLK_LinkType type)
  437. {
  438. VOLK_LinkMap *lm;
  439. MALLOC_GUARD (lm, NULL);
  440. lm->type = type;
  441. lm->links = hashmap_new (
  442. sizeof (Link), 0, VOLK_HASH_SEED, 0,
  443. link_map_hash_fn, link_map_cmp_fn, link_map_free_fn, NULL);
  444. if (!linked_term) {
  445. log_error ("term must not be NULL.");
  446. free (lm);
  447. return NULL;
  448. }
  449. lm->linked_t = VOLK_term_copy (linked_term);
  450. return lm;
  451. }
  452. void
  453. VOLK_link_map_free (VOLK_LinkMap *lm)
  454. {
  455. hashmap_free (lm->links);
  456. VOLK_term_free (lm->linked_t);
  457. free (lm);
  458. }
  459. VOLK_LinkType
  460. VOLK_link_map_type (const VOLK_LinkMap *map)
  461. { return map->type; }
  462. // TODO Memory error handling.
  463. VOLK_rc
  464. VOLK_link_map_add (
  465. VOLK_LinkMap *lmap, VOLK_Term *term, VOLK_TermSet *tset)
  466. {
  467. // Keyed term to look up the link term and insert it, if necessary.
  468. KeyedTerm entry_s = {.key=VOLK_term_hash (term), .term=term};
  469. const Link *ex = hashmap_get (lmap->links, &(Link){.term=&entry_s});
  470. if (ex) {
  471. // Add terms one by one to the existing term set.
  472. LOG_TRACE(
  473. "Linking term %s exists. Adding individual terms.",
  474. ex->term->term->data);
  475. size_t i = 0;
  476. KeyedTerm *kt;
  477. while (hashmap_iter (tset, &i, (void **)&kt)) {
  478. LOG_TRACE(
  479. "Adding term %s to link %s",
  480. kt->term->data, ex->term->term->data);
  481. if (hashmap_get (ex->tset, kt))
  482. // Term already exist, free the new one and move on.
  483. VOLK_term_free (kt->term);
  484. else
  485. // Insert KeyedTerm, the term set now owns the underlying term.
  486. hashmap_set (ex->tset, kt);
  487. }
  488. // Free link term that hasn't been used.
  489. VOLK_term_free (term);
  490. } else {
  491. // Add the new term and the termset wholesale.
  492. LOG_TRACE("Adding new linking term %s.", term->data);
  493. // Allocate inserted member on heap, it will be owned by the map.
  494. KeyedTerm *ins;
  495. MALLOC_GUARD (ins, VOLK_MEM_ERR);
  496. memcpy (ins, &entry_s, sizeof (entry_s));
  497. Link link = {.term=ins, .tset=tset};
  498. hashmap_set (lmap->links, &link);
  499. }
  500. return VOLK_OK;
  501. }
  502. VOLK_LinkMapIterator *
  503. VOLK_link_map_iter_new (const VOLK_LinkMap *lmap)
  504. {
  505. VOLK_LinkMapIterator *it;
  506. CALLOC_GUARD (it, NULL);
  507. it->map = lmap;
  508. return it;
  509. }
  510. // This leaves the link and link map references intact.
  511. void
  512. VOLK_link_map_iter_free (VOLK_LinkMapIterator *it) { free (it); }
  513. VOLK_rc
  514. VOLK_link_map_next (
  515. VOLK_LinkMapIterator *it, VOLK_Term **lt, VOLK_TermSet **ts)
  516. {
  517. if (!hashmap_iter (it->map->links, &it->i, (void **)&it->link))
  518. return VOLK_END;
  519. *lt = it->link->term->term;
  520. *ts = it->link->tset;
  521. return VOLK_OK;
  522. }
  523. // TODO dismantle if the only triple generator is for the graph.
  524. VOLK_rc
  525. VOLK_link_map_triples (
  526. VOLK_LinkMapIterator *it, VOLK_Triple *spo)
  527. {
  528. // Assign external (related) term.
  529. if (it->map->type == VOLK_LINK_INBOUND)
  530. spo->o = it->map->linked_t;
  531. else if (it->map->type == VOLK_LINK_OUTBOUND)
  532. spo->s = it->map->linked_t;
  533. else spo->p = it->map->linked_t;
  534. KeyedTerm *kt;
  535. // If we are already handling a link, continue the internal loop.
  536. if (it->link) goto int_loop;
  537. ext_loop:
  538. // Advance external counter and start new internal loop.
  539. it->j = 0;
  540. if (!hashmap_iter (it->map->links, &it->i, (void **)&it->link))
  541. return VOLK_END;
  542. int_loop:
  543. // If end of the term set is reached, start with a new linking term.
  544. if (!hashmap_iter (it->link->tset, &it->j, (void **)&kt)) goto ext_loop;
  545. // Continue pulling from term set.
  546. // Assign linking term.
  547. if (it->map->type == VOLK_LINK_EDGE) spo->s = it->link->term->term;
  548. else spo->p = it->link->term->term;
  549. // Assign term in term set.
  550. if (it->map->type == VOLK_LINK_INBOUND) spo->s = kt->term;
  551. else spo->o = kt->term;
  552. return VOLK_OK;
  553. }
  554. /*
  555. * Static functions.
  556. */
  557. static VOLK_rc
  558. term_init (
  559. VOLK_Term *term, VOLK_TermType type,
  560. const char *data, void *metadata)
  561. {
  562. // Exit early if environment is not initialized.
  563. // EXCEPT for IRIRef which is used inside of VOLK_init().
  564. if (!VOLK_IS_INIT && type != VOLK_TERM_IRIREF)
  565. return VOLK_ENV_ERR;
  566. // Undefined type. Make quick work of it.
  567. if (type == VOLK_TERM_UNDEFINED) {
  568. term->type = type;
  569. if (data) {
  570. term->data = malloc (strlen (data) + 1);
  571. if (UNLIKELY (!term->data)) return VOLK_MEM_ERR;
  572. strcpy (term->data, data);
  573. }
  574. return VOLK_OK;
  575. }
  576. if (type < MIN_VALID_TYPE || type > MAX_VALID_TYPE) {
  577. log_error ("%d is not a valid term type.", type);
  578. return VOLK_VALUE_ERR;
  579. }
  580. term->type = type;
  581. if (data) {
  582. // Validate IRI.
  583. if (term->type == VOLK_TERM_IRIREF) {
  584. char *fquri = (char *) data;
  585. if (strpbrk (fquri, invalid_uri_chars) != NULL) {
  586. log_warn (
  587. "Characters %s are not valid in a URI. Got: %s\n",
  588. invalid_uri_chars, fquri);
  589. #if 0
  590. // TODO This causes W3C TTL test #29 to fail. Remove?
  591. return VOLK_VALUE_ERR;
  592. #endif
  593. }
  594. // Capture interesting IRI parts.
  595. MatchCoord matches[7] = {}; // Initialize all to 0.
  596. if (UNLIKELY (parse_iri (fquri, matches) != VOLK_OK)) {
  597. log_error ("Error matching URI pattern.");
  598. return VOLK_VALUE_ERR;
  599. }
  600. MALLOC_GUARD (term->iri_info, VOLK_MEM_ERR);
  601. term->iri_info->prefix = matches[1];
  602. term->iri_info->path = matches[4];
  603. term->iri_info->frag = matches[6];
  604. }
  605. term->data = strdup (data);
  606. } else {
  607. // No data. Make up a random UUID or URI if allowed.
  608. if (type == VOLK_TERM_IRIREF || type == VOLK_TERM_BNODE) {
  609. uuid_t uuid;
  610. uuid_generate_random (uuid);
  611. uuid_str_t uuid_str;
  612. uuid_unparse_lower (uuid, uuid_str);
  613. if (type == VOLK_TERM_IRIREF) {
  614. term->data = malloc (UUID4_URN_SIZE);
  615. snprintf (
  616. term->data, UUID4_URN_SIZE, "urn:uuid4:%s", uuid_str);
  617. MALLOC_GUARD (term->iri_info, VOLK_MEM_ERR);
  618. // Allocate IRI match patterns manually.
  619. term->iri_info->prefix.offset = 0;
  620. term->iri_info->prefix.size = 4;
  621. term->iri_info->path.offset = 4;
  622. term->iri_info->path.size = UUIDSTR_SIZE + 6;
  623. term->iri_info->frag.offset = 0;
  624. term->iri_info->frag.size = 0;
  625. } else term->data = strdup (uuid_str);
  626. } else {
  627. log_error ("No data provided for term.");
  628. return VOLK_VALUE_ERR;
  629. }
  630. }
  631. if (term->type == VOLK_TERM_LT_LITERAL) {
  632. if (!metadata) {
  633. log_warn ("Lang tag is NULL. Creating a non-tagged literal.");
  634. term->type = VOLK_TERM_LITERAL;
  635. } else {
  636. // FIXME metadata should be const all across.
  637. char *lang_str = (char *) metadata;
  638. LOG_TRACE("Lang string: '%s'", lang_str);
  639. // Lang tags longer than 7 characters will be truncated.
  640. strncpy(term->lang, lang_str, sizeof (term->lang) - 1);
  641. if (strlen (term->lang) < 1) {
  642. log_error ("Lang tag cannot be an empty string.");
  643. return VOLK_VALUE_ERR;
  644. }
  645. term->lang[7] = '\0';
  646. }
  647. }
  648. if (term->type == VOLK_TERM_LITERAL) {
  649. term->datatype = metadata;
  650. if (! term->datatype) term->datatype = VOLK_default_datatype;
  651. LOG_TRACE("Storing data type: %s", term->datatype->data);
  652. if (term->datatype->type != VOLK_TERM_IRIREF) {
  653. log_error (
  654. "Literal data type is not an IRI: %s",
  655. term->datatype->data);
  656. return VOLK_VALUE_ERR;
  657. }
  658. VOLK_Term *ex = NULL;
  659. VOLK_term_set_add (VOLK_term_cache, term->datatype, &ex);
  660. if (ex && ex != term->datatype) {
  661. // Replace datatype handle with the one in term cache, and free
  662. // the new one.
  663. if (term->datatype != VOLK_default_datatype)
  664. VOLK_term_free (term->datatype);
  665. term->datatype = ex;
  666. }
  667. //LOG_TRACE("Datatype address: %p", term->datatype);
  668. LOG_TRACE("Datatype hash: %lx", VOLK_term_hash (term->datatype));
  669. } else if (term->type == VOLK_TERM_BNODE) {
  670. // TODO This is not usable for global skolemization.
  671. term->bnode_id = VOLK_HASH (
  672. term->data, strlen (term->data) + 1, VOLK_HASH_SEED);
  673. }
  674. return VOLK_OK;
  675. }
  676. /**
  677. * @brief scan an IRI string and parse IRI parts.
  678. *
  679. * Experimental replacement of a regex engine for better performance.
  680. *
  681. * Slightly adapted from regex on
  682. * https://datatracker.ietf.org/doc/html/rfc3986#appendix-B to capture relevant
  683. * parts of the IRI.
  684. *
  685. * Reference regex and group numbering:
  686. * ^((?([^:/?#]+):)?(?//([^/?#]*))?)((?[^?#]*)(?\?([^#]*))?(?#(.*))?)
  687. * 1 2 3 4 5 6
  688. *
  689. * Capturing groups:
  690. *
  691. * #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
  692. * #1: Prefix (http://example.org)
  693. * #2: Scheme (http)
  694. * #3: Authority (example.org)
  695. * #4: Path, including query and fragment (/123/456/?query=blah#frag)
  696. * #5: Query (query=blah)
  697. * #6: Fragment (frag)
  698. *
  699. *
  700. * @param iri_str[in] IRI string to parse.
  701. *
  702. * @param match_coord_t[out] coord Coordinates to be stored. This must be a
  703. * pre-allocated array of at least 7 elements.
  704. *
  705. * The first size_t of each element stores the relative position of a match,
  706. * and the second one stores the length of the match. A length of 0 indicates
  707. * no match.
  708. */
  709. static VOLK_rc
  710. parse_iri (char *iri_str, MatchCoord coord[]) {
  711. char *cur = iri_str;
  712. size_t iri_len = strlen (iri_str);
  713. MatchCoord tmp = {}; // Temporary storage for capture groups
  714. // Redundant if only called by term_init.
  715. // memset (coord, 0, sizeof(*coord));
  716. //LOG_DEBUG("Parsing IRI: %s", iri_str);
  717. // #2: ([^:/?#]+)
  718. while (
  719. *cur != ':' && *cur != '/' && *cur != '?'
  720. && *cur != '#' && *cur != '\0') {
  721. tmp.size++;
  722. cur++;
  723. }
  724. // Non-capturing: (?([^:/?#]+):)?
  725. if (tmp.size > 0 && *cur == ':') {
  726. // Got capture groups #2 and #3. Store them.
  727. coord[2].offset = 0;
  728. coord[2].size = tmp.size;
  729. cur++;
  730. //LOG_DEBUG("Group #2: %lu, %lu", coord[2].offset, coord[2].size);
  731. } else cur = iri_str; // Backtrack if no match.
  732. // Non-capturing: (?//([^/?#]*))?
  733. if (*cur == '/' && *(cur + 1) == '/') {
  734. cur += 2;
  735. tmp.offset = cur - iri_str;
  736. tmp.size = 0;
  737. // #3: ([^/?#]*)
  738. while (*cur != '/' && *cur != '?' && *cur != '#' && *cur != '\0') {
  739. tmp.size++;
  740. cur++;
  741. }
  742. coord[3].offset = tmp.offset;
  743. coord[3].size = tmp.size;
  744. //LOG_DEBUG("Group #3: %lu, %lu", coord[3].offset, coord[3].size);
  745. }
  746. // Capture group 1.
  747. coord[1].offset = 0;
  748. coord[1].size = cur - iri_str;
  749. //LOG_DEBUG("Group #1: %lu, %lu", coord[1].offset, coord[1].size);
  750. tmp.offset = cur - iri_str;
  751. tmp.size = 0;
  752. coord[4].offset = tmp.offset;
  753. coord[4].size = iri_len - tmp.offset;
  754. //LOG_DEBUG("Group #4: %lu, %lu", coord[4].offset, coord[4].size);
  755. // Non-capturing: (?[^?#]*)
  756. while (*cur != '?' && *cur != '#' && *cur != '\0') {
  757. tmp.size++;
  758. cur++;
  759. }
  760. // Non-capturing: (?\?([^#]*))
  761. if (*cur == '?') {
  762. // 5: ([^#]*)
  763. tmp.offset = ++cur - iri_str;
  764. tmp.size = 0;
  765. while (*cur != '#' && *cur != '\0') {
  766. tmp.size++;
  767. cur++;
  768. }
  769. if (tmp.size > 0) {
  770. // Got capture group #5.
  771. coord[5].offset = tmp.offset;
  772. coord[5].size = tmp.size;
  773. //LOG_DEBUG("Group #5: %lu, %lu", coord[5].offset, coord[5].size);
  774. }
  775. }
  776. // Non-capturing: (?#(.*))?
  777. if (*cur == '#') {
  778. // #6: (.*)
  779. coord[6].offset = ++cur - iri_str;
  780. coord[6].size = iri_str + iri_len - cur;
  781. //LOG_DEBUG("Group #6: %lu, %lu", coord[6].offset, coord[6].size);
  782. }
  783. coord[0].offset = 0;
  784. coord[0].size = iri_len;
  785. //LOG_DEBUG("Full match: %lu, %lu", coord[0].offset, coord[0].size);
  786. return VOLK_OK;
  787. }
  788. /*
  789. * Extern inline functions.
  790. */
  791. VOLK_Key VOLK_term_hash (const VOLK_Term *term);
  792. VOLK_Term *VOLK_iriref_new (const char *data);
  793. VOLK_Term *VOLK_iriref_new_ns (const char *data);
  794. VOLK_Term *VOLK_literal_new (const char *data, VOLK_Term *datatype);
  795. VOLK_Term *VOLK_lt_literal_new (const char *data, char *lang);
  796. VOLK_Term *VOLK_bnode_new (const char *data);
  797. bool VOLK_term_equals (const VOLK_Term *term1, const VOLK_Term *term2);
  798. VOLK_Term *VOLK_triple_pos (const VOLK_Triple *trp, VOLK_TriplePos n);
  799. VOLK_Key VOLK_triple_hash (const VOLK_Triple *trp);