term.c 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025
  1. #include "tpl.h"
  2. #include "term.h"
  3. /** @brief tpl packing format for a term.
  4. *
  5. * The pack elements are: 1. term type (char); 2. data (string); 3. void* type
  6. * metadata, cast to 8-byte unsigned.
  7. */
  8. #define TERM_PACK_FMT "csU"
  9. #define MAX_VALID_TERM_TYPE LSUP_TERM_BNODE /* For type validation. */
  10. /*
  11. * Data structures.
  12. */
  13. /// Sub-match coordinates in IRI parsing results.
  14. typedef struct match_coord_t {
  15. size_t offset; ///< Offset of match from start of string.
  16. size_t size; ///< Length of match.
  17. } MatchCoord;
  18. /// Matching sub-patterns for IRI parts.
  19. struct iri_info_t {
  20. LSUP_NSMap * nsm; ///< NSM handle for prefixed IRI.
  21. MatchCoord prefix; ///< URI prefix (scheme + authority).
  22. MatchCoord path; ///< URI path (including fragment).
  23. MatchCoord frag; ///< URI fragment.
  24. };
  25. /// Key-term pair in term set.
  26. typedef struct keyed_term {
  27. LSUP_Key key; ///< Key (hash) of the term.
  28. LSUP_Term * term; ///< Term handle.
  29. } KeyedTerm;
  30. /** @brief Single link between a term and a term set.
  31. *
  32. * This link is not qualified and may not be used by itself. It belongs
  33. * in a #LSUP_LinkMap which qualifies all links of the same type.
  34. */
  35. typedef struct link {
  36. KeyedTerm * term; ///< Linked term.
  37. LSUP_TermSet * tset; ///< Term set linked to the term.
  38. } Link;
  39. /// Opaque link map iterator.
  40. struct link_map_iter {
  41. const LSUP_LinkMap *map; ///< Link map to iterate.
  42. size_t i; ///< Linking term loop cursor.
  43. size_t j; ///< Term set loop cursor.
  44. LSUP_Term * ext; ///< External link to look for connections.
  45. Link * link; ///< Current link being retrieved.
  46. };
  47. /*
  48. * A link map is thus nested:
  49. *
  50. * - A link map contains a hash map of Link instances (link).
  51. * - Each Link contains a KeyedTerm (term) and a TermSet (tset).
  52. * - Each term set is a hash map of KeyedTerm instances.
  53. * - Each KeyedTerm contains a Term and its hash.
  54. */
  55. typedef struct link_map {
  56. LSUP_LinkType type; ///< Link type.
  57. struct hashmap * links; ///< Map of #Link instances.
  58. } LSUP_LinkMap;
  59. /*
  60. * External variables.
  61. */
  62. uint32_t LSUP_default_dtype_key = 0;
  63. LSUP_Term *LSUP_default_datatype = NULL;
  64. LSUP_TermSet *LSUP_term_cache = NULL;
  65. /*
  66. * Static variables.
  67. */
  68. // Characters not allowed in a URI string.
  69. static const char *invalid_uri_chars = "<>\" {}|\\^`";
  70. /*
  71. * Static prototypes.
  72. */
  73. static LSUP_rc
  74. term_init (
  75. LSUP_Term *term, LSUP_TermType type, const char *data, void *metadata);
  76. /*
  77. * Term set callbacks.
  78. */
  79. static uint64_t
  80. tset_hash_fn (
  81. const void *item, uint64_t seed0, uint64_t seed1)
  82. { return ((const KeyedTerm *) item)->key; }
  83. static int
  84. tset_cmp_fn (const void *a, const void *b, void *udata)
  85. {
  86. return
  87. ((const KeyedTerm *) a)->key -
  88. ((const KeyedTerm *) b)->key;
  89. }
  90. static void
  91. tset_free_fn (void *item)
  92. { LSUP_term_free (((KeyedTerm *) item)->term); }
  93. /*
  94. * Link map callbacks.
  95. */
  96. static uint64_t
  97. link_map_hash_fn (
  98. const void *item, uint64_t seed0, uint64_t seed1)
  99. { return ((const Link *)item)->term->key; }
  100. static int
  101. link_map_cmp_fn (const void *a, const void *b, void *udata)
  102. {
  103. return
  104. ((const Link *)a)->term->key -
  105. ((const Link *)b)->term->key;
  106. }
  107. static void
  108. link_map_free_fn (void *item)
  109. {
  110. Link *link = item;
  111. LSUP_term_free (link->term->term);
  112. free (link->term);
  113. LSUP_term_set_free (link->tset);
  114. }
  115. static LSUP_rc parse_iri (char *iri, MatchCoord coords[]);
  116. /*
  117. * Term API.
  118. */
  119. LSUP_Term *
  120. LSUP_term_new (
  121. LSUP_TermType type, const char *data, void *metadata)
  122. {
  123. LSUP_Term *term;
  124. CALLOC_GUARD (term, NULL);
  125. // If undefined, just set the type.
  126. if (type == LSUP_TERM_UNDEFINED) term->type = type;
  127. else if (UNLIKELY (term_init (
  128. term, type, data, metadata) != LSUP_OK)) {
  129. free (term);
  130. return NULL;
  131. }
  132. return term;
  133. }
  134. LSUP_Term *
  135. LSUP_term_copy (const LSUP_Term *src)
  136. {
  137. void *metadata = NULL;
  138. if (LSUP_IS_IRI (src))
  139. metadata = (void *) LSUP_iriref_nsm (src);
  140. else if (src->type == LSUP_TERM_LITERAL)
  141. metadata = (void *) src->datatype;
  142. else if (src->type == LSUP_TERM_LT_LITERAL) {
  143. metadata = (void *) src->lang;
  144. }
  145. return LSUP_term_new (src->type, src->data, metadata);
  146. }
  147. LSUP_Term *
  148. LSUP_term_new_from_buffer (const LSUP_Buffer *sterm)
  149. {
  150. if (UNLIKELY (!sterm)) return NULL;
  151. LSUP_Term *term = NULL;
  152. LSUP_TermType type = LSUP_TERM_UNDEFINED;
  153. char *data = NULL;
  154. void *metadata;
  155. tpl_node *tn;
  156. tn = tpl_map (TERM_PACK_FMT, &type, &data, &metadata);
  157. if (UNLIKELY (!tn)) goto finally;
  158. if (UNLIKELY (tpl_load (tn, TPL_MEM, sterm->addr, sterm->size) < 0)) {
  159. log_error ("Error loading serialized term.");
  160. goto finally;
  161. }
  162. if (UNLIKELY (tpl_unpack (tn, 0) < 0)) {
  163. log_error ("Error unpacking serialized term.");
  164. goto finally;
  165. }
  166. if (type == LSUP_TERM_LT_LITERAL)
  167. term = LSUP_lt_literal_new (data, (char *)&metadata);
  168. else term = LSUP_term_new (type, data, metadata);
  169. finally:
  170. tpl_free (tn);
  171. free (data);
  172. return term;
  173. }
  174. LSUP_Term *
  175. LSUP_iriref_absolute (const LSUP_Term *root, const LSUP_Term *iri)
  176. {
  177. if (! LSUP_IS_IRI (iri)) {
  178. log_error ("Provided path is not an IRI.");
  179. return NULL;
  180. }
  181. if (! LSUP_IS_IRI (root)) {
  182. log_error ("Provided root is not an IRI.");
  183. return NULL;
  184. }
  185. char *data, *pfx = LSUP_iriref_prefix (iri);
  186. if (pfx) data = iri->data;
  187. else if (iri->data[0] == '/') {
  188. free (pfx);
  189. pfx = LSUP_iriref_prefix (root);
  190. data = malloc (strlen (iri->data) + strlen (pfx) + 1);
  191. if (!data) return NULL;
  192. sprintf (data, "%s%s", pfx, iri->data);
  193. } else {
  194. data = malloc (strlen (iri->data) + strlen (root->data) + 1);
  195. if (!data) return NULL;
  196. sprintf (data, "%s%s", root->data, iri->data);
  197. }
  198. free (pfx);
  199. LSUP_Term *ret = LSUP_iriref_new (data, NULL);
  200. if (data != iri->data) free (data);
  201. return ret;
  202. }
  203. LSUP_Term *
  204. LSUP_iriref_relative (const LSUP_Term *root, const LSUP_Term *iri)
  205. {
  206. if (! LSUP_IS_IRI (iri)) {
  207. log_error ("Provided path is not an IRI.");
  208. return NULL;
  209. }
  210. if (! LSUP_IS_IRI (root)) {
  211. log_error ("Provided root is not an IRI.");
  212. return NULL;
  213. }
  214. size_t offset = (
  215. strstr (iri->data, root->data) == iri->data ?
  216. strlen (root->data) : 0);
  217. return LSUP_iriref_new (iri->data + offset, LSUP_iriref_nsm (iri));
  218. }
  219. LSUP_Buffer *
  220. LSUP_term_serialize (const LSUP_Term *term)
  221. {
  222. /*
  223. * In serializing a term, the fact that two terms of different types may
  224. * be semantically identical must be taken into account. Specifically, a
  225. * namespace-prefixed IRI ref is identical to its fully qualified version,
  226. * and a LSUP_TERM_LT_LITERAL with no language tag is identical to a
  227. * LSUP_TERM_LITERAL of xsd:string type, made up of the same string. Such
  228. * terms must have identical serializations.
  229. */
  230. if (UNLIKELY (!term)) return NULL;
  231. LSUP_Term *tmp_term;
  232. void *metadata = NULL;
  233. if (term->type == LSUP_TERM_NS_IRIREF) {
  234. // For IRI refs, simply serialize the FQ version of the term.
  235. char *fq_uri;
  236. if (LSUP_nsmap_normalize_uri (
  237. term->iri_info->nsm, term->data, &fq_uri
  238. ) != LSUP_OK) return NULL;
  239. tmp_term = LSUP_iriref_new (fq_uri, NULL);
  240. free (fq_uri);
  241. } else if (term->type == LSUP_TERM_LT_LITERAL) {
  242. // For LT literals with empty lang tag, convert to a normal xsd:string.
  243. if (strlen (term->lang) == 0)
  244. tmp_term = LSUP_literal_new (term->data, NULL);
  245. else tmp_term = LSUP_lt_literal_new (term->data, (char *) term->lang);
  246. } else tmp_term = LSUP_term_new (
  247. term->type, term->data, (void *) term->datatype);
  248. // "datatype" can be anything here since it's cast to void *.
  249. // metadata field is ignored for IRI ref.
  250. if (tmp_term->type == LSUP_TERM_LITERAL)
  251. metadata = tmp_term->datatype;
  252. else if (tmp_term->type == LSUP_TERM_LT_LITERAL)
  253. memcpy (&metadata, tmp_term->lang, sizeof (metadata));
  254. LSUP_Buffer *sterm;
  255. MALLOC_GUARD (sterm, NULL);
  256. //log_trace ("Effective term being serialized: %s", tmp_term->data);
  257. int rc = tpl_jot (
  258. TPL_MEM, &sterm->addr, &sterm->size, TERM_PACK_FMT,
  259. &tmp_term->type, &tmp_term->data, &metadata);
  260. LSUP_term_free (tmp_term);
  261. if (rc != 0) {
  262. LSUP_buffer_free (sterm);
  263. return NULL;
  264. }
  265. return sterm;
  266. }
  267. LSUP_Key
  268. LSUP_term_hash (const LSUP_Term *term)
  269. {
  270. LSUP_Buffer *buf;
  271. if (UNLIKELY (!term)) buf = BUF_DUMMY;
  272. else buf = LSUP_term_serialize (term);
  273. LSUP_Key key = LSUP_buffer_hash (buf);
  274. LSUP_buffer_free (buf);
  275. return key;
  276. }
  277. void
  278. LSUP_term_free (LSUP_Term *term)
  279. {
  280. if (UNLIKELY (!term)) return;
  281. if (LSUP_IS_IRI (term)) free (term->iri_info);
  282. free (term->data);
  283. free (term);
  284. }
  285. LSUP_NSMap *
  286. LSUP_iriref_nsm (const LSUP_Term *iri)
  287. {
  288. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  289. log_error ("Term is not a IRI ref type.");
  290. return NULL;
  291. }
  292. return iri->iri_info->nsm;
  293. }
  294. char *
  295. LSUP_iriref_prefix (const LSUP_Term *iri)
  296. {
  297. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  298. log_error ("Term is not a IRI ref type.");
  299. return NULL;
  300. }
  301. if (iri->iri_info->prefix.size == 0) return NULL;
  302. return strndup (
  303. iri->data + iri->iri_info->prefix.offset,
  304. iri->iri_info->prefix.size);
  305. }
  306. char *
  307. LSUP_iriref_path (const LSUP_Term *iri)
  308. {
  309. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  310. log_error ("Term is not a IRI ref type.");
  311. return NULL;
  312. }
  313. if (iri->iri_info->path.size == 0) return NULL;
  314. return strndup (
  315. iri->data + iri->iri_info->path.offset,
  316. iri->iri_info->path.size);
  317. }
  318. char *
  319. LSUP_iriref_frag (const LSUP_Term *iri)
  320. {
  321. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  322. log_error ("Term is not a IRI ref type.");
  323. return NULL;
  324. }
  325. if (iri->iri_info->frag.size == 0) return NULL;
  326. return strndup (
  327. iri->data + iri->iri_info->frag.offset,
  328. iri->iri_info->frag.size);
  329. }
  330. /*
  331. * Triple API.
  332. */
  333. LSUP_Triple *
  334. LSUP_triple_new(LSUP_Term *s, LSUP_Term *p, LSUP_Term *o)
  335. {
  336. LSUP_Triple *spo = malloc (sizeof (*spo));
  337. if (!spo) return NULL;
  338. if (UNLIKELY (LSUP_triple_init (spo, s, p, o))) {
  339. free (spo);
  340. return NULL;
  341. }
  342. return spo;
  343. }
  344. LSUP_Triple *
  345. LSUP_triple_new_from_btriple (const LSUP_BufferTriple *sspo)
  346. {
  347. LSUP_Triple *spo = malloc (sizeof (*spo));
  348. if (!spo) return NULL;
  349. spo->s = LSUP_term_new_from_buffer (sspo->s);
  350. spo->p = LSUP_term_new_from_buffer (sspo->p);
  351. spo->o = LSUP_term_new_from_buffer (sspo->o);
  352. return spo;
  353. }
  354. LSUP_BufferTriple *
  355. LSUP_triple_serialize (const LSUP_Triple *spo)
  356. {
  357. LSUP_BufferTriple *sspo = malloc (sizeof (*sspo));
  358. if (!sspo) return NULL;
  359. sspo->s = LSUP_term_serialize (spo->s);
  360. sspo->p = LSUP_term_serialize (spo->p);
  361. sspo->o = LSUP_term_serialize (spo->o);
  362. return sspo;
  363. }
  364. LSUP_rc
  365. LSUP_triple_init (LSUP_Triple *spo, LSUP_Term *s, LSUP_Term *p, LSUP_Term *o)
  366. {
  367. /* FIXME TRP_DUMMY is a problem here.
  368. if (! LSUP_IS_IRI (s) && s->type != LSUP_TERM_BNODE) {
  369. log_error ("Subject is not of a valid term type: %d", s->type);
  370. return LSUP_VALUE_ERR;
  371. }
  372. if (! LSUP_IS_IRI (p)) {
  373. log_error ("Predicate is not of a valid term type: %d", p->type);
  374. return LSUP_VALUE_ERR;
  375. }
  376. */
  377. spo->s = s;
  378. spo->p = p;
  379. spo->o = o;
  380. return LSUP_OK;
  381. }
  382. void
  383. LSUP_triple_done (LSUP_Triple *spo)
  384. {
  385. if (UNLIKELY (!spo)) return;
  386. LSUP_term_free (spo->s);
  387. LSUP_term_free (spo->p);
  388. LSUP_term_free (spo->o);
  389. }
  390. void
  391. LSUP_triple_free (LSUP_Triple *spo)
  392. {
  393. if (UNLIKELY (!spo)) return;
  394. LSUP_term_free (spo->s);
  395. LSUP_term_free (spo->p);
  396. LSUP_term_free (spo->o);
  397. free (spo);
  398. }
  399. /*
  400. * Multi-add functions.
  401. */
  402. LSUP_TermSet *
  403. LSUP_term_set_new ()
  404. {
  405. // Capacity of 4 is an arbitrary guess.
  406. LSUP_TermSet *ts = hashmap_new (
  407. sizeof (KeyedTerm), 4, LSUP_HASH_SEED, 0,
  408. tset_hash_fn, tset_cmp_fn, tset_free_fn, NULL);
  409. if (UNLIKELY (hashmap_oom (ts))) return NULL;
  410. return ts;
  411. }
  412. LSUP_rc
  413. LSUP_term_set_add (LSUP_TermSet *ts, LSUP_Term *term, LSUP_Term **existing)
  414. {
  415. LSUP_Hash key = LSUP_term_hash (term);
  416. KeyedTerm entry_s = {.key=key, .term=term};
  417. KeyedTerm *ex = hashmap_get (ts, &entry_s);
  418. if (ex) {
  419. if (existing) *existing = ex->term;
  420. return LSUP_NOACTION;
  421. }
  422. hashmap_set (ts, &entry_s);
  423. if (hashmap_oom (ts)) return LSUP_MEM_ERR;
  424. return LSUP_OK;
  425. }
  426. const LSUP_Term *
  427. LSUP_term_set_get (LSUP_TermSet *ts, LSUP_Key key)
  428. {
  429. KeyedTerm *entry = hashmap_get (ts, &(KeyedTerm){.key=key});
  430. if (entry) log_trace ("ID found for key %lx: %s", key, entry->term->data);
  431. else log_trace ("No ID found for key %lx.", key);
  432. return (entry) ? entry->term : NULL;
  433. }
  434. LSUP_rc
  435. LSUP_term_set_next (LSUP_TermSet *ts, size_t *i, LSUP_Term **term)
  436. {
  437. KeyedTerm *kt = NULL;
  438. if (!hashmap_iter (ts, i, (void **)&kt)) return LSUP_END;
  439. if (term) *term = kt->term;
  440. return LSUP_OK;
  441. }
  442. void
  443. LSUP_term_set_free (LSUP_TermSet *ts)
  444. { hashmap_free (ts); }
  445. LSUP_LinkMap *
  446. LSUP_link_map_new (LSUP_LinkType type)
  447. {
  448. LSUP_LinkMap *cm;
  449. MALLOC_GUARD (cm, NULL);
  450. cm->type = type;
  451. cm->links = hashmap_new (
  452. sizeof (Link), 0, LSUP_HASH_SEED, 0,
  453. link_map_hash_fn, link_map_cmp_fn, link_map_free_fn, NULL);
  454. return cm;
  455. }
  456. void
  457. LSUP_link_map_free (LSUP_LinkMap *cm)
  458. {
  459. hashmap_free (cm->links);
  460. free (cm);
  461. }
  462. LSUP_LinkType
  463. LSUP_link_map_type (const LSUP_LinkMap *map)
  464. { return map->type; }
  465. // TODO Memory error handling.
  466. LSUP_rc
  467. LSUP_link_map_add (
  468. LSUP_LinkMap *cmap, LSUP_Term *term, LSUP_TermSet *tset)
  469. {
  470. // Keyed term to look up the link term and insert it, if necessary.
  471. KeyedTerm entry_s = {.key=LSUP_term_hash (term), .term=term};
  472. Link *ex = hashmap_get (cmap->links, &(Link){.term=&entry_s});
  473. if (ex) {
  474. // Add terms one by one to the existing term set.
  475. log_trace (
  476. "Linking term %s exists. Adding individual terms.",
  477. ex->term->term->data);
  478. size_t i = 0;
  479. KeyedTerm *kt;
  480. while (hashmap_iter (tset, &i, (void **)&kt)) {
  481. log_trace (
  482. "Adding term %s to link %s",
  483. kt->term->data, ex->term->term->data);
  484. if (hashmap_get (ex->tset, kt))
  485. // Term already exist, free the new one and move on.
  486. LSUP_term_free (kt->term);
  487. else
  488. // Insert KeyedTerm, the term set now owns the underlying term.
  489. hashmap_set (ex->tset, kt);
  490. }
  491. // Free link term that hasn't been used.
  492. LSUP_term_free (term);
  493. } else {
  494. // Add the new term and the termset wholesale.
  495. log_trace ("Adding new linking term %s.", term->data);
  496. // Allocate inserted member on heap, it will be owned by the map.
  497. KeyedTerm *ins;
  498. MALLOC_GUARD (ins, LSUP_MEM_ERR);
  499. memcpy (ins, &entry_s, sizeof (entry_s));
  500. Link link = {.term=ins, .tset=tset};
  501. hashmap_set (cmap->links, &link);
  502. }
  503. return LSUP_OK;
  504. }
  505. LSUP_LinkMapIterator *
  506. LSUP_link_map_iter_new (const LSUP_LinkMap *lmap, LSUP_Term *ext)
  507. {
  508. LSUP_LinkMapIterator *it;
  509. CALLOC_GUARD (it, NULL);
  510. it->map = lmap;
  511. it->ext = ext;
  512. return it;
  513. }
  514. void
  515. LSUP_link_map_iter_free (LSUP_LinkMapIterator *it)
  516. { free (it); }
  517. LSUP_rc
  518. LSUP_link_map_next (
  519. LSUP_LinkMapIterator *it, LSUP_Term **lt, LSUP_TermSet **ts)
  520. {
  521. if (!hashmap_iter (it->map->links, &it->i, (void **)&it->link))
  522. return LSUP_END;
  523. *lt = it->link->term->term;
  524. *ts = it->link->tset;
  525. return LSUP_OK;
  526. }
  527. // TODO dismantle if the only triple generator is for the graph.
  528. LSUP_rc
  529. LSUP_link_map_triples (
  530. LSUP_LinkMapIterator *it, LSUP_Triple *spo)
  531. {
  532. // Assign external (related) term.
  533. if (it->map->type == LSUP_LINK_INBOUND)
  534. spo->o = it->ext;
  535. else if (it->map->type == LSUP_LINK_OUTBOUND)
  536. spo->s = it->ext;
  537. else spo->p = it->ext;
  538. KeyedTerm *kt;
  539. // If we are already handling a link, continue the internal loop.
  540. if (it->link) goto int_loop;
  541. ext_loop:
  542. // Advance external counter and start new internal loop.
  543. it->j = 0;
  544. if (!hashmap_iter (it->map->links, &it->i, (void **)&it->link))
  545. return LSUP_END;
  546. int_loop:
  547. // If end of the term set is reached, start with a new linking term.
  548. if (!hashmap_iter (it->link->tset, &it->j, (void **)&kt)) goto ext_loop;
  549. // Continue pulling from term set.
  550. // Assign linking term.
  551. if (it->map->type == LSUP_LINK_EDGE) spo->s = it->link->term->term;
  552. else spo->p = it->link->term->term;
  553. // Assign term in term set.
  554. if (it->map->type == LSUP_LINK_INBOUND) spo->s = kt->term;
  555. else spo->o = kt->term;
  556. return LSUP_OK;
  557. }
  558. /*
  559. * Static functions.
  560. */
  561. static LSUP_rc
  562. term_init (
  563. LSUP_Term *term, LSUP_TermType type,
  564. const char *data, void *metadata)
  565. {
  566. if (UNLIKELY (!LSUP_IS_INIT)) {
  567. log_error ("Environment not initialized. Did you call LSUP_init()?");
  568. return LSUP_ERROR;
  569. }
  570. // This can never be LSUP_TERM_UNDEFINED.
  571. if (type == LSUP_TERM_UNDEFINED) {
  572. log_error ("%d is not a valid term type.", type);
  573. return LSUP_VALUE_ERR;
  574. }
  575. term->type = type;
  576. if (data) {
  577. // Validate IRI.
  578. if (LSUP_IS_IRI (term)) {
  579. char *fquri;
  580. // Find fully qualified IRI to parse.
  581. if (term->type == LSUP_TERM_NS_IRIREF) {
  582. if (LSUP_nsmap_normalize_uri (metadata, data, &fquri) < 0) {
  583. log_error ("Error normalizing IRI data.");
  584. return LSUP_VALUE_ERR;
  585. }
  586. log_debug ("Fully qualified IRI: %s", fquri);
  587. } else fquri = (char *) data;
  588. if (strpbrk (fquri, invalid_uri_chars) != NULL) {
  589. log_warn (
  590. "Characters %s are not valid in a URI. Got: %s\n",
  591. invalid_uri_chars, fquri);
  592. #if 0
  593. // TODO This causes W3C TTL test #29 to fail. Remove?
  594. return LSUP_VALUE_ERR;
  595. #endif
  596. }
  597. // Capture interesting IRI parts.
  598. MatchCoord matches[7] = {}; // Initialize all to 0.
  599. if (UNLIKELY (parse_iri (fquri, matches) != LSUP_OK)) {
  600. log_error ("Error matching URI pattern.");
  601. return LSUP_VALUE_ERR;
  602. }
  603. if (term->type == LSUP_TERM_NS_IRIREF) free (fquri);
  604. MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
  605. term->iri_info->prefix = matches[1];
  606. term->iri_info->path = matches[4];
  607. term->iri_info->frag = matches[6];
  608. term->iri_info->nsm = metadata;
  609. }
  610. term->data = strdup (data);
  611. } else {
  612. // No data. Make up a random UUID or URI if allowed.
  613. if (type == LSUP_TERM_IRIREF || type == LSUP_TERM_BNODE) {
  614. uuid_t uuid;
  615. uuid_generate_random (uuid);
  616. uuid_str_t uuid_str;
  617. uuid_unparse_lower (uuid, uuid_str);
  618. if (type == LSUP_TERM_IRIREF) {
  619. term->data = malloc (UUID4_URN_SIZE);
  620. snprintf (
  621. term->data, UUID4_URN_SIZE, "urn:uuid4:%s", uuid_str);
  622. MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
  623. // Allocate IRI match patterns manually.
  624. term->iri_info->prefix.offset = 0;
  625. term->iri_info->prefix.size = 4;
  626. term->iri_info->path.offset = 4;
  627. term->iri_info->path.size = UUIDSTR_SIZE + 6;
  628. term->iri_info->frag.offset = 0;
  629. term->iri_info->frag.size = 0;
  630. term->iri_info->nsm = NULL;
  631. } else term->data = strdup (uuid_str);
  632. } else {
  633. log_error ("No data provided for term.");
  634. return LSUP_VALUE_ERR;
  635. }
  636. }
  637. if (term->type == LSUP_TERM_LT_LITERAL) {
  638. if (!metadata) {
  639. log_warn ("Lang tag is NULL. Creating a non-tagged literal.");
  640. term->type = LSUP_TERM_LITERAL;
  641. } else {
  642. char *lang_str = (char *) metadata;
  643. log_trace("Lang string: '%s'", lang_str);
  644. // Lang tags longer than 7 characters will be truncated.
  645. strncpy(term->lang, lang_str, sizeof (term->lang) - 1);
  646. if (strlen (term->lang) < 1) {
  647. log_error ("Lang tag cannot be an empty string.");
  648. return LSUP_VALUE_ERR;
  649. }
  650. term->lang[7] = '\0';
  651. }
  652. }
  653. if (term->type == LSUP_TERM_LITERAL) {
  654. term->datatype = metadata;
  655. if (! term->datatype) term->datatype = LSUP_default_datatype;
  656. log_trace ("Storing data type: %s", term->datatype->data);
  657. if (! LSUP_IS_IRI (term->datatype)) {
  658. log_error (
  659. "Literal data type is not an IRI: %s",
  660. term->datatype->data);
  661. return LSUP_VALUE_ERR;
  662. }
  663. LSUP_Term *ex = NULL;
  664. LSUP_term_set_add (LSUP_term_cache, term->datatype, &ex);
  665. if (ex && ex != term->datatype) {
  666. // Replace datatype handle with the one in term cache, and free
  667. // the new one.
  668. if (term->datatype != LSUP_default_datatype)
  669. LSUP_term_free (term->datatype);
  670. term->datatype = ex;
  671. }
  672. //log_trace ("Datatype address: %p", term->datatype);
  673. log_trace ("Datatype hash: %lx", LSUP_term_hash (term->datatype));
  674. } else if (term->type == LSUP_TERM_BNODE) {
  675. // TODO This is not usable for global skolemization.
  676. term->bnode_id = LSUP_HASH (
  677. term->data, strlen (term->data) + 1, LSUP_HASH_SEED);
  678. }
  679. return LSUP_OK;
  680. }
  681. /**
  682. * @brief scan an IRI string and parse IRI parts.
  683. *
  684. * Experimental replacement of a regex engine for better performance.
  685. *
  686. * Slightly adapted from regex on
  687. * https://datatracker.ietf.org/doc/html/rfc3986#appendix-B to capture relevant
  688. * parts of the IRI.
  689. *
  690. * Reference regex and group numbering:
  691. * ^((?([^:/?#]+):)?(?//([^/?#]*))?)((?[^?#]*)(?\?([^#]*))?(?#(.*))?)
  692. * 1 2 3 4 5 6
  693. *
  694. * Capturing groups:
  695. *
  696. * #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
  697. * #1: Prefix (http://example.org)
  698. * #2: Scheme (http)
  699. * #3: Authority (example.org)
  700. * #4: Path, including query and fragment (/123/456/?query=blah#frag)
  701. * #5: Query (query=blah)
  702. * #6: Fragment (frag)
  703. *
  704. *
  705. * @param iri_str[in] IRI string to parse.
  706. *
  707. * @param match_coord_t[out] coord Coordinates to be stored. This must be a
  708. * pre-allocated array of at least 7 elements.
  709. *
  710. * The first size_t of each element stores the relative position of a match,
  711. * and the second one stores the length of the match. A length of 0 indicates
  712. * no match.
  713. */
  714. static LSUP_rc
  715. parse_iri (char *iri_str, MatchCoord coord[]) {
  716. char *cur = iri_str;
  717. size_t iri_len = strlen (iri_str);
  718. MatchCoord tmp = {}; // Temporary storage for capture groups
  719. // #2: ([^:/?#]+)
  720. while (
  721. *cur != ':' && *cur != '/' && *cur != '?'
  722. && *cur != '#' && *cur != '\0') {
  723. tmp.size++;
  724. cur++;
  725. }
  726. // Non-capturing: (?([^:/?#]+):)?
  727. if (tmp.size > 0 && *(++cur) == ':') {
  728. // Got capture groups #2 and #3. Store them.
  729. tmp.size++;
  730. coord[3].offset = tmp.offset;
  731. coord[3].size = tmp.size - 1;
  732. }
  733. // Non-capturing: (?//([^/?#]*))?
  734. if (*(cur + 1) == '/' && *(cur + 2) == '/') {
  735. cur++;
  736. tmp.offset = cur - iri_str;
  737. tmp.size = 2;
  738. cur += 2;
  739. // #3: ([^/?#]*)
  740. while (*cur != '/' && *cur != '?' && *cur != '#' && *cur != '\0') {
  741. tmp.size++;
  742. cur++;
  743. }
  744. // Maybe got capture group #5.
  745. coord[3].offset = tmp.offset + 2;
  746. coord[3].size = tmp.size -2;
  747. }
  748. // Capture group 1 and advance cursor.
  749. coord[1].offset = 0;
  750. coord[1].size = cur++ - iri_str;
  751. // Non-capturing: (?[^?#]*)
  752. tmp.offset = cur - iri_str;
  753. tmp.size = 0;
  754. while (*cur != '?' && *cur != '#' && *cur != '\0') {
  755. tmp.size++;
  756. cur++;
  757. }
  758. if (tmp.size > 0) {
  759. coord[4].offset = tmp.offset;
  760. coord[4].size = iri_str + iri_len - cur;
  761. } else return LSUP_NORESULT; // This group is the only mandatory match.
  762. // Non-capturing: (?\?([^#]*))
  763. if (*(++cur) == '?') {
  764. // 5: ([^#]*)
  765. tmp.offset = ++cur - iri_str;
  766. tmp.size = 0;
  767. while (*cur != '#' && *cur != '\0') {
  768. tmp.size++;
  769. cur++;
  770. }
  771. if (tmp.size > 0) {
  772. // Got capture group #5.
  773. coord[5].offset = tmp.offset;
  774. coord[5].size = tmp.size;
  775. }
  776. }
  777. // Non-capturing: (?#(.*))?
  778. if (*(++cur) == '#') {
  779. // #6: (.*)
  780. coord[6].offset = ++cur - iri_str;
  781. coord[6].size = iri_str + iri_len - cur;
  782. }
  783. coord[0].offset = 0;
  784. coord[0].size = iri_len;
  785. return LSUP_OK;
  786. }
  787. /*
  788. * Extern inline functions.
  789. */
  790. LSUP_Key LSUP_term_hash (const LSUP_Term *term);
  791. LSUP_Term *LSUP_iriref_new (const char *data, LSUP_NSMap *nsm);
  792. LSUP_Term *LSUP_literal_new (const char *data, LSUP_Term *datatype);
  793. LSUP_Term *LSUP_lt_literal_new (const char *data, char *lang);
  794. LSUP_Term *LSUP_bnode_new (const char *data);
  795. bool LSUP_term_equals (const LSUP_Term *term1, const LSUP_Term *term2);
  796. LSUP_Term *LSUP_triple_pos (const LSUP_Triple *trp, LSUP_TriplePos n);
  797. LSUP_Key LSUP_triple_hash (const LSUP_Triple *trp);