term.c 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019
  1. #include "tpl.h"
  2. #include "term.h"
  3. /** @brief tpl packing format for a term.
  4. *
  5. * The pack elements are: 1. term type (char); 2. data (string); 3. void* type
  6. * metadata, cast to 8-byte unsigned.
  7. */
  8. #define TERM_PACK_FMT "csU"
  9. #define MAX_VALID_TERM_TYPE LSUP_TERM_BNODE /* For type validation. */
  10. /*
  11. * Data structures.
  12. */
  13. /// Sub-string coordinates for IRI matches.
  14. typedef size_t match_coord_t[2];
  15. /// Matching sub-patterns for IRI parts.
  16. struct iri_info_t {
  17. LSUP_NSMap * nsm; ///< NSM handle for prefixed IRI.
  18. match_coord_t prefix; ///< Matching group #1.
  19. match_coord_t path; ///< Matching group #5.
  20. match_coord_t frag; ///< Matching group #10.
  21. };
  22. /// Key-term pair in term set.
  23. typedef struct keyed_term {
  24. LSUP_Key key; ///< Key (hash) of the term.
  25. LSUP_Term * term; ///< Term handle.
  26. } KeyedTerm;
  27. /** @brief Single link between a term and a term set.
  28. *
  29. * This link is not qualified and may not be used by itself. It belongs
  30. * in a #LSUP_LinkMap which qualifies all links of the same type.
  31. */
  32. typedef struct link {
  33. KeyedTerm * term; ///< Linked term.
  34. LSUP_TermSet * tset; ///< Term set linked to the term.
  35. } Link;
  36. /// Opaque link map iterator.
  37. struct link_map_iter {
  38. const LSUP_LinkMap *map; ///< Link map to iterate.
  39. size_t i; ///< Linking term loop cursor.
  40. size_t j; ///< Term set loop cursor.
  41. LSUP_Term * ext; ///< External link to look for connections.
  42. Link * link; ///< Current link being retrieved.
  43. };
  44. /*
  45. * A link map is thus nested:
  46. *
  47. * - A link map contains a hash map of Link instances (link).
  48. * - Each Link contains a KeyedTerm (term) and a TermSet (tset).
  49. * - Each term set is a hash map of KeyedTerm instances.
  50. * - Each KeyedTerm contains a Term and its hash.
  51. */
  52. typedef struct link_map {
  53. LSUP_LinkType type; ///< Link type.
  54. struct hashmap * links; ///< Map of #Link instances.
  55. } LSUP_LinkMap;
  56. /*
  57. * External variables.
  58. */
  59. uint32_t LSUP_default_dtype_key = 0;
  60. regex_t *LSUP_uri_ptn;
  61. LSUP_Term *LSUP_default_datatype = NULL;
  62. /*
  63. * Static variables.
  64. */
  65. // Characters not allowed in a URI string.
  66. static const char *invalid_uri_chars = "<>\" {}|\\^`";
  67. /*
  68. * Static prototypes.
  69. */
  70. static LSUP_rc
  71. term_init (
  72. LSUP_Term *term, LSUP_TermType type, const char *data, void *metadata);
  73. /*
  74. * Term set callbacks.
  75. */
  76. static uint64_t
  77. tset_hash_fn (
  78. const void *item, uint64_t seed0, uint64_t seed1)
  79. { return ((const KeyedTerm *) item)->key; }
  80. static int
  81. tset_cmp_fn (const void *a, const void *b, void *udata)
  82. {
  83. return
  84. ((const KeyedTerm *) a)->key -
  85. ((const KeyedTerm *) b)->key;
  86. }
  87. static void
  88. tset_free_fn (void *item)
  89. { LSUP_term_free (((KeyedTerm *) item)->term); }
  90. /*
  91. * Link map callbacks.
  92. */
  93. static uint64_t
  94. link_map_hash_fn (
  95. const void *item, uint64_t seed0, uint64_t seed1)
  96. { return ((const Link *)item)->term->key; }
  97. static int
  98. link_map_cmp_fn (const void *a, const void *b, void *udata)
  99. {
  100. return
  101. ((const Link *)a)->term->key -
  102. ((const Link *)b)->term->key;
  103. }
  104. static void
  105. link_map_free_fn (void *item)
  106. {
  107. Link *link = item;
  108. LSUP_term_free (link->term->term);
  109. free (link->term);
  110. LSUP_term_set_free (link->tset);
  111. }
  112. /*
  113. * Term API.
  114. */
  115. LSUP_Term *
  116. LSUP_term_new (
  117. LSUP_TermType type, const char *data, void *metadata)
  118. {
  119. LSUP_Term *term;
  120. CALLOC_GUARD (term, NULL);
  121. // If undefined, just set the type.
  122. if (type == LSUP_TERM_UNDEFINED) term->type = type;
  123. else if (UNLIKELY (term_init (
  124. term, type, data, metadata) != LSUP_OK)) {
  125. free (term);
  126. return NULL;
  127. }
  128. return term;
  129. }
  130. LSUP_Term *
  131. LSUP_term_copy (const LSUP_Term *src)
  132. {
  133. void *metadata = NULL;
  134. if (LSUP_IS_IRI (src))
  135. metadata = (void *) LSUP_iriref_nsm (src);
  136. else if (src->type == LSUP_TERM_LITERAL)
  137. metadata = (void *) src->datatype;
  138. else if (src->type == LSUP_TERM_LT_LITERAL) {
  139. metadata = (void *) src->lang;
  140. }
  141. return LSUP_term_new (src->type, src->data, metadata);
  142. }
  143. LSUP_Term *
  144. LSUP_term_new_from_buffer (const LSUP_Buffer *sterm)
  145. {
  146. if (UNLIKELY (!sterm)) return NULL;
  147. LSUP_Term *term = NULL;
  148. LSUP_TermType type = LSUP_TERM_UNDEFINED;
  149. char *data = NULL;
  150. void *metadata;
  151. tpl_node *tn;
  152. tn = tpl_map (TERM_PACK_FMT, &type, &data, &metadata);
  153. if (UNLIKELY (!tn)) goto finally;
  154. if (UNLIKELY (tpl_load (tn, TPL_MEM, sterm->addr, sterm->size) < 0)) {
  155. log_error ("Error loading serialized term.");
  156. goto finally;
  157. }
  158. if (UNLIKELY (tpl_unpack (tn, 0) < 0)) {
  159. log_error ("Error unpacking serialized term.");
  160. goto finally;
  161. }
  162. if (type == LSUP_TERM_LT_LITERAL)
  163. term = LSUP_lt_literal_new (data, (char *)&metadata);
  164. else term = LSUP_term_new (type, data, metadata);
  165. finally:
  166. tpl_free (tn);
  167. free (data);
  168. return term;
  169. }
  170. LSUP_Term *
  171. LSUP_iriref_absolute (const LSUP_Term *root, const LSUP_Term *iri)
  172. {
  173. if (! LSUP_IS_IRI (iri)) {
  174. log_error ("Provided path is not an IRI.");
  175. return NULL;
  176. }
  177. if (! LSUP_IS_IRI (root)) {
  178. log_error ("Provided root is not an IRI.");
  179. return NULL;
  180. }
  181. char *data, *pfx = LSUP_iriref_prefix (iri);
  182. if (pfx) data = iri->data;
  183. else if (iri->data[0] == '/') {
  184. free (pfx);
  185. pfx = LSUP_iriref_prefix (root);
  186. data = malloc (strlen (iri->data) + strlen (pfx) + 1);
  187. if (!data) return NULL;
  188. sprintf (data, "%s%s", pfx, iri->data);
  189. } else {
  190. data = malloc (strlen (iri->data) + strlen (root->data) + 1);
  191. if (!data) return NULL;
  192. sprintf (data, "%s%s", root->data, iri->data);
  193. }
  194. free (pfx);
  195. LSUP_Term *ret = LSUP_iriref_new (data, NULL);
  196. if (data != iri->data) free (data);
  197. return ret;
  198. }
  199. LSUP_Term *
  200. LSUP_iriref_relative (const LSUP_Term *root, const LSUP_Term *iri)
  201. {
  202. if (! LSUP_IS_IRI (iri)) {
  203. log_error ("Provided path is not an IRI.");
  204. return NULL;
  205. }
  206. if (! LSUP_IS_IRI (root)) {
  207. log_error ("Provided root is not an IRI.");
  208. return NULL;
  209. }
  210. size_t offset = (
  211. strstr (iri->data, root->data) == iri->data ?
  212. strlen (root->data) : 0);
  213. return LSUP_iriref_new (iri->data + offset, LSUP_iriref_nsm (iri));
  214. }
  215. LSUP_Buffer *
  216. LSUP_term_serialize (const LSUP_Term *term)
  217. {
  218. /*
  219. * In serializing a term, the fact that two terms of different types may
  220. * be semantically identical must be taken into account. Specifically, a
  221. * namespace-prefixed IRI ref is identical to its fully qualified version,
  222. * and a LSUP_TERM_LT_LITERAL with no language tag is identical to a
  223. * LSUP_TERM_LITERAL of xsd:string type, made up of the same string. Such
  224. * terms must have identical serializations.
  225. */
  226. if (UNLIKELY (!term)) return NULL;
  227. LSUP_Term *tmp_term;
  228. void *metadata = NULL;
  229. if (term->type == LSUP_TERM_NS_IRIREF) {
  230. // For IRI refs, simply serialize the FQ version of the term.
  231. char *fq_uri;
  232. if (LSUP_nsmap_normalize_uri (
  233. term->iri_info->nsm, term->data, &fq_uri
  234. ) != LSUP_OK) return NULL;
  235. tmp_term = LSUP_iriref_new (fq_uri, NULL);
  236. free (fq_uri);
  237. } else if (term->type == LSUP_TERM_LT_LITERAL) {
  238. // For LT literals with empty lang tag, convert to a normal xsd:string.
  239. if (strlen (term->lang) == 0)
  240. tmp_term = LSUP_literal_new (term->data, NULL);
  241. else tmp_term = LSUP_lt_literal_new (term->data, (char *) term->lang);
  242. } else tmp_term = LSUP_term_new (
  243. term->type, term->data, (void *) term->datatype);
  244. // "datatype" can be anything here since it's cast to void *.
  245. // metadata field is ignored for IRI ref.
  246. if (tmp_term->type == LSUP_TERM_LITERAL)
  247. metadata = tmp_term->datatype;
  248. else if (tmp_term->type == LSUP_TERM_LT_LITERAL)
  249. memcpy (&metadata, tmp_term->lang, sizeof (metadata));
  250. LSUP_Buffer *sterm;
  251. MALLOC_GUARD (sterm, NULL);
  252. //log_trace ("Effective term being serialized: %s", tmp_term->data);
  253. int rc = tpl_jot (
  254. TPL_MEM, &sterm->addr, &sterm->size, TERM_PACK_FMT,
  255. &tmp_term->type, &tmp_term->data, &metadata);
  256. LSUP_term_free (tmp_term);
  257. if (rc != 0) {
  258. LSUP_buffer_free (sterm);
  259. return NULL;
  260. }
  261. return sterm;
  262. }
  263. LSUP_Key
  264. LSUP_term_hash (const LSUP_Term *term)
  265. {
  266. LSUP_Buffer *buf;
  267. if (UNLIKELY (!term)) buf = BUF_DUMMY;
  268. else buf = LSUP_term_serialize (term);
  269. LSUP_Key key = LSUP_buffer_hash (buf);
  270. LSUP_buffer_free (buf);
  271. return key;
  272. }
  273. void
  274. LSUP_term_free (LSUP_Term *term)
  275. {
  276. if (UNLIKELY (!term)) return;
  277. if (LSUP_IS_IRI (term)) free (term->iri_info);
  278. free (term->data);
  279. free (term);
  280. }
  281. LSUP_NSMap *
  282. LSUP_iriref_nsm (const LSUP_Term *iri)
  283. {
  284. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  285. log_error ("Term is not a IRI ref type.");
  286. return NULL;
  287. }
  288. return iri->iri_info->nsm;
  289. }
  290. char *
  291. LSUP_iriref_prefix (const LSUP_Term *iri)
  292. {
  293. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  294. log_error ("Term is not a IRI ref type.");
  295. return NULL;
  296. }
  297. if (iri->iri_info->prefix.rm_so == -1) return NULL;
  298. size_t len = iri->iri_info->prefix.rm_eo - iri->iri_info->prefix.rm_so;
  299. if (len == 0) return NULL;
  300. return strndup (iri->data + iri->iri_info->prefix.rm_so, len);
  301. }
  302. char *
  303. LSUP_iriref_path (const LSUP_Term *iri)
  304. {
  305. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  306. log_error ("Term is not a IRI ref type.");
  307. return NULL;
  308. }
  309. if (iri->iri_info->path.rm_so == -1) return NULL;
  310. size_t len = iri->iri_info->path.rm_eo - iri->iri_info->path.rm_so;
  311. if (len == 0) return NULL;
  312. return strndup (iri->data + iri->iri_info->path.rm_so, len);
  313. }
  314. char *
  315. LSUP_iriref_frag (const LSUP_Term *iri)
  316. {
  317. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  318. log_error ("Term is not a IRI ref type.");
  319. return NULL;
  320. }
  321. if (iri->iri_info->frag.rm_so == -1) return NULL;
  322. size_t len = iri->iri_info->frag.rm_eo - iri->iri_info->frag.rm_so;
  323. return strndup (iri->data + iri->iri_info->frag.rm_so, len);
  324. }
  325. /*
  326. * Triple API.
  327. */
  328. LSUP_Triple *
  329. LSUP_triple_new(LSUP_Term *s, LSUP_Term *p, LSUP_Term *o)
  330. {
  331. LSUP_Triple *spo = malloc (sizeof (*spo));
  332. if (!spo) return NULL;
  333. if (UNLIKELY (LSUP_triple_init (spo, s, p, o))) {
  334. free (spo);
  335. return NULL;
  336. }
  337. return spo;
  338. }
  339. LSUP_Triple *
  340. LSUP_triple_new_from_btriple (const LSUP_BufferTriple *sspo)
  341. {
  342. LSUP_Triple *spo = malloc (sizeof (*spo));
  343. if (!spo) return NULL;
  344. spo->s = LSUP_term_new_from_buffer (sspo->s);
  345. spo->p = LSUP_term_new_from_buffer (sspo->p);
  346. spo->o = LSUP_term_new_from_buffer (sspo->o);
  347. return spo;
  348. }
  349. LSUP_BufferTriple *
  350. LSUP_triple_serialize (const LSUP_Triple *spo)
  351. {
  352. LSUP_BufferTriple *sspo = malloc (sizeof (*sspo));
  353. if (!sspo) return NULL;
  354. sspo->s = LSUP_term_serialize (spo->s);
  355. sspo->p = LSUP_term_serialize (spo->p);
  356. sspo->o = LSUP_term_serialize (spo->o);
  357. return sspo;
  358. }
  359. LSUP_rc
  360. LSUP_triple_init (LSUP_Triple *spo, LSUP_Term *s, LSUP_Term *p, LSUP_Term *o)
  361. {
  362. /* FIXME TRP_DUMMY is a problem here.
  363. if (! LSUP_IS_IRI (s) && s->type != LSUP_TERM_BNODE) {
  364. log_error ("Subject is not of a valid term type: %d", s->type);
  365. return LSUP_VALUE_ERR;
  366. }
  367. if (! LSUP_IS_IRI (p)) {
  368. log_error ("Predicate is not of a valid term type: %d", p->type);
  369. return LSUP_VALUE_ERR;
  370. }
  371. */
  372. spo->s = s;
  373. spo->p = p;
  374. spo->o = o;
  375. return LSUP_OK;
  376. }
  377. void
  378. LSUP_triple_done (LSUP_Triple *spo)
  379. {
  380. if (UNLIKELY (!spo)) return;
  381. LSUP_term_free (spo->s);
  382. LSUP_term_free (spo->p);
  383. LSUP_term_free (spo->o);
  384. }
  385. void
  386. LSUP_triple_free (LSUP_Triple *spo)
  387. {
  388. if (UNLIKELY (!spo)) return;
  389. LSUP_term_free (spo->s);
  390. LSUP_term_free (spo->p);
  391. LSUP_term_free (spo->o);
  392. free (spo);
  393. }
  394. /*
  395. * Multi-add functions.
  396. */
  397. LSUP_TermSet *
  398. LSUP_term_set_new ()
  399. {
  400. // Capacity of 4 is an arbitrary guess.
  401. LSUP_TermSet *ts = hashmap_new (
  402. sizeof (KeyedTerm), 4, LSUP_HASH_SEED, 0,
  403. tset_hash_fn, tset_cmp_fn, tset_free_fn, NULL);
  404. if (UNLIKELY (hashmap_oom (ts))) return NULL;
  405. return ts;
  406. }
  407. LSUP_rc
  408. LSUP_term_set_add (LSUP_TermSet *ts, LSUP_Term *term, LSUP_Term **existing)
  409. {
  410. LSUP_Hash key = LSUP_term_hash (term);
  411. KeyedTerm entry_s = {.key=key, .term=term};
  412. KeyedTerm *ex = hashmap_get (ts, &entry_s);
  413. if (ex) {
  414. if (existing) *existing = ex->term;
  415. return LSUP_NOACTION;
  416. }
  417. hashmap_set (ts, &entry_s);
  418. if (hashmap_oom (ts)) return LSUP_MEM_ERR;
  419. return LSUP_OK;
  420. }
  421. const LSUP_Term *
  422. LSUP_term_set_get (LSUP_TermSet *ts, LSUP_Key key)
  423. {
  424. KeyedTerm *entry = hashmap_get (ts, &(KeyedTerm){.key=key});
  425. if (entry) log_trace ("ID found for key %lx: %s", key, entry->term->data);
  426. else log_trace ("No ID found for key %lx.", key);
  427. return (entry) ? entry->term : NULL;
  428. }
  429. LSUP_rc
  430. LSUP_term_set_next (LSUP_TermSet *ts, size_t *i, LSUP_Term **term)
  431. {
  432. KeyedTerm *kt = NULL;
  433. if (!hashmap_iter (ts, i, (void **)&kt)) return LSUP_END;
  434. if (term) *term = kt->term;
  435. return LSUP_OK;
  436. }
  437. void
  438. LSUP_term_set_free (LSUP_TermSet *ts)
  439. { hashmap_free (ts); }
  440. LSUP_LinkMap *
  441. LSUP_link_map_new (LSUP_LinkType type)
  442. {
  443. LSUP_LinkMap *cm;
  444. MALLOC_GUARD (cm, NULL);
  445. cm->type = type;
  446. cm->links = hashmap_new (
  447. sizeof (Link), 0, LSUP_HASH_SEED, 0,
  448. link_map_hash_fn, link_map_cmp_fn, link_map_free_fn, NULL);
  449. return cm;
  450. }
  451. void
  452. LSUP_link_map_free (LSUP_LinkMap *cm)
  453. {
  454. hashmap_free (cm->links);
  455. free (cm);
  456. }
  457. LSUP_LinkType
  458. LSUP_link_map_type (const LSUP_LinkMap *map)
  459. { return map->type; }
  460. // TODO Memory error handling.
  461. LSUP_rc
  462. LSUP_link_map_add (
  463. LSUP_LinkMap *cmap, LSUP_Term *term, LSUP_TermSet *tset)
  464. {
  465. // Keyed term to look up the link term and insert it, if necessary.
  466. KeyedTerm entry_s = {.key=LSUP_term_hash (term), .term=term};
  467. Link *ex = hashmap_get (cmap->links, &(Link){.term=&entry_s});
  468. if (ex) {
  469. // Add terms one by one to the existing term set.
  470. log_trace (
  471. "Linking term %s exists. Adding individual terms.",
  472. ex->term->term->data);
  473. size_t i = 0;
  474. KeyedTerm *kt;
  475. while (hashmap_iter (tset, &i, (void **)&kt)) {
  476. log_trace (
  477. "Adding term %s to link %s",
  478. kt->term->data, ex->term->term->data);
  479. if (hashmap_get (ex->tset, kt))
  480. // Term already exist, free the new one and move on.
  481. LSUP_term_free (kt->term);
  482. else
  483. // Insert KeyedTerm, the term set now owns the underlying term.
  484. hashmap_set (ex->tset, kt);
  485. }
  486. // Free link term that hasn't been used.
  487. LSUP_term_free (term);
  488. } else {
  489. // Add the new term and the termset wholesale.
  490. log_trace ("Adding new linking term %s.", term->data);
  491. // Allocate inserted member on heap, it will be owned by the map.
  492. KeyedTerm *ins;
  493. MALLOC_GUARD (ins, LSUP_MEM_ERR);
  494. memcpy (ins, &entry_s, sizeof (entry_s));
  495. Link link = {.term=ins, .tset=tset};
  496. hashmap_set (cmap->links, &link);
  497. }
  498. return LSUP_OK;
  499. }
  500. LSUP_LinkMapIterator *
  501. LSUP_link_map_iter_new (const LSUP_LinkMap *lmap, LSUP_Term *ext)
  502. {
  503. LSUP_LinkMapIterator *it;
  504. CALLOC_GUARD (it, NULL);
  505. it->map = lmap;
  506. it->ext = ext;
  507. return it;
  508. }
  509. void
  510. LSUP_link_map_iter_free (LSUP_LinkMapIterator *it)
  511. { free (it); }
  512. LSUP_rc
  513. LSUP_link_map_next (
  514. LSUP_LinkMapIterator *it, LSUP_Term **lt, LSUP_TermSet **ts)
  515. {
  516. if (!hashmap_iter (it->map->links, &it->i, (void **)&it->link))
  517. return LSUP_END;
  518. *lt = it->link->term->term;
  519. *ts = it->link->tset;
  520. return LSUP_OK;
  521. }
  522. // TODO dismantle if the only triple generator is for the graph.
  523. LSUP_rc
  524. LSUP_link_map_triples (
  525. LSUP_LinkMapIterator *it, LSUP_Triple *spo)
  526. {
  527. // Assign external (related) term.
  528. if (it->map->type == LSUP_LINK_INBOUND)
  529. spo->o = it->ext;
  530. else if (it->map->type == LSUP_LINK_OUTBOUND)
  531. spo->s = it->ext;
  532. else spo->p = it->ext;
  533. KeyedTerm *kt;
  534. // If we are already handling a link, continue the internal loop.
  535. if (it->link) goto int_loop;
  536. ext_loop:
  537. // Advance external counter and start new internal loop.
  538. it->j = 0;
  539. if (!hashmap_iter (it->map->links, &it->i, (void **)&it->link))
  540. return LSUP_END;
  541. int_loop:
  542. // If end of the term set is reached, start with a new linking term.
  543. if (!hashmap_iter (it->link->tset, &it->j, (void **)&kt)) goto ext_loop;
  544. // Continue pulling from term set.
  545. // Assign linking term.
  546. if (it->map->type == LSUP_LINK_EDGE) spo->s = it->link->term->term;
  547. else spo->p = it->link->term->term;
  548. // Assign term in term set.
  549. if (it->map->type == LSUP_LINK_INBOUND) spo->s = kt->term;
  550. else spo->o = kt->term;
  551. return LSUP_OK;
  552. }
  553. /*
  554. * Static functions.
  555. */
  556. static LSUP_rc
  557. term_init (
  558. LSUP_Term *term, LSUP_TermType type,
  559. const char *data, void *metadata)
  560. {
  561. if (UNLIKELY (!LSUP_uri_ptn)) {
  562. log_error ("Environment not initialized. Did you call LSUP_init()?");
  563. return LSUP_ERROR;
  564. }
  565. // This can never be LSUP_TERM_UNDEFINED.
  566. if (type == LSUP_TERM_UNDEFINED) {
  567. log_error ("%d is not a valid term type.", type);
  568. return LSUP_VALUE_ERR;
  569. }
  570. term->type = type;
  571. if (data) {
  572. // Validate IRI.
  573. if (LSUP_IS_IRI (term)) {
  574. char *fquri;
  575. // Find fully qualified IRI to parse.
  576. if (term->type == LSUP_TERM_NS_IRIREF) {
  577. if (LSUP_nsmap_normalize_uri (metadata, data, &fquri) < 0) {
  578. log_error ("Error normalizing IRI data.");
  579. return LSUP_VALUE_ERR;
  580. }
  581. log_debug ("Fully qualified IRI: %s", fquri);
  582. } else fquri = (char *) data;
  583. if (strpbrk (fquri, invalid_uri_chars) != NULL) {
  584. log_warn (
  585. "Characters %s are not valid in a URI. Got: %s\n",
  586. invalid_uri_chars, fquri);
  587. #if 0
  588. // TODO This causes W3C TTL test #29 to fail. Remove?
  589. return LSUP_VALUE_ERR;
  590. #endif
  591. }
  592. // Capture interesting IRI parts.
  593. regmatch_t matches[11];
  594. if (UNLIKELY (regexec (LSUP_uri_ptn, fquri, 11, matches, 0) != 0)) {
  595. fprintf (stderr, "Error matching URI pattern.\n");
  596. return LSUP_VALUE_ERR;
  597. }
  598. if (term->type == LSUP_TERM_NS_IRIREF) free (fquri);
  599. MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
  600. term->iri_info->prefix = matches[1];
  601. term->iri_info->path = matches[5];
  602. term->iri_info->frag = matches[10];
  603. term->iri_info->nsm = metadata;
  604. }
  605. term->data = strdup (data);
  606. } else {
  607. // No data. Make up a random UUID or URI if allowed.
  608. if (type == LSUP_TERM_IRIREF || type == LSUP_TERM_BNODE) {
  609. uuid_t uuid;
  610. uuid_generate_random (uuid);
  611. uuid_str_t uuid_str;
  612. uuid_unparse_lower (uuid, uuid_str);
  613. if (type == LSUP_TERM_IRIREF) {
  614. term->data = malloc (UUID4_URN_SIZE);
  615. snprintf (
  616. term->data, UUID4_URN_SIZE, "urn:uuid4:%s", uuid_str);
  617. MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
  618. // Allocate IRI match patterns manually.
  619. term->iri_info->prefix.rm_so = 0;
  620. term->iri_info->prefix.rm_eo = 4;
  621. term->iri_info->path.rm_so = 4;
  622. term->iri_info->path.rm_eo = UUIDSTR_SIZE + 6;
  623. term->iri_info->frag.rm_so = -1;
  624. term->iri_info->frag.rm_eo = -1;
  625. term->iri_info->nsm = NULL;
  626. } else term->data = strdup (uuid_str);
  627. } else {
  628. log_error ("No data provided for term.");
  629. return LSUP_VALUE_ERR;
  630. }
  631. }
  632. if (term->type == LSUP_TERM_LT_LITERAL) {
  633. if (!metadata) {
  634. log_warn ("Lang tag is NULL. Creating a non-tagged literal.");
  635. term->type = LSUP_TERM_LITERAL;
  636. } else {
  637. char *lang_str = (char *) metadata;
  638. log_trace("Lang string: '%s'", lang_str);
  639. // Lang tags longer than 7 characters will be truncated.
  640. strncpy(term->lang, lang_str, sizeof (term->lang) - 1);
  641. if (strlen (term->lang) < 1) {
  642. log_error ("Lang tag cannot be an empty string.");
  643. return LSUP_VALUE_ERR;
  644. }
  645. term->lang[7] = '\0';
  646. }
  647. }
  648. if (term->type == LSUP_TERM_LITERAL) {
  649. term->datatype = metadata;
  650. if (! term->datatype) term->datatype = LSUP_default_datatype;
  651. log_trace ("Storing data type: %s", term->datatype->data);
  652. if (! LSUP_IS_IRI (term->datatype)) {
  653. log_error (
  654. "Literal data type is not an IRI: %s",
  655. term->datatype->data);
  656. return LSUP_VALUE_ERR;
  657. }
  658. LSUP_Term *ex = NULL;
  659. LSUP_term_set_add (LSUP_term_cache, term->datatype, &ex);
  660. if (ex && ex != term->datatype) {
  661. // Replace datatype handle with the one in term cache, and free
  662. // the new one.
  663. if (term->datatype != LSUP_default_datatype)
  664. LSUP_term_free (term->datatype);
  665. term->datatype = ex;
  666. }
  667. //log_trace ("Datatype address: %p", term->datatype);
  668. log_trace ("Datatype hash: %lx", LSUP_term_hash (term->datatype));
  669. } else if (term->type == LSUP_TERM_BNODE) {
  670. // TODO This is not usable for global skolemization.
  671. term->bnode_id = LSUP_HASH (
  672. term->data, strlen (term->data) + 1, LSUP_HASH_SEED);
  673. }
  674. return LSUP_OK;
  675. }
  676. /**
  677. * @brief scan an IRI string and parse IRI parts.
  678. *
  679. * Experimental replacement of a regex engine for better performance.
  680. *
  681. * https://datatracker.ietf.org/doc/html/rfc3986#appendix-B
  682. *
  683. * Reference regex and group numbering:
  684. * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
  685. * 12 3 4 5 6 7 8 9
  686. *
  687. * Capturing groups:
  688. *
  689. * scheme = $2
  690. * authority = $4
  691. * path = $5
  692. * query = $7
  693. * fragment = $9
  694. *
  695. * #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
  696. * #1: Domain prefix (http://example.org)
  697. * #2: Protocol (http:)
  698. * #4: Authority (example.org)
  699. * #5: Path, excluding query and fragment (/123/456/)
  700. * #7: Query (query=blah)
  701. * #9: Fragment (frag)
  702. */
  703. static LSUP_rc
  704. parse_iri (char *iri_str, LSUP_IRIInfo *iri_info) {
  705. char *cur = iri_str;
  706. size_t
  707. coord[2][10] = {0}, // Store capture group relative position & length
  708. tmp[2] = {0}; // Temporary storage for capture groups
  709. // #1: (([^:/?#]+:)?(//([^/?#]*))?)
  710. while (*cur != '\0') {
  711. // #2: ([^:/?#]+:)?
  712. tmp[0] = cur - iri_str;
  713. while (
  714. *cur != ':' && *cur != '/' && *cur != '?'
  715. && *cur != '#' && *cur != '\0') {
  716. tmp[1]++;
  717. cur++;
  718. }
  719. if (tmp[1] > 0 && *cur == ':') {
  720. // Got capture group #2. Store it.
  721. tmp[1]++;
  722. coord[3][0] = tmp[0];
  723. coord[3][1] = tmp[1];
  724. cur++;
  725. }
  726. // #3: (//([^/?#]*))?
  727. if (*cur == '/' && *(cur + 1) == '/') {
  728. tmp[0] = cur - iri_str;
  729. tmp[1] = 2;
  730. cur += 2;
  731. // #4: ([^/?#]*)
  732. while (*cur != '/' && *cur != '?' && *cur != '#' && *cur != '\0') {
  733. // Continue recording length for #3, coordinates for #4 can
  734. // be inferred.
  735. tmp[1]++;
  736. cur++;
  737. }
  738. if (tmp[1] > 2) {
  739. // Got capture groups #3 and #4. Store them.
  740. coord[3][0] = tmp[0];
  741. coord[3][1] = tmp[1];
  742. coord[4][0] = tmp[0] + 2;
  743. coord[4][1] = tmp[1] -2;
  744. }
  745. }
  746. }
  747. // #5: ([^?#]*)
  748. tmp[0] = cur - iri_str;
  749. tmp[1] = 0;
  750. while (*cur != '?' && *cur != '#' && *cur != '\0') {
  751. tmp[1]++;
  752. cur++;
  753. }
  754. if (tmp[1] > 0) {
  755. // Got capture group #5. Store it.
  756. coord[5][0] = tmp[0];
  757. coord[5][1] = tmp[1];
  758. }
  759. // #6: (\?([^#]*))
  760. if (*cur == '?') {
  761. // Advance cursor by one and skip storing '?'.
  762. tmp[0] = ++cur - iri_str;
  763. tmp[1] = 0;
  764. // 7: ([^#]*)
  765. while (*cur != '#' && *cur != '\0') {
  766. tmp[1]++;
  767. cur++;
  768. }
  769. if (tmp[1] > 0) {
  770. // Got capture group #7. Store it.
  771. // Group #6 (query including '?') is ignored.
  772. coord[7][0] = tmp[0];
  773. coord[7][1] = tmp[1];
  774. }
  775. }
  776. // 8: (#(.*))
  777. if (*cur == '#') {
  778. // 9: (.*)
  779. // Skip storing '#'.
  780. tmp[0] = ++cur - iri_str;
  781. // Store the length of the remaining string.
  782. tmp[1] = strlen(iri_str) + iri_str - cur;
  783. }
  784. return LSUP_OK;
  785. }
  786. /*
  787. * Extern inline functions.
  788. */
  789. LSUP_Key LSUP_term_hash (const LSUP_Term *term);
  790. LSUP_Term *LSUP_iriref_new (const char *data, LSUP_NSMap *nsm);
  791. LSUP_Term *LSUP_literal_new (const char *data, LSUP_Term *datatype);
  792. LSUP_Term *LSUP_lt_literal_new (const char *data, char *lang);
  793. LSUP_Term *LSUP_bnode_new (const char *data);
  794. bool LSUP_term_equals (const LSUP_Term *term1, const LSUP_Term *term2);
  795. LSUP_Term *LSUP_triple_pos (const LSUP_Triple *trp, LSUP_TriplePos n);
  796. LSUP_Key LSUP_triple_hash (const LSUP_Triple *trp);