term.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. #include "tpl.h"
  2. #include "term.h"
  3. /** @brief tpl packing format for a term.
  4. *
  5. * The pack elements are: 1. term type (char); 2. data (string); 3. void* type
  6. * metadata, cast to 8-byte unsigned.
  7. */
  8. #define TERM_PACK_FMT "csU"
  9. #define MAX_VALID_TERM_TYPE LSUP_TERM_BNODE /* For type validation. */
  10. /*
  11. * Data structures.
  12. */
  13. struct iri_info_t {
  14. LSUP_NSMap * nsm; // NSM handle for prefixed IRI.
  15. regmatch_t prefix; // Matching group #1.
  16. regmatch_t path; // Matching group #5.
  17. regmatch_t frag; // Matching group #10.
  18. };
  19. /*
  20. * Extern variables.
  21. */
  22. struct hashmap *LSUP_term_cache = NULL;
  23. uint32_t LSUP_default_dtype_key = 0;
  24. regex_t *LSUP_uri_ptn;
  25. LSUP_Term *LSUP_default_datatype = NULL;
  26. /*
  27. * Static variables.
  28. */
  29. // Characters not allowed in a URI string.
  30. static const char *invalid_uri_chars = "<>\" {}|\\^`";
  31. /*
  32. * Static prototypes.
  33. */
  34. static LSUP_rc
  35. term_init (
  36. LSUP_Term *term, LSUP_TermType type, const char *data, void *metadata);
  37. /*
  38. * Term API.
  39. */
  40. LSUP_Term *
  41. LSUP_term_new (
  42. LSUP_TermType type, const char *data, void *metadata)
  43. {
  44. LSUP_Term *term;
  45. CALLOC_GUARD (term, NULL);
  46. // If undefined, just set the type.
  47. if (type == LSUP_TERM_UNDEFINED) term->type = type;
  48. else if (UNLIKELY (term_init (
  49. term, type, data, metadata) != LSUP_OK)) {
  50. free (term);
  51. return NULL;
  52. }
  53. return term;
  54. }
  55. LSUP_Term *
  56. LSUP_term_copy (const LSUP_Term *src)
  57. {
  58. void *metadata = NULL;
  59. if (LSUP_IS_IRI (src))
  60. metadata = (void *) LSUP_iriref_nsm (src);
  61. else if (src->type == LSUP_TERM_LITERAL)
  62. metadata = (void *) src->datatype;
  63. else if (src->type == LSUP_TERM_LT_LITERAL) {
  64. metadata = (void *) src->lang;
  65. }
  66. return LSUP_term_new (src->type, src->data, metadata);
  67. }
  68. LSUP_Term *
  69. LSUP_term_new_from_buffer (const LSUP_Buffer *sterm)
  70. {
  71. if (UNLIKELY (!sterm)) return NULL;
  72. LSUP_Term *term = NULL;
  73. LSUP_TermType type = LSUP_TERM_UNDEFINED;
  74. char *data = NULL;
  75. void *metadata;
  76. tpl_node *tn;
  77. tn = tpl_map (TERM_PACK_FMT, &type, &data, &metadata);
  78. if (UNLIKELY (!tn)) goto finally;
  79. if (UNLIKELY (tpl_load (tn, TPL_MEM, sterm->addr, sterm->size) < 0)) {
  80. log_error ("Error loading serialized term.");
  81. goto finally;
  82. }
  83. if (UNLIKELY (tpl_unpack (tn, 0) < 0)) {
  84. log_error ("Error unpacking serialized term.");
  85. goto finally;
  86. }
  87. if (type == LSUP_TERM_LT_LITERAL)
  88. term = LSUP_lt_literal_new (data, (char *)&metadata);
  89. else term = LSUP_term_new (type, data, metadata);
  90. finally:
  91. tpl_free (tn);
  92. free (data);
  93. return term;
  94. }
  95. LSUP_Term *
  96. LSUP_iriref_absolute (const LSUP_Term *root, const LSUP_Term *iri)
  97. {
  98. if (! LSUP_IS_IRI (iri)) {
  99. log_error ("Provided path is not an IRI.");
  100. return NULL;
  101. }
  102. if (! LSUP_IS_IRI (root)) {
  103. log_error ("Provided root is not an IRI.");
  104. return NULL;
  105. }
  106. char *data, *pfx = LSUP_iriref_prefix (iri);
  107. if (pfx) data = iri->data;
  108. else if (iri->data[0] == '/') {
  109. free (pfx);
  110. pfx = LSUP_iriref_prefix (root);
  111. data = malloc (strlen (iri->data) + strlen (pfx) + 1);
  112. if (!data) return NULL;
  113. sprintf (data, "%s%s", pfx, iri->data);
  114. } else {
  115. data = malloc (strlen (iri->data) + strlen (root->data) + 1);
  116. if (!data) return NULL;
  117. sprintf (data, "%s%s", root->data, iri->data);
  118. }
  119. free (pfx);
  120. LSUP_Term *ret = LSUP_iriref_new (data, NULL);
  121. if (data != iri->data) free (data);
  122. return ret;
  123. }
  124. LSUP_Term *
  125. LSUP_iriref_relative (const LSUP_Term *root, const LSUP_Term *iri)
  126. {
  127. if (! LSUP_IS_IRI (iri)) {
  128. log_error ("Provided path is not an IRI.");
  129. return NULL;
  130. }
  131. if (! LSUP_IS_IRI (root)) {
  132. log_error ("Provided root is not an IRI.");
  133. return NULL;
  134. }
  135. size_t offset = (
  136. strstr (iri->data, root->data) == iri->data ?
  137. strlen (root->data) : 0);
  138. return LSUP_iriref_new (iri->data + offset, LSUP_iriref_nsm (iri));
  139. }
  140. LSUP_Buffer *
  141. LSUP_term_serialize (const LSUP_Term *term)
  142. {
  143. /*
  144. * In serializing a term, the fact that two terms of different types may
  145. * be semantically identical must be taken into account. Specifically, a
  146. * namespace-prefixed IRI ref is identical to its fully qualified version,
  147. * and a LSUP_TERM_LT_LITERAL with no language tag is identical to a
  148. * LSUP_TERM_LITERAL of xsd:string type, made up of the same string. Such
  149. * terms must have identical serializations.
  150. */
  151. if (UNLIKELY (!term)) return NULL;
  152. LSUP_Term *tmp_term;
  153. void *metadata = NULL;
  154. if (term->type == LSUP_TERM_NS_IRIREF) {
  155. // For IRI refs, simply serialize the FQ version of the term.
  156. char *fq_uri;
  157. if (LSUP_nsmap_normalize_uri (
  158. term->iri_info->nsm, term->data, &fq_uri
  159. ) != LSUP_OK) return NULL;
  160. tmp_term = LSUP_iriref_new (fq_uri, NULL);
  161. free (fq_uri);
  162. } else if (term->type == LSUP_TERM_LT_LITERAL) {
  163. // For LT literals with empty lang tag, convert to a normal xsd:string.
  164. if (strlen (term->lang) == 0)
  165. tmp_term = LSUP_literal_new (term->data, NULL);
  166. else tmp_term = LSUP_lt_literal_new (term->data, (char *) term->lang);
  167. } else tmp_term = LSUP_term_new (
  168. term->type, term->data, (void *) term->datatype);
  169. // "datatype" can be anything here since it's cast to void *.
  170. // metadata field is ignored for IRI ref.
  171. if (tmp_term->type == LSUP_TERM_LITERAL)
  172. metadata = tmp_term->datatype;
  173. else if (tmp_term->type == LSUP_TERM_LT_LITERAL)
  174. memcpy (&metadata, tmp_term->lang, sizeof (metadata));
  175. LSUP_Buffer *sterm;
  176. MALLOC_GUARD (sterm, NULL);
  177. int rc = tpl_jot (
  178. TPL_MEM, &sterm->addr, &sterm->size, TERM_PACK_FMT,
  179. &tmp_term->type, &tmp_term->data, &metadata);
  180. LSUP_term_free (tmp_term);
  181. if (rc != 0) {
  182. LSUP_buffer_free (sterm);
  183. return NULL;
  184. }
  185. return sterm;
  186. }
  187. LSUP_Key
  188. LSUP_term_hash (const LSUP_Term *term)
  189. {
  190. LSUP_Buffer *buf;
  191. if (UNLIKELY (!term)) buf = BUF_DUMMY;
  192. else buf = LSUP_term_serialize (term);
  193. LSUP_Key key = LSUP_buffer_hash (buf);
  194. LSUP_buffer_free (buf);
  195. return key;
  196. }
  197. void
  198. LSUP_term_free (LSUP_Term *term)
  199. {
  200. if (UNLIKELY (!term)) return;
  201. if (LSUP_IS_IRI (term)) free (term->iri_info);
  202. free (term->data);
  203. free (term);
  204. }
  205. LSUP_NSMap *
  206. LSUP_iriref_nsm (const LSUP_Term *iri)
  207. {
  208. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  209. log_error ("Term is not a IRI ref type.");
  210. return NULL;
  211. }
  212. return iri->iri_info->nsm;
  213. }
  214. char *
  215. LSUP_iriref_prefix (const LSUP_Term *iri)
  216. {
  217. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  218. log_error ("Term is not a IRI ref type.");
  219. return NULL;
  220. }
  221. if (iri->iri_info->prefix.rm_so == -1) return NULL;
  222. size_t len = iri->iri_info->prefix.rm_eo - iri->iri_info->prefix.rm_so;
  223. if (len == 0) return NULL;
  224. return strndup (iri->data + iri->iri_info->prefix.rm_so, len);
  225. }
  226. char *
  227. LSUP_iriref_path (const LSUP_Term *iri)
  228. {
  229. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  230. log_error ("Term is not a IRI ref type.");
  231. return NULL;
  232. }
  233. if (iri->iri_info->path.rm_so == -1) return NULL;
  234. size_t len = iri->iri_info->path.rm_eo - iri->iri_info->path.rm_so;
  235. if (len == 0) return NULL;
  236. return strndup (iri->data + iri->iri_info->path.rm_so, len);
  237. }
  238. char *
  239. LSUP_iriref_frag (const LSUP_Term *iri)
  240. {
  241. if (iri->type != LSUP_TERM_IRIREF && iri->type != LSUP_TERM_NS_IRIREF) {
  242. log_error ("Term is not a IRI ref type.");
  243. return NULL;
  244. }
  245. if (iri->iri_info->frag.rm_so == -1) return NULL;
  246. size_t len = iri->iri_info->frag.rm_eo - iri->iri_info->frag.rm_so;
  247. return strndup (iri->data + iri->iri_info->frag.rm_so, len);
  248. }
  249. /*
  250. * Triple API.
  251. */
  252. LSUP_Triple *
  253. LSUP_triple_new(LSUP_Term *s, LSUP_Term *p, LSUP_Term *o)
  254. {
  255. LSUP_Triple *spo = malloc (sizeof (*spo));
  256. if (!spo) return NULL;
  257. if (UNLIKELY (LSUP_triple_init (spo, s, p, o))) {
  258. free (spo);
  259. return NULL;
  260. }
  261. return spo;
  262. }
  263. LSUP_Triple *
  264. LSUP_triple_new_from_btriple (const LSUP_BufferTriple *sspo)
  265. {
  266. LSUP_Triple *spo = malloc (sizeof (*spo));
  267. if (!spo) return NULL;
  268. spo->s = LSUP_term_new_from_buffer (sspo->s);
  269. spo->p = LSUP_term_new_from_buffer (sspo->p);
  270. spo->o = LSUP_term_new_from_buffer (sspo->o);
  271. return spo;
  272. }
  273. LSUP_BufferTriple *
  274. LSUP_triple_serialize (const LSUP_Triple *spo)
  275. {
  276. LSUP_BufferTriple *sspo = malloc (sizeof (*sspo));
  277. if (!sspo) return NULL;
  278. sspo->s = LSUP_term_serialize (spo->s);
  279. sspo->p = LSUP_term_serialize (spo->p);
  280. sspo->o = LSUP_term_serialize (spo->o);
  281. return sspo;
  282. }
  283. LSUP_rc
  284. LSUP_triple_init (LSUP_Triple *spo, LSUP_Term *s, LSUP_Term *p, LSUP_Term *o)
  285. {
  286. /* FIXME TRP_DUMMY is a problem here.
  287. if (! LSUP_IS_IRI (s) && s->type != LSUP_TERM_BNODE) {
  288. log_error ("Subject is not of a valid term type: %d", s->type);
  289. return LSUP_VALUE_ERR;
  290. }
  291. if (! LSUP_IS_IRI (p)) {
  292. log_error ("Predicate is not of a valid term type: %d", p->type);
  293. return LSUP_VALUE_ERR;
  294. }
  295. */
  296. spo->s = s;
  297. spo->p = p;
  298. spo->o = o;
  299. return LSUP_OK;
  300. }
  301. void
  302. LSUP_triple_done (LSUP_Triple *spo)
  303. {
  304. if (UNLIKELY (!spo)) return;
  305. LSUP_term_free (spo->s);
  306. LSUP_term_free (spo->p);
  307. LSUP_term_free (spo->o);
  308. }
  309. void
  310. LSUP_triple_free (LSUP_Triple *spo)
  311. {
  312. if (UNLIKELY (!spo)) return;
  313. LSUP_term_free (spo->s);
  314. LSUP_term_free (spo->p);
  315. LSUP_term_free (spo->o);
  316. free (spo);
  317. }
  318. LSUP_rc
  319. LSUP_tcache_add (const LSUP_Key key, const LSUP_Term *term)
  320. {
  321. LSUP_KeyedTerm entry_s = {.key=key, .term=(LSUP_Term *)term};
  322. // Many calls will likely attempt inserting duplicates after the first one.
  323. if (LIKELY (hashmap_get (LSUP_term_cache, &entry_s))) return LSUP_NOACTION;
  324. hashmap_set (LSUP_term_cache, &entry_s);
  325. return LSUP_OK;
  326. }
  327. const LSUP_Term *
  328. LSUP_tcache_get (LSUP_Key key)
  329. {
  330. LSUP_KeyedTerm *entry = hashmap_get (
  331. LSUP_term_cache, &(LSUP_KeyedTerm){.key=key});
  332. if (entry) log_trace ("ID found for key %lx: %s", key, entry->term->data);
  333. else log_trace ("No ID found for key %lx.", key);
  334. return (entry) ? entry->term : NULL;
  335. }
  336. /*
  337. * Static functions.
  338. */
  339. static LSUP_rc
  340. term_init (
  341. LSUP_Term *term, LSUP_TermType type,
  342. const char *data, void *metadata)
  343. {
  344. if (UNLIKELY (!LSUP_uri_ptn)) {
  345. log_error ("Environment not initialized. Did you call LSUP_init()?");
  346. return LSUP_ERROR;
  347. }
  348. // This can never be LSUP_TERM_UNDEFINED.
  349. if (type == LSUP_TERM_UNDEFINED) {
  350. log_error ("%d is not a valid term type.", type);
  351. return LSUP_VALUE_ERR;
  352. }
  353. term->type = type;
  354. if (data) {
  355. // Validate IRI.
  356. if (LSUP_IS_IRI (term)) {
  357. char *fquri;
  358. // Find fully qualified IRI to parse.
  359. if (term->type == LSUP_TERM_NS_IRIREF) {
  360. if (LSUP_nsmap_normalize_uri (
  361. metadata, data, &fquri) != LSUP_OK
  362. ) {
  363. log_error ("Error normalizing IRI data.");
  364. return LSUP_VALUE_ERR;
  365. }
  366. log_debug ("Fully qualified IRI: %s", fquri);
  367. } else fquri = (char *) data;
  368. if (strpbrk (fquri, invalid_uri_chars) != NULL) {
  369. log_warn (
  370. "Characters %s are not valid in a URI. Got: %s\n",
  371. invalid_uri_chars, fquri);
  372. #if 0
  373. // TODO This causes W3C TTL test #29 to fail. Remove?
  374. return LSUP_VALUE_ERR;
  375. #endif
  376. }
  377. // Capture interesting IRI parts.
  378. regmatch_t matches[11];
  379. if (UNLIKELY (regexec (LSUP_uri_ptn, fquri, 11, matches, 0) != 0)) {
  380. fprintf (stderr, "Error matching URI pattern.\n");
  381. return LSUP_VALUE_ERR;
  382. }
  383. if (term->type == LSUP_TERM_NS_IRIREF) free (fquri);
  384. MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
  385. term->iri_info->prefix = matches[1];
  386. term->iri_info->path = matches[5];
  387. term->iri_info->frag = matches[10];
  388. term->iri_info->nsm = metadata;
  389. }
  390. term->data = strdup (data);
  391. } else {
  392. // No data. Make up a random UUID or URI if allowed.
  393. if (type == LSUP_TERM_IRIREF || type == LSUP_TERM_BNODE) {
  394. uuid_t uuid;
  395. uuid_generate_random (uuid);
  396. uuid_str_t uuid_str;
  397. uuid_unparse_lower (uuid, uuid_str);
  398. if (type == LSUP_TERM_IRIREF) {
  399. term->data = malloc (UUID4_URN_SIZE);
  400. snprintf (
  401. term->data, UUID4_URN_SIZE, "urn:uuid4:%s", uuid_str);
  402. MALLOC_GUARD (term->iri_info, LSUP_MEM_ERR);
  403. // Allocate IRI match patterns manually.
  404. term->iri_info->prefix.rm_so = 0;
  405. term->iri_info->prefix.rm_eo = 4;
  406. term->iri_info->path.rm_so = 4;
  407. term->iri_info->path.rm_eo = UUIDSTR_SIZE + 6;
  408. term->iri_info->frag.rm_so = -1;
  409. term->iri_info->frag.rm_eo = -1;
  410. term->iri_info->nsm = NULL;
  411. } else term->data = strdup (uuid_str);
  412. } else {
  413. log_error ("No data provided for term.");
  414. return LSUP_VALUE_ERR;
  415. }
  416. }
  417. if (term->type == LSUP_TERM_LT_LITERAL) {
  418. if (!metadata) {
  419. log_warn ("Lang tag is NULL. Creating a non-tagged literal.");
  420. term->type = LSUP_TERM_LITERAL;
  421. } else {
  422. char *lang_str = (char *) metadata;
  423. log_trace("Lang string: '%s'", lang_str);
  424. // Lang tags longer than 7 characters will be truncated.
  425. strncpy(term->lang, lang_str, sizeof (term->lang) - 1);
  426. if (strlen (term->lang) < 1) {
  427. log_error ("Lang tag cannot be an empty string.");
  428. return LSUP_VALUE_ERR;
  429. }
  430. term->lang[7] = '\0';
  431. }
  432. }
  433. if (term->type == LSUP_TERM_LITERAL) {
  434. term->datatype = metadata;
  435. if (! term->datatype) term->datatype = LSUP_default_datatype;
  436. log_trace ("Storing data type: %s", term->datatype->data);
  437. if (! LSUP_IS_IRI (term->datatype)) {
  438. log_error (
  439. "Literal data type is not an IRI: %s",
  440. term->datatype->data);
  441. return LSUP_VALUE_ERR;
  442. }
  443. uint32_t dtype_hash = LSUP_term_hash (term->datatype);
  444. const LSUP_Term *tmp = LSUP_tcache_get (dtype_hash);
  445. if (!tmp) LSUP_tcache_add (dtype_hash, term->datatype);
  446. else if (term->datatype != tmp) {
  447. if (term->datatype != LSUP_default_datatype)
  448. LSUP_term_free (term->datatype);
  449. term->datatype = (LSUP_Term *)tmp;
  450. }
  451. //log_trace ("Datatype address: %p", term->datatype);
  452. log_trace ("Datatype hash: %lx", LSUP_term_hash (term->datatype));
  453. } else if (term->type == LSUP_TERM_BNODE) {
  454. // TODO This is not usable for global skolemization.
  455. term->bnode_id = LSUP_HASH (
  456. term->data, strlen (term->data) + 1, LSUP_HASH_SEED);
  457. }
  458. return LSUP_OK;
  459. }
  460. /*
  461. * Extern inline functions.
  462. */
  463. LSUP_Key LSUP_term_hash (const LSUP_Term *term);
  464. LSUP_Term *LSUP_iriref_new (const char *data, LSUP_NSMap *nsm);
  465. LSUP_Term *LSUP_literal_new (const char *data, LSUP_Term *datatype);
  466. LSUP_Term *LSUP_lt_literal_new (const char *data, char *lang);
  467. LSUP_Term *LSUP_bnode_new (const char *data);
  468. bool LSUP_term_equals (const LSUP_Term *term1, const LSUP_Term *term2);
  469. LSUP_Term *LSUP_triple_pos (const LSUP_Triple *trp, LSUP_TriplePos n);
  470. LSUP_Key LSUP_triple_hash (const LSUP_Triple *trp);