term.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. #ifndef _LSUP_TERM_H
  2. #define _LSUP_TERM_H
  3. #include <assert.h>
  4. #include <regex.h>
  5. #include "buffer.h"
  6. #include "namespace.h"
  7. #define UUID4_URN_SIZE UUIDSTR_SIZE + 10
  8. /*
  9. * Term types.
  10. */
  11. /* Undefined placeholder or result of an error. Invalid for most operations. */
  12. #define LSUP_TERM_UNDEFINED 0
  13. /* IRI reference. */
  14. #define LSUP_TERM_IRIREF 1
  15. /* Namespace-prefixed IRI reference. */
  16. #define LSUP_TERM_NS_IRIREF 2
  17. /* Literal without language tag. */
  18. #define LSUP_TERM_LITERAL 3
  19. /* Language-tagged string literal. */
  20. #define LSUP_TERM_LT_LITERAL 4
  21. /* Blank node. */
  22. #define LSUP_TERM_BNODE 5
  23. /** @brief Default data type for untyped literals (prefixed IRI).
  24. */
  25. #define DEFAULT_DTYPE "http://www.w3.org/2001/XMLSchema#string"
  26. /** @brief URI parsing regular expression.
  27. *
  28. * Based on RFC3986 (see https://tools.ietf.org/html/rfc3986#appendix-B) and
  29. * modified for use in this application. Relevant matching groups are the
  30. * following, for a sample URI `http://example.org/123/456/?query=blah#frag`:
  31. *
  32. * #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
  33. * #1: Domain prefix (http://example.org)
  34. * #2: Protocol (http:)
  35. * #4: Authority (example.org)
  36. * #5: Path relative to domain (/123/456/?query=blah#frag)
  37. * #6: Path, excluding query and fragment (/123/456/)
  38. * #8: Query (query=blah)
  39. * #10: Fragment (frag)
  40. *
  41. * For URN-like URIs, such as `urn:s:0`, the prefix part (#1) is `urn:` and
  42. * the path (#4) is `s:0`.
  43. */
  44. #define LSUP_URI_REGEX_STR \
  45. "^(([^:/?#]+:)?(//([^/?#]*))?)?(([^?#]*)(\\?([^#]*))?(#(.*))?)"
  46. /*
  47. * Data types.
  48. */
  49. typedef char LSUP_TermType;
  50. typedef char LSUP_LangTag[8];
  51. /** @brief IRI information.
  52. *
  53. * See regex matching group for #LSUP_URI_REGEX_STR for more information.
  54. */
  55. typedef struct iri_info_t LSUP_IRIInfo;
  56. typedef struct term_t {
  57. char * data; // URI, literal value, or BNode label.
  58. union {
  59. struct term_t * datatype; // Data type IRI for LSUP_TERM_LITERAL.
  60. LSUP_LangTag lang; // Lang tag for LSUP_TERM_LT_LITERAL.
  61. LSUP_Key bnode_id; // BNode ID for comparison & skolemization.
  62. LSUP_IRIInfo * iri_info; // IRI information structure.
  63. };
  64. LSUP_TermType type; // Term type.
  65. } LSUP_Term;
  66. /** @brief Shorthand to test if a term is a IRI of any kind.
  67. */
  68. #define LSUP_IS_IRI(term) \
  69. ((term)->type == LSUP_TERM_IRIREF || (term)->type == LSUP_TERM_NS_IRIREF)
  70. /** @brief Shorthand to test if a term is a literal of any kind.
  71. */
  72. #define LSUP_IS_LITERAL(term) \
  73. ((term)->type == LSUP_TERM_LITERAL || (term)->type == LSUP_TERM_LT_LITERAL)
  74. typedef struct triple_t {
  75. LSUP_Term *s;
  76. LSUP_Term *p;
  77. LSUP_Term *o;
  78. } LSUP_Triple;
  79. /** @brief Key-term pair.
  80. */
  81. typedef struct term_cache_entry_t {
  82. LSUP_Key key; // Key (hash) of the term.
  83. LSUP_Term * term; // Term handle.
  84. } LSUP_KeyedTerm;
  85. /*
  86. * Extern variables.
  87. */
  88. /** @brief Global term cache.
  89. *
  90. * Stores frequently used terms, e.g. data type URIs.
  91. */
  92. extern struct hashmap *LSUP_term_cache;
  93. /** @brief Compiled hash of default literal data type.
  94. */
  95. extern uint32_t LSUP_default_dtype_key;
  96. /** @brief URI validation pattern, compiled in #LSUP_init().
  97. */
  98. extern regex_t *LSUP_uri_ptn;
  99. /** @brief Default literal data type URI.
  100. *
  101. * Literal terms created with undefined data type will have it set to this
  102. * URI implicitly.
  103. */
  104. extern LSUP_Term *LSUP_default_datatype;
  105. /*
  106. * Function prototypes.
  107. */
  108. /** @brief Create a new term.
  109. *
  110. * This is a generic function; it is recommended to use specialized functions
  111. * such as #LSUP_term_new(), #LSUP_literal_new(), etc. as they have strict type
  112. * checks for the metadata parameter.
  113. *
  114. * @param type[in] Term type. One of #LSUP_TermType.
  115. *
  116. * @param data[in] Term data: textual URI, literal value without data type
  117. * or langtag, etc.
  118. *
  119. * @param metadata[in] Namespace map (LSUP_NSMap *) for IRI refs; language tag
  120. * (LSUP_LangTag *) for language-tagged literals; or data type (LSUP_Term *)
  121. * for other literals. It may be NULL.
  122. *
  123. * @return New term, which must be freed with #LSUP_term_free after use; or
  124. * NULL on error.
  125. */
  126. LSUP_Term *
  127. LSUP_term_new (LSUP_TermType type, const char *data, void *metadata);
  128. /** @brief Placeholder term to use with LSUP_term_reset.
  129. */
  130. #define TERM_DUMMY LSUP_term_new (LSUP_TERM_UNDEFINED, NULL, NULL)
  131. /** @brief Shortcut to create an IRI reference.
  132. *
  133. * Must be freed with #LSUP_term_free.
  134. *
  135. * @param data[in] The URI string. If NULL, a UUID4-based URN is generated.
  136. * This cannot be NULL if the nsm parameter is not NULL.
  137. *
  138. * @param nsm[in] Namespace map. If not NULL, a namespace-prefixed
  139. * (#LSUP_TERM_NS_IRIREF) is created, otherwise a regular one
  140. * (#LSUP_TERM_IRIREF).
  141. *
  142. * @return same as #LSUP_term_new().
  143. */
  144. inline LSUP_Term *
  145. LSUP_iriref_new (const char *data, LSUP_NSMap *nsm)
  146. {
  147. return (
  148. nsm ? LSUP_term_new (LSUP_TERM_NS_IRIREF, data, nsm) :
  149. LSUP_term_new (LSUP_TERM_IRIREF, data, NULL));
  150. }
  151. /** @brief Create a new absolute IRI from a path relative to a root IRI.
  152. *
  153. * The term is always of type LSUP_TERM_IRIREF (i.e. not namespace-prefixed).
  154. *
  155. * If the provided IRI is already a fully qualified IRI (i.e. it has a prefix)
  156. * the result is semantically identical to the input.
  157. *
  158. * If the provided IRI begins with a '/', the resulting IRI is relative to the
  159. * web root of the root IRI. I.e. if a root IRI has a path after the webroot,
  160. * it is ignored.
  161. *
  162. * Otherwise, the resulting IRI is relative to the full root string.
  163. *
  164. * @param[in] root Root IRI that the new IRI should be relative to.
  165. *
  166. * @param[in] iri Term with an IRI relative to the webroot.
  167. *
  168. * @return New absolute IRI, or NULL if either term is not an IRI.
  169. */
  170. LSUP_Term *
  171. LSUP_iriref_absolute (const LSUP_Term *root, const LSUP_Term *iri);
  172. /** @brief Create a new relative IRI from an absolute IRI and a web root IRI.
  173. *
  174. * This works with namespace-prefixed IRIs and returns a term of the same type
  175. * as the input.
  176. *
  177. * @param[in] iri Full IRI.
  178. *
  179. * @param[in] root Root IRI that the new IRI should be relative to.
  180. *
  181. * @return New IRI, or NULL if either term is not an IRI. If the input IRI is
  182. * not a path under the root IRI, the result will be identical to the input.
  183. */
  184. LSUP_Term *
  185. LSUP_iriref_relative (const LSUP_Term *root, const LSUP_Term *iri);
  186. /** @brief Shortcut to create a literal term.
  187. *
  188. * Must be freed with #LSUP_term_free.
  189. *
  190. * @param data[in] The literal string.
  191. *
  192. * @param datatype[in] Data type URI string. If NULL, the default data type
  193. * (xsd:string) is used. The new term takes ownership of the pointer.
  194. *
  195. * @return same as #LSUP_term_new().
  196. */
  197. inline LSUP_Term *
  198. LSUP_literal_new (const char *data, LSUP_Term *datatype)
  199. { return LSUP_term_new (LSUP_TERM_LITERAL, data, datatype); }
  200. /** @brief Shortcut to create a language-tagged literal term.
  201. *
  202. * Must be freed with #LSUP_term_free.
  203. *
  204. * @param data[in] The literal string.
  205. *
  206. * @param lang[in] Language tag string.
  207. *
  208. * @return same as #LSUP_term_new().
  209. */
  210. inline LSUP_Term *
  211. LSUP_lt_literal_new (const char *data, char *lang)
  212. { return LSUP_term_new (LSUP_TERM_LT_LITERAL, data, lang); }
  213. /** @brief Shortcut to create a blank node.
  214. *
  215. * Must be freed with #LSUP_term_free.
  216. *
  217. * @param data[in] The BNode identifier.
  218. *
  219. * @return same as #LSUP_term_new().
  220. */
  221. inline LSUP_Term *
  222. LSUP_bnode_new (const char *data)
  223. { return LSUP_term_new (LSUP_TERM_BNODE, data, NULL); }
  224. /** @brief Copy a term.
  225. *
  226. * @param[in] src The term to copy.
  227. *
  228. * @return A new duplicate term handle.
  229. */
  230. LSUP_Term *
  231. LSUP_term_copy (const LSUP_Term *src);
  232. /** @brief Deserialize a buffer into a term.
  233. *
  234. * @param[in] sterm Buffer to convert into a term. It must be a valid
  235. * serialized term from store or obtained with #LSUP_term_serialize().
  236. *
  237. * @return New term handle. It must be freed with #LSUP_term_free().
  238. */
  239. LSUP_Term *
  240. LSUP_term_new_from_buffer (const LSUP_Buffer *sterm);
  241. /** @brief Serialize a term into a buffer.
  242. *
  243. * @param[in] sterm Term to convert into a buffer.
  244. *
  245. * @return New buffer handle. It must be freed with #LSUP_buffer_free().
  246. */
  247. LSUP_Buffer *
  248. LSUP_term_serialize (const LSUP_Term *term);
  249. /** @brief Hash a buffer.
  250. */
  251. LSUP_Key
  252. LSUP_term_hash (const LSUP_Term *term);
  253. /** @brief Compare two terms.
  254. *
  255. * The terms evaluate as equal if their hashes are equal—i.e. if they are
  256. * semantically equivalent.
  257. */
  258. inline bool LSUP_term_equals (const LSUP_Term *term1, const LSUP_Term *term2)
  259. { return LSUP_term_hash (term1) == LSUP_term_hash (term2); }
  260. void
  261. LSUP_term_free (LSUP_Term *term);
  262. /** @brief Namespace map of a IRI ref.
  263. *
  264. * @param[in] iri IRI reference handle.
  265. *
  266. * @return A pointer to the namespace map associated with the IRI. It is
  267. * freed at program shutdown.
  268. */
  269. LSUP_NSMap *
  270. LSUP_iriref_nsm (const LSUP_Term *iri);
  271. /** @brief Get the prefix portion of a IRI ref.
  272. *
  273. * @param[in] iri IRI reference handle.
  274. *
  275. * @return String containing the protocol and domain name part of the IRI. It
  276. * should be freed after use.
  277. */
  278. char *
  279. LSUP_iriref_prefix (const LSUP_Term *iri);
  280. /** @brief Get the path portion of a IRI ref.
  281. *
  282. * @param[in] iri IRI reference handle.
  283. *
  284. * @return String containing the path of the IRI relative to the web root. For
  285. * a URN, such as `urn:myns:myid`, it would be `myns:myid`. This string should
  286. * be freed after use.
  287. */
  288. char *
  289. LSUP_iriref_path (const LSUP_Term *iri);
  290. /** @brief Get the fragment portion of a IRI ref.
  291. *
  292. * @param[in] iri IRI reference handle.
  293. *
  294. * @return String containing the fragment part of the IRI, or NULL if the IRI
  295. * contains no fragment. It should be freed after use.
  296. */
  297. char *
  298. LSUP_iriref_frag (const LSUP_Term *iri);
  299. /*
  300. * TRIPLES
  301. */
  302. /** @brief Create a new triple from three terms.
  303. *
  304. * Terms are NOT copied. To free them with the triple, use #LSUP_triple_free().
  305. * To only free the triple, use free().
  306. *
  307. * TODO Term types are not validated at the moment.
  308. *
  309. * @param[in] s Triple subject. It must be an IRIRef or BNode.
  310. *
  311. * @param[in] p Triple predicate. It must be an IRIRef.
  312. *
  313. * @param[in] o Triple object.
  314. *
  315. */
  316. LSUP_Triple *
  317. LSUP_triple_new(LSUP_Term *s, LSUP_Term *p, LSUP_Term *o);
  318. /** @brief Dummy triple with NULL slots. It is not a valid triple.
  319. */
  320. #define TRP_DUMMY LSUP_triple_new (NULL, NULL, NULL)
  321. LSUP_Triple *
  322. LSUP_triple_new_from_btriple (const LSUP_BufferTriple *sspo);
  323. LSUP_BufferTriple *
  324. LSUP_triple_serialize (const LSUP_Triple *spo);
  325. /** @brief Initialize internal term pointers in a heap-allocated triple.
  326. *
  327. * Terms are NOT copied. To free them with the triple, use #LSUP_triple_free().
  328. * To only free the triple, use free().
  329. *
  330. * @param spo[in] Triple pointer to initialize.
  331. */
  332. LSUP_rc
  333. LSUP_triple_init (LSUP_Triple *spo, LSUP_Term *s, LSUP_Term *p, LSUP_Term *o);
  334. /** @brief Free the internal pointers of a triple.
  335. *
  336. * @param spo[in] Triple to be freed.
  337. */
  338. void
  339. LSUP_triple_done (LSUP_Triple *spo);
  340. /** @brief Free a triple and all its internal pointers.
  341. *
  342. * NOTE: If the term pointers are not to be freed (e.g. they are owned by a
  343. * back end), use a simple free(spo) instead of this.
  344. *
  345. * @param spo[in] Triple to be freed.
  346. */
  347. void
  348. LSUP_triple_free (LSUP_Triple *spo);
  349. /** @brief Get triple by term position.
  350. *
  351. * Useful for looping over all terms.
  352. *
  353. * @param trp[in] Triple pointer.
  354. *
  355. * @param n[in] A number between 0÷2.
  356. *
  357. * @return Corresponding triple term or NULL if n is out of range.
  358. */
  359. inline LSUP_Term *
  360. LSUP_triple_pos (const LSUP_Triple *trp, LSUP_TriplePos n)
  361. {
  362. if (n == TRP_POS_S) return trp->s;
  363. if (n == TRP_POS_P) return trp->p;
  364. if (n == TRP_POS_O) return trp->o;
  365. return NULL;
  366. }
  367. /** @brief Hash a triple.
  368. *
  369. * TODO This doesn't handle blank nodes correctly.
  370. */
  371. inline LSUP_Key
  372. LSUP_triple_hash (const LSUP_Triple *trp)
  373. {
  374. LSUP_BufferTriple *strp = LSUP_triple_serialize (trp);
  375. LSUP_Key hash = LSUP_btriple_hash (strp);
  376. LSUP_btriple_free (strp);
  377. return hash;
  378. }
  379. /** @brief Add an identifier to the term cache.
  380. *
  381. * @param[in] key Hash of the inserted term.
  382. *
  383. * @param[in] term Term to insert. A copy of the term is stored in the cache,
  384. * which is freed on application teardown.
  385. */
  386. LSUP_rc
  387. LSUP_tcache_add (const LSUP_Key key, const LSUP_Term *term);
  388. /** @brief Get an identifier from the cache.
  389. *
  390. * @param[in] key Key for the queried term.
  391. *
  392. * @return The retrieved term if found, or NULL. The string must not be
  393. * modified or freed.
  394. */
  395. const LSUP_Term *
  396. LSUP_tcache_get (LSUP_Key key);
  397. #endif