term.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. #ifndef _LSUP_TERM_H
  2. #define _LSUP_TERM_H
  3. #include <assert.h>
  4. #include <regex.h>
  5. #include "uthash.h"
  6. #include "buffer.h"
  7. #include "namespace.h"
  8. #define UUID4_URN_SIZE UUIDSTR_SIZE + 10
  9. /*
  10. * Term types.
  11. */
  12. /* Undefined placeholder or result of an error. Invalid for most operations. */
  13. #define LSUP_TERM_UNDEFINED 0
  14. /* IRI reference. */
  15. #define LSUP_TERM_IRIREF 1
  16. /* Namespace-prefixed IRI reference. */
  17. #define LSUP_TERM_NS_IRIREF 2
  18. /* Literal without language tag. */
  19. #define LSUP_TERM_LITERAL 3
  20. /* Language-tagged string literal. */
  21. #define LSUP_TERM_LT_LITERAL 4
  22. /* Blank node. */
  23. #define LSUP_TERM_BNODE 5
  24. /** @brief Default data type for untyped literals (prefixed IRI).
  25. */
  26. #define DEFAULT_DTYPE "http://www.w3.org/2001/XMLSchema#string"
  27. /** @brief URI parsing regular expression.
  28. *
  29. * Based on RFC3986 (see https://tools.ietf.org/html/rfc3986#appendix-B) and
  30. * modified for use in this application. Relevant matching groups are the
  31. * following, for a sample URI `http://example.org/123/456/?query=blah#frag`:
  32. *
  33. * #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
  34. * #1: Domain prefix (http://example.org)
  35. * #2: Protocol (http:)
  36. * #4: Authority (example.org)
  37. * #5: Path relative to domain (/123/456/?query=blah#frag)
  38. * #6: Path, excluding query and fragment (/123/456/)
  39. * #8: Query (query=blah)
  40. * #10: Fragment (frag)
  41. *
  42. * For URN-like URIs, such as `urn:s:0`, the prefix part (#1) is `urn:` and
  43. * the path (#4) is `s:0`.
  44. */
  45. #define LSUP_URI_REGEX_STR \
  46. "^(([^:/?#]+:)?(//([^/?#]*))?)?(([^?#]*)(\\?([^#]*))?(#(.*))?)"
  47. /*
  48. * Data types.
  49. */
  50. typedef XXH64_hash_t LSUP_Hash64;
  51. typedef char LSUP_TermType;
  52. typedef char LSUP_LangTag[8];
  53. /** @brief IRI information.
  54. *
  55. * See regex matching group for #LSUP_URI_REGEX_STR for more information.
  56. */
  57. typedef struct iri_info_t LSUP_IRIInfo;
  58. typedef struct term_t {
  59. char * data; // URI, literal value, or BNode label.
  60. union {
  61. struct term_t * datatype; // Data type IRI for LSUP_TERM_LITERAL.
  62. LSUP_LangTag lang; // Lang tag for LSUP_TERM_LT_LITERAL.
  63. LSUP_Key bnode_id; // BNode ID for comparison & skolemization.
  64. LSUP_IRIInfo * iri_info; // IRI information structure.
  65. };
  66. LSUP_TermType type; // Term type.
  67. } LSUP_Term;
  68. /** @brief Shorthand to test if a term is a IRI of any kind.
  69. */
  70. #define LSUP_IS_IRI(term) \
  71. (term->type == LSUP_TERM_IRIREF || term->type == LSUP_TERM_NS_IRIREF)
  72. /** @brief Shorthand to test if a term is a literal of any kind.
  73. */
  74. #define LSUP_IS_LITERAL(term) \
  75. term->type == LSUP_TERM_LITERAL || term->type == LSUP_TERM_LT_LITERAL
  76. /** @brief Hash cache for data types.
  77. */
  78. struct term_cache_t {
  79. LSUP_Key key;
  80. LSUP_Term * term;
  81. UT_hash_handle hh;
  82. };
  83. typedef struct triple_t {
  84. LSUP_Term *s;
  85. LSUP_Term *p;
  86. LSUP_Term *o;
  87. } LSUP_Triple;
  88. /*
  89. * Extern variables.
  90. */
  91. /** @brief Global term cache.
  92. *
  93. * Stores frequently used terms, e.g. data type URIs.
  94. */
  95. extern struct term_cache_t *LSUP_term_cache;
  96. /** @brief Compiled hash of default literal data type.
  97. */
  98. extern uint32_t LSUP_default_dtype_key;
  99. /** @brief URI validation pattern, compiled in #LSUP_init().
  100. */
  101. extern regex_t *LSUP_uri_ptn;
  102. /** @brief Default literal data type URI.
  103. *
  104. * Literal terms created with undefined data type will have it set to this
  105. * URI implicitly.
  106. */
  107. extern LSUP_Term *LSUP_default_datatype;
  108. /*
  109. * Function prototypes.
  110. */
  111. /** @brief Create a new term.
  112. *
  113. * This is a generic function; it is recommended to use specialized functions
  114. * such as #LSUP_term_new(), #LSUP_literal_new(), etc. as they have strict type
  115. * checks for the metadata parameter.
  116. *
  117. * @param type[in] Term type. One of #LSUP_TermType.
  118. *
  119. * @param data[in] Term data: textual URI, literal value without data type
  120. * or langtag, etc.
  121. *
  122. * @param metadata[in] Namespace map (LSUP_NSMap *) for IRI refs; language tag
  123. * (LSUP_LangTag *) for language-tagged literals; or data type (LSUP_Term *)
  124. * for other literals. It may be NULL.
  125. *
  126. * @return New term, which must be freed with #LSUP_term_free after use; or
  127. * NULL on error.
  128. */
  129. LSUP_Term *
  130. LSUP_term_new (LSUP_TermType type, const char *data, void *metadata);
  131. /** @brief Placeholder term to use with LSUP_term_reset.
  132. */
  133. #define TERM_DUMMY LSUP_term_new (LSUP_TERM_UNDEFINED, NULL, NULL)
  134. /** @brief Shortcut to create an IRI reference.
  135. *
  136. * Must be freed with #LSUP_term_free.
  137. *
  138. * @param data[in] The URI string. If NULL, a UUID4-based URN is generated.
  139. * This cannot be NULL if the nsm parameter is not NULL.
  140. *
  141. * @param nsm[in] Namespace map. If not NULL, a namespace-prefixed
  142. * (#LSUP_TERM_NS_IRIREF) is created, otherwise a regular one
  143. * (#LSUP_TERM_IRIREF).
  144. *
  145. * @return same as #LSUP_term_new().
  146. */
  147. inline LSUP_Term *
  148. LSUP_iriref_new (const char *data, LSUP_NSMap *nsm)
  149. {
  150. return (
  151. nsm ? LSUP_term_new (LSUP_TERM_NS_IRIREF, data, nsm) :
  152. LSUP_term_new (LSUP_TERM_IRIREF, data, NULL));
  153. }
  154. /** @brief Shortcut to create a literal term.
  155. *
  156. * Must be freed with #LSUP_term_free.
  157. *
  158. * @param data[in] The literal string.
  159. *
  160. * @param datatype[in] Data type URI string. If NULL, the default data type
  161. * (xsd:string) is used.
  162. *
  163. * @return same as #LSUP_term_new().
  164. */
  165. inline LSUP_Term *
  166. LSUP_literal_new (const char *data, LSUP_Term *datatype)
  167. { return LSUP_term_new (LSUP_TERM_LITERAL, data, datatype); }
  168. /** @brief Shortcut to create a language-tagged literal term.
  169. *
  170. * Must be freed with #LSUP_term_free.
  171. *
  172. * @param data[in] The literal string.
  173. *
  174. * @param lang[in] Language tag string.
  175. *
  176. * @return same as #LSUP_term_new().
  177. */
  178. inline LSUP_Term *
  179. LSUP_lt_literal_new (const char *data, char *lang)
  180. { return LSUP_term_new (LSUP_TERM_LT_LITERAL, data, lang); }
  181. /** @brief Shortcut to create a blank node.
  182. *
  183. * Must be freed with #LSUP_term_free.
  184. *
  185. * @param data[in] The BNode identifier.
  186. *
  187. * @return same as #LSUP_term_new().
  188. */
  189. inline LSUP_Term *
  190. LSUP_bnode_new (const char *data)
  191. { return LSUP_term_new (LSUP_TERM_BNODE, data, NULL); }
  192. /** @brief Deserialize a buffer into a term.
  193. *
  194. * @param[in] sterm Buffer to convert into a term. It must be a valid
  195. * serialized term from store or obtained with #LSUP_term_serialize().
  196. *
  197. * @return New term handle. It must be freed with #LSUP_term_free().
  198. */
  199. LSUP_Term *
  200. LSUP_term_new_from_buffer (const LSUP_Buffer *sterm);
  201. /** @brief Serialize a term into a buffer.
  202. *
  203. * @param[in] sterm Term to convert into a buffer.
  204. *
  205. * @return New buffer handle. It must be freed with #LSUP_buffer_free().
  206. */
  207. LSUP_Buffer *
  208. LSUP_term_serialize (const LSUP_Term *term);
  209. /** @brief Hash a buffer.
  210. */
  211. LSUP_Key
  212. LSUP_term_hash (const LSUP_Term *term);
  213. /** @brief Compare two terms.
  214. *
  215. * The terms evaluate as equal if their hashes are equal—i.e. if they are
  216. * semantically equivalent.
  217. */
  218. inline bool LSUP_term_equals (const LSUP_Term *term1, const LSUP_Term *term2)
  219. { return LSUP_term_hash (term1) == LSUP_term_hash (term2); }
  220. void
  221. LSUP_term_done (LSUP_Term *term);
  222. void
  223. LSUP_term_free (LSUP_Term *term);
  224. /** @brief Namespace map of a IRI ref.
  225. *
  226. * @param[in] iri IRI reference handle.
  227. *
  228. * @return A pointer to the namespace map assiciated with the IRI. It is
  229. * freed at program shutdown.
  230. */
  231. LSUP_NSMap *
  232. LSUP_iriref_nsm (LSUP_Term *iri);
  233. /** @brief Get the prefix portion of a IRI ref.
  234. *
  235. * @param[in] iri IRI reference handle.
  236. *
  237. * @return String containing the protocol and domain name part of the IRI. It
  238. * should be freed after use.
  239. */
  240. char *
  241. LSUP_iriref_prefix (LSUP_Term *iri);
  242. /** @brief Get the path portion of a IRI ref.
  243. *
  244. * @param[in] iri IRI reference handle.
  245. *
  246. * @return String containing the path of the IRI relative to the web root. For
  247. * a URN, such as `urn:myns:myid`, it would be `myns:myid`. This string should
  248. * be freed after use.
  249. */
  250. char *
  251. LSUP_iriref_path (LSUP_Term *iri);
  252. /** @brief Get the fragment portion of a IRI ref.
  253. *
  254. * @param[in] iri IRI reference handle.
  255. *
  256. * @return String containing the fragment part of the IRI, or NULL if the IRI
  257. * contains no fragment. It should be freed after use.
  258. */
  259. char *
  260. LSUP_iriref_frag (LSUP_Term *iri);
  261. /*
  262. * TRIPLES
  263. */
  264. /** @brief Create a new triple from three terms.
  265. *
  266. * TODO Term types are not validated at the moment.
  267. *
  268. * @param[in] s Triple subject. It must be an IRIRef or BNode.
  269. *
  270. * @param[in] p Triple predicate. It must be an IRIRef.
  271. *
  272. * @param[in] o Triple object.
  273. *
  274. */
  275. LSUP_Triple *
  276. LSUP_triple_new(LSUP_Term *s, LSUP_Term *p, LSUP_Term *o);
  277. /** @brief Dummy triple with NULL slots. It is not a valid triple.
  278. */
  279. #define TRP_DUMMY LSUP_triple_new (NULL, NULL, NULL)
  280. LSUP_Triple *
  281. LSUP_triple_new_from_btriple (const LSUP_BufferTriple *sspo);
  282. LSUP_BufferTriple *
  283. LSUP_triple_serialize (const LSUP_Triple *spo);
  284. /** @brief Initialize internal term pointers in a heap-allocated triple.
  285. *
  286. * NOTE: the term structures are not copied. If the triple is freed with
  287. * #LSUP_triple_free(), the originally provided terms are freed too.
  288. *
  289. * @param spo[in] Triple pointer to initialize.
  290. */
  291. LSUP_rc
  292. LSUP_triple_init (LSUP_Triple *spo, LSUP_Term *s, LSUP_Term *p, LSUP_Term *o);
  293. /** @brief Free the internal pointers of a triple.
  294. *
  295. * @param spo[in] Triple to be freed.
  296. */
  297. void
  298. LSUP_triple_done (LSUP_Triple *spo);
  299. /** @brief Free a triple and all its internal pointers.
  300. *
  301. * NOTE: If the term pointers are not to be freed (e.g. they are owned by a
  302. * back end), use a simple free(spo) instead of this.
  303. *
  304. * @param spo[in] Triple to be freed.
  305. */
  306. void
  307. LSUP_triple_free (LSUP_Triple *spo);
  308. /** @brief Get triple by term position.
  309. *
  310. * Useful for looping over all terms.
  311. *
  312. * @param trp[in] Triple pointer.
  313. *
  314. * @param n[in] A number between 0÷2.
  315. *
  316. * @return Corresponding triple term or NULL if n is out of range.
  317. */
  318. inline LSUP_Term *
  319. LSUP_triple_pos (const LSUP_Triple *trp, LSUP_TriplePos n)
  320. {
  321. if (n == TRP_POS_S) return trp->s;
  322. if (n == TRP_POS_P) return trp->p;
  323. if (n == TRP_POS_O) return trp->o;
  324. return NULL;
  325. }
  326. /** @brief Hash a triple.
  327. *
  328. * TODO This doesn't handle blank nodes correctly.
  329. */
  330. inline LSUP_Key
  331. LSUP_triple_hash (const LSUP_Triple *trp)
  332. {
  333. LSUP_BufferTriple *strp = LSUP_triple_serialize (trp);
  334. LSUP_Key hash = LSUP_btriple_hash (strp);
  335. LSUP_btriple_free (strp);
  336. return hash;
  337. }
  338. /** @brief Add an identifier to the term cache.
  339. *
  340. * @param[in] key Hash of the inserted term.
  341. *
  342. * @param[in] term Term to insert.
  343. */
  344. LSUP_rc
  345. LSUP_tcache_add (const LSUP_Key key, LSUP_Term *term);
  346. /** @brief Get an identifier from the cache.
  347. *
  348. * @param[in] key Key for the queried term.
  349. *
  350. * @return The retrieved term if found, or NULL. The string must not be
  351. * modified or freed.
  352. */
  353. const LSUP_Term *
  354. LSUP_tcache_get (const LSUP_Key key);
  355. #endif