term.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. #ifndef _LSUP_TERM_H
  2. #define _LSUP_TERM_H
  3. #include <assert.h>
  4. #include <regex.h>
  5. #include "buffer.h"
  6. #include "namespace.h"
  7. #define UUID4_URN_SIZE UUIDSTR_SIZE + 10
  8. /** @brief Default data type for untyped literals (prefixed IRI).
  9. */
  10. #define DEFAULT_DTYPE "http://www.w3.org/2001/XMLSchema#string"
  11. /** @brief URI parsing regular expression.
  12. *
  13. * Based on RFC3986 (see https://tools.ietf.org/html/rfc3986#appendix-B) and
  14. * modified for use in this application. Relevant matching groups are the
  15. * following, for a sample URI `http://example.org/123/456/?query=blah#frag`:
  16. *
  17. * #0: Full parsed URI (http://example.org/123/456/?query=blah#frag)
  18. * #1: Domain prefix (http://example.org)
  19. * #2: Protocol (http:)
  20. * #4: Authority (example.org)
  21. * #5: Path relative to domain (/123/456/?query=blah#frag)
  22. * #6: Path, excluding query and fragment (/123/456/)
  23. * #8: Query (query=blah)
  24. * #10: Fragment (frag)
  25. *
  26. * For URN-like URIs, such as `urn:s:0`, the prefix part (#1) is `urn:` and
  27. * the path (#4) is `s:0`.
  28. */
  29. #define LSUP_URI_REGEX_STR \
  30. "^(([^:/?#]+:)?(//([^/?#]*))?)?(([^?#]*)(\\?([^#]*))?(#(.*))?)"
  31. /*
  32. * Data types.
  33. */
  34. /// Language tag, currently restricted to 7 characters.
  35. typedef char LSUP_LangTag[8];
  36. /// Term type.
  37. typedef enum {
  38. LSUP_TERM_UNDEFINED = 0,/**<
  39. * Undefined placeholder or result of an error.
  40. * Invalid for most operations.
  41. */
  42. LSUP_TERM_IRIREF, ///< IRI reference.
  43. LSUP_TERM_NS_IRIREF, ///< Namespace-prefixed IRI reference.
  44. LSUP_TERM_LITERAL, ///< Literal without language tag.
  45. LSUP_TERM_LT_LITERAL, ///< Language-tagged string literal.
  46. LSUP_TERM_BNODE, ///< Blank node.
  47. } LSUP_TermType;
  48. /** @brief IRI information.
  49. *
  50. * See regex matching group for #LSUP_URI_REGEX_STR for more information.
  51. */
  52. typedef struct iri_info_t LSUP_IRIInfo;
  53. /// RDF term.
  54. typedef struct term_t {
  55. char * data; // URI, literal value, or BNode label.
  56. union {
  57. struct term_t * datatype; // Data type IRI for LSUP_TERM_LITERAL.
  58. LSUP_LangTag lang; // Lang tag for LSUP_TERM_LT_LITERAL.
  59. LSUP_Key bnode_id; // BNode ID for comparison & skolemization.
  60. LSUP_IRIInfo * iri_info; // IRI information structure.
  61. };
  62. LSUP_TermType type; // Term type.
  63. } LSUP_Term;
  64. /** @brief Shorthand to test if a term is a IRI of any kind.
  65. */
  66. #define LSUP_IS_IRI(term) \
  67. ((term)->type == LSUP_TERM_IRIREF || (term)->type == LSUP_TERM_NS_IRIREF)
  68. /** @brief Shorthand to test if a term is a literal of any kind.
  69. */
  70. #define LSUP_IS_LITERAL(term) \
  71. ((term)->type == LSUP_TERM_LITERAL || (term)->type == LSUP_TERM_LT_LITERAL)
  72. typedef struct triple_t {
  73. LSUP_Term *s;
  74. LSUP_Term *p;
  75. LSUP_Term *o;
  76. } LSUP_Triple;
  77. /** @brief Key-term pair.
  78. */
  79. typedef struct term_cache_entry_t {
  80. LSUP_Key key; // Key (hash) of the term.
  81. LSUP_Term * term; // Term handle.
  82. } LSUP_KeyedTerm;
  83. /// Connection type.
  84. typedef enum {
  85. LSUP_CONN_INBOUND, ///< Inbound connection (sp).
  86. LSUP_CONN_OUTBOUND, ///< Outbound connection (po).
  87. LSUP_CONN_EDGE, ///< Edge connection (so).
  88. } LSUP_ConnectionType;
  89. /** @brief Connection list.
  90. *
  91. * A list of predicates and related lists of terms, that can be used to list
  92. * inbound or outbound connections to a node.
  93. *
  94. * Each term in the NUL-terminated `p` list represent a term which is
  95. * paired with a list of terms in the `tl` list. The index of each term in this
  96. * list corresponds to the same index of a term list in `tl`.
  97. *
  98. * If the type of the connection list is `LSUP_CONN_INBOUND`, the term list
  99. * represent subjects and a term that is associated with the connection list is
  100. * the related object; if `LSUP_CONN_OUTBOUND`, the term list represents
  101. * objects, and a term that is associated with the connection list represents
  102. * the subject. If `LSUP_CONN_EDGE`, the members of the connection list
  103. * represent subjects and objects, and the associated term is the predicate.
  104. *
  105. */
  106. typedef struct {
  107. LSUP_ConnectionType type; ///< Inbound or outbound connection.
  108. LSUP_Term ** t; ///< NUL-terminated array of term handles.
  109. LSUP_Term *** tl; /**<
  110. * NUL-terminated array of
  111. * NUL-terminated arrays of term handles.
  112. */
  113. } LSUP_ConnectionList;
  114. /*
  115. * Extern variables.
  116. */
  117. /** @brief Global term cache.
  118. *
  119. * Stores frequently used terms, e.g. data type URIs.
  120. */
  121. extern struct hashmap *LSUP_term_cache;
  122. /** @brief Compiled hash of default literal data type.
  123. */
  124. extern uint32_t LSUP_default_dtype_key;
  125. /** @brief URI validation pattern, compiled in #LSUP_init().
  126. */
  127. extern regex_t *LSUP_uri_ptn;
  128. /** @brief Default literal data type URI.
  129. *
  130. * Literal terms created with undefined data type will have it set to this
  131. * URI implicitly.
  132. */
  133. extern LSUP_Term *LSUP_default_datatype;
  134. /*
  135. * API functions.
  136. */
  137. /** @brief Create a new term.
  138. *
  139. * This is a generic function; it is recommended to use specialized functions
  140. * such as #LSUP_term_new(), #LSUP_literal_new(), etc. as they have strict type
  141. * checks for the metadata parameter.
  142. *
  143. * @param type[in] Term type. One of #LSUP_TermType.
  144. *
  145. * @param data[in] Term data: textual URI, literal value without data type
  146. * or langtag, etc. It may be NULL for IRI refs and BNodes, in which case a
  147. * random identifier is generated.
  148. *
  149. * @param metadata[in] Namespace map (LSUP_NSMap *) for IRI refs; language tag
  150. * (LSUP_LangTag *) for language-tagged literals; or data type (LSUP_Term *)
  151. * for other literals. It may be NULL.
  152. *
  153. * @return New term, which must be freed with #LSUP_term_free after use; or
  154. * NULL on error.
  155. */
  156. LSUP_Term *
  157. LSUP_term_new (LSUP_TermType type, const char *data, void *metadata);
  158. /** @brief Placeholder term to use with LSUP_term_reset.
  159. */
  160. #define TERM_DUMMY LSUP_term_new (LSUP_TERM_UNDEFINED, NULL, NULL)
  161. /** @brief Shortcut to create an IRI reference.
  162. *
  163. * Must be freed with #LSUP_term_free.
  164. *
  165. * @param data[in] The URI string. If NULL, a UUID4-based URN is generated.
  166. * This cannot be NULL if the nsm parameter is not NULL.
  167. *
  168. * @param nsm[in] Namespace map. If not NULL, a namespace-prefixed
  169. * (#LSUP_TERM_NS_IRIREF) is created, otherwise a regular one
  170. * (#LSUP_TERM_IRIREF).
  171. *
  172. * @return same as #LSUP_term_new().
  173. */
  174. inline LSUP_Term *
  175. LSUP_iriref_new (const char *data, LSUP_NSMap *nsm)
  176. {
  177. return (
  178. nsm ? LSUP_term_new (LSUP_TERM_NS_IRIREF, data, nsm) :
  179. LSUP_term_new (LSUP_TERM_IRIREF, data, NULL));
  180. }
  181. /** @brief Create a new absolute IRI from a path relative to a root IRI.
  182. *
  183. * The term is always of type LSUP_TERM_IRIREF (i.e. not namespace-prefixed).
  184. *
  185. * If the provided IRI is already a fully qualified IRI (i.e. it has a prefix)
  186. * the result is semantically identical to the input.
  187. *
  188. * If the relative IRI begins with a '/', the resulting IRI is relative to the
  189. * web root of the root IRI. I.e. if a root IRI has a path after the webroot,
  190. * it is ignored.
  191. *
  192. * Otherwise, the resulting IRI is relative to the full root string.
  193. *
  194. * @param[in] root Root IRI that the new IRI should be relative to.
  195. *
  196. * @param[in] iri Term with an IRI relative to the webroot.
  197. *
  198. * @return New absolute IRI, or NULL if either term is not an IRI.
  199. */
  200. LSUP_Term *
  201. LSUP_iriref_absolute (const LSUP_Term *root, const LSUP_Term *iri);
  202. /** @brief Create a new relative IRI from an absolute IRI and a web root IRI.
  203. *
  204. * This works with namespace-prefixed IRIs and returns a term of the same type
  205. * as the input.
  206. *
  207. * @param[in] iri Full IRI.
  208. *
  209. * @param[in] root Root IRI that the new IRI should be relative to.
  210. *
  211. * @return New IRI, or NULL if either term is not an IRI. If the input IRI is
  212. * not a path under the root IRI, the result will be identical to the input.
  213. */
  214. LSUP_Term *
  215. LSUP_iriref_relative (const LSUP_Term *root, const LSUP_Term *iri);
  216. /** @brief Shortcut to create a literal term.
  217. *
  218. * Must be freed with #LSUP_term_free.
  219. *
  220. * @param data[in] The literal string.
  221. *
  222. * @param datatype[in] Data type URI string. If NULL, the default data type
  223. * (xsd:string) is used. The new term takes ownership of the pointer.
  224. *
  225. * @return same as #LSUP_term_new().
  226. */
  227. inline LSUP_Term *
  228. LSUP_literal_new (const char *data, LSUP_Term *datatype)
  229. { return LSUP_term_new (LSUP_TERM_LITERAL, data, datatype); }
  230. /** @brief Shortcut to create a language-tagged literal term.
  231. *
  232. * Must be freed with #LSUP_term_free.
  233. *
  234. * @param data[in] The literal string.
  235. *
  236. * @param lang[in] Language tag string.
  237. *
  238. * @return same as #LSUP_term_new().
  239. */
  240. inline LSUP_Term *
  241. LSUP_lt_literal_new (const char *data, char *lang)
  242. { return LSUP_term_new (LSUP_TERM_LT_LITERAL, data, lang); }
  243. /** @brief Shortcut to create a blank node.
  244. *
  245. * Must be freed with #LSUP_term_free.
  246. *
  247. * @param data[in] The BNode identifier.
  248. *
  249. * @return same as #LSUP_term_new().
  250. */
  251. inline LSUP_Term *
  252. LSUP_bnode_new (const char *data)
  253. { return LSUP_term_new (LSUP_TERM_BNODE, data, NULL); }
  254. /** @brief Copy a term.
  255. *
  256. * @param[in] src The term to copy.
  257. *
  258. * @return A new duplicate term handle.
  259. */
  260. LSUP_Term *
  261. LSUP_term_copy (const LSUP_Term *src);
  262. /** @brief Deserialize a buffer into a term.
  263. *
  264. * @param[in] sterm Buffer to convert into a term. It must be a valid
  265. * serialized term from store or obtained with #LSUP_term_serialize().
  266. *
  267. * @return New term handle. It must be freed with #LSUP_term_free().
  268. */
  269. LSUP_Term *
  270. LSUP_term_new_from_buffer (const LSUP_Buffer *sterm);
  271. /** @brief Serialize a term into a buffer.
  272. *
  273. * @param[in] sterm Term to convert into a buffer.
  274. *
  275. * @return New buffer handle. It must be freed with #LSUP_buffer_free().
  276. */
  277. LSUP_Buffer *
  278. LSUP_term_serialize (const LSUP_Term *term);
  279. /** @brief Hash a buffer.
  280. */
  281. LSUP_Key
  282. LSUP_term_hash (const LSUP_Term *term);
  283. /** @brief Compare two terms.
  284. *
  285. * The terms evaluate as equal if their hashes are equal—i.e. if they are
  286. * semantically equivalent.
  287. */
  288. inline bool LSUP_term_equals (const LSUP_Term *term1, const LSUP_Term *term2)
  289. { return LSUP_term_hash (term1) == LSUP_term_hash (term2); }
  290. void
  291. LSUP_term_free (LSUP_Term *term);
  292. /** @brief Namespace map of a IRI ref.
  293. *
  294. * @param[in] iri IRI reference handle.
  295. *
  296. * @return A pointer to the namespace map associated with the IRI. It is
  297. * freed at program shutdown.
  298. */
  299. LSUP_NSMap *
  300. LSUP_iriref_nsm (const LSUP_Term *iri);
  301. /** @brief Get the prefix portion of a IRI ref.
  302. *
  303. * @param[in] iri IRI reference handle.
  304. *
  305. * @return String containing the protocol and domain name part of the IRI. It
  306. * should be freed after use.
  307. */
  308. char *
  309. LSUP_iriref_prefix (const LSUP_Term *iri);
  310. /** @brief Get the path portion of a IRI ref.
  311. *
  312. * @param[in] iri IRI reference handle.
  313. *
  314. * @return String containing the path of the IRI relative to the web root. For
  315. * a URN, such as `urn:myns:myid`, it would be `myns:myid`. This string should
  316. * be freed after use.
  317. */
  318. char *
  319. LSUP_iriref_path (const LSUP_Term *iri);
  320. /** @brief Get the fragment portion of a IRI ref.
  321. *
  322. * @param[in] iri IRI reference handle.
  323. *
  324. * @return String containing the fragment part of the IRI, or NULL if the IRI
  325. * contains no fragment. It should be freed after use.
  326. */
  327. char *
  328. LSUP_iriref_frag (const LSUP_Term *iri);
  329. /*
  330. * TRIPLES
  331. */
  332. /** @brief Create a new triple from three terms.
  333. *
  334. * Terms are NOT copied. To free them with the triple, use #LSUP_triple_free().
  335. * To only free the triple, use free().
  336. *
  337. * TODO Term types are not validated at the moment.
  338. *
  339. * @param[in] s Triple subject. It must be an IRIRef or BNode.
  340. *
  341. * @param[in] p Triple predicate. It must be an IRIRef.
  342. *
  343. * @param[in] o Triple object.
  344. *
  345. */
  346. LSUP_Triple *
  347. LSUP_triple_new(LSUP_Term *s, LSUP_Term *p, LSUP_Term *o);
  348. /** @brief Dummy triple with NULL slots. It is not a valid triple.
  349. */
  350. #define TRP_DUMMY LSUP_triple_new (NULL, NULL, NULL)
  351. LSUP_Triple *
  352. LSUP_triple_new_from_btriple (const LSUP_BufferTriple *sspo);
  353. LSUP_BufferTriple *
  354. LSUP_triple_serialize (const LSUP_Triple *spo);
  355. /** @brief Initialize internal term pointers in a heap-allocated triple.
  356. *
  357. * Terms are NOT copied. To free them with the triple, use #LSUP_triple_free().
  358. * To only free the triple, use free().
  359. *
  360. * @param spo[in] Triple pointer to initialize.
  361. */
  362. LSUP_rc
  363. LSUP_triple_init (LSUP_Triple *spo, LSUP_Term *s, LSUP_Term *p, LSUP_Term *o);
  364. /** @brief Free the internal pointers of a triple.
  365. *
  366. * @param spo[in] Triple to be freed.
  367. */
  368. void
  369. LSUP_triple_done (LSUP_Triple *spo);
  370. /** @brief Free a triple and all its internal pointers.
  371. *
  372. * NOTE: If the term pointers are not to be freed (e.g. they are owned by a
  373. * back end), use a simple free(spo) instead of this.
  374. *
  375. * @param spo[in] Triple to be freed.
  376. */
  377. void
  378. LSUP_triple_free (LSUP_Triple *spo);
  379. /** @brief Get triple by term position.
  380. *
  381. * Useful for looping over all terms.
  382. *
  383. * @param trp[in] Triple pointer.
  384. *
  385. * @param n[in] A number between 0÷2.
  386. *
  387. * @return Corresponding triple term or NULL if n is out of range.
  388. */
  389. inline LSUP_Term *
  390. LSUP_triple_pos (const LSUP_Triple *trp, LSUP_TriplePos n)
  391. {
  392. if (n == TRP_POS_S) return trp->s;
  393. if (n == TRP_POS_P) return trp->p;
  394. if (n == TRP_POS_O) return trp->o;
  395. return NULL;
  396. }
  397. /** @brief Hash a triple.
  398. *
  399. * TODO This doesn't handle blank nodes correctly.
  400. */
  401. inline LSUP_Key
  402. LSUP_triple_hash (const LSUP_Triple *trp)
  403. {
  404. LSUP_BufferTriple *strp = LSUP_triple_serialize (trp);
  405. LSUP_Key hash = LSUP_btriple_hash (strp);
  406. LSUP_btriple_free (strp);
  407. return hash;
  408. }
  409. /** @brief Add an identifier to the term cache.
  410. *
  411. * @param[in] key Hash of the inserted term.
  412. *
  413. * @param[in] term Term to insert. A copy of the term is stored in the cache,
  414. * which is freed on application teardown.
  415. */
  416. LSUP_rc
  417. LSUP_tcache_add (const LSUP_Key key, const LSUP_Term *term);
  418. /** @brief Get an identifier from the cache.
  419. *
  420. * @param[in] key Key for the queried term.
  421. *
  422. * @return The retrieved term if found, or NULL. The string must not be
  423. * modified or freed.
  424. */
  425. const LSUP_Term *
  426. LSUP_tcache_get (LSUP_Key key);
  427. /** @brief Add term to a term list.
  428. *
  429. * @param[in] tl Array of term handles to be added to. The handle must be NUL-
  430. * terminated. On success, this handle will be reallocated and the new address
  431. * returned, so the passed handle should no longer be used. On failure, it
  432. * remains unchanged and may be reused.
  433. *
  434. * @param[in] t Term to be added to the list. The object list will take
  435. * ownership of the term.
  436. *
  437. * @return Reallocated list on success; NULL on failure.
  438. */
  439. LSUP_Term **
  440. LSUP_term_list_add (LSUP_Term **tl, LSUP_Term *t);
  441. /** @brief New connection list.
  442. *
  443. * The initial state of the returned list is: `{t: [NULL], tl: [NULL]}`
  444. *
  445. * Predicates and term lists can be added with #LSUP_conn_list_add, and terms
  446. * can be added to a term list with #LSUP_term_list_add.
  447. *
  448. * @return a new empty predicate-object list.
  449. */
  450. LSUP_ConnectionList *
  451. LSUP_conn_list_new (LSUP_ConnectionType type);
  452. /** @brief Free a predicate-object list.
  453. *
  454. * All arrays and term handles are recursively freed.
  455. *
  456. * @param[in] pol Predicate-object list handle obtained with
  457. * #LSUP_conn_list_new().
  458. */
  459. void
  460. LSUP_conn_list_free (LSUP_ConnectionList *pol);
  461. /** @brief Add a term - term list pair to a connection list.
  462. *
  463. * @param[in] cl Connection list handle obtained with
  464. * #LSUP_conn_list_new().
  465. *
  466. * @param[in] t Term to be associated with the given object list. The
  467. * connection list structure takes ownership of the term.
  468. *
  469. * @param[in] o NULL-terminated array of object term handles to be associated
  470. * with the given predicate. The connection list structire takes ownership of
  471. * the whole term array.
  472. *
  473. * @return LSUP_OK on success; LSUP_MEM_ERR on allocation error.
  474. */
  475. LSUP_rc
  476. LSUP_conn_list_add (
  477. LSUP_ConnectionList *cl, LSUP_Term *t, LSUP_Term **tl);
  478. #endif