codec.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. #ifndef _LSUP_CODEC_BASE_H
  2. #define _LSUP_CODEC_BASE_H
  3. #include "graph.h"
  4. /**
  5. * Max data size passed to the scanner and parser at each iteration.
  6. */
  7. #ifdef LSUP_RDF_STREAM_CHUNK_SIZE
  8. #define CHUNK_SIZE LSUP_RDF_STREAM_CHUNK_SIZE
  9. #else
  10. #define CHUNK_SIZE 8192
  11. #endif
  12. typedef struct codec_t LSUP_Codec;
  13. /** @brief Codec iterator type.
  14. *
  15. * This structure holds state data including input and output for encoding a
  16. * graph into RDF. Normally it should not be inspected or manipulated directly,
  17. * but rather passed to codec iteration functions for processing RDF.
  18. *
  19. * NOTE: This should be used as an opaque handle, however it is exposed here
  20. * for easier inclusion into each codec.
  21. */
  22. typedef struct codec_iter_t {
  23. const LSUP_Codec * codec; // Codec that generated this iterator.
  24. LSUP_Triple * trp; // RDF fragment being encoded.
  25. LSUP_GraphIterator *gr_it; // Graph iterator.
  26. const LSUP_NSMap * nsm; // Namespace map.
  27. size_t cur; // Internal cursor.
  28. LSUP_rc rc; // Internal return code.
  29. char * rep, // String representation of a RDF fragment.
  30. * str_s, // Temporary string.
  31. * str_p, // Temporary string.
  32. * str_o; // Temporary string.
  33. } LSUP_CodecIterator;
  34. /// Predicate and object list. Used for Turtle.
  35. typedef struct {
  36. LSUP_Term ** p; ///< NULL-terminated array of term handles.
  37. LSUP_Term *** o; /**<
  38. * NULL-terminated array of
  39. * NULL-terminated arrays of term handles.
  40. * The indices of the outer array are
  41. * equal to the indices of the associated
  42. * predicate in the predicate list.
  43. */
  44. } LSUP_PredObjList;
  45. /// Parser state.
  46. typedef struct {
  47. LSUP_GraphIterator * it; ///< Iterator used to build the graph.
  48. LSUP_NSMap * nsm; ///< NS map used in the document.
  49. LSUP_Term * base; ///< Base IRI used in the document.
  50. size_t ct; ///< Statements parsed.
  51. LSUP_rc rc; ///< Internal return code.
  52. } LSUP_TTLParserState;
  53. /** @brief Parse error information.
  54. *
  55. */
  56. /* TODO A plain string will suffice for now.
  57. typedef struct parse_error_t {
  58. unsigned int line; // Line number where the error occurred.
  59. unsigned int linec; // Position in line of the offending token.
  60. char * token; // String representation of the token.
  61. } LSUP_ParseError;
  62. */
  63. /*
  64. * Interface prototypes.
  65. */
  66. /** @brief Term encoder callback type.
  67. *
  68. * @param[in] term Single term handle.
  69. *
  70. * @param[in] nsm Namespace map. May be NULL for no prefix shortening.
  71. *
  72. * @param[out] rep Pointer to a string to be filled with the encoded term. The
  73. * caller is in charge of freeing the string after use. Returns undefined on
  74. * error.
  75. *
  76. * @return LSUP_OK on successful encoding; <0 for other errors.
  77. */
  78. typedef LSUP_rc (*term_enc_fn_t)(
  79. const LSUP_Term *term, const LSUP_NSMap *nsm, char **rep);
  80. /** @brief Initialize a graph encoding loop.
  81. *
  82. * This prototype is to be implemented by graph encoding loops. It should
  83. * create an iterator and perform all initial setup for finding triples.
  84. *
  85. * Implementations MUST set the "codec" member of the iterator to the address
  86. * of the codec that generated it.
  87. *
  88. * @param[in] gr The graph to be encoded. The graph's namespace map is used by
  89. * the codec for namespace prefixing. The graph may only be freed after the
  90. * loop is finalized.
  91. *
  92. * @return A codec iterator handle to be passed to a #gr_codec_iter_fn_t
  93. * function and, eventually, to a #gr_codec_done_fn_t function.
  94. */
  95. typedef LSUP_CodecIterator * (*gr_encode_init_fn_t)(const LSUP_Graph *gr);
  96. /** @brief Perform one encoding iteration.
  97. *
  98. * Implementations of this prototype MUST perform all the steps to encode one
  99. * or more complete triples into an RDF fragment representing those triples.
  100. * The input and output units are up to the implementation and a caller SHOULD
  101. * assume that multiple lines may be yielded at each iteration.
  102. *
  103. * @param[in] it Iterator handle.
  104. *
  105. * @param[out] res Handle to be populated with a string obtained from encoding.
  106. * The output data should be UTF-8 [TODO or UTF-16] encoded. This pointer
  107. * must be initialized (even to NULL) and should be eventually freed manually
  108. * at the end of the loop. It is reallocated at each iteration, so memory from
  109. * a previous iteration may be overwritten with new data.
  110. *
  111. * @return LSUP_OK if a new token was processed; LSUP_END if the end of the
  112. * loop was reached.
  113. */
  114. typedef LSUP_rc (*gr_encode_iter_fn_t)(
  115. LSUP_CodecIterator *it, unsigned char **res);
  116. /** @brief Finalize an encoding operation.
  117. *
  118. * Implementations SHOULD use this function to perform all necessary steps to
  119. * clean up memory and free the iterator handle after a graph has been
  120. * completely encoded.
  121. *
  122. * @param[in] it Iterator handle.
  123. */
  124. typedef void (*gr_encode_done_fn_t)(LSUP_CodecIterator *it);
  125. /** @brief Prototype for decoding a string into a LSUP_Term.
  126. *
  127. * Implementations MAY ignore any other tokens after finding the first one.
  128. *
  129. * @param[in] rep NT representation of the term.
  130. *
  131. * @param[in] nsm Namespace map handle.
  132. *
  133. * @param[out] Pointer to the term handle to be created. Implementaions SHOULD
  134. * return NULL on a parse error.
  135. *
  136. * @return Implementations MUST return LSUP_OK on success and a negative value
  137. * on parsing error.
  138. */
  139. typedef LSUP_rc (*term_decode_fn_t)(
  140. const char *rep, const LSUP_NSMap *nsm, LSUP_Term **term);
  141. /** @brief Prototype for decoding a complete RDF document into a graph.
  142. *
  143. * Implementations SHOULD consume data from the file handle in chunks.
  144. *
  145. * @param[in] rep Open file handle pointing to the RDF data. Implementations
  146. * MUST NOT close the file handle.
  147. *
  148. * @param[out] gr Pointer to a graph handle to be generated from decoding.
  149. *
  150. * @param[out] ct If not NULL, it may be populated with the number of triples
  151. * parsed (which may be different from the resulting graph size).
  152. * Implementations may choose not not use this, and they must account for the
  153. * value to be NULL.
  154. *
  155. * @param[out] err Pointer to error info string. If no error occurs, it yields
  156. * NULL.
  157. *
  158. * @return Implementations MUST return LSUP_OK on success and a negative value
  159. * on parsing error.
  160. */
  161. typedef LSUP_rc (*gr_decode_fn_t)(
  162. FILE *rep, LSUP_Graph **gr, size_t *ct, char **err);
  163. /** @brief Codec structure.
  164. *
  165. * An instance of this structure is usually defined at compile time (see
  166. * examples in "include/codec_*.h" and "src/codec_*.c") and should have the
  167. * following defined:
  168. *
  169. * - name: A brief (16-char max), human-readable to identify the codec.
  170. * - mimetype: MIME type (32-char max) associated with the codec.
  171. * - extension: File extension associated with the serialized file.
  172. *
  173. * - encode_term: Encode a single term.
  174. *
  175. * - encode_graph_init: Initialize a graph decoding loop.
  176. * - encode_graph_iter: Run one iteration of encoding on one or more triples.
  177. * - encode_graph_done: Finalize the encoding loop and free the support data.
  178. *
  179. * - decode_term: Decode a single term.
  180. * - decode_graph: Decode a RDF document into a graph.
  181. *
  182. * For documentation on the individual encoding and decoding callbacks, see the
  183. * related function prototypes.
  184. */
  185. struct codec_t {
  186. char name[16]; // Name of the codec.
  187. char mimetype[32]; // MIME type associated with the codec.
  188. char extension[8]; // Serialized file extension.
  189. // Encoding.
  190. term_enc_fn_t encode_term; // Term encoder function.
  191. gr_encode_init_fn_t encode_graph_init; // Graph encoder initialization.
  192. gr_encode_iter_fn_t encode_graph_iter; // Graph encoder iteration.
  193. gr_encode_done_fn_t encode_graph_done; // Graph encoder finalization.
  194. // Decoding.
  195. term_decode_fn_t decode_term; // Term decoder function.
  196. gr_decode_fn_t decode_graph; // Graph decoder function.
  197. };
  198. /*
  199. * Common utility functions.
  200. */
  201. /** @brief strdup() for unsigned char.
  202. *
  203. * This is to be used with uint8_t sequences considered to be UTF-8 sequences,
  204. * requird by re2c (it won't work with byte sequences containing `NUL`).
  205. */
  206. inline uint8_t
  207. *uint8_dup (const uint8_t *str)
  208. { return (uint8_t *) strdup ((char *) str); }
  209. /** @brief strndup() for unsigned char.
  210. *
  211. * This is to be used with uint8_t sequences considered to be UTF-8 sequences,
  212. * requird by re2c (it won't work with byte sequences containing `NUL`).
  213. */
  214. inline uint8_t
  215. *uint8_ndup (const uint8_t *str, size_t size)
  216. { return (uint8_t *) strndup ((char *) str, size); }
  217. /** Replace non-printable characters with their literal byte.
  218. *
  219. * Escape backslash is to be added separately.
  220. */
  221. static inline char
  222. escape_char (const char c) {
  223. switch (c) {
  224. case '\t': return 't';
  225. case '\b': return 'b';
  226. case '\n': return 'n';
  227. case '\r': return 'r';
  228. case '\f': return 'f';
  229. default: return c;
  230. }
  231. }
  232. /** @brief Unescape a single character.
  233. *
  234. * Convert escaped special characters such as `\t`, `\n`, etc. into their
  235. * corresponding code points.
  236. *
  237. * Non-special characters are returned unchanged.
  238. *
  239. * @param[in] c Character to unescape. Note that this is the single character
  240. * after `\`.
  241. *
  242. * @return Code point corresponding to the escaped character.
  243. */
  244. inline char
  245. unescape_char (const char c)
  246. {
  247. switch (c) {
  248. case 't': return '\t';
  249. case 'b': return '\b';
  250. case 'n': return '\n';
  251. case 'r': return '\r';
  252. case 'f': return '\f';
  253. default: return c;
  254. }
  255. }
  256. /** @brief Replace \uxxxx and \Uxxxxxxxx with Unicode bytes.
  257. *
  258. * @param[in] esc_str Escaped string.
  259. *
  260. * @param[in] size Maximum number of characters to scan, à la strncpy().
  261. *
  262. * @return String with escape sequences replaced by Unicode bytes.
  263. */
  264. uint8_t *unescape_unicode (const uint8_t *esc_str, size_t size);
  265. /** @brief Add an object to an objct list.
  266. *
  267. * @param[in] ol Array of object handles to be added to. On success, this
  268. * handle will be reallocated and the new address returned, so the passed
  269. * handle should no longer be used. On failure, it remains unchanged and may
  270. * be reused.
  271. *
  272. * @param[in] o Object to be added to the list.
  273. *
  274. * @return Reallocated list on success; NULL on failure.
  275. */
  276. LSUP_Term **
  277. LSUP_obj_list_add (LSUP_Term **ol, LSUP_Term *o);
  278. /** @brief New predicate-object list.
  279. *
  280. * @return a new empty predicate-object list.
  281. */
  282. LSUP_PredObjList *
  283. LSUP_pred_obj_list_new (void);
  284. /** @brief Free a predicate-object list.
  285. *
  286. * @param[in] pol Predicate-object list handle obtained with
  287. * #LSUP_pred_obj_list_new().
  288. */
  289. void
  290. LSUP_pred_obj_list_free (LSUP_PredObjList *pol);
  291. /** @brief Add a predicate-object list pair to a PO list.
  292. *
  293. * @param[in] pol Predicate-object list handle obtained with
  294. * #LSUP_pred_obj_list_new().
  295. *
  296. * @param[in] p Predicate to be associated with the given object list.
  297. *
  298. * @param[in] o NULL-terminated array of object term handles to be associated
  299. * with the given predicate.
  300. *
  301. * @return LSUP_OK on success; LSUP_MEM_ERR on allocation error.
  302. */
  303. LSUP_rc
  304. LSUP_pred_obj_list_add (LSUP_PredObjList *pol, LSUP_Term *p, LSUP_Term **o);
  305. /** @brief Add triples for a subject and a PO list to a graph.
  306. *
  307. * @param[in] it Graph iterator to use for insertion.
  308. *
  309. * @param[in] s Subject of all the triples.
  310. *
  311. * @param[in] po Predicate-object list.
  312. *
  313. * @return Number of triples added on success; <0 (LSUP_*_ERR) on error.
  314. */
  315. size_t
  316. LSUP_spo_list_add_triples (
  317. LSUP_GraphIterator *it, LSUP_Term *s, const LSUP_PredObjList *po);
  318. /** @brief Add triples for an anonymous collection to a graph.
  319. *
  320. * The `rdf:first`, `rdf:rest`, etc. terms are automatically added and the term
  321. * for the first item in the list is returned.
  322. *
  323. * @param[in] it Graph iterator to use for insertion.
  324. *
  325. * @param[in] ol NULL-terminated term array.
  326. *
  327. * @return Blank node representing the first list item.
  328. */
  329. LSUP_Term *
  330. LSUP_bnode_add_collection (LSUP_GraphIterator *it, LSUP_Term **ol);
  331. #endif