codec.h 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. #ifndef _LSUP_CODEC_BASE_H
  2. #define _LSUP_CODEC_BASE_H
  3. #include "graph.h"
  4. /** @defgroup codec RDF codec module
  5. * @ingroup private
  6. * @{
  7. */
  8. /**
  9. * Max data size passed to the scanner and parser at each iteration.
  10. */
  11. #ifdef LSUP_RDF_STREAM_CHUNK_SIZE
  12. #define CHUNK_SIZE LSUP_RDF_STREAM_CHUNK_SIZE
  13. #else
  14. #define CHUNK_SIZE 8192
  15. #endif
  16. typedef struct codec_t LSUP_Codec;
  17. /// Parser state.
  18. typedef struct ttl_parser_state {
  19. LSUP_GraphIterator * it; ///< Iterator used to build the graph.
  20. LSUP_NSMap * nsm; ///< NS map used in the document.
  21. LSUP_Term * base; ///< Base IRI used in the document.
  22. LSUP_Term * lms; ///< Link map subject.
  23. size_t ct; ///< Statements parsed.
  24. LSUP_rc rc; ///< Internal return code.
  25. } LSUP_TTLParserState;
  26. /** @brief Parse error information.
  27. *
  28. */
  29. /* TODO A plain string will suffice for now.
  30. typedef struct parse_error_t {
  31. unsigned int line; // Line number where the error occurred.
  32. unsigned int linec; // Position in line of the offending token.
  33. char * token; // String representation of the token.
  34. } LSUP_ParseError;
  35. */
  36. /*
  37. * Interface prototypes.
  38. */
  39. /** @brief Term encoder callback type.
  40. *
  41. * @param[in] term Single term handle.
  42. *
  43. * @param[in] nsm Namespace map. May be NULL for no prefix shortening.
  44. *
  45. * @param[out] rep Pointer to a string to be filled with the encoded term. The
  46. * string is reallocated and, if reused for multiple calls to this function,
  47. * it only needs to be freed after the last call. It should be initialized to
  48. * NULL at the beginning.
  49. *
  50. * @return LSUP_OK on successful encoding; <0 for other errors.
  51. */
  52. typedef LSUP_rc (*term_enc_fn_t)(
  53. const LSUP_Term *term, const LSUP_NSMap *nsm, char **rep);
  54. /** @brief Initialize a graph encoding loop.
  55. *
  56. * This prototype is to be implemented by graph encoding loops. It should
  57. * create an iterator and perform all initial setup for finding triples.
  58. *
  59. * Implementations MUST set the "codec" member of the iterator to the address
  60. * of the codec that generated it.
  61. *
  62. * @param[in] gr The graph to be encoded. The graph's namespace map is used by
  63. * the codec for namespace prefixing. The graph may only be freed after the
  64. * loop is finalized.
  65. *
  66. * @return A codec iterator handle to be passed to a #gr_encode_iter_fn_t
  67. * function and, eventually, to a #gr_encode_done_fn_t function. This
  68. * structure is opaque and defined by each codec according to its own needs.
  69. */
  70. typedef void * (*gr_encode_init_fn_t)(const LSUP_Graph *gr);
  71. /** @brief Perform one encoding iteration.
  72. *
  73. * Implementations of this prototype MUST perform all the steps to encode one
  74. * or more complete triples into an RDF fragment representing those triples.
  75. * The input and output units are up to the implementation and a caller SHOULD
  76. * assume that multiple lines may be yielded at each iteration.
  77. *
  78. * @param[in] it Iterator handle.
  79. *
  80. * @param[out] res Handle to be populated with a string obtained from encoding.
  81. * The output data should be UTF-8 encoded. This pointer must be initialized
  82. * (even to NULL) and should be eventually freed manually at the end of the
  83. * loop. It is reallocated at each iteration, so memory from a previous
  84. * iteration may be overwritten with new data.
  85. *
  86. * @return LSUP_OK if a new token was processed; LSUP_END if the end of the
  87. * loop was reached.
  88. */
  89. typedef LSUP_rc (*gr_encode_iter_fn_t)(void *it, char **res);
  90. /** @brief Finalize an encoding operation.
  91. *
  92. * Implementations SHOULD use this function to perform all necessary steps to
  93. * clean up memory and free the iterator handle after a graph has been
  94. * completely encoded.
  95. *
  96. * @param[in] it Iterator handle.
  97. */
  98. typedef void (*gr_encode_done_fn_t)(void *it);
  99. /** @brief Prototype for decoding a string into a LSUP_Term.
  100. *
  101. * Implementations MAY ignore any other tokens after finding the first one.
  102. *
  103. * @param[in] rep NT representation of the term.
  104. *
  105. * @param[in] nsm Namespace map handle.
  106. *
  107. * @param[out] term Pointer to the term handle to be created. Implementaions
  108. * SHOULD return NULL on a parse error.
  109. *
  110. * @return Implementations MUST return LSUP_OK on success and a negative value
  111. * on parsing error.
  112. */
  113. typedef LSUP_rc (*term_decode_fn_t)(
  114. const char *rep, const LSUP_NSMap *nsm, LSUP_Term **term);
  115. /** @brief Prototype for decoding a complete RDF document file into a graph.
  116. *
  117. * Implementations SHOULD consume data from the file handle in chunks.
  118. *
  119. * @param[in] rep Open file handle pointing to the RDF data. Implementations
  120. * MUST NOT close the file handle.
  121. *
  122. * @param[out] gr Pointer to a graph handle to be generated from decoding.
  123. * The handle need not be initialized. The graph URI will be randomly assigned
  124. * and can be changed at a later time.
  125. *
  126. * @param[out] ct If not NULL, it may be populated with the number of triples
  127. * parsed (which may be different from the resulting graph size).
  128. * Implementations may choose not not use this, and they must account for the
  129. * value to be NULL.
  130. *
  131. * @param[out] err Pointer to error info string. If no error occurs, it yields
  132. * NULL.
  133. *
  134. * @return Implementations MUST return LSUP_OK on success and a negative value
  135. * on parsing error.
  136. */
  137. typedef LSUP_rc (*gr_decode_fn_t)(
  138. FILE *rep, LSUP_Graph **gr, size_t *ct, char **err);
  139. /** @brief Codec structure.
  140. *
  141. * An instance of this structure is usually defined at compile time (see
  142. * examples in "include/codec_*.h" and "src/codec_*.c") and should have the
  143. * following defined:
  144. *
  145. * - name: A brief (16-char max), human-readable to identify the codec.
  146. * - mimetype: MIME type (32-char max) associated with the codec.
  147. * - extension: File extension associated with the serialized file.
  148. *
  149. * - encode_term: Encode a single term.
  150. *
  151. * - encode_graph_init: Initialize a graph decoding loop.
  152. * - encode_graph_iter: Run one iteration of encoding on one or more triples.
  153. * - encode_graph_done: Finalize the encoding loop and free the support data.
  154. *
  155. * - decode_term: Decode a single term.
  156. * - decode_graph: Decode a RDF document into a graph.
  157. *
  158. * For documentation on the individual encoding and decoding callbacks, see the
  159. * related function prototypes.
  160. */
  161. struct codec_t {
  162. char name[16]; ///< Name of the codec.
  163. char mimetype[32]; ///< MIME type associated with the codec.
  164. char extension[8]; ///< Serialized file extension.
  165. // Encoding.
  166. term_enc_fn_t encode_term; ///< Term encoder function.
  167. gr_encode_init_fn_t encode_graph_init; ///< Graph encoder initialization.
  168. gr_encode_iter_fn_t encode_graph_iter; ///< Graph encoder iteration.
  169. gr_encode_done_fn_t encode_graph_done; ///< Graph encoder finalization.
  170. // Decoding.
  171. term_decode_fn_t decode_term; ///< Term decoder function.
  172. gr_decode_fn_t decode_graph; ///< Graph decoder function.
  173. };
  174. /*
  175. * Common utility functions.
  176. */
  177. /** @brief strdup() for unsigned char.
  178. *
  179. * This is to be used with uint8_t sequences considered to be UTF-8 sequences,
  180. * requird by re2c (it won't work with byte sequences containing `NUL`).
  181. */
  182. inline uint8_t
  183. *uint8_dup (const uint8_t *str)
  184. { return (uint8_t *) strdup ((char *) str); }
  185. /** @brief strndup() for unsigned char.
  186. *
  187. * This is to be used with uint8_t sequences considered to be UTF-8 sequences,
  188. * requird by re2c (it won't work with byte sequences containing `NUL`).
  189. */
  190. inline uint8_t
  191. *uint8_ndup (const uint8_t *str, size_t size)
  192. { return (uint8_t *) strndup ((char *) str, size); }
  193. /** @brief Add escape character (backslash) to illegal literal characters.
  194. *
  195. * @param[in] in Input string.
  196. *
  197. * @param[out] out Output string.
  198. *
  199. * @return LSUP_OK on success; LSUP_MEM_ERR on memory error.
  200. */
  201. LSUP_rc
  202. escape_lit (const char *in, char **out);
  203. /** @brief Replace non-printable characters with their literal byte.
  204. *
  205. * Escape backslash is to be added separately.
  206. */
  207. static inline char
  208. escape_char (const char c) {
  209. switch (c) {
  210. case '\t': return 't';
  211. case '\b': return 'b';
  212. case '\n': return 'n';
  213. case '\r': return 'r';
  214. case '\f': return 'f';
  215. default: return c;
  216. }
  217. }
  218. /** @brief Unescape a single character.
  219. *
  220. * Convert escaped special characters such as `\t`, `\n`, etc. into their
  221. * corresponding code points.
  222. *
  223. * Non-special characters are returned unchanged.
  224. *
  225. * @param[in] c Character to unescape. Note that this is the single character
  226. * after `\`.
  227. *
  228. * @return Code point corresponding to the escaped character.
  229. */
  230. inline char
  231. unescape_char (const char c)
  232. {
  233. switch (c) {
  234. case 't': return '\t';
  235. case 'b': return '\b';
  236. case 'n': return '\n';
  237. case 'r': return '\r';
  238. case 'f': return '\f';
  239. default: return c;
  240. }
  241. }
  242. /** @brief Replace `\uxxxx` and `\Uxxxxxxxx` with Unicode bytes.
  243. *
  244. * @param[in] esc_str Escaped string.
  245. *
  246. * @param[in] size Maximum number of characters to scan, à la strncpy().
  247. *
  248. * @return String with escape sequences replaced by Unicode bytes.
  249. */
  250. uint8_t *unescape_unicode (const uint8_t *esc_str, size_t size);
  251. /** @brief Format an informational header.
  252. *
  253. * The information includes software version and current date. It is terminated
  254. * by a newline + NUL and prefixed with the string specified in `pfx`. It is
  255. * NOT prefixed by any comment characters.
  256. *
  257. * @param[in] pfx Prefix to add to the string. It may be a comment starter,
  258. * such as `# `.
  259. */
  260. char *fmt_header (char *pfx);
  261. /// @} END defgroup codec
  262. #endif