#ifndef _LSUP_CODEC_BASE_H #define _LSUP_CODEC_BASE_H #include "graph.h" /** * Max data size passed to the scanner and parser at each iteration. */ #ifdef LSUP_RDF_STREAM_CHUNK_SIZE #define CHUNK_SIZE LSUP_RDF_STREAM_CHUNK_SIZE #else #define CHUNK_SIZE 8192 #endif typedef struct codec_t LSUP_Codec; /// Parser state. typedef struct { LSUP_GraphIterator * it; ///< Iterator used to build the graph. LSUP_NSMap * nsm; ///< NS map used in the document. LSUP_Term * base; ///< Base IRI used in the document. size_t ct; ///< Statements parsed. LSUP_rc rc; ///< Internal return code. } LSUP_TTLParserState; /** @brief Parse error information. * */ /* TODO A plain string will suffice for now. typedef struct parse_error_t { unsigned int line; // Line number where the error occurred. unsigned int linec; // Position in line of the offending token. char * token; // String representation of the token. } LSUP_ParseError; */ /* * Interface prototypes. */ /** @brief Term encoder callback type. * * @param[in] term Single term handle. * * @param[in] nsm Namespace map. May be NULL for no prefix shortening. * * @param[out] rep Pointer to a string to be filled with the encoded term. The * string is reallocated and, if reused for multiple calls to this function, * it only needs to be freed after the last call. It should be initialized to * NULL at the beginning. * * @return LSUP_OK on successful encoding; <0 for other errors. */ typedef LSUP_rc (*term_enc_fn_t)( const LSUP_Term *term, const LSUP_NSMap *nsm, char **rep); /** @brief Initialize a graph encoding loop. * * This prototype is to be implemented by graph encoding loops. It should * create an iterator and perform all initial setup for finding triples. * * Implementations MUST set the "codec" member of the iterator to the address * of the codec that generated it. * * @param[in] gr The graph to be encoded. The graph's namespace map is used by * the codec for namespace prefixing. The graph may only be freed after the * loop is finalized. * * @return A codec iterator handle to be passed to a #gr_encode_iter_fn_t * function and, eventually, to a #gr_encode_done_fn_t function. This * structure is opaque and defined by each codec according to its own needs. */ typedef void * (*gr_encode_init_fn_t)(const LSUP_Graph *gr); /** @brief Perform one encoding iteration. * * Implementations of this prototype MUST perform all the steps to encode one * or more complete triples into an RDF fragment representing those triples. * The input and output units are up to the implementation and a caller SHOULD * assume that multiple lines may be yielded at each iteration. * * @param[in] it Iterator handle. * * @param[out] res Handle to be populated with a string obtained from encoding. * The output data should be UTF-8 encoded. This pointer must be initialized * (even to NULL) and should be eventually freed manually at the end of the * loop. It is reallocated at each iteration, so memory from a previous * iteration may be overwritten with new data. * * @return LSUP_OK if a new token was processed; LSUP_END if the end of the * loop was reached. */ typedef LSUP_rc (*gr_encode_iter_fn_t)(void *it, char **res); /** @brief Finalize an encoding operation. * * Implementations SHOULD use this function to perform all necessary steps to * clean up memory and free the iterator handle after a graph has been * completely encoded. * * @param[in] it Iterator handle. */ typedef void (*gr_encode_done_fn_t)(void *it); /** @brief Prototype for decoding a string into a LSUP_Term. * * Implementations MAY ignore any other tokens after finding the first one. * * @param[in] rep NT representation of the term. * * @param[in] nsm Namespace map handle. * * @param[out] Pointer to the term handle to be created. Implementaions SHOULD * return NULL on a parse error. * * @return Implementations MUST return LSUP_OK on success and a negative value * on parsing error. */ typedef LSUP_rc (*term_decode_fn_t)( const char *rep, const LSUP_NSMap *nsm, LSUP_Term **term); /** @brief Prototype for decoding a complete RDF document into a graph. * * Implementations SHOULD consume data from the file handle in chunks. * * @param[in] rep Open file handle pointing to the RDF data. Implementations * MUST NOT close the file handle. * * @param[out] gr Pointer to a graph handle to be generated from decoding. * * @param[out] ct If not NULL, it may be populated with the number of triples * parsed (which may be different from the resulting graph size). * Implementations may choose not not use this, and they must account for the * value to be NULL. * * @param[out] err Pointer to error info string. If no error occurs, it yields * NULL. * * @return Implementations MUST return LSUP_OK on success and a negative value * on parsing error. */ typedef LSUP_rc (*gr_decode_fn_t)( FILE *rep, LSUP_Graph **gr, size_t *ct, char **err); /** @brief Codec structure. * * An instance of this structure is usually defined at compile time (see * examples in "include/codec_*.h" and "src/codec_*.c") and should have the * following defined: * * - name: A brief (16-char max), human-readable to identify the codec. * - mimetype: MIME type (32-char max) associated with the codec. * - extension: File extension associated with the serialized file. * * - encode_term: Encode a single term. * * - encode_graph_init: Initialize a graph decoding loop. * - encode_graph_iter: Run one iteration of encoding on one or more triples. * - encode_graph_done: Finalize the encoding loop and free the support data. * * - decode_term: Decode a single term. * - decode_graph: Decode a RDF document into a graph. * * For documentation on the individual encoding and decoding callbacks, see the * related function prototypes. */ struct codec_t { char name[16]; ///< Name of the codec. char mimetype[32]; ///< MIME type associated with the codec. char extension[8]; ///< Serialized file extension. // Encoding. term_enc_fn_t encode_term; ///< Term encoder function. gr_encode_init_fn_t encode_graph_init; ///< Graph encoder initialization. gr_encode_iter_fn_t encode_graph_iter; ///< Graph encoder iteration. gr_encode_done_fn_t encode_graph_done; ///< Graph encoder finalization. // Decoding. term_decode_fn_t decode_term; ///< Term decoder function. gr_decode_fn_t decode_graph; ///< Graph decoder function. }; /* * Common utility functions. */ /** @brief strdup() for unsigned char. * * This is to be used with uint8_t sequences considered to be UTF-8 sequences, * requird by re2c (it won't work with byte sequences containing `NUL`). */ inline uint8_t *uint8_dup (const uint8_t *str) { return (uint8_t *) strdup ((char *) str); } /** @brief strndup() for unsigned char. * * This is to be used with uint8_t sequences considered to be UTF-8 sequences, * requird by re2c (it won't work with byte sequences containing `NUL`). */ inline uint8_t *uint8_ndup (const uint8_t *str, size_t size) { return (uint8_t *) strndup ((char *) str, size); } /** @brief Add escape character (backslash) to illegal literal characters. * * @param[in] in Input string. * * @param[out] out Output string. * * @return LSUP_OK on success; LSUP_MEM_ERR on memory error. */ LSUP_rc escape_lit (const char *in, char **out); /** @brief Replace non-printable characters with their literal byte. * * Escape backslash is to be added separately. */ static inline char escape_char (const char c) { switch (c) { case '\t': return 't'; case '\b': return 'b'; case '\n': return 'n'; case '\r': return 'r'; case '\f': return 'f'; default: return c; } } /** @brief Unescape a single character. * * Convert escaped special characters such as `\t`, `\n`, etc. into their * corresponding code points. * * Non-special characters are returned unchanged. * * @param[in] c Character to unescape. Note that this is the single character * after `\`. * * @return Code point corresponding to the escaped character. */ inline char unescape_char (const char c) { switch (c) { case 't': return '\t'; case 'b': return '\b'; case 'n': return '\n'; case 'r': return '\r'; case 'f': return '\f'; default: return c; } } /** @brief Replace `\uxxxx` and `\Uxxxxxxxx` with Unicode bytes. * * @param[in] esc_str Escaped string. * * @param[in] size Maximum number of characters to scan, à la strncpy(). * * @return String with escape sequences replaced by Unicode bytes. */ uint8_t *unescape_unicode (const uint8_t *esc_str, size_t size); /** @brief Format an informational header. * * The information includes software version and current date. It is terminated * by a newline + NUL and prefixed with the string specified in `pfx`. It is * NOT prefixed by any comment characters. * * @param[in] pfx Prefix to add to the string. It may be a comment starter, * such as `# `. */ char *fmt_header (char *pfx); #endif