#ifndef _LSUP_CODEC_BASE_H #define _LSUP_CODEC_BASE_H #include "graph.h" /** * Max data size passed to the scanner and parser at each iteration. */ #ifdef LSUP_RDF_STREAM_CHUNK_SIZE #define CHUNK_SIZE LSUP_RDF_STREAM_CHUNK_SIZE #else #define CHUNK_SIZE 8192 #endif typedef struct codec_t LSUP_Codec; /** @brief Codec iterator type. * * This structure holds state data including input and output for encoding a * graph into RDF. Normally it should not be inspected or manipulated directly, * but rather passed to codec iteration functions for processing RDF. * * NOTE: This should be used as an opaque handle, however it is exposed here * for easier inclusion into each codec. */ typedef struct codec_iter_t { const LSUP_Codec * codec; // Codec that generated this iterator. LSUP_Triple * trp; // RDF fragment being encoded. LSUP_GraphIterator *gr_it; // Graph iterator. const LSUP_NSMap * nsm; // Namespace map. size_t cur; // Internal cursor. LSUP_rc rc; // Internal return code. char * rep, // String representation of a RDF fragment. * str_s, // Temporary string. * str_p, // Temporary string. * str_o; // Temporary string. } LSUP_CodecIterator; /// Predicate and object list. Used for Turtle. typedef struct { LSUP_Term ** p; ///< NULL-terminated array of term handles. LSUP_Term *** o; /**< * NULL-terminated array of * NULL-terminated arrays of term handles. * The indices of the outer array are * equal to the indices of the associated * predicate in the predicate list. */ } LSUP_PredObjList; /// Parser state. typedef struct { LSUP_GraphIterator * it; ///< Iterator used to build the graph. LSUP_NSMap * nsm; ///< NS map used in the document. LSUP_Term * base; ///< Base IRI used in the document. size_t ct; ///< Statements parsed. } LSUP_TTLParserState; /** @brief Parse error information. * */ /* TODO A plain string will suffice for now. typedef struct parse_error_t { unsigned int line; // Line number where the error occurred. unsigned int linec; // Position in line of the offending token. char * token; // String representation of the token. } LSUP_ParseError; */ /* * Interface prototypes. */ /** @brief Term encoder callback type. * * @param[in] term Single term handle. * * @param[in] nsm Namespace map. May be NULL for no prefix shortening. * * @param[out] rep Pointer to a string to be filled with the encoded term. The * caller is in charge of freeing the string after use. Returns undefined on * error. * * @return LSUP_OK on successful encoding; <0 for other errors. */ typedef LSUP_rc (*term_enc_fn_t)( const LSUP_Term *term, const LSUP_NSMap *nsm, char **rep); /** @brief Initialize a graph encoding loop. * * This prototype is to be implemented by graph encoding loops. It should * create an iterator and perform all initial setup for finding triples. * * Implementations MUST set the "codec" member of the iterator to the address * of the codec that generated it. * * @param[in] gr The graph to be encoded. The graph's namespace map is used by * the codec for namespace prefixing. The graph may only be freed after the * loop is finalized. * * @return A codec iterator handle to be passed to a #gr_codec_iter_fn_t * function and, eventually, to a #gr_codec_done_fn_t function. */ typedef LSUP_CodecIterator * (*gr_encode_init_fn_t)(const LSUP_Graph *gr); /** @brief Perform one encoding iteration. * * Implementations of this prototype MUST perform all the steps to encode one * or more complete triples into an RDF fragment representing those triples. * The input and output units are up to the implementation and a caller SHOULD * assume that multiple lines may be yielded at each iteration. * * @param[in] it Iterator handle. * * @param[out] res Handle to be populated with a string obtained from encoding. * The output data should be UTF-8 [TODO or UTF-16] encoded. This pointer * must be initialized (even to NULL) and should be eventually freed manually * at the end of the loop. It is reallocated at each iteration, so memory from * a previous iteration may be overwritten with new data. * * @return LSUP_OK if a new token was processed; LSUP_END if the end of the * loop was reached. */ typedef LSUP_rc (*gr_encode_iter_fn_t)( LSUP_CodecIterator *it, unsigned char **res); /** @brief Finalize an encoding operation. * * Implementations SHOULD use this function to perform all necessary steps to * clean up memory and free the iterator handle after a graph has been * completely encoded. * * @param[in] it Iterator handle. */ typedef void (*gr_encode_done_fn_t)(LSUP_CodecIterator *it); /** @brief Prototype for decoding a string into a LSUP_Term. * * Implementations MAY ignore any other tokens after finding the first one. * * @param[in] rep NT representation of the term. * * @param[in] nsm Namespace map handle. * * @param[out] Pointer to the term handle to be created. Implementaions SHOULD * return NULL on a parse error. * * @return Implementations MUST return LSUP_OK on success and a negative value * on parsing error. */ typedef LSUP_rc (*term_decode_fn_t)( const char *rep, const LSUP_NSMap *nsm, LSUP_Term **term); /** @brief Prototype for decoding a complete RDF document into a graph. * * Implementations SHOULD consume data from the file handle in chunks. * * @param[in] rep Open file handle pointing to the RDF data. Implementations * MUST NOT close the file handle. * * @param[out] gr Pointer to a graph handle to be generated from decoding. * * @param[out] ct If not NULL, it may be populated with the number of triples * parsed (which may be different from the resulting graph size). * Implementations may choose not not use this, and they must account for the * value to be NULL. * * @param[out] err Pointer to error info string. If no error occurs, it yields * NULL. * * @return Implementations MUST return LSUP_OK on success and a negative value * on parsing error. */ typedef LSUP_rc (*gr_decode_fn_t)( FILE *rep, LSUP_Graph **gr, size_t *ct, char **err); /** @brief Codec structure. * * An instance of this structure is usually defined at compile time (see * examples in "include/codec_*.h" and "src/codec_*.c") and should have the * following defined: * * - name: A brief (16-char max), human-readable to identify the codec. * - mimetype: MIME type (32-char max) associated with the codec. * - extension: File extension associated with the serialized file. * * - encode_term: Encode a single term. * * - encode_graph_init: Initialize a graph decoding loop. * - encode_graph_iter: Run one iteration of encoding on one or more triples. * - encode_graph_done: Finalize the encoding loop and free the support data. * * - decode_term: Decode a single term. * - decode_graph: Decode a RDF document into a graph. * * For documentation on the individual encoding and decoding callbacks, see the * related function prototypes. */ struct codec_t { char name[16]; // Name of the codec. char mimetype[32]; // MIME type associated with the codec. char extension[8]; // Serialized file extension. // Encoding. term_enc_fn_t encode_term; // Term encoder function. gr_encode_init_fn_t encode_graph_init; // Graph encoder initialization. gr_encode_iter_fn_t encode_graph_iter; // Graph encoder iteration. gr_encode_done_fn_t encode_graph_done; // Graph encoder finalization. // Decoding. term_decode_fn_t decode_term; // Term decoder function. gr_decode_fn_t decode_graph; // Graph decoder function. }; /* * Common utility functions. */ /** @brief strdup() for unsigned char. * * This is to be used with uint8_t sequences considered to be UTF-8 sequences, * requird by re2c (it won't work with byte sequences containing `NUL`). */ inline uint8_t *uint8_dup (const uint8_t *str) { return (uint8_t *) strdup ((char *) str); } /** @brief strndup() for unsigned char. * * This is to be used with uint8_t sequences considered to be UTF-8 sequences, * requird by re2c (it won't work with byte sequences containing `NUL`). */ inline uint8_t *uint8_ndup (const uint8_t *str, size_t size) { return (uint8_t *) strndup ((char *) str, size); } /** @brief Unescape a single character. * * Convert escaped special characters such as `\t`, `\n`, etc. into their * corresponding code points. * * Non-special characters are returned unchanged. * * @param[in] c Character to unescape. Note that this is the single character * after `\`. * * @return Code point corresponding to the escaped character. */ inline char unescape_char (const char c) { switch (c) { case 't': return '\t'; case 'b': return '\b'; case 'n': return '\n'; case 'r': return '\r'; case 'f': return '\f'; default: return c; } } /** @brief Replace \uxxxx and \Uxxxxxxxx with Unicode bytes. * * @param[in] esc_str Escaped string. * * @param[in] size Maximum number of characters to scan, à la strncpy(). * * @return String with escape sequences replaced by Unicode bytes. */ uint8_t *unescape_unicode (const uint8_t *esc_str, size_t size); /** @brief Add an object to an objct list. * * @param[in] ol Array of object handles to be added to. On success, this * handle will be reallocated and the new address returned, so the passed * handle should no longer be used. On failure, it remains unchanged and may * be reused. * * @param[in] o Object to be added to the list. * * @return Reallocated list on success; NULL on failure. */ LSUP_Term ** LSUP_obj_list_add (LSUP_Term **ol, LSUP_Term *o); /** @brief New predicate-object list. * * @return a new empty predicate-object list. */ LSUP_PredObjList * LSUP_pred_obj_list_new (void); /** @brief Free a predicate-object list. * * @param[in] pol Predicate-object list handle obtained with * #LSUP_pred_obj_list_new(). */ void LSUP_pred_obj_list_free (LSUP_PredObjList *pol); /** @brief Add a predicate-object list pair to a PO list. * * @param[in] pol Predicate-object list handle obtained with * #LSUP_pred_obj_list_new(). * * @param[in] p Predicate to be associated with the given object list. * * @param[in] o NULL-terminated array of object term handles to be associated * with the given predicate. * * @return LSUP_OK on success; LSUP_MEM_ERR on allocation error. */ LSUP_rc LSUP_pred_obj_list_add (LSUP_PredObjList *pol, LSUP_Term *p, LSUP_Term **o); /** @brief Add triples for a subject and a PO list to a graph. * * @param[in] it Graph iterator to use for insertion. * * @param[in] s Subject of all the triples. * * @param[in] po Predicate-object list. * * @return Number of triples added on success; <0 (LSUP_*_ERR) on error. */ size_t LSUP_spo_list_add_triples ( LSUP_GraphIterator *it, LSUP_Term *s, const LSUP_PredObjList *po); /** @brief Add triples for an anonymous collection to a graph. * * The `rdf:first`, `rdf:rest`, etc. terms are automatically added and the term * for the first item in the list is returned. * * @param[in] it Graph iterator to use for insertion. * * @param[in] ol NULL-terminated term array. * * @return Blank node representing the first list item. */ LSUP_Term * LSUP_bnode_add_collection (LSUP_GraphIterator *it, LSUP_Term **ol); #endif