scossu
/
lsup_rdf


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
							#ifndef _LSUP_CODEC_BASE_H
#define _LSUP_CODEC_BASE_H

#include "graph.h"


/**
 * Max data size passed to the scanner and parser at each iteration.
 */
#ifdef LSUP_RDF_STREAM_CHUNK_SIZE
#define CHUNK_SIZE LSUP_RDF_STREAM_CHUNK_SIZE
#else
#define CHUNK_SIZE 8192
#endif


typedef struct codec_t LSUP_Codec;


/// Parser state.
typedef struct {
    LSUP_GraphIterator *    it;     ///< Iterator used to build the graph.
    LSUP_NSMap *            nsm;    ///< NS map used in the document.
    LSUP_Term *             base;   ///< Base IRI used in the document.
    size_t                  ct;     ///< Statements parsed.
    LSUP_rc                 rc;     ///< Internal return code.
} LSUP_TTLParserState;

/** @brief Parse error information.
 *
 */
/* TODO A plain string will suffice for now.
typedef struct parse_error_t {
    unsigned int        line;       // Line number where the error occurred.
    unsigned int        linec;      // Position in line of the offending token.
    char *              token;      // String representation of the token.
} LSUP_ParseError;
*/


/*
 * Interface prototypes.
 */

/** @brief Term encoder callback type.
 *
 * @param[in] term Single term handle.
 *
 * @param[in] nsm Namespace map. May be NULL for no prefix shortening.
 *
 * @param[out] rep Pointer to a string to be filled with the encoded term. The
 *  string is reallocated and, if reused for multiple calls to this function,
 *  it only needs to be freed after the last call. It should be initialized to
 *  NULL at the beginning.
 *
 * @return LSUP_OK on successful encoding; <0 for other errors.
 */
typedef LSUP_rc (*term_enc_fn_t)(
        const LSUP_Term *term, const LSUP_NSMap *nsm, char **rep);


/** @brief Initialize a graph encoding loop.
 *
 * This prototype is to be implemented by graph encoding loops. It should
 * create an iterator and perform all initial setup for finding triples.
 *
 * Implementations MUST set the "codec" member of the iterator to the address
 * of the codec that generated it.
 *
 * @param[in] gr The graph to be encoded. The graph's namespace map is used by
 * the codec for namespace prefixing. The graph may only be freed after the
 * loop is finalized.
 *
 * @return A codec iterator handle to be passed to a #gr_encode_iter_fn_t
 * function and, eventually, to a #gr_encode_done_fn_t function. This
 * structure is opaque and defined by each codec according to its own needs.
 */
typedef void * (*gr_encode_init_fn_t)(const LSUP_Graph *gr);


/** @brief Perform one encoding iteration.
 *
 * Implementations of this prototype MUST perform all the steps to encode one
 * or more complete triples into an RDF fragment representing those triples.
 * The input and output units are up to the implementation and a caller SHOULD
 * assume that multiple lines may be yielded at each iteration.
 *
 * @param[in] it Iterator handle.
 *
 * @param[out] res Handle to be populated with a string obtained from encoding.
 *  The output data should be UTF-8 encoded. This pointer must be initialized
 *  (even to NULL) and should be eventually freed manually at the end of the
 *  loop. It is reallocated at each iteration, so memory from a previous
 *  iteration may be overwritten with new data.
 *
 * @return LSUP_OK if a new token was processed; LSUP_END if the end of the
 *  loop was reached.
 */
typedef LSUP_rc (*gr_encode_iter_fn_t)(void *it, char **res);


/** @brief Finalize an encoding operation.
 *
 * Implementations SHOULD use this function to perform all necessary steps to
 * clean up memory and free the iterator handle after a graph has been
 * completely encoded.
 *
 * @param[in] it Iterator handle.
 */
typedef void (*gr_encode_done_fn_t)(void *it);


/** @brief Prototype for decoding a string into a LSUP_Term.
 *
 * Implementations MAY ignore any other tokens after finding the first one.
 *
 * @param[in] rep NT representation of the term.
 *
 * @param[in] nsm Namespace map handle.
 *
 * @param[out] Pointer to the term handle to be created. Implementaions SHOULD
 *  return NULL on a parse error.
 *
 * @return Implementations MUST return LSUP_OK on success and a negative value
 *  on parsing error.
 */
typedef LSUP_rc (*term_decode_fn_t)(
        const char *rep, const LSUP_NSMap *nsm, LSUP_Term **term);


/** @brief Prototype for decoding a complete RDF document into a graph.
 *
 * Implementations SHOULD consume data from the file handle in chunks.
 *
 * @param[in] rep Open file handle pointing to the RDF data. Implementations
 * MUST NOT close the file handle.
 *
 * @param[out] gr Pointer to a graph handle to be generated from decoding.
 *
 * @param[out] ct If not NULL, it may be populated with the number of triples
 *  parsed (which may be different from the resulting graph size).
 *  Implementations may choose not not use this, and they must account for the
 *  value to be NULL.
 *
 * @param[out] err Pointer to error info string. If no error occurs, it yields
 *  NULL.
 *
 * @return Implementations MUST return LSUP_OK on success and a negative value
 *  on parsing error.
 */
typedef LSUP_rc (*gr_decode_fn_t)(
        FILE *rep, LSUP_Graph **gr, size_t *ct, char **err);


/** @brief Codec structure.
 *
 * An instance of this structure is usually defined at compile time (see
 * examples in "include/codec_*.h" and "src/codec_*.c") and should have the
 * following defined:
 *
 * - name: A brief (16-char max), human-readable to identify the codec.
 * - mimetype: MIME type (32-char max) associated with the codec.
 * - extension: File extension associated with the serialized file.
 *
 * - encode_term: Encode a single term.
 *
 * - encode_graph_init: Initialize a graph decoding loop.
 * - encode_graph_iter: Run one iteration of encoding on one or more triples.
 * - encode_graph_done: Finalize the encoding loop and free the support data.
 *
 * - decode_term: Decode a single term.
 * - decode_graph: Decode a RDF document into a graph.
 *
 * For documentation on the individual encoding and decoding callbacks, see the
 * related function prototypes.
 */
struct codec_t {
    char                name[16];           ///< Name of the codec.
    char                mimetype[32]; ///< MIME type associated with the codec.
    char                extension[8];       ///< Serialized file extension.

    // Encoding.
    term_enc_fn_t       encode_term;        ///< Term encoder function.

    gr_encode_init_fn_t encode_graph_init;  ///< Graph encoder initialization.
    gr_encode_iter_fn_t encode_graph_iter;  ///< Graph encoder iteration.
    gr_encode_done_fn_t encode_graph_done;  ///< Graph encoder finalization.

    // Decoding.
    term_decode_fn_t    decode_term;        ///< Term decoder function.
    gr_decode_fn_t      decode_graph;       ///< Graph decoder function.
};


/*
 * Common utility functions.
 */

/** @brief strdup() for unsigned char.
 *
 * This is to be used with uint8_t sequences considered to be UTF-8 sequences,
 * requird by re2c (it won't work with byte sequences containing `NUL`).
 */
inline uint8_t
*uint8_dup (const uint8_t *str)
{ return (uint8_t *) strdup ((char *) str); }


/** @brief strndup() for unsigned char.
 *
 * This is to be used with uint8_t sequences considered to be UTF-8 sequences,
 * requird by re2c (it won't work with byte sequences containing `NUL`).
 */
inline uint8_t
*uint8_ndup (const uint8_t *str, size_t size)
{ return (uint8_t *) strndup ((char *) str, size); }


/** @brief Add escape character (backslash) to illegal literal characters.
 *
 * @param[in] in Input string.
 *
 * @param[out] out Output string.
 *
 * @return LSUP_OK on success; LSUP_MEM_ERR on memory error.
 */
LSUP_rc
escape_lit (const char *in, char **out);


/** @brief Replace non-printable characters with their literal byte.
 *
 *  Escape backslash is to be added separately.
 */
static inline char
escape_char (const char c) {
    switch (c) {
        case '\t': return 't';
        case '\b': return 'b';
        case '\n': return 'n';
        case '\r': return 'r';
        case '\f': return 'f';
        default: return c;
    }
}


/** @brief Unescape a single character.
 *
 * Convert escaped special characters such as `\t`, `\n`, etc. into their
 * corresponding code points.
 *
 * Non-special characters are returned unchanged.
 *
 * @param[in] c Character to unescape. Note that this is the single character
 * after `\`.
 * 
 * @return Code point corresponding to the escaped character.
 */
inline char
unescape_char (const char c)
{
    switch (c) {
        case 't': return '\t';
        case 'b': return '\b';
        case 'n': return '\n';
        case 'r': return '\r';
        case 'f': return '\f';
        default: return c;
    }
}


/** @brief Replace `\uxxxx` and `\Uxxxxxxxx` with Unicode bytes.
 *
 * @param[in] esc_str Escaped string.
 *
 * @param[in] size Maximum number of characters to scan, à la strncpy().
 *
 * @return String with escape sequences replaced by Unicode bytes.
 */
uint8_t *unescape_unicode (const uint8_t *esc_str, size_t size);


/** @brief Format an informational header.
 *
 * The information includes software version and current date. It is terminated
 * by a newline + NUL and prefixed with the string specified in `pfx`. It is
 * NOT prefixed by any comment characters.
 *
 * @param[in] pfx Prefix to add to the string. It may be a comment starter,
 *  such as `# `.
 */
char *fmt_header (char *pfx);


#endif