#include "term.h" // URI parsing regular expression. Conforms to RFC3986. #define URI_REGEX_STR \ "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" #define NLEN(str) (str) == NULL ? 0 : strlen ((str)) #define INVALID_URI_CHARS "<>\" {}|\\^`" static regex_t ptn; static bool ptn_init = false; /* Global inline prototypes. */ LSUP_Term *LSUP_uri_new (const char *data); LSUP_rc LSUP_uri_init (LSUP_Term *term, const char *data); /** * Free global regex struct. Register with atexit(). */ void term_cleanup() { if (ptn_init) regfree (&ptn); } LSUP_Term * LSUP_term_new ( LSUP_term_type type, const char *data, char *datatype, char *lang) { LSUP_Term *term; CALLOC_GUARD (term, NULL); // If undefined, just set the type. if (type == LSUP_TERM_UNDEFINED) term->type = type; else if (UNLIKELY (LSUP_term_init ( term, type, data, datatype, lang) != LSUP_OK)) { free (term); return NULL; } return term; } LSUP_Term * LSUP_term_new_from_buffer (const LSUP_Buffer *sterm) { LSUP_Term *term; MALLOC_GUARD (term, NULL); if (UNLIKELY (LSUP_term_deserialize (sterm, term) != LSUP_OK)) { free (term); return NULL; } return term; } LSUP_Buffer * LSUP_buffer_new_from_term (const LSUP_Term *term) { LSUP_Buffer *sterm; MALLOC_GUARD (sterm, NULL); sterm->addr = NULL; if (LSUP_term_serialize (term, sterm) != LSUP_OK) { free (sterm); return NULL; } return sterm; } LSUP_rc LSUP_term_init( LSUP_Term *term, LSUP_term_type type, const char *data, char *datatype, char *lang) { // This can never be LSUP_TERM_UNDEFINED. if (!data) return LSUP_VALUE_ERR; term->type = type; // Validate URI. if (term->type == LSUP_TERM_URI) { // TODO Cheap fix. Should url-encode all invalid chars. if (strpbrk (data, INVALID_URI_CHARS) != NULL) { fprintf ( stderr, "Characters %s are not allowed.\n", INVALID_URI_CHARS); return LSUP_VALUE_ERR; } if (UNLIKELY (!ptn_init)) { int rc = regcomp (&ptn, URI_REGEX_STR, REG_EXTENDED); if (rc != 0) return LSUP_ERROR; ptn_init = true; atexit (term_cleanup); } if (regexec (&ptn, data, 0, NULL, 0) != 0) { fprintf (stderr, "Error matching URI pattern.\n"); return LSUP_VALUE_ERR; } } char *data_tmp = realloc (term->data, strlen (data) + 1); if (UNLIKELY (!data_tmp)) return LSUP_MEM_ERR; term->data = data_tmp; strcpy (term->data, data); if (datatype) { data_tmp = realloc (term->datatype, strlen (datatype) + 1); if (UNLIKELY (!data_tmp)) return LSUP_MEM_ERR; term->datatype = data_tmp; strcpy (term->datatype, datatype); } else { free (term->datatype); term->datatype = NULL; } if (lang) { // TODO validate language and country code //char lsize = 5 ? lang[2] == "-" : 2; memcpy (term->lang, lang, LANG_SIZE); } else { memset (term->lang, 0, LANG_SIZE); } return LSUP_OK; } /* * This function allocates and returns the following byte sequence: * * - `sizeof (char)` bytes for the term type; * - `LANG_SIZE` bytes for the language tag; * - Arbitrary bytes with NUL-terminated strings for data and datatype. * * The index for `data` is consistently `LANG_SIZE + sizeof (char)`. The * index for `datatype` is found by the terminating NULL for `data`. * * Serialized representations of some RDF terms: * * * * 0 1 size=19 * | \x01 | http://hello.org\x00 | * type data * * "hello" * * 0 1 size=7 * | \x03 | hello\x00 | * type data * * "hello"^^xsd:string * * 0 1 7 size=18 * | \x03 | hello\x00 | xsd:string\x00 | * type data datatype * * (note: the "xsd:" prefix is used for simplification here, it would be * normally be a fully qualified URI) * * "hello"@en-US * * 0 1 7 18 size=24 * | \x03 | hello\x00 | xsd:string\x00 | en-US\x00 | * type data datatype lang */ LSUP_rc LSUP_term_serialize (const LSUP_Term *term, LSUP_Buffer *sterm) { size_t size, data_len, datatype_len = 0, data_idx = 1, datatype_idx = 0, lang_idx = 0; if (UNLIKELY (term == NULL)) return LSUP_NOACTION; data_len = strlen (term->data) + 1; size = data_idx + data_len; if (term->datatype != NULL) { datatype_idx = size; datatype_len = strlen (term->datatype) + 1; size += datatype_len; if (strlen (term->lang) > 0) { lang_idx = size; size += strlen (term->lang) + 1; } } //TRACE ("Serialized term size: %lu", size); LSUP_buffer_init (sterm, size, NULL); // Copy type. memcpy (sterm->addr, &term->type, 1); // Copy data. memcpy (sterm->addr + data_idx, term->data, data_len); if (term->datatype != NULL) { // Copy data type. memcpy (sterm->addr + datatype_idx, term->datatype, datatype_len); // Copy lang tag. if (strlen (term->lang) > 0) strcpy (sterm->addr + lang_idx, term->lang); } return LSUP_OK; } LSUP_rc LSUP_term_deserialize (const LSUP_Buffer *sterm, LSUP_Term *term) { size_t cur; char *data, *datatype = NULL; langtag lang = "\00"; char type = ((char*)(sterm->addr))[0]; cur = 1; data = (char*)sterm->addr + cur; cur += strlen (data) + 1; if (type == LSUP_TERM_LITERAL && cur < sterm->size) { datatype = (char*)sterm->addr + cur; cur += strlen (datatype) + 1; if (strlen (datatype) == 0) datatype = NULL; if (cur < sterm->size) strcpy (lang, sterm->addr + cur); } return LSUP_term_init (term, type, data, datatype, lang); } bool LSUP_term_equals (const LSUP_Term *term1, const LSUP_Term *term2) { if (term1->type != term2->type) return false; if (strcmp (term1->data, term2->data) != 0) return false; if (term1->type == LSUP_TERM_LITERAL) { if ((term1->datatype == NULL) != (term2->datatype == NULL)) // XOR return false; if ( term1->datatype != NULL && strcmp (term1->datatype, term2->datatype) != 0) return false; if ((term1->lang == NULL) != (term2->lang == NULL)) // XOR return false; if ( term1->lang != NULL && strcmp (term1->lang, term2->lang) != 0) return false; } return true; } void LSUP_term_done (LSUP_Term *term) { if (LIKELY (term->data != NULL)) { free (term->data); term->data = NULL; } if (term->datatype != NULL) { free (term->datatype); term->datatype = NULL; } } void LSUP_term_free (LSUP_Term *term) { if (LIKELY (term != NULL)) { LSUP_term_done (term); free (term); term = NULL; } } // Extern inline functions. LSUP_Key LSUP_term_hash (const LSUP_Term *term);