from uuid import uuid4 from rdflib import URIRef, BNode, Literal #from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.stdint cimport uint64_t from libc.stdlib cimport free from libc.string cimport memcpy from cymem.cymem cimport Pool from lakesuperior.cy_include cimport cytpl as tpl from lakesuperior.model.base cimport Buffer, buffer_dump DEF LSUP_TERM_TYPE_URIREF = 1 DEF LSUP_TERM_TYPE_BNODE = 2 DEF LSUP_TERM_TYPE_LITERAL = 3 DEF LSUP_TERM_PK_FMT = b'csss' # Reflects the Term structure DEF LSUP_TERM_STRUCT_PK_FMT = b'S(' + LSUP_TERM_PK_FMT + b')' # URI parsing regular expression. Conforms to RFC3986. #DEF URI_REGEX_STR = ( # b'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?' #) #cdef char* ptn = URI_REGEX_STR #regcomp(&uri_regex, ptn, REG_NOSUB) # Compile with no catch groups. # TODO This should be properly cleaned up on application shutdown: # regfree(&uri_regex) #cdef int term_new( # Term* term, char type, char* data, char* datatype=NULL, char* lang=NULL #) except -1: # if regexec(&uri_regex, data, 0, NULL, 0) == REG_NOMATCH: # raise ValueError('Not a valid URI.') # term.type = type # term.data = ( # data # TODO use C UUID v4 (RFC 4122) generator # if term.type == LSUP_TERM_TYPE_BNODE # else data # ) # if term.type == LSUP_TERM_TYPE_LITERAL: # term.datatype = datatype # term.lang = lang # # return 0 cdef int serialize(const Term *term, Buffer *sterm, Pool pool=None) except -1: """ Serialize a Term into a binary buffer. The returned result is dynamically allocated in the provided memory pool. """ cdef: unsigned char *addr size_t sz tpl.tpl_jot(tpl.TPL_MEM, &addr, &sz, LSUP_TERM_STRUCT_PK_FMT, term) if pool is None: sterm.addr = addr else: # addr is within this function scope. Must be copied to the cymem pool. sterm.addr = pool.alloc(sz, 1) if not sterm.addr: raise MemoryError() memcpy(sterm.addr, addr, sz) sterm.sz = sz cdef int deserialize(const Buffer *data, Term *term) except -1: """ Return a term from serialized binary data. """ #print(f'Deserializing: {buffer_dump(data)}') _pk = tpl.tpl_peek( tpl.TPL_MEM | tpl.TPL_DATAPEEK, data[0].addr, data[0].sz, LSUP_TERM_PK_FMT, &(term[0].type), &(term[0].data), &(term[0].datatype), &(term[0].lang)) if _pk is NULL: raise MemoryError('Error deserializing term.') else: free(_pk) cdef int from_rdflib(term_obj, Term *term) except -1: """ Return a Term struct obtained from a Python/RDFLib term. """ _data = str(term_obj).encode() term[0].data = _data if isinstance(term_obj, Literal): _datatype = (getattr(term_obj, 'datatype') or '').encode() _lang = (getattr(term_obj, 'language') or '').encode() term[0].type = LSUP_TERM_TYPE_LITERAL term[0].datatype = _datatype term[0].lang = _lang else: term[0].datatype = NULL term[0].lang = NULL if isinstance(term_obj, URIRef): term[0].type = LSUP_TERM_TYPE_URIREF elif isinstance(term_obj, BNode): term[0].type = LSUP_TERM_TYPE_BNODE else: raise ValueError(f'Unsupported term type: {type(term_obj)}') cdef int serialize_from_rdflib( term_obj, Buffer *data, Pool pool=None ) except -1: """ Return a Buffer struct from a Python/RDFLib term. """ cdef: Term _term void *addr size_t sz # From RDFlib _data = str(term_obj).encode() _term.data = _data if isinstance(term_obj, Literal): _datatype = (getattr(term_obj, 'datatype') or '').encode() _lang = (getattr(term_obj, 'language') or '').encode() _term.type = LSUP_TERM_TYPE_LITERAL _term.datatype = _datatype _term.lang = _lang else: _term.datatype = NULL _term.lang = NULL if isinstance(term_obj, URIRef): _term.type = LSUP_TERM_TYPE_URIREF elif isinstance(term_obj, BNode): _term.type = LSUP_TERM_TYPE_BNODE else: raise ValueError(f'Unsupported term type: {type(term_obj)}') serialize(&_term, data, pool) cdef object to_rdflib(const Term *term): """ Return an RDFLib term. """ cdef str data = (term[0].data).decode() if term[0].type == LSUP_TERM_TYPE_LITERAL: return Literal( data, datatype=term[0].datatype if not term[0].lang else None, lang=term[0].lang or None ) else: if term[0].type == LSUP_TERM_TYPE_URIREF: return URIRef(data) elif term[0].type == LSUP_TERM_TYPE_BNODE: return BNode(data) else: raise IOError(f'Unknown term type code: {term[0].type}') cdef object deserialize_to_rdflib(const Buffer *data): """ Return a Python/RDFLib term from a serialized Cython term. """ cdef Term t deserialize(data, &t) return to_rdflib(&t) cdef object to_bytes(const Term *term): """ Return a Python bytes object of the serialized term. """ cdef: Buffer pk_t unsigned char *bytestream serialize(term, &pk_t) bytestream = pk_t.addr return (bytestream)[:pk_t.sz]