import logging from functools import wraps from rdflib import Graph from rdflib.term import Node from lakesuperior import env from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.string cimport memcmp from lakesuperior.cy_include cimport calg from lakesuperior.cy_include cimport cylmdb as lmdb from lakesuperior.store.ldp_rs cimport term from lakesuperior.store.ldp_rs.lmdb_triplestore cimport ( KLEN, DBL_KLEN, TRP_KLEN, TripleKey, LmdbTriplestore) from lakesuperior.store.ldp_rs.keyset cimport Keyset from lakesuperior.store.ldp_rs.triple cimport Triple from lakesuperior.util.hash cimport Hash64, hash64 logger = logging.getLogger(__name__) def use_data(fn): """ Decorator to indicate that a set operation between two SimpleGraph instances should use the ``data`` property of the second term. The second term can also be a simple set. """ @wraps(fn) def _wrapper(self, other): if isinstance(other, SimpleGraph): other = other.data return _wrapper cdef unsigned int set_item_hash_fn(calg.SetValue data): """ Hash function for the CAlg set implementation. https://fragglet.github.io/c-algorithms/doc/set_8h.html#6c7986a2a80d7a3cb7b9d74e1c6fef97 :param SetItem *data: Pointer to a SetItem structure. """ cdef: Hash64 hash term.Buffer sr_data sr_data.addr = (data).data sr_data.sz = (data).size hash64(&sr_data, &hash) return hash cdef bint set_item_cmp_fn(calg.SetValue v1, calg.SetValue v2): """ Compare function for two CAlg set items. https://fragglet.github.io/c-algorithms/doc/set_8h.html#40fa2c86d5b003c1b0b0e8dd1e4df9f4 """ if (v1)[0].size != (v2)[0].size: return False return memcmp( (v1)[0].data, (v2)[0].data, (v1)[0].size) cdef class SimpleGraph: """ Fast and simple implementation of a graph. Most functions should mimic RDFLib's graph with less overhead. It uses the same funny but functional slicing notation. Instances of this class hold a set of pointers to :py:class:`~lakesuperior.store.ldp_rs.triple.Triple` structures. No data are copied but care must be taken when freeing the triples pointed to. A SimpleGraph can be obtained from a :py:class:`lakesuperior.store.keyset.Keyset` which is convenient bacause a Keyset can be obtained very efficiently from querying a store, then also very efficiently filtered and eventually converted into a set of readable terms. An instance of this class can also be converted to and from a ``rdflib.Graph`` instance. TODO verify that this frees Cython pointers. """ def __cinit__( self, calg.Set *cdata=NULL, Keyset keyset=None, store=None, set data=set()): """ Initialize the graph with pre-existing data or by looking up a store. One of ``cdata``, ``keyset``, or ``data`` can be provided. If more than one of these is provided, precedence is given in the mentioned order. If none of them is specified, an empty graph is initialized. :param rdflib.URIRef uri: The graph URI. This will serve as the subject for some queries. :param calg.Set cdata: Initial data as a C ``Set`` struct. :param Keyset keyset: Keyset to create the graph from. Keys will be converted to set elements. :param lakesuperior.store.ldp_rs.LmdbTripleStore store: store to look up the keyset. Only used if ``keyset`` is specified. If not set, the environment store is used. :param set data: Initial data as a set of 3-tuples of RDFLib terms. :param tuple lookup: tuple of a 3-tuple of lookup terms, and a context. E.g. ``((URIRef('urn:ns:a'), None, None), URIRef('urn:ns:ctx'))``. Any and all elements may be ``None``. :param lmdbStore store: the store to look data up. """ self.store = store or env.app_defaults.rdf_store cdef: size_t i = 0 TripleKey spok term.Buffer pk_t if cdata is not NULL: # Get data from provided C set. self._data = cdata else: # Initialize empty data set. self._data = calg.set_new(set_item_hash_fn, set_item_cmp_fn) if keyset is not None: # Populate with triples extracted from provided key set. while keyset.next(spok): self.store.lookup_term(spok[:KLEN], &pk_t) term.deserialize(&pk_t, self._trp.s) self.store.lookup_term(spok[KLEN:DBL_KLEN], &pk_t) term.deserialize(&pk_t, self._trp.p) self.store.lookup_term(spok[DBL_KLEN:TRP_KLEN], &pk_t) term.deserialize(&pk_t, self._trp.o) calg.set_insert(self._data, &self._trp) else: # Populate with provided Python set. self._trp = PyMem_Malloc(sizeof(Triple) * len(data)) for s, p, o in data: term.from_rdflib(s, self._trp[i].s) term.from_rdflib(p, self._trp[i].p) term.from_rdflib(o, self._trp[i].o) calg.set_insert(self._data, self._trp) def __dealloc__(self): """ Free the triple pointer. """ PyMem_Free(self._trp) # TODO This should free the structs pointed to as well, unless they # were provided as ``cdata`` in the constructor (i.e. they were # generated.externally). @property def data(self): """ Triple data as a Python set. :rtype: set """ return self._data_as_set() cdef void _data_from_lookup( self, LmdbTriplestore store, tuple trp_ptn, ctx=None) except *: """ Look up triples in the triplestore and load them into ``data``. :param tuple lookup: 3-tuple of RDFlib terms or ``None``. :param LmdbTriplestore store: Reference to a LMDB triplestore. This is normally set to ``lakesuperior.env.app_globals.rdf_store``. """ cdef: size_t i unsigned char spok[TRP_KLEN] self._data = calg.set_new(set_item_hash_fn, set_item_cmp_fn) with store.txn_ctx(): keyset = store.triple_keys(trp_ptn, ctx) for i in range(keyset.ct): spok = keyset.data + i * TRP_KLEN self.data.add(store.from_trp_key(spok[: TRP_KLEN])) strp = serialize_triple(self._trp) calg.set_insert(self._data, strp) cdef _data_as_set(self): """ Convert triple data to a Python set. :rtype: set """ pass # Basic set operations. def add(self, dataset): """ Set union. """ self.data.add(dataset) def remove(self, item): """ Remove one item from the graph. :param tuple item: A 3-tuple of RDFlib terms. Only exact terms, i.e. wildcards are not accepted. """ self.data.remove(item) def __len__(self): """ Number of triples in the graph. """ return len(self.data) @use_data def __eq__(self, other): """ Equality operator between ``SimpleGraph`` instances. """ return self.data == other def __repr__(self): """ String representation of the graph. It provides the number of triples in the graph and memory address of the instance. """ return (f'<{self.__class__.__name__} @{hex(id(self))} ' f'length={len(self.data)}>') def __str__(self): """ String dump of the graph triples. """ return str(self.data) @use_data def __sub__(self, other): """ Set subtraction. """ return self.data - other @use_data def __isub__(self, other): """ In-place set subtraction. """ self.data -= other return self @use_data def __and__(self, other): """ Set intersection. """ return self.data & other @use_data def __iand__(self, other): """ In-place set intersection. """ self.data &= other return self @use_data def __or__(self, other): """ Set union. """ return self.data | other @use_data def __ior__(self, other): """ In-place set union. """ self.data |= other return self @use_data def __xor__(self, other): """ Set exclusive intersection (XOR). """ return self.data ^ other @use_data def __ixor__(self, other): """ In-place set exclusive intersection (XOR). """ self.data ^= other return self def __contains__(self, item): """ Whether the graph contains a triple. :rtype: boolean """ return item in self.data def __iter__(self): """ Graph iterator. It iterates over the set triples. """ return self.data.__iter__() # Slicing. def __getitem__(self, item): """ Slicing function. It behaves similarly to `RDFLib graph slicing `__ """ if isinstance(item, slice): s, p, o = item.start, item.stop, item.step return self._slice(s, p, o) else: raise TypeError(f'Wrong slice format: {item}.') cpdef void set(self, tuple trp) except *: """ Set a single value for subject and predicate. Remove all triples matching ``s`` and ``p`` before adding ``s p o``. """ self.remove_triples((trp[0], trp[1], None)) if None in trp: raise ValueError(f'Invalid triple: {trp}') self.data.add(trp) cpdef void remove_triples(self, pattern) except *: """ Remove triples by pattern. The pattern used is similar to :py:meth:`LmdbTripleStore.delete`. """ s, p, o = pattern for match in self.lookup(s, p, o): logger.debug(f'Removing from graph: {match}.') self.data.remove(match) cpdef object as_rdflib(self): """ Return the data set as an RDFLib Graph. :rtype: rdflib.Graph """ gr = Graph() for trp in self.data: gr.add(trp) return gr cdef _slice(self, s, p, o): """ Return terms filtered by other terms. This behaves like the rdflib.Graph slicing policy. """ if s is None and p is None and o is None: return self.data elif s is None and p is None: return {(r[0], r[1]) for r in self.data if r[2] == o} elif s is None and o is None: return {(r[0], r[2]) for r in self.data if r[1] == p} elif p is None and o is None: return {(r[1], r[2]) for r in self.data if r[0] == s} elif s is None: return {r[0] for r in self.data if r[1] == p and r[2] == o} elif p is None: return {r[1] for r in self.data if r[0] == s and r[2] == o} elif o is None: return {r[2] for r in self.data if r[0] == s and r[1] == p} else: # all given return (s,p,o) in self.data cpdef lookup(self, s, p, o): """ Look up triples by a pattern. """ logger.debug(f'Looking up in graph: {s}, {p}, {o}.') if s is None and p is None and o is None: return self.data elif s is None and p is None: return {r for r in self.data if r[2] == o} elif s is None and o is None: return {r for r in self.data if r[1] == p} elif p is None and o is None: return {r for r in self.data if r[0] == s} elif s is None: return {r for r in self.data if r[1] == p and r[2] == o} elif p is None: return {r for r in self.data if r[0] == s and r[2] == o} elif o is None: return {r for r in self.data if r[0] == s and r[1] == p} else: # all given return (s,p,o) if (s, p, o) in self.data else set() cpdef set terms(self, str type): """ Get all terms of a type: subject, predicate or object. :param str type: One of ``s``, ``p`` or ``o``. """ i = 'spo'.index(type) return {r[i] for r in self.data} cdef class Imr(SimpleGraph): """ In-memory resource data container. This is an extension of :py:class:`~SimpleGraph` that adds a subject URI to the data set and some convenience methods. An instance of this class can be converted to a ``rdflib.Resource`` instance. Some set operations that produce a new object (``-``, ``|``, ``&``, ``^``) will create a new ``Imr`` instance with the same subject URI. """ def __init__(self, str uri, *args, **kwargs): """ Initialize the graph with pre-existing data or by looking up a store. Either ``data``, or ``lookup`` *and* ``store``, can be provide. ``lookup`` and ``store`` have precedence. If none of them is specified, an empty graph is initialized. :param rdflib.URIRef uri: The graph URI. This will serve as the subject for some queries. :param set data: Initial data as a set of 3-tuples of RDFLib terms. :param tuple lookup: tuple of a 3-tuple of lookup terms, and a context. E.g. ``((URIRef('urn:ns:a'), None, None), URIRef('urn:ns:ctx'))``. Any and all elements may be ``None``. :param lmdbStore store: the store to look data up. """ super().__init__(*args, **kwargs) self.uri = uri @property def identifier(self): """ IMR URI. For compatibility with RDFLib Resource. :rtype: string """ return self.uri @property def graph(self): """ Return a SimpleGraph with the same data. :rtype: SimpleGraph """ return SimpleGraph(self.data) def __repr__(self): """ String representation of an Imr. This includes the subject URI, number of triples contained and the memory address of the instance. """ return (f'<{self.__class__.__name__} @{hex(id(self))} uri={self.uri}, ' f'length={len(self.data)}>') @use_data def __sub__(self, other): """ Set difference. This creates a new Imr with the same subject URI. """ return self.__class__(uri=self.uri, data=self.data - other) @use_data def __and__(self, other): """ Set intersection. This creates a new Imr with the same subject URI. """ return self.__class__(uri=self.uri, data=self.data & other) @use_data def __or__(self, other): """ Set union. This creates a new Imr with the same subject URI. """ return self.__class__(uri=self.uri, data=self.data | other) @use_data def __xor__(self, other): """ Set exclusive OR (XOR). This creates a new Imr with the same subject URI. """ return self.__class__(uri=self.uri, data=self.data ^ other) def __getitem__(self, item): """ Supports slicing notation. """ if isinstance(item, slice): s, p, o = item.start, item.stop, item.step return self._slice(s, p, o) elif isinstance(item, Node): # If a Node is given, return all values for that predicate. return { r[2] for r in self.data if r[0] == self.uri and r[1] == item} else: raise TypeError(f'Wrong slice format: {item}.') def value(self, p, strict=False): """ Get an individual value. :param rdflib.termNode p: Predicate to search for. :param bool strict: If set to ``True`` the method raises an error if more than one value is found. If ``False`` (the default) only the first found result is returned. :rtype: rdflib.term.Node """ values = self[p] if strict and len(values) > 1: raise RuntimeError('More than one value found for {}, {}.'.format( self.uri, p)) for ret in values: return ret return None cpdef as_rdflib(self): """ Return the IMR as a RDFLib Resource. :rtype: rdflib.Resource """ gr = Graph() for trp in self.data: gr.add(trp) return gr.resource(identifier=self.uri)