123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771 |
- import logging
- from functools import wraps
- from rdflib import Graph
- from rdflib.term import Node
- from lakesuperior import env
- from libc.string cimport memcmp, memcpy
- from libc.stdlib cimport free
- from lakesuperior.cy_include cimport calg
- from lakesuperior.cy_include cimport cylmdb as lmdb
- from lakesuperior.store.ldp_rs cimport term
- from lakesuperior.store.ldp_rs.lmdb_triplestore cimport (
- KLEN, DBL_KLEN, TRP_KLEN, TripleKey, LmdbTriplestore)
- from lakesuperior.store.ldp_rs.keyset cimport Keyset
- from lakesuperior.store.ldp_rs.triple cimport BufferTriple
- from lakesuperior.util.hash cimport Hash32, hash32
- #BUF_PTR_SZ = sizeof(Buffer *)
- logger = logging.getLogger(__name__)
- def use_data(fn):
- """
- Decorator to indicate that a set operation between two SimpleGraph
- instances should use the ``data`` property of the second term. The second
- term can also be a simple set.
- """
- @wraps(fn)
- def _wrapper(self, other):
- if isinstance(other, SimpleGraph):
- other = other.data
- return _wrapper
- cdef unsigned int term_hash_fn(const calg.SetValue data):
- """
- Hash function for sets of terms.
- https://fragglet.github.io/c-algorithms/doc/set_8h.html#6c7986a2a80d7a3cb7b9d74e1c6fef97
- :param SetValue *data: Pointer to a Buffer structure.
- """
- cdef:
- Hash32 hash
- hash32(<const Buffer *>&data, &hash)
- return hash
- cdef unsigned int trp_hash_fn(calg.SetValue btrp):
- """
- Hash function for sets of (serialized) triples.
- https://fragglet.github.io/c-algorithms/doc/set_8h.html#6c7986a2a80d7a3cb7b9d74e1c6fef97
- This function computes the hash of the concatenated pointer values in the
- s, p, o members of the triple. The triple structure is treated as a byte
- string. This is safe in spite of byte-wise struct evaluation being a
- frowned-upon practice (due to padding issues), because it is assumed that
- the input value is always the same type of structure.
- :param SetItem *data: Pointer to a BufferTriple structure.
- """
- cdef:
- Buffer data
- Hash32 hash
- data.addr = &btrp
- data.sz = sizeof(btrp)
- hash32(&data, &hash)
- return hash
- cdef bint buffer_cmp_fn(const calg.SetValue v1, const calg.SetValue v2):
- """
- Compare function for two Buffer objects.
- https://fragglet.github.io/c-algorithms/doc/set_8h.html#40fa2c86d5b003c1b0b0e8dd1e4df9f4
- """
- # No-cast option.
- #if v1[0].sz != v2[0].sz:
- # return False
- #return memcmp(v1[0].addr, v2[0].addr, v1[0].sz) == 0
- cdef:
- Buffer b1 = (<Buffer *>v1)[0]
- Buffer b2 = (<Buffer *>v2)[0]
- if b1.sz != b2.sz:
- return False
- return memcmp(b1.addr, b2.addr, b1.sz) == 0
- cdef bint triple_cmp_fn(const calg.SetValue v1, const calg.SetValue v2):
- """
- Compare function for two triples in a CAlg set.
- Here, pointers to terms are compared for s, p, o. The pointers should be
- guaranteed to point to unique values (i.e. no two pointers have the same
- term value within a graph).
- https://fragglet.github.io/c-algorithms/doc/set_8h.html#40fa2c86d5b003c1b0b0e8dd1e4df9f4
- """
- cdef:
- BufferTriple t1 = (<BufferTriple *>v1)[0]
- BufferTriple t2 = (<BufferTriple *>v2)[0]
- return(
- t1.s == t2.s and
- t1.p == t2.p and
- t1.o == t2.o)
- cdef inline bint lookup_none_cmp_fn(
- const BufferTriple *trp, const Buffer *t1, const Buffer *t2):
- return True
- cdef inline bint lookup_s_cmp_fn(
- const BufferTriple *trp, const Buffer *t1, const Buffer *t2):
- """
- Lookup callback compare function for a given s in a triple.
- The function returns ``True`` if ``t1`` matches the first term.
- ``t2`` is not used and is declared only for compatibility with the
- other interchangeable functions.
- """
- return buffer_cmp_fn(t1, trp[0].s)
- cdef inline bint lookup_p_cmp_fn(
- const BufferTriple *trp, const Buffer *t1, const Buffer *t2):
- return buffer_cmp_fn(t1, trp[0].p)
- cdef inline bint lookup_o_cmp_fn(
- const BufferTriple *trp, const Buffer *t1, const Buffer *t2):
- return buffer_cmp_fn(t1, trp[0].o)
- cdef inline bint lookup_sp_cmp_fn(
- const BufferTriple *trp, const Buffer *t1, const Buffer *t2):
- return (
- buffer_cmp_fn(t1, trp[0].s)
- and buffer_cmp_fn(t2, trp[0].p))
- cdef inline bint lookup_so_cmp_fn(
- const BufferTriple *trp, const Buffer *t1, const Buffer *t2):
- return (
- buffer_cmp_fn(t1, trp[0].s)
- and buffer_cmp_fn(t2, trp[0].o))
- cdef inline bint lookup_po_cmp_fn(
- const BufferTriple *trp, const Buffer *t1, const Buffer *t2):
- return (
- buffer_cmp_fn(t1, trp[0].p)
- and buffer_cmp_fn(t2, trp[0].o))
- cdef class SimpleGraph:
- """
- Fast and simple implementation of a graph.
- Most functions should mimic RDFLib's graph with less overhead. It uses
- the same funny but functional slicing notation. No lookup functions within
- the graph are available at this time.
- Instances of this class hold a set of
- :py:class:`~lakesuperior.store.ldp_rs.term.Term` structures that stores
- unique terms within the graph, and a set of
- :py:class:`~lakesuperior.store.ldp_rs.triple.Triple` structures referencing
- those terms. Therefore, no data duplication occurs and the storage is quite
- sparse.
- A graph can be instantiated from a store lookup.
- A SimpleGraph can also be obtained from a
- :py:class:`lakesuperior.store.keyset.Keyset` which is convenient bacause
- a Keyset can be obtained very efficiently from querying a store, then also
- very efficiently filtered and eventually converted into a set of meaningful
- terms.
- An instance of this class can also be converted to and from a
- ``rdflib.Graph`` instance. TODO verify that this frees Cython pointers.
- """
- def __cinit__(
- self, Keyset keyset=None, store=None, set data=set()):
- """
- Initialize the graph with pre-existing data or by looking up a store.
- One of ``keyset``, or ``data`` can be provided. If more than
- one of these is provided, precedence is given in the mentioned order.
- If none of them is specified, an empty graph is initialized.
- :param rdflib.URIRef uri: The graph URI.
- This will serve as the subject for some queries.
- :param Keyset keyset: Keyset to create the graph from. Keys will be
- converted to set elements.
- :param lakesuperior.store.ldp_rs.LmdbTripleStore store: store to
- look up the keyset. Only used if ``keyset`` is specified. If not
- set, the environment store is used.
- :param set data: Initial data as a set of 3-tuples of RDFLib terms.
- :param tuple lookup: tuple of a 3-tuple of lookup terms, and a context.
- E.g. ``((URIRef('urn:ns:a'), None, None), URIRef('urn:ns:ctx'))``.
- Any and all elements may be ``None``.
- :param lmdbStore store: the store to look data up.
- """
- self.store = store or env.app_globals.rdf_store
- self._terms = calg.set_new(term_hash_fn, buffer_cmp_fn)
- self._triples = calg.set_new(trp_hash_fn, triple_cmp_fn)
- cdef:
- size_t i = 0
- TripleKey spok
- term.Buffer pk_t
- # Initialize empty data set.
- if keyset:
- # Populate with triples extracted from provided key set.
- self._data_from_keyset(keyset)
- elif data is not None:
- # Populate with provided Python set.
- for s, p, o in data:
- self._add_from_rdflib(s, p, o)
- def __dealloc__(self):
- """
- Free the triple pointers. TODO use a Cymem pool
- """
- free(self._triples)
- free(self._terms)
- @property
- def data(self):
- """
- Triple data as a Python set.
- :rtype: set
- """
- return self._data_as_set()
- cdef void _data_from_lookup(self, tuple trp_ptn, ctx=None) except *:
- """
- Look up triples in the triplestore and load them into ``data``.
- :param tuple lookup: 3-tuple of RDFlib terms or ``None``.
- :param LmdbTriplestore store: Reference to a LMDB triplestore. This
- is normally set to ``lakesuperior.env.app_globals.rdf_store``.
- """
- cdef:
- size_t i
- unsigned char spok[TRP_KLEN]
- with self.store.txn_ctx():
- keyset = self.store.triple_keys(trp_ptn, ctx)
- self.data_from_keyset(keyset)
- cdef void _data_from_keyset(self, Keyset data) except *:
- """Populate a graph from a Keyset."""
- cdef TripleKey spok
- while data.next(spok):
- self._add_from_spok(spok)
- cdef inline void _add_from_spok(self, const TripleKey spok) except *:
- """
- Add a triple from a TripleKey of term keys.
- """
- cdef:
- Buffer *ss = NULL
- Buffer *sp = NULL
- Buffer *so = NULL
- self.store.lookup_term(spok, ss)
- self.store.lookup_term(spok + KLEN, sp)
- self.store.lookup_term(spok + DBL_KLEN, so)
- self._add_triple(ss, sp, so)
- cdef inline void _add_triple(
- self, const Buffer *ss, const Buffer *sp, const Buffer *so
- ) except *:
- """
- Add a triple from 3 (TPL) serialized terms.
- Each of the terms is added to the term set if not existing. The triple
- also is only added if not existing.
- """
- cdef BufferTriple trp
- print('Adding terms.')
- print('ss: ')
- print((<unsigned char *>ss[0].addr)[:ss[0].sz])
- calg.set_insert(self._terms, ss)
- print('sp: ')
- print((<unsigned char *>sp[0].addr)[:sp[0].sz])
- calg.set_insert(self._terms, sp)
- print('so: ')
- print((<unsigned char *>so[0].addr)[:so[0].sz])
- calg.set_insert(self._terms, so)
- print('Added terms.')
- trp.s = ss
- trp.p = sp
- trp.o = so
- print('Adding triple.')
- calg.set_insert(self._triples, &trp)
- print('Added triple.')
- cdef set _data_as_set(self):
- """
- Convert triple data to a Python set.
- :rtype: set
- """
- cdef:
- calg.SetIterator ti
- BufferTriple *trp
- term.Term s, p, o
- graph_set = set()
- calg.set_iterate(self._triples, &ti)
- while calg.set_iter_has_more(&ti):
- trp = <BufferTriple *>calg.set_iter_next(&ti)
- graph_set.add((
- term.deserialize_to_rdflib(trp[0].s),
- term.deserialize_to_rdflib(trp[0].p),
- term.deserialize_to_rdflib(trp[0].o)))
- return graph_set
- # Basic set operations.
- def add(self, triple):
- """ Add one triple to the graph. """
- cdef Buffer ss, sp, so
- s, p, o = triple
- #print('Serializing s.')
- term.serialize_from_rdflib(s, &ss)
- #print('Serializing p.')
- term.serialize_from_rdflib(p, &sp)
- #print('Serializing o.')
- term.serialize_from_rdflib(o, &so)
- print('Adding triple from rdflib.')
- self._add_triple(&ss, &sp, &so)
- print('Added triple from rdflib.')
- def remove(self, item):
- """
- Remove one item from the graph.
- :param tuple item: A 3-tuple of RDFlib terms. Only exact terms, i.e.
- wildcards are not accepted.
- """
- self.data.remove(item)
- def __len__(self):
- """ Number of triples in the graph. """
- return len(self.data)
- @use_data
- def __eq__(self, other):
- """ Equality operator between ``SimpleGraph`` instances. """
- return self.data == other
- def __repr__(self):
- """
- String representation of the graph.
- It provides the number of triples in the graph and memory address of
- the instance.
- """
- return (f'<{self.__class__.__name__} @{hex(id(self))} '
- f'length={len(self.data)}>')
- def __str__(self):
- """ String dump of the graph triples. """
- return str(self.data)
- @use_data
- def __sub__(self, other):
- """ Set subtraction. """
- return self.data - other
- @use_data
- def __isub__(self, other):
- """ In-place set subtraction. """
- self.data -= other
- return self
- @use_data
- def __and__(self, other):
- """ Set intersection. """
- return self.data & other
- @use_data
- def __iand__(self, other):
- """ In-place set intersection. """
- self.data &= other
- return self
- @use_data
- def __or__(self, other):
- """ Set union. """
- return self.data | other
- @use_data
- def __ior__(self, other):
- """ In-place set union. """
- self.data |= other
- return self
- @use_data
- def __xor__(self, other):
- """ Set exclusive intersection (XOR). """
- return self.data ^ other
- @use_data
- def __ixor__(self, other):
- """ In-place set exclusive intersection (XOR). """
- self.data ^= other
- return self
- def __contains__(self, item):
- """
- Whether the graph contains a triple.
- :rtype: boolean
- """
- return item in self.data
- def __iter__(self):
- """ Graph iterator. It iterates over the set triples. """
- return self.data.__iter__()
- # Slicing.
- def __getitem__(self, item):
- """
- Slicing function.
- It behaves similarly to `RDFLib graph slicing
- <https://rdflib.readthedocs.io/en/stable/utilities.html#slicing-graphs>`__
- """
- if isinstance(item, slice):
- s, p, o = item.start, item.stop, item.step
- return self._slice(s, p, o)
- else:
- raise TypeError(f'Wrong slice format: {item}.')
- cpdef void set(self, tuple trp) except *:
- """
- Set a single value for subject and predicate.
- Remove all triples matching ``s`` and ``p`` before adding ``s p o``.
- """
- if None in trp:
- raise ValueError(f'Invalid triple: {trp}')
- self.remove_triples((trp[0], trp[1], None))
- self.add(trp)
- cpdef void remove_triples(self, pattern) except *:
- """
- Remove triples by pattern.
- The pattern used is similar to :py:meth:`LmdbTripleStore.delete`.
- """
- s, p, o = pattern
- for match in self.lookup(s, p, o):
- logger.debug(f'Removing from graph: {match}.')
- self.data.remove(match)
- cpdef object as_rdflib(self):
- """
- Return the data set as an RDFLib Graph.
- :rtype: rdflib.Graph
- """
- gr = Graph()
- for trp in self.data:
- gr.add(trp)
- return gr
- def _slice(self, s, p, o):
- """
- Return terms filtered by other terms.
- This behaves like the rdflib.Graph slicing policy.
- """
- _data = self.data
- logger.debug(f'Slicing graph by: {s}, {p}, {o}.')
- if s is None and p is None and o is None:
- return _data
- elif s is None and p is None:
- return {(r[0], r[1]) for r in _data if r[2] == o}
- elif s is None and o is None:
- return {(r[0], r[2]) for r in _data if r[1] == p}
- elif p is None and o is None:
- return {(r[1], r[2]) for r in _data if r[0] == s}
- elif s is None:
- return {r[0] for r in _data if r[1] == p and r[2] == o}
- elif p is None:
- return {r[1] for r in _data if r[0] == s and r[2] == o}
- elif o is None:
- return {r[2] for r in _data if r[0] == s and r[1] == p}
- else:
- # all given
- return (s,p,o) in _data
- def lookup(self, s, p, o):
- """
- Look up triples by a pattern.
- This function converts RDFLib terms into the serialized format stored
- in the graph's internal structure and compares them bytewise.
- Any and all of the lookup terms can be ``None``.
- """
- cdef:
- BufferTriple trp
- BufferTriple *trp_p
- calg.SetIterator ti
- const Buffer t1
- const Buffer t2
- lookup_fn_t fn
- res = set()
- # Decide comparison logic outside the loop.
- if s is not None and p is not None and o is not None:
- # Return immediately if 3-term match is requested.
- term.serialize_from_rdflib(s, trp.s)
- term.serialize_from_rdflib(p, trp.p)
- term.serialize_from_rdflib(o, trp.o)
- if calg.set_query(self._triples, &trp):
- res.add((s, p, o))
- return res
- elif s is not None:
- term.serialize_from_rdflib(s, &t1)
- if p is not None:
- fn = lookup_sp_cmp_fn
- term.serialize_from_rdflib(p, &t2)
- elif o is not None:
- fn = lookup_so_cmp_fn
- term.serialize_from_rdflib(o, &t2)
- else:
- fn = lookup_s_cmp_fn
- elif p is not None:
- term.serialize_from_rdflib(p, &t1)
- if o is not None:
- fn = lookup_po_cmp_fn
- term.serialize_from_rdflib(o, &t2)
- else:
- fn = lookup_p_cmp_fn
- elif o is not None:
- fn = lookup_o_cmp_fn
- term.serialize_from_rdflib(o, &t1)
- else:
- fn = lookup_none_cmp_fn
- # Iterate over serialized triples.
- calg.set_iterate(self._triples, &ti)
- while calg.set_iter_has_more(&ti):
- trp_p = <BufferTriple *>calg.set_iter_next(&ti)
- if fn(trp_p, &t1, &t2):
- res.add((
- term.deserialize_to_rdflib(trp_p[0].s),
- term.deserialize_to_rdflib(trp_p[0].p),
- term.deserialize_to_rdflib(trp_p[0].o),
- ))
- return res
- cpdef set terms(self, str type):
- """
- Get all terms of a type: subject, predicate or object.
- :param str type: One of ``s``, ``p`` or ``o``.
- """
- i = 'spo'.index(type)
- return {r[i] for r in self.data}
- cdef class Imr(SimpleGraph):
- """
- In-memory resource data container.
- This is an extension of :py:class:`~SimpleGraph` that adds a subject URI to
- the data set and some convenience methods.
- An instance of this class can be converted to a ``rdflib.Resource``
- instance.
- Some set operations that produce a new object (``-``, ``|``, ``&``, ``^``)
- will create a new ``Imr`` instance with the same subject URI.
- """
- def __init__(self, str uri, *args, **kwargs):
- """
- Initialize the graph with pre-existing data or by looking up a store.
- Either ``data``, or ``lookup`` *and* ``store``, can be provide.
- ``lookup`` and ``store`` have precedence. If none of them is specified,
- an empty graph is initialized.
- :param rdflib.URIRef uri: The graph URI.
- This will serve as the subject for some queries.
- :param set data: Initial data as a set of 3-tuples of RDFLib terms.
- :param tuple lookup: tuple of a 3-tuple of lookup terms, and a context.
- E.g. ``((URIRef('urn:ns:a'), None, None), URIRef('urn:ns:ctx'))``.
- Any and all elements may be ``None``.
- :param lmdbStore store: the store to look data up.
- """
- super().__init__(*args, **kwargs)
- self.uri = uri
- @property
- def identifier(self):
- """
- IMR URI. For compatibility with RDFLib Resource.
- :rtype: string
- """
- return self.uri
- @property
- def graph(self):
- """
- Return a SimpleGraph with the same data.
- :rtype: SimpleGraph
- """
- return SimpleGraph(self.data)
- def __repr__(self):
- """
- String representation of an Imr.
- This includes the subject URI, number of triples contained and the
- memory address of the instance.
- """
- return (f'<{self.__class__.__name__} @{hex(id(self))} uri={self.uri}, '
- f'length={len(self.data)}>')
- @use_data
- def __sub__(self, other):
- """
- Set difference. This creates a new Imr with the same subject URI.
- """
- return self.__class__(uri=self.uri, data=self.data - other)
- @use_data
- def __and__(self, other):
- """
- Set intersection. This creates a new Imr with the same subject URI.
- """
- return self.__class__(uri=self.uri, data=self.data & other)
- @use_data
- def __or__(self, other):
- """
- Set union. This creates a new Imr with the same subject URI.
- """
- return self.__class__(uri=self.uri, data=self.data | other)
- @use_data
- def __xor__(self, other):
- """
- Set exclusive OR (XOR). This creates a new Imr with the same subject
- URI.
- """
- return self.__class__(uri=self.uri, data=self.data ^ other)
- def __getitem__(self, item):
- """
- Supports slicing notation.
- """
- if isinstance(item, slice):
- s, p, o = item.start, item.stop, item.step
- return self._slice(s, p, o)
- elif isinstance(item, Node):
- # If a Node is given, return all values for that predicate.
- return {
- r[2] for r in self.data
- if r[0] == self.uri and r[1] == item}
- else:
- raise TypeError(f'Wrong slice format: {item}.')
- def value(self, p, strict=False):
- """
- Get an individual value.
- :param rdflib.termNode p: Predicate to search for.
- :param bool strict: If set to ``True`` the method raises an error if
- more than one value is found. If ``False`` (the default) only
- the first found result is returned.
- :rtype: rdflib.term.Node
- """
- values = self[p]
- if strict and len(values) > 1:
- raise RuntimeError('More than one value found for {}, {}.'.format(
- self.uri, p))
- for ret in values:
- return ret
- return None
- cpdef as_rdflib(self):
- """
- Return the IMR as a RDFLib Resource.
- :rtype: rdflib.Resource
- """
- gr = Graph()
- for trp in self.data:
- gr.add(trp)
- return gr.resource(identifier=self.uri)
|