scossu
/
lakesuperior_legacy


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672
							import logging

import rdflib

from lakesuperior import env

from libc.string cimport memcpy
from libc.stdlib cimport free

from cymem.cymem cimport Pool

cimport lakesuperior.cy_include.collections as cc
cimport lakesuperior.model.graph.callbacks as cb

from lakesuperior.model.base cimport Buffer, buffer_dump
from lakesuperior.model.structures.keyset cimport Keyset
from lakesuperior.model.graph cimport term
from lakesuperior.model.graph.triple cimport BufferTriple
from lakesuperior.model.structures.hash cimport term_hash_seed32

logger = logging.getLogger(__name__)


cdef class Graph:
    """
    Fast and simple implementation of a graph.

    Most functions should mimic RDFLib's graph with less overhead. It uses
    the same funny but functional slicing notation.

    A Graph can be instantiated from a store lookup. This makes it
    possible to use a Keyset to perform initial filtering via identity by key,
    then the filtered Keyset can be converted into a set of meaningful terms.

    An instance of this class can also be converted to and from a
    ``rdflib.Graph`` instance.
    """

    def __cinit__(
        self, store, size_t ct=0, str uri=None, set data=set()
    ):
        """
        Initialize the graph, optionally with Python data.

        :param set data: Initial data as a set of 3-tuples of RDFLib terms.
        """

        self.pool = Pool()

        if not store:
            store = env.app_globals.ldprs_store
        # Initialize empty data set.
        if data:
            # Populate with provided Python set.
            self.keys = Keyset(len(data))
            self.add_triples(data)
        else:
            self.keys = Keyset(ct)


    ## PROPERTIES ##

    @property
    def uri(self):
        """
        Get resource identifier as a RDFLib URIRef.

        :rtype: rdflib.URIRef.
        """
        return rdflib.URIRef(self.id)


    @property
    def data(self):
        """
        Triple data as a Python set.

        :rtype: set
        """
        cdef TripleKey spok

        ret = set()

        self.seek()
        while self.get_next(&spok):
            ret.add((
                self.store.from_key(trp[0]),
                self.store.from_key(trp[1]),
                self.store.from_key(trp[2])
            ))

        return ret


    ## MAGIC METHODS ##

    def __len__(self):
        """ Number of triples in the graph. """
        return self.keys.size()


    def __eq__(self, other):
        """ Equality operator between ``Graph`` instances. """
        return len(self ^ other) == 0


    def __repr__(self):
        """
        String representation of the graph.

        This includes the subject URI, number of triples contained and the
        memory address of the instance.
        """
        id_repr = f' id={self.id},' if self.id else ''
        return (
                f'<{self.__class__.__name__} @0x{id(self):02x}{id_repr} '
            f'length={len(self)}>'
        )


    def __str__(self):
        """ String dump of the graph triples. """
        return str(self.data)


    def __add__(self, other):
        """ Alias for set-theoretical union. """
        return self.union_(other)


    def __iadd__(self, other):
        """ Alias for in-place set-theoretical union. """
        self.ip_union(other)
        return self


    def __sub__(self, other):
        """ Set-theoretical subtraction. """
        return self.subtraction(other)


    def __isub__(self, other):
        """ In-place set-theoretical subtraction. """
        self.ip_subtraction(other)
        return self

    def __and__(self, other):
        """ Set-theoretical intersection. """
        return self.intersection(other)


    def __iand__(self, other):
        """ In-place set-theoretical intersection. """
        self.ip_intersection(other)
        return self


    def __or__(self, other):
        """ Set-theoretical union. """
        return self.union_(other)


    def __ior__(self, other):
        """ In-place set-theoretical union. """
        self.ip_union(other)
        return self


    def __xor__(self, other):
        """ Set-theoretical exclusive disjunction (XOR). """
        return self.xor(other)


    def __ixor__(self, other):
        """ In-place set-theoretical exclusive disjunction (XOR). """
        self.ip_xor(other)
        return self


    def __contains__(self, trp):
        """
        Whether the graph contains a triple.

        :rtype: boolean
        """
        cdef TripleKey spok

        spok = [
            self.store.to_key(trp[0]),
            self.store.to_key(trp[1]),
            self.store.to_key(trp[2]),
        ]

        return self.data.contains(&spok)


    def __iter__(self):
        """ Graph iterator. It iterates over the set triples. """
        yield from self.data


    # Slicing.

    def __getitem__(self, item):
        """
        Slicing function.

        It behaves similarly to `RDFLib graph slicing
        <https://rdflib.readthedocs.io/en/stable/utilities.html#slicing-graphs>`__
        """
        if isinstance(item, slice):
            s, p, o = item.start, item.stop, item.step
            return self._slice(s, p, o)
        elif self.id and isinstance(item, rdflib.Node):
            # If a Node is given, return all values for that predicate.
            return self._slice(self.uri, item, None)
        else:
            raise TypeError(f'Wrong slice format: {item}.')


    def __hash__(self):
        """ TODO Bogus """
        return self.id


    ## BASIC PYTHON-ACCESSIBLE SET OPERATIONS ##

    def value(self, p, strict=False):
        """
        Get an individual value.

        :param rdflib.termNode p: Predicate to search for.
        :param bool strict: If set to ``True`` the method raises an error if
            more than one value is found. If ``False`` (the default) only
            the first found result is returned.
        :rtype: rdflib.term.Node
        """
        if not self.id:
            raise ValueError('Cannot use `value` on a non-named graph.')

        # TODO use slice.
        values = {trp[2] for trp in self.lookup((self.uri, p, None))}

        if strict and len(values) > 1:
            raise RuntimeError('More than one value found for {}, {}.'.format(
                    self.id, p))

        for ret in values:
            return ret

        return None


    def terms_by_type(self, type):
        """
        Get all terms of a type: subject, predicate or object.

        :param str type: One of ``s``, ``p`` or ``o``.
        """
        i = 'spo'.index(type)
        return {r[i] for r in self.data}


    def add_triples(self, trp):
        """
        Add triples to the graph.

        This method checks for duplicates.

        :param iterable triples: iterable of 3-tuple triples.
        """
        for s, p, o in triples:
            self.keys.add([
                self.store.to_key(s),
                self.store.to_key(p),
                self.store.to_key(o),
            ], True)


    def remove(self, pattern):
        """
        Remove triples by pattern.

        The pattern used is similar to :py:meth:`LmdbTripleStore.delete`.
        """
        self._match_ptn_callback(
            pattern, self, cb.del_trp_callback, NULL
        )


    ## CYTHON-ACCESSIBLE BASIC METHODS ##

    cdef Graph empty_copy(self):
        """
        Create an empty copy carrying over some key properties.

        Override in subclasses to accommodate for different init properties.
        """
        return self.__class__(self.ct, self.store, uri=self.id)


    cpdef union_(self, Graph other):
        """
        Perform set union resulting in a new Graph instance.

        TODO Allow union of multiple graphs at a time.

        :param Graph other: The other graph to merge.

        :rtype: Graph
        :return: A new Graph instance.
        """
        cdef:
            void *cur
            cc.HashSetIter it
            BufferTriple *trp

        new_gr = self.empty_copy()

        for gr in (self, other):
            cc.hashset_iter_init(&it, gr._triples)
            while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
                bt = <BufferTriple*>cur
                new_gr.add_triple(bt, True)

        return new_gr


    cdef void ip_union(self, Graph other) except *:
        """
        Perform an in-place set union that adds triples to this instance

        TODO Allow union of multiple graphs at a time.

        :param Graph other: The other graph to merge.

        :rtype: void
        """
        cdef:
            void *cur
            cc.HashSetIter it

        cc.hashset_iter_init(&it, other._triples)
        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
            bt = <BufferTriple*>cur
            self.add_triple(bt, True)


    cpdef intersection(self, Graph other):
        """
        Graph intersection.

        :param Graph other: The other graph to intersect.

        :rtype: Graph
        :return: A new Graph instance.
        """
        cdef:
            void *cur
            cc.HashSetIter it

        new_gr = self.empty_copy()

        cc.hashset_iter_init(&it, self._triples)
        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
            bt = <BufferTriple*>cur
            if other.trp_contains(bt):
                new_gr.add_triple(bt, True)

        return new_gr


    cdef void ip_intersection(self, Graph other) except *:
        """
        In-place graph intersection.

        Triples not in common with another graph are removed from the current
        one.

        :param Graph other: The other graph to intersect.

        :rtype: void
        """
        cdef:
            void *cur
            cc.HashSetIter it

        cc.hashset_iter_init(&it, self._triples)
        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
            bt = <BufferTriple*>cur
            if not other.trp_contains(bt):
                self.remove_triple(bt)


    cpdef subtraction(self, Graph other):
        """
        Graph set-theoretical subtraction.

        Create a new graph with the triples of this graph minus the ones in
        common with the other graph.

        :param Graph other: The other graph to subtract to this.

        :rtype: Graph
        :return: A new Graph instance.
        """
        cdef:
            void *cur
            cc.HashSetIter it

        new_gr = self.empty_copy()

        cc.hashset_iter_init(&it, self._triples)
        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
            bt = <BufferTriple*>cur
            if not other.trp_contains(bt):
                new_gr.add_triple(bt, True)

        return new_gr


    cdef void ip_subtraction(self, Graph other) except *:
        """
        In-place graph subtraction.

        Triples in common with another graph are removed from the current one.

        :param Graph other: The other graph to intersect.

        :rtype: void
        """
        cdef:
            void *cur
            cc.HashSetIter it

        cc.hashset_iter_init(&it, self._triples)
        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
            bt = <BufferTriple*>cur
            if other.trp_contains(bt):
                self.remove_triple(bt)


    cpdef xor(self, Graph other):
        """
        Graph Exclusive disjunction (XOR).

        :param Graph other: The other graph to perform XOR with.

        :rtype: Graph
        :return: A new Graph instance.
        """
        cdef:
            void *cur
            cc.HashSetIter it
            BufferTriple* bt

        new_gr = self.empty_copy()

        # Add triples in this and not in other.
        cc.hashset_iter_init(&it, self._triples)
        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
            bt = <BufferTriple*>cur
            if not other.trp_contains(bt):
                new_gr.add_triple(bt, True)

        # Other way around.
        cc.hashset_iter_init(&it, other._triples)
        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
            bt = <BufferTriple*>cur
            if not self.trp_contains(bt):
                new_gr.add_triple(bt, True)

        return new_gr


    cdef void ip_xor(self, Graph other) except *:
        """
        In-place graph XOR.

        Triples in common with another graph are removed from the current one,
        and triples not in common will be added from the other one.

        :param Graph other: The other graph to perform XOR with.

        :rtype: void
        """
        cdef:
            void *cur
            cc.HashSetIter it
            # TODO This could be more efficient to stash values in a simple
            # array, but how urgent is it to improve an in-place XOR?
            Graph tmp = Graph()

        # Add *to the tmp graph* triples in other graph and not in this graph.
        cc.hashset_iter_init(&it, other._triples)
        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
            bt = <BufferTriple*>cur
            if not self.trp_contains(bt):
                tmp.add_triple(bt)

        # Remove triples in common.
        cc.hashset_iter_init(&it, self._triples)
        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
            bt = <BufferTriple*>cur
            if other.trp_contains(bt):
                self.remove_triple(bt)

        self |= tmp


    cdef bint trp_contains(self, const BufferTriple* btrp):
        cdef:
            cc.HashSetIter it
            void* cur

        cc.hashset_iter_init(&it, self._triples)
        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
            if self.trp_cmp_fn(cur, btrp) == 0:
                return True
        return False


    cpdef void set(self, tuple trp) except *:
        """
        Set a single value for subject and predicate.

        Remove all triples matching ``s`` and ``p`` before adding ``s p o``.
        """
        if None in trp:
            raise ValueError(f'Invalid triple: {trp}')
        self.remove((trp[0], trp[1], None))
        self.add((trp,))


    def as_rdflib(self):
        """
        Return the data set as an RDFLib Graph.

        :rtype: rdflib.Graph
        """
        gr = Graph(identifier=self.id)
        for trp in self.data:
            gr.add(trp)

        return gr


    def _slice(self, s, p, o):
        """
        Return terms filtered by other terms.

        This behaves like the rdflib.Graph slicing policy.
        """
        # If no terms are unbound, check for containment.
        if s is not None and p is not None and o is not None: # s p o
            return (s, p, o) in self

        # If some terms are unbound, do a lookup.
        res = self.lookup((s, p, o))
        if s is not None:
            if p is not None: # s p ?
                return {r[2] for r in res}

            if o is not None: # s ? o
                return {r[1] for r in res}

            # s ? ?
            return {(r[1], r[2]) for r in res}

        if p is not None:
            if o is not None: # ? p o
                return {r[0] for r in res}

            # ? p ?
            return {(r[0], r[2]) for r in res}

        if o is not None: # ? ? o
            return {(r[0], r[1]) for r in res}

        # ? ? ?
        return res


    def lookup(self, pattern):
        """
        Look up triples by a pattern.

        This function converts RDFLib terms into the serialized format stored
        in the graph's internal structure and compares them bytewise.

        Any and all of the lookup terms msy be ``None``.

        :rtype: Graph
        "return: New Graph instance with matching triples.
        """
        cdef:
            void* cur
            BufferTriple trp
            Graph res_gr = Graph()

        self._match_ptn_callback(pattern, res_gr, cb.add_trp_callback, NULL)

        return res_gr


    cdef void _match_ptn_callback(
        self, pattern, Graph gr,
        lookup_callback_fn_t callback_fn, void* ctx=NULL
    ) except *:
        """
        Execute an arbitrary function on a list of triples matching a pattern.

        The arbitrary function is appied to each triple found in the current
        graph, and to a discrete graph that can be the current graph itself
        or a different one.
        """
        cdef:
            void* cur
            Buffer t1, t2
            Buffer ss, sp, so
            BufferTriple trp
            BufferTriple* trp_p
            lookup_fn_t cmp_fn
            cc.HashSetIter it

            TripleKey spok

        s, p, o = pattern

        # Decide comparison logic outside the loop.
        if s is not None and p is not None and o is not None:
            # Shortcut for 3-term match.
            spok = [
                self.store.to_key(s),
                self.store.to_key(p),
                self.store.to_key(o),
            ]

            if self.keys.contains(spok):
                callback_fn(gr, &spok, ctx)
                return

        if s is not None:
            term.serialize_from_rdflib(s, &t1)
            if p is not None:
                cmp_fn = cb.lookup_sp_cmp_fn
                term.serialize_from_rdflib(p, &t2)
            elif o is not None:
                cmp_fn = cb.lookup_so_cmp_fn
                term.serialize_from_rdflib(o, &t2)
            else:
                cmp_fn = cb.lookup_s_cmp_fn
        elif p is not None:
            term.serialize_from_rdflib(p, &t1)
            if o is not None:
                cmp_fn = cb.lookup_po_cmp_fn
                term.serialize_from_rdflib(o, &t2)
            else:
                cmp_fn = cb.lookup_p_cmp_fn
        elif o is not None:
            cmp_fn = cb.lookup_o_cmp_fn
            term.serialize_from_rdflib(o, &t1)
        else:
            cmp_fn = cb.lookup_none_cmp_fn

        # Iterate over serialized triples.
        cc.hashset_iter_init(&it, self._triples)
        while cc.hashset_iter_next(&it, &cur) != cc.CC_ITER_END:
            trp_p = <BufferTriple*>cur
            if cmp_fn(trp_p, &t1, &t2):
                callback_fn(gr, trp_p, ctx)