Browse Source

WIP [ci skip]

Stefano Cossu 5 years ago
parent
commit
64d0436182

+ 23 - 22
README.rst

@@ -3,43 +3,44 @@ Lakesuperior
 
 |build status| |docs| |pypi| |codecov|
 
-Lakesuperior is an alternative `Fedora
-Repository <http://fedorarepository.org>`__ implementation.
+Lakesuperior is a Linked Data repository software. It is capable of storing and
+managing  large volumes of files and their metadata regardless of their
+format, size, ethnicity, gender identity or expression.
 
-Fedora is a mature repository software system historically adopted by
-major cultural heritage institutions. It exposes an
-`LDP <https://www.w3.org/TR/ldp-primer/>`__ endpoint to manage
-any type of binary files and their metadata in Linked Data format.
+Lakesuperior is an alternative `Fedora Repository
+<http://fedorarepository.org>`__ implementation. Fedora is a mature repository
+software system historically adopted by major cultural heritage institutions
+which extends the `Linked Data Platform <https://www.w3.org/TR/ldp-primer/>`__
+protocol.
 
 Guiding Principles
 ------------------
 
-Lakesuperior aims at being an uncomplicated, efficient Fedora 4
-implementation.
+Lakesuperior aims at being a reliable and efficient Fedora 4 implementation.
 
 Its main goals are:
 
 -  **Reliability:** Based on solid technologies with stability in mind.
 -  **Efficiency:** Small memory and CPU footprint, high scalability.
--  **Ease of management:** Tools to perform monitoring and maintenance
-   included.
+-  **Ease of management:** Tools to perform migration, monitoring and
+   maintenance included.
 -  **Simplicity of design:** Straight-forward architecture, robustness
    over features.
 
 Key features
 ------------
 
--  Drop-in replacement for Fedora4
--  Very stable persistence layer based on
-   `LMDB <https://symas.com/lmdb/>`__ and filesystem. Fully
-   ACID-compliant writes guarantee consistency of data.
--  Term-based search and SPARQL Query API + UI
--  No performance penalty for storing many resources under the same
-   container, or having one resource link to many URIs
--  Extensible provenance metadata tracking
--  Multi-modal access: HTTP (REST), command line interface and native Python
-   API.
--  Fits in a pocket: you can carry 50M triples in an 8Gb memory stick.
+- Stores binary files and RDF metadata in one repository.
+- Multi-modal access: REST/LDP, command line and native Python API.
+- (`almost <fcrepo4_deltas>`_) Drop-in replacement for Fedora4
+- Very stable persistence layer based on
+  `LMDB <https://symas.com/lmdb/>`__ and filesystem. Fully
+  ACID-compliant writes guarantee consistency of data.
+- Term-based search and SPARQL Query API + UI
+- No performance penalty for storing many resources under the same
+  container, or having one resource link to many URIs
+- Extensible provenance metadata tracking
+- Fits in a pocket: you can carry 50M triples in an 8Gb memory stick.
 
 Installation & Documentation
 ----------------------------
@@ -50,7 +51,7 @@ With Docker::
     cd lakesuperior
     docker-compose up
 
-With pip (assuming you are familiar with it)::
+With pip (requires a C compiler to be installed)::
 
     pip install lakesuperior
 

+ 34 - 8
lakesuperior/store/ldp_rs/graph.pyx

@@ -5,6 +5,10 @@ from functools import wraps
 from rdflib import Graph
 from rdflib.term import Node
 
+from lakesuperior import env
+
+from libc.string cimport memcmp
+
 from lakesuperior.cy_include cimport calg
 from lakesuperior.store.ldp_rs.lmdb_triplestore cimport (
         TRP_KLEN, TripleKey, LmdbTriplestore)
@@ -51,7 +55,13 @@ cdef bint set_item_cmp_fn(calg.SetValue v1, calg.SetValue v2):
 
     https://fragglet.github.io/c-algorithms/doc/set_8h.html#40fa2c86d5b003c1b0b0e8dd1e4df9f4
     """
-    pass
+    if (<SetItem *>v1)[0].size != (<SetItem *>v2)[0].size:
+        return False
+
+    return memcmp(
+            (<SetItem *>v1)[0].data, (<SetItem *>v2)[0].data,
+            (<SetItem *>v1)[0].size)
+
 
 
 cdef class SimpleGraph:
@@ -59,16 +69,25 @@ cdef class SimpleGraph:
     Fast and simple implementation of a graph.
 
     Most functions should mimic RDFLib's graph with less overhead. It uses
-        the same funny but functional slicing notation.
+    the same funny but functional slicing notation.
+
+    A SimpleGraph can be obtained from a
+    :py:class:`lakesuperior.store.keyset.Keyset` which is convenient bacause
+    a Keyset can be obtained very efficiently from querying a store, then also
+    very efficiently filtered and eventually converted into a set of readable
+    terms.
 
-    An instance of this class can be converted to a ``rdflib.Graph`` instance.
+    An instance of this class can also be converted to and from a
+    ``rdflib.Graph`` instance.
     """
 
     cdef:
         calg.Set *_data
 
+
     def __cinit__(
-            self, calg.Set *cdata=NULL, Keyset keyset=None, set data=set()):
+            self, calg.Set *cdata=NULL, Keyset keyset=None, store=None,
+            set data=set()):
         """
         Initialize the graph with pre-existing data or by looking up a store.
 
@@ -81,25 +100,34 @@ cdef class SimpleGraph:
         :param calg.Set cdata: Initial data as a C ``Set`` struct.
         :param Keyset keyset: Keyset to create the graph from. Keys will be
             converted to set elements.
+        :param lakesuperior.store.ldp_rs.LmdbTripleStore store: store to
+            look up the keyset. Only used if ``keyset`` is specified. If not
+            set, the environment store is used.
         :param set data: Initial data as a set of 3-tuples of RDFLib terms.
         :param tuple lookup: tuple of a 3-tuple of lookup terms, and a context.
             E.g. ``((URIRef('urn:ns:a'), None, None), URIRef('urn:ns:ctx'))``.
             Any and all elements may be ``None``.
         :param lmdbStore store: the store to look data up.
         """
+        self.store = store or env.app_defaults.rdf_store
+
         cdef:
             Triple strp
             TripleKey spok
 
         if cdata is not NULL:
+            # Build data from provided C set.
             self._data = cdata
+
         else:
+            # Initialize empty data set.
             self._data = calg.set_new(set_item_hash_fn, set_item_cmp_fn)
             if keyset is not None:
+                # Populate with provided key set.
                 while keyset.next(spok):
-                    self._data = LmdbTriplestore.from_trp_key(
-                    )
+                    calg.set_insert(self._data, self.store.from_trp_key(spok))
             else:
+                # Populate with provided Python set.
                 for trp in data:
                     strp = serialize_triple(trp)
                     calg.set_insert(self._data, strp)
@@ -142,8 +170,6 @@ cdef class SimpleGraph:
         """
         Convert triple data to a Python set.
 
-        Internally the data are stored as a C struct.
-
         :rtype: set
         """
         pass

+ 21 - 6
lakesuperior/store/ldp_rs/lmdb_triplestore.pyx

@@ -13,6 +13,7 @@ from libc.string cimport memcpy
 
 cimport lakesuperior.cy_include.cylmdb as lmdb
 cimport lakesuperior.cy_include.cytpl as tpl
+from lakesuperior.store.ldp_rs.term cimport Term
 
 from lakesuperior.store.base_lmdb_store cimport (
         BaseLmdbStore, data_v, dbi, key_v)
@@ -1327,12 +1328,27 @@ cdef class LmdbTriplestore(BaseLmdbStore):
 
     cdef object from_key(self, Key key):
         """
-        Convert a single or multiple key into one or more terms.
+        Convert a single key into one term.
 
         :param Key key: The key to be converted.
         """
+        ser_term = self.lookup_term(key)
+
+        return deserialize(
+                <unsigned char *>ser_term.mv_data, ser_term.mv_size)
+
+
+    cdef inline lmdb.MDB_val lookup_term(self, Key key):
+        """
+        look up a term by key.
+
+        :param Key key: The key to be looked up.
+
+        :rtype: lmdb.MDB_val
+        :return: LMDB value structure containing the serialized term.
+        """
         cdef:
-            unsigned char *pk
+            lmdb.MDB_val key_v, data_v
 
         key_v.mv_data = key
         key_v.mv_size = KLEN
@@ -1341,15 +1357,14 @@ cdef class LmdbTriplestore(BaseLmdbStore):
                 lmdb.mdb_get(self.txn, self.get_dbi('t:st'), &key_v, &data_v),
                 'Error getting data for key \'{}\'.'.format(key))
 
-        return deserialize(
-                <unsigned char *>data_v.mv_data, data_v.mv_size)
+        return data_v
 
 
     cdef tuple from_trp_key(self, TripleKey key):
         """
-        Convert a single or multiple key into one or more terms.
+        Convert a triple key into a tuple of 3 terms.
 
-        :param Key key: The key to be converted.
+        :param TripleKey key: The triple key to be converted.
         """
         #logger.debug(f'From triple key: {key[: TRP_KLEN]}')
         return (

+ 9 - 11
lakesuperior/store/ldp_rs/term.pxd

@@ -1,17 +1,15 @@
 from lakesuperior.cy_include cimport cytpl as tpl
 
 cdef class Term:
-    char type
-    char *data
-    char *datatype
-    char *lang
+    cdef:
+        char type
+        char *data
+        char *datatype
+        char *lang
 
-    # Temporary vars that get cleaned up on object deallocation.
-    char *_fmt
-    char *_pk
+        # Temporary vars that get cleaned up on object deallocation.
+        char *_fmt
+        char *_pk
 
-    tpl.tpl_bin serialize(self)
-    object to_python()
-
-    Term from_buffer(const unsigned char *data, const size_t size)
+        tpl.tpl_bin serialize(self)
 

+ 4 - 4
lakesuperior/store/ldp_rs/term.pyx

@@ -18,15 +18,15 @@ cdef class Term:
     """
     RDF term: URI reference, blank node or literal.
     """
-    def __cinit__(self, const tpl.tpl_bin data):
+    def __cinit__(self, const tpl.tpl_bin *data):
         """
         Initialize a Term from pack data.
 
-        :param tpl.tpl_bin data: a TPL binary buffer packed according to the
-            term structure format.
+        :param tpl.tpl_bin *data: a pointer to a TPL binary buffer packed
+            according to the term structure format.
         """
         self._pk = tpl.tpl_peek(
-                tpl.TPL_MEM | tpl.TPL_DATAPEEK, data.addr, data.sz,
+                tpl.TPL_MEM | tpl.TPL_DATAPEEK, data[0].addr, data[0].sz,
                 LSUP_TERM_PK_FMT, &self.term_type, &self.data, &self.datatype,
                 &self.lang)
 

+ 4 - 3
lakesuperior/store/ldp_rs/triple.pxd

@@ -1,9 +1,10 @@
 from lakesuperior.cy_include cimport cytpl as tpl
+from lakesuperior.store.ldp_rs.term cimport Term
 
 ctypedef struct Triple:
-    tpl.tpl_bin s
-    tpl.tpl_bin p
-    tpl.tpl_bin o
+    Term s
+    Term p
+    Term o
 
 
 cdef: