Ver código fonte

Complete strategy #5. Performance penalty.

Stefano Cossu 7 anos atrás
pai
commit
d900a13a3f

+ 21 - 0
doc/notes/performance.txt

@@ -31,8 +31,13 @@ Modified Duchamp VIAF dataset (343 triples; changed all subjects to `<>`)
 
 Retrieval of parent resource (11400 triples), pipe to /dev/null: 3.6"
 
+Database size: 1.2 Gb
+
+
 ## LMDB Back End Test
 
+### Strategy #4
+
 10K PUTs to new resources under the same container:
 
 ~29' running time
@@ -46,3 +51,19 @@ Database size: 633 Mb
 
 Retrieval of parent resource (11400 triples), pipe to /dev/null: 3.48"
 
+
+### Strategy #5
+
+10K PUTs to new resources under the same container:
+
+29' running time
+0.176" per resource
+3.4M triples total in repo at the end of the process
+
+Less gaps than strategy #4, however overall timing is almost identical. The
+blocker seems to be somewhere else.
+
+Database size: 422 Mb
+
+Retrieval of parent resource (11400 triples), pipe to /dev/null: 7.5"
+

+ 87 - 51
lakesuperior/store_layouts/ldp_rs/lmdb_store.py

@@ -235,8 +235,8 @@ class LmdbStore(Store):
         'ns:pfx',
         # Term hash to triple key: 1:1
         'th:t',
-        # Lookups for one known term: 1:m, fixed-length values
-        's:spo', 'p:spo', 'o:spo', 'c:spo',
+        # Lookups: 1:m, fixed-length values
+        's:po', 'p:so', 'o:sp', 'c:spo',
     )
 
     '''
@@ -249,6 +249,16 @@ class LmdbStore(Store):
     '''
     _lookup_rank = ('s', 'o', 'p')
 
+    '''
+    Order of terms in the lookup indices. Used to rebuild a triple from lookup.
+    '''
+    _lookup_ordering = {
+        's:po': (0, 1, 2),
+        'p:so': (1, 0, 2),
+        'o:sp': (2, 0, 1),
+    }
+
+
     data_env = None
     idx_env = None
     db = None
@@ -515,6 +525,7 @@ class LmdbStore(Store):
                     if cur.set_key(trp_key):
                         cur.delete(dupdata=True)
 
+            #import pdb; pdb.set_trace()
             self._index('remove', trp_key, ck)
 
 
@@ -801,7 +812,7 @@ class LmdbStore(Store):
         self.data_env = lmdb.open(path + '/main', subdir=False, create=create,
                 map_size=self.MAP_SIZE, max_dbs=4, readahead=False)
         self.idx_env = lmdb.open(path + '/index', subdir=False, create=create,
-                map_size=self.MAP_SIZE, max_dbs=9, readahead=False)
+                map_size=self.MAP_SIZE, max_dbs=6, readahead=False)
 
         # Open and optionally create main databases.
         self.dbs = {
@@ -881,40 +892,88 @@ class LmdbStore(Store):
 
         @return iterator of matching triple keys.
         '''
+        def lookup_1bound(label, term):
+            '''
+            Lookup triples for a pattern with one bound term.
+            '''
+            #import pdb; pdb.set_trace()
+            k = self._to_key(term)
+            if not k:
+                return iter(())
+            idx_name = label + ':' + 'spo'.replace(label, '')
+            term_order = self._lookup_ordering[idx_name]
+            with self.cur(idx_name) as cur:
+                if cur.set_key(k):
+                    for match in cur.iternext_dup():
+                        subkeys = bytes(match).split(self.SEP_BYTE)
+
+                        # Compose result.
+                        out = [None, None, None]
+                        out[term_order[0]] = k
+                        out[term_order[1]] = subkeys[0]
+                        out[term_order[2]] = subkeys[1]
+
+                        yield self.SEP_BYTE.join(out)
+
+
         def lookup_2bound(bound_terms):
             '''
             Look up triples for a pattern with two bound terms.
+
+            @param bound terms (dict) Triple labels and terms to search for,
+            in the format of, e.g. {'s': URIRef('urn:s:1'), 'o':
+            URIRef('urn:o:1')}
             '''
-            if not len(bound_terms) == 2:
+            #import pdb; pdb.set_trace()
+            if len(bound_terms) != 2:
                 raise ValueError(
                         'Exactly 2 terms need to be bound. Got {}'.format(
                             len(bound_terms)))
 
             # Establish lookup ranking.
-            luk = None
-            for k in self._lookup_rank:
-                if k in bound_terms.keys():
+            luc = None
+            for k_label in self._lookup_rank:
+                if k_label in bound_terms.keys():
                     # First match is lookup term.
-                    if not luk:
-                        # Lookup database key (cursor)
-                        luk = k + ':spo'
+                    if not luc:
+                        v_label = 'spo'.replace(k_label, '')
+                        # Lookup database key (cursor) name
+                        luc = k_label + ':' + v_label
+                        term_order = self._lookup_ordering[luc]
                         # Term to look up
-                        lut = bound_terms[k]
+                        luk = self._to_key(bound_terms[k_label])
+                        if not luk:
+                            return iter(())
+                        # Position of key in final triple.
                     # Second match is the filter.
                     else:
-                        # Filter key (position in triple key)
-                        fpos = 'spo'.index(k)
+                        # Filter key (position of sub-key in lookup results)
+                        fpos = v_label.index(k_label)
                         # Fliter term
-                        ft = bound_terms[k]
+                        ft = self._to_key(bound_terms[k_label])
+                        if not ft:
+                            return iter(())
+                        break
 
             # Look up in index.
-            with self.cur(luk) as cur:
-                if cur.set_key(lut):
+            with self.cur(luc) as cur:
+                if cur.set_key(luk):
                     # Iterate over matches and filter by second term.
                     for match in cur.iternext_dup():
-                        subkey = bytes(match).split(self.SEP_BYTE)[fpos]
-                        if subkey == ft:
-                            yield match
+                        subkeys = bytes(match).split(self.SEP_BYTE)
+                        flt_subkey = subkeys[fpos]
+                        if flt_subkey == ft:
+                            # Remainder (not filter) key used to complete the
+                            # triple.
+                            r_subkey = subkeys[1-fpos]
+
+                            # Compose result.
+                            out = [None, None, None]
+                            out[term_order[0]] = luk
+                            out[term_order[fpos+1]] = flt_subkey
+                            out[term_order[2-fpos]] = r_subkey
+
+                            yield self.SEP_BYTE.join(out)
 
         s, p, o = triple_pattern
 
@@ -938,9 +997,7 @@ class LmdbStore(Store):
                     yield from lookup_2bound({'s': s, 'o': o})
                 # s ? ?
                 else:
-                    bound_terms = [s]
-                    cur_label = 's:po'
-                    order = (0, 1, 2)
+                    yield from lookup_1bound('s', s)
         else:
             if p is not None:
                 # ? p o
@@ -948,40 +1005,16 @@ class LmdbStore(Store):
                     yield from lookup_2bound({'p': p, 'o': o})
                 # ? p ?
                 else:
-                    bound_terms = [p]
-                    cur_label = 'p:so'
-                    order = (1, 0, 2)
+                    yield from lookup_1bound('p', p)
             else:
                 # ? ? o
                 if o is not None:
-                    bound_terms = [o]
-                    cur_label = 'o:sp'
-                    order = (1, 2, 0)
+                    yield from lookup_1bound('o', o)
                 # ? ? ?
                 else:
                     # Get all triples in the database.
                     with self.cur('spo:c') as cur:
                         yield from cur.iternext_nodup()
-                    return
-
-        tkey = self._to_key(bound_terms)
-        if not tkey:
-            return iter(())
-
-        with self.cur(cur_label) as cur:
-            if cur.set_key(tkey):
-                # @FIXME For some reason LMDB blows up if this iterator is not
-                # wrapped in a set. This may not be too bad because we can get
-                # rid of duplicates here.
-                for match in set(cur.iternext_dup()):
-                    # Combine bound and found in search order.
-                    comb_keys = (
-                            bytes(tkey).split(self.SEP_BYTE)
-                            + bytes(match).split(self.SEP_BYTE))
-                    # Rearrange term keys according to given order.
-                    yield self.SEP_BYTE.join([comb_keys[i] for i in order])
-            else:
-                return iter(())
 
 
     def _append(self, cur, values, **kwargs):
@@ -1018,13 +1051,16 @@ class LmdbStore(Store):
         # Split and rearrange-join keys for association and indices.
         triple = bytes(spok).split(self.SEP_BYTE)
         sk, pk, ok = triple[:3]
+        spk = self.SEP_BYTE.join(triple[:2])
+        sok = bytes(triple[0]) + self.SEP_BYTE + bytes(triple[2])
+        pok = self.SEP_BYTE.join(triple[1:3])
         spok = self.SEP_BYTE.join(triple[:3])
 
         # Associate cursor labels with k/v pairs.
         curs = {
-            's:spo': (sk, spok),
-            'p:spo': (pk, spok),
-            'o:spo': (ok, spok),
+            's:po': (sk, pok),
+            'p:so': (pk, sok),
+            'o:sp': (ok, spk),
             'c:spo': (ck, spok),
         }
 

+ 1 - 0
lakesuperior/store_layouts/ldp_rs/rsrc_centric_layout.py

@@ -338,6 +338,7 @@ class RsrcCentricLayout:
 
         @param subj_uri Subject URI.
         '''
+        #import pdb; pdb.set_trace()
         # Only search in non-historic graphs.
         qry = '''
         CONSTRUCT { ?s1 ?p1 ?s }