Browse Source

BReak loop on alphabetical order.

Stefano Cossu 1 year ago
parent
commit
19063a2efa
2 changed files with 23 additions and 8 deletions
  1. 9 5
      TODO.md
  2. 14 3
      transliterator/trans.py

+ 9 - 5
TODO.md

@@ -14,16 +14,20 @@ discussion, etc.); *X* = not implementing.
 - *D* Basic REST API
 - *D* Basic UI
 - *D* Life cycle hooks for plugins
-- *W* Regular expressions in ignore lists
+- *B* Regular expressions in ignore lists
 - *W* Word boundaries (design)
   - *B* Define word boundary characters
   - *D* Mark end-of-word and beginning-of-word characters
-- *P* Optimize token lookup
-  - *P* Break loop early based on alphabetical order
-  - *P* Ignore word break characters
+- *B* Optimize token lookup
+  - *D* Break loop early based on alphabetical order
+  - *B* Ignore word break characters
 - *D* API documentation
-- *P* Config file documentation
+- *D* Config file documentation
 - *D* Hooks documentation
+- *P* Tests
+  - *P* Config parsing
+  - *P* Transliteration
+  - *P* REST API
 - *W* Complete conversion of existing tables to YAML
   - *P* Arabic
   - *P* Armenian

+ 14 - 3
transliterator/trans.py

@@ -11,7 +11,7 @@ MULTI_WS_RE = re.compile(r"\s{2,}")
 # per-table.
 WORD_BOUNDARY = " \n\t:;.,\"'"
 
-# Cursor flags.
+# Cursor bitwise flags.
 CUR_BOW = 1
 CUR_EOW = 2
 
@@ -157,9 +157,20 @@ def transliterate(src, lang, r2s=False):
             if hret == CONT:
                 continue
 
+            step = len(ctx.src_tk)
+
+            # If the first character of the token is greater (= higher code
+            # point value) than the current character, then break the loop
+            # without a match, because we know there won't be any more match
+            # due to the alphabetical ordering.
+            if ctx.src_tk[0] > src[ctx.cur]:
+                logger.debug(
+                        f"{ctx.src_tk} is after {src[ctx.cur:ctx.cur + step]}."
+                        " Breaking loop.")
+                break
+
             # Longer tokens should be guaranteed to be scanned before their
             # substrings at this point.
-            step = len(ctx.src_tk)
             if ctx.src_tk == src[ctx.cur:ctx.cur + step]:
                 ctx.match = True
                 # This hook may skip this token or break out of the token
@@ -186,7 +197,7 @@ def transliterate(src, lang, r2s=False):
 
             # No match found. Copy non-mapped character (one at a time).
             logger.info(
-                    f"Token {src[ctx.cur]} (\\u{hex(ord(src[ctx.cur]))[2:]})"
+                    f"Token {src[ctx.cur]} (\\u{hex(ord(src[ctx.cur]))[2:]}) "
                     f"at position {ctx.cur} is not mapped.")
             ctx.dest_ls.append(src[ctx.cur])
             ctx.cur += 1