Browse Source

Define word boundary per config.

Stefano Cossu 1 year ago
parent
commit
3b2f1d1000
3 changed files with 17 additions and 13 deletions
  1. 6 5
      TODO.md
  2. 2 0
      scriptshifter/tables/__init__.py
  3. 9 8
      scriptshifter/trans.py

+ 6 - 5
TODO.md

@@ -1,6 +1,6 @@
 # Brief TODO list
 
-*P* = pengding; *W* = working no it; *D* = done; *B* = blocked (needs
+*P* = pending; *W* = working no it; *D* = done; *B* = blocked (needs
 discussion, etc.); *X* = not implementing.
 
 - *D* Basic table loading & parsing
@@ -14,20 +14,21 @@ discussion, etc.); *X* = not implementing.
 - *D* Basic REST API
 - *D* Basic UI
 - *D* Life cycle hooks for plugins
-- *B* Regular expressions in ignore lists
-- *W* Word boundaries (design)
-  - *B* Define word boundary characters
+- *X* Regular expressions in ignore lists
+- *D* Word boundaries
+  - *D* Define word boundary characters per config
   - *D* Mark end-of-word and beginning-of-word characters
 - *B* Optimize token lookup
   - *D* Break loop early based on alphabetical order
   - *B* Ignore word break characters
   - *W* Capitalization
     - *P* Separate capitalization function
-    - *P* Capitalize ligated letters (e.g. Cyrillic T͡͡S)
+    - *B* Capitalize ligated letters (e.g. Cyrillic T͡͡S)
     - *D* Option for capitalizing first word, all words, none, unchanged
 - *D* API documentation
 - *D* Config file documentation
 - *D* Hooks documentation
+- *D* Rebranding (ScriptShifter)
 - *W* Tests
   - *W* Config parsing
   - *W* Transliteration

+ 2 - 0
scriptshifter/tables/__init__.py

@@ -40,6 +40,8 @@ HOOKS = (
 )
 # Package path where hook functions are kept.
 HOOK_PKG_PATH = "scriptshifter.hooks"
+# Default characters defining a word boundary. This is configurable per-table.
+WORD_BOUNDARY = " \n\t:;.,\"'-()[]{}"
 
 logger = logging.getLogger(__name__)
 

+ 9 - 8
scriptshifter/trans.py

@@ -2,14 +2,11 @@ import logging
 import re
 
 from scriptshifter.exceptions import BREAK, CONT
-from scriptshifter.tables import load_table
+from scriptshifter.tables import WORD_BOUNDARY, load_table
 
 
 # Match multiple spaces.
 MULTI_WS_RE = re.compile(r"\s{2,}")
-# Default characters defining a word boundary. TODO Make this configurable
-# per-table.
-WORD_BOUNDARY = " \n\t:;.,\"'-()[]{}"
 
 # Cursor bitwise flags.
 CUR_BOW = 1
@@ -89,19 +86,22 @@ def transliterate(src, lang, r2s=False, capitalize=False):
     # the length of the token that eventually matches.
     ignore_list = langsec.get("ignore", [])  # Only present in R2S
     ctx.cur = 0
+    word_boundary = langsec.get("word_boundary", WORD_BOUNDARY)
     while ctx.cur < len(src):
         # Reset cursor position flags.
         ctx.cur_flags = 0
         cur_char = src[ctx.cur]
 
         # Look for a word boundary and flag word beginning/end it if found.
-        if (ctx.cur == 0 or src[ctx.cur - 1] in WORD_BOUNDARY) and (
-                cur_char not in WORD_BOUNDARY):
+        if (ctx.cur == 0 or src[ctx.cur - 1] in word_boundary) and (
+                cur_char not in word_boundary):
             # Beginning of word.
             logger.debug(f"Beginning of word at position {ctx.cur}.")
             ctx.cur_flags |= CUR_BOW
-        if (ctx.cur == len(src) - 1 or src[ctx.cur + 1] in WORD_BOUNDARY) and (
-                cur_char not in WORD_BOUNDARY):
+        if (
+            ctx.cur == len(src) - 1
+            or src[ctx.cur + 1] in word_boundary
+        ) and (cur_char not in word_boundary):
             # Beginning of word.
             # End of word.
             logger.debug(f"End of word at position {ctx.cur}.")
@@ -218,6 +218,7 @@ def transliterate(src, lang, r2s=False, capitalize=False):
         delattr(ctx, "cur_flags")
 
     delattr(ctx, "cur")
+    delattr(ctx, "word_boundary")
 
     # This hook may take care of the assembly and cause the function to return
     # its own return value.