Browse Source

Initial word boundary implementation.

Stefano Cossu 1 year ago
parent
commit
8a4a41bb0a
3 changed files with 57 additions and 4 deletions
  1. 9 3
      TODO.md
  2. 27 0
      doc/hooks.md
  3. 21 1
      transliterator/trans.py

+ 9 - 3
TODO.md

@@ -15,19 +15,25 @@ discussion, etc.); *X* = not implementing.
 - *D* Basic UI
 - *D* Life cycle hooks for plugins
 - *P* Regular expressions in ignore lists
-- *P* Word boundaries (design)
+- *W* Word boundaries (design)
+  - *W* Define word boundary characters
+  - *D* Mark end-of-word and beginning-of-word characters
+- *P* Optimize token lookup
+  - *P* Break loop based on alphabetical order
+  - *P* Ignore space
 - *D* API documentation
 - *P* Config file documentation
 - *D* Hooks documentation
 - *W* Complete conversion of existing tables to YAML
   - *P* Arabic
   - *P* Armenian
+  - *D* Asian Cyrillic
   - *P* Azerbajani
   - *D* Belarusian
   - *D* Bulgarian
   - *D* Chinese
-  - *P* Ethiopic
-  - *P* Georgian
+  - *D* Ethiopic
+  - *D* Georgian
   - *W* Greek
   - *P* Hebrew and Yiddish
   - *X* Japanese

+ 27 - 0
doc/hooks.md

@@ -154,6 +154,8 @@ of multiple symbols based on logical rules rather than a dictionary.
 
 - `ctx.src`: Source text. It should not be reassigned.
 - `ctx.cur`: cursor position.
+- `ctx.cur_flags`: flags associated with the current position. They are reset
+  at every character iteration. See "Cursor Flags" below.
 - `ctx.dest_ls`: destination token list.
 - `ctx.general`: Configuration general options.
 - `ctx.langsec`: language section (S2R or R2S) of configuration.
@@ -178,6 +180,8 @@ ignore term and when or when not to trigger a match.
 
 - `ctx.src`: Source text. It should not be reassigned.
 - `ctx.cur`: cursor position.
+- `ctx.cur_flags`: flags associated with the current position. They are reset
+  at every character iteration. See "Cursor Flags" below.
 - `ctx.dest_ls`: destination token list.
 - `ctx.general`: Configuration general options.
 - `ctx.langsec`: language section (S2R or R2S) of configuration.
@@ -202,6 +206,8 @@ scanning for more ignore tokens past the match.
 
 - `ctx.src`: Source text. It should not be reassigned.
 - `ctx.cur`: cursor position.
+- `ctx.cur_flags`: flags associated with the current position. They are reset
+  at every character iteration. See "Cursor Flags" below.
 - `ctx.dest_ls`: destination token list.
 - `ctx.general`: Configuration general options.
 - `ctx.langsec`: language section (S2R or R2S) of configuration.
@@ -229,6 +235,8 @@ number of characters, and/or exit the text scanning loop altogether.
 
 - `ctx.src`: Source text. It should not be reassigned.
 - `ctx.cur`: cursor position.
+- `ctx.cur_flags`: flags associated with the current position. They are reset
+  at every character iteration. See "Cursor Flags" below.
 - `ctx.dest_ls`: destination token list.
 - `ctx.general`: Configuration general options.
 - `ctx.langsec`: language section (S2R or R2S) of configuration.
@@ -254,6 +262,8 @@ also inject additional conditions and logic for the match, and revoke the
 
 - `ctx.src`: Source text. It should not be reassigned.
 - `ctx.cur`: cursor position.
+- `ctx.cur_flags`: flags associated with the current position. They are reset
+  at every character iteration. See "Cursor Flags" below.
 - `ctx.dest_ls`: destination token list. The matching token will be added to it
   after this hook is run.
 - `ctx.general`: Configuration general options.
@@ -283,6 +293,8 @@ cursor position to the destination list, verbatim.
 
 - `ctx.src`: Source text. It should not be reassigned.
 - `ctx.cur`: cursor position.
+- `ctx.cur_flags`: flags associated with the current position. They are reset
+  at every character iteration. See "Cursor Flags" below.
 - `ctx.dest_ls`: destination token list.
 - `ctx.general`: Configuration general options.
 - `ctx.langsec`: language section (S2R or R2S) of configuration.
@@ -337,3 +349,18 @@ and return it before any further default processing is done.
 `"ret"` or `None`. If `"ret"`, the transliteration function returns `ctx.dest`
 immediately; otherwise it proceeds with standard adjustments of the output
 string before returning.
+
+## Cursor flags
+
+At certain points of the processing, some boolean flags are associated with
+the current cursor position. These flags are available under `ctx.cur_flags`.
+The following flags are currently supported:
+
+- `CUR_BOW`: Beginning of word.
+- `CUR_EOW`: End of word.
+
+The beginning and end of word flags are useful for hooks to manipulate the
+transliteration where letters take different shapes based on their position
+within a word. Either, both, or neither flag may be set at any position. If
+both are set, the letter is standalone. If neither is set, the letter is
+medial.

+ 21 - 1
transliterator/trans.py

@@ -7,7 +7,13 @@ from transliterator.tables import load_table
 
 # Match multiple spaces.
 MULTI_WS_RE = re.compile(r"\s{2,}")
+# Default characters defining a word boundary. TODO Make this configurable
+# per-table.
+WORD_BOUNDARY = " \n\t:;.,\"'"
 
+# Cursor flags.
+CUR_BOW = 1
+CUR_EOW = 2
 
 logger = logging.getLogger(__name__)
 
@@ -84,6 +90,19 @@ def transliterate(src, lang, r2s=False):
     ignore_list = langsec.get("ignore", [])  # Only present in R2S
     ctx.cur = 0
     while ctx.cur < len(src):
+        # Reset cursor position flags.
+        ctx.cur_flags = 0
+
+        # Look for a word boundary and flag word beginning/end it if found.
+        if ctx.cur == 0 or src[ctx.cur - 1] in WORD_BOUNDARY:
+            # Beginning of word.
+            logger.debug(f"Beginning of word at position {ctx.cur}.")
+            ctx.cur_flags |= CUR_BOW
+        if ctx.cur == len(src) - 1 or src[ctx.cur + 1] in WORD_BOUNDARY:
+            # End of word.
+            logger.debug(f"End of word at position {ctx.cur}.")
+            ctx.cur_flags |= CUR_EOW
+
         # This hook may skip the parsing of the current
         # token or exit the scanning loop altogether.
         hret = _run_hook("begin_input_token", ctx, langsec_hooks)
@@ -94,7 +113,7 @@ def transliterate(src, lang, r2s=False):
             logger.debug("Skipping scanning iteration from hook signal.")
             continue
 
-        # Check ignore list first. Find as many subsequent ignore tokens
+        # Check ignore list. Find as many subsequent ignore tokens
         # as possible before moving on to looking for match tokens.
         ctx.tk = None
         while True:
@@ -173,6 +192,7 @@ def transliterate(src, lang, r2s=False):
             ctx.cur += 1
         else:
             delattr(ctx, "match")
+        delattr(ctx, "cur_flags")
 
     delattr(ctx, "cur")