3 years ago · 8a4a41bb0a
--- a/TODO.md
+++ b/TODO.md
@@ -15,19 +15,25 @@ discussion, etc.); *X* = not implementing.
 
				 - *D* Basic UI
			
 
				 - *D* Life cycle hooks for plugins
			
 
				 - *P* Regular expressions in ignore lists
			
 
				-- *P* Word boundaries (design)
			
 
				+- *W* Word boundaries (design)
			
 
				+  - *W* Define word boundary characters
			
 
				+  - *D* Mark end-of-word and beginning-of-word characters
			
 
				+- *P* Optimize token lookup
			
 
				+  - *P* Break loop based on alphabetical order
			
 
				+  - *P* Ignore space
			
 
				 - *D* API documentation
			
 
				 - *P* Config file documentation
			
 
				 - *D* Hooks documentation
			
 
				 - *W* Complete conversion of existing tables to YAML
			
 
				   - *P* Arabic
			
 
				   - *P* Armenian
			
 
				+  - *D* Asian Cyrillic
			
 
				   - *P* Azerbajani
			
 
				   - *D* Belarusian
			
 
				   - *D* Bulgarian
			
 
				   - *D* Chinese
			
 
				-  - *P* Ethiopic
			
 
				-  - *P* Georgian
			
 
				+  - *D* Ethiopic
			
 
				+  - *D* Georgian
			
 
				   - *W* Greek
			
 
				   - *P* Hebrew and Yiddish
			
 
				   - *X* Japanese
			
--- a/doc/hooks.md
+++ b/doc/hooks.md
@@ -154,6 +154,8 @@ of multiple symbols based on logical rules rather than a dictionary.
 
				 
			
 
				 - `ctx.src`: Source text. It should not be reassigned.
			
 
				 - `ctx.cur`: cursor position.
			
 
				+- `ctx.cur_flags`: flags associated with the current position. They are reset
			
 
				+  at every character iteration. See "Cursor Flags" below.
			
 
				 - `ctx.dest_ls`: destination token list.
			
 
				 - `ctx.general`: Configuration general options.
			
 
				 - `ctx.langsec`: language section (S2R or R2S) of configuration.
			
@@ -178,6 +180,8 @@ ignore term and when or when not to trigger a match.
 
				 
			
 
				 - `ctx.src`: Source text. It should not be reassigned.
			
 
				 - `ctx.cur`: cursor position.
			
 
				+- `ctx.cur_flags`: flags associated with the current position. They are reset
			
 
				+  at every character iteration. See "Cursor Flags" below.
			
 
				 - `ctx.dest_ls`: destination token list.
			
 
				 - `ctx.general`: Configuration general options.
			
 
				 - `ctx.langsec`: language section (S2R or R2S) of configuration.
			
@@ -202,6 +206,8 @@ scanning for more ignore tokens past the match.
 
				 
			
 
				 - `ctx.src`: Source text. It should not be reassigned.
			
 
				 - `ctx.cur`: cursor position.
			
 
				+- `ctx.cur_flags`: flags associated with the current position. They are reset
			
 
				+  at every character iteration. See "Cursor Flags" below.
			
 
				 - `ctx.dest_ls`: destination token list.
			
 
				 - `ctx.general`: Configuration general options.
			
 
				 - `ctx.langsec`: language section (S2R or R2S) of configuration.
			
@@ -229,6 +235,8 @@ number of characters, and/or exit the text scanning loop altogether.
 
				 
			
 
				 - `ctx.src`: Source text. It should not be reassigned.
			
 
				 - `ctx.cur`: cursor position.
			
 
				+- `ctx.cur_flags`: flags associated with the current position. They are reset
			
 
				+  at every character iteration. See "Cursor Flags" below.
			
 
				 - `ctx.dest_ls`: destination token list.
			
 
				 - `ctx.general`: Configuration general options.
			
 
				 - `ctx.langsec`: language section (S2R or R2S) of configuration.
			
@@ -254,6 +262,8 @@ also inject additional conditions and logic for the match, and revoke the
 
				 
			
 
				 - `ctx.src`: Source text. It should not be reassigned.
			
 
				 - `ctx.cur`: cursor position.
			
 
				+- `ctx.cur_flags`: flags associated with the current position. They are reset
			
 
				+  at every character iteration. See "Cursor Flags" below.
			
 
				 - `ctx.dest_ls`: destination token list. The matching token will be added to it
			
 
				   after this hook is run.
			
 
				 - `ctx.general`: Configuration general options.
			
@@ -283,6 +293,8 @@ cursor position to the destination list, verbatim.
 
				 
			
 
				 - `ctx.src`: Source text. It should not be reassigned.
			
 
				 - `ctx.cur`: cursor position.
			
 
				+- `ctx.cur_flags`: flags associated with the current position. They are reset
			
 
				+  at every character iteration. See "Cursor Flags" below.
			
 
				 - `ctx.dest_ls`: destination token list.
			
 
				 - `ctx.general`: Configuration general options.
			
 
				 - `ctx.langsec`: language section (S2R or R2S) of configuration.
			
@@ -337,3 +349,18 @@ and return it before any further default processing is done.
 
				 `"ret"` or `None`. If `"ret"`, the transliteration function returns `ctx.dest`
			
 
				 immediately; otherwise it proceeds with standard adjustments of the output
			
 
				 string before returning.
			
 
				+
			
 
				+## Cursor flags
			
 
				+
			
 
				+At certain points of the processing, some boolean flags are associated with
			
 
				+the current cursor position. These flags are available under `ctx.cur_flags`.
			
 
				+The following flags are currently supported:
			
 
				+
			
 
				+- `CUR_BOW`: Beginning of word.
			
 
				+- `CUR_EOW`: End of word.
			
 
				+
			
 
				+The beginning and end of word flags are useful for hooks to manipulate the
			
 
				+transliteration where letters take different shapes based on their position
			
 
				+within a word. Either, both, or neither flag may be set at any position. If
			
 
				+both are set, the letter is standalone. If neither is set, the letter is
			
 
				+medial.
			
--- a/transliterator/trans.py
+++ b/transliterator/trans.py
@@ -7,7 +7,13 @@ from transliterator.tables import load_table
 
				 
			
 
				 # Match multiple spaces.
			
 
				 MULTI_WS_RE = re.compile(r"\s{2,}")
			
 
				+# Default characters defining a word boundary. TODO Make this configurable
			
 
				+# per-table.
			
 
				+WORD_BOUNDARY = " \n\t:;.,\"'"
			
 
				 
			
 
				+# Cursor flags.
			
 
				+CUR_BOW = 1
			
 
				+CUR_EOW = 2
			
 
				 
			
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
@@ -84,6 +90,19 @@ def transliterate(src, lang, r2s=False):
 
				     ignore_list = langsec.get("ignore", [])  # Only present in R2S
			
 
				     ctx.cur = 0
			
 
				     while ctx.cur < len(src):
			
 
				+        # Reset cursor position flags.
			
 
				+        ctx.cur_flags = 0
			
 
				+
			
 
				+        # Look for a word boundary and flag word beginning/end it if found.
			
 
				+        if ctx.cur == 0 or src[ctx.cur - 1] in WORD_BOUNDARY:
			
 
				+            # Beginning of word.
			
 
				+            logger.debug(f"Beginning of word at position {ctx.cur}.")
			
 
				+            ctx.cur_flags |= CUR_BOW
			
 
				+        if ctx.cur == len(src) - 1 or src[ctx.cur + 1] in WORD_BOUNDARY:
			
 
				+            # End of word.
			
 
				+            logger.debug(f"End of word at position {ctx.cur}.")
			
 
				+            ctx.cur_flags |= CUR_EOW
			
 
				+
			
 
				         # This hook may skip the parsing of the current
			
 
				         # token or exit the scanning loop altogether.
			
 
				         hret = _run_hook("begin_input_token", ctx, langsec_hooks)
			
@@ -94,7 +113,7 @@ def transliterate(src, lang, r2s=False):
 
				             logger.debug("Skipping scanning iteration from hook signal.")
			
 
				             continue
			
 
				 
			
 
				-        # Check ignore list first. Find as many subsequent ignore tokens
			
 
				+        # Check ignore list. Find as many subsequent ignore tokens
			
 
				         # as possible before moving on to looking for match tokens.
			
 
				         ctx.tk = None
			
 
				         while True:
			
@@ -173,6 +192,7 @@ def transliterate(src, lang, r2s=False):
 
				             ctx.cur += 1
			
 
				         else:
			
 
				             delattr(ctx, "match")
			
 
				+        delattr(ctx, "cur_flags")
			
 
				 
			
 
				     delattr(ctx, "cur")