4 months ago · 28fc35828c
--- a/doc/hooks.md
+++ b/doc/hooks.md
@@ -114,6 +114,15 @@ after the hook function is executed. Possible return values are defined below
 
				 for each hook. Some special return values, such as `BREAK` and `CONT`, are
			
 
				 registered as constants under `scriptshifter.exceptions`.
			
 
				 
			
 
				+### Note on running multiple functions on a hook
			
 
				+
			
 
				+Currently, if multiple functions are defined for a hook, they are executed
			
 
				+in the order specified in the configuration. There is no way to skip a function
			
 
				+implicitly based on the outcome of the previous one. The only state that is
			
 
				+passed around in this context, is the `ctx` instance of the `Transliterator`
			
 
				+class. This may change in the future as specific needs arise. 
			
 
				+
			
 
				+
			
 
				 ### Always available context members
			
 
				 
			
 
				 The following members of the context object are available in all the hooks:
			
@@ -191,7 +200,7 @@ ignore term and when or when not to trigger a match.
 
				   at every character iteration. See "Cursor Flags" below.
			
 
				 - `ctx.dest_ls`: destination token list.
			
 
				 
			
 
				-#### Output
			
 
				+#### Return
			
 
				 
			
 
				 `CONT`, `BREAK`, or `None`. `CONT` skips the checks on the
			
 
				 current ignore token. `BREAK` stops looking up ignore tokens for the current
			
@@ -217,7 +226,7 @@ scanning for more ignore tokens past the match.
 
				 - `ctx.ignoring`: whether an ignore token matched. If set to `False`, the rest
			
 
				   of the workflow will assume a non-match.
			
 
				 
			
 
				-#### Output
			
 
				+#### Return
			
 
				 
			
 
				 `CONT`, `BREAK`, or `None`. `CONT` voids the match and keeps
			
 
				 on looking up the ignore list. `BREAK` stops looking up ignore tokens for the
			
@@ -242,7 +251,7 @@ number of characters, and/or exit the text scanning loop altogether.
 
				 - `ctx.src_tk`: the input token being looked up.
			
 
				 - `ctx.dest_tk`: the transliterated string associated with the current token.
			
 
				 
			
 
				-#### Output
			
 
				+#### Return
			
 
				 
			
 
				 `CONT`, `BREAK`, or `None`. `CONT` skips the checks on the
			
 
				 current token. `BREAK` stops looking up all tokens for the current
			
@@ -269,7 +278,7 @@ also inject additional conditions and logic for the match, and revoke the
 
				 - `ctx.match`: whether there was a match. If set to `False`, the rest of the
			
 
				   workflow will assume a non-match.
			
 
				 
			
 
				-#### Output
			
 
				+#### Return
			
 
				 
			
 
				 `CONT`, `BREAK`, or `None`. `CONT` voids the match and keeps
			
 
				 on looking up the token list. `BREAK` stops looking up tokens for the
			
@@ -292,7 +301,7 @@ cursor position to the destination list, verbatim.
 
				   at every character iteration. See "Cursor Flags" below.
			
 
				 - `ctx.dest_ls`: destination token list.
			
 
				 
			
 
				-#### Output
			
 
				+#### Return
			
 
				 
			
 
				 `CONT`, `BREAK`, or `None`. `CONT` skips to the next position in the input
			
 
				 text. Int his case, the function **must** advance the cursor. `BREAK` stops all
			
@@ -311,10 +320,10 @@ bypass any further output handling.
 
				 
			
 
				 - `ctx.dest_ls`: destination token list.
			
 
				 
			
 
				-#### Output
			
 
				+#### Return
			
 
				 
			
 
				-A string or `None`. If the output is a string, the transliteration function
			
 
				-returns this string immediately; otherwise it proceeds with standard
			
 
				+`BREAK` or `None`. If `BREAK`, the content of `ctx.dest`, which should be set
			
 
				+by the function, is returned immediately; otherwise it proceeds with standard
			
 
				 adjustments and assembly of the output list.
			
 
				 
			
 
				 ### `post_assembly`
			
@@ -333,9 +342,9 @@ and return it before any further default processing is done.
 
				 
			
 
				 #### Output
			
 
				 
			
 
				-String or `None`. If a string, the transliteration function returns that
			
 
				-immediately; otherwise it proceeds with standard adjustments of the output
			
 
				-string before returning.
			
 
				+`BREAK` or `None`. If `BREAK`, the transliteration function returns the content
			
 
				+of `ctx.dest` immediately; otherwise it proceeds with standard adjustments of
			
 
				+the output string before returning.
			
 
				 
			
 
				 ## Cursor flags
			
 
				 
			
--- a/scriptshifter/hooks/chinese/__init__.py
+++ b/scriptshifter/hooks/chinese/__init__.py
@@ -4,8 +4,6 @@ __doc__ = """Chinese hooks."""
 
				 from logging import getLogger
			
 
				 from re import I, compile, search, sub
			
 
				 
			
 
				-from scriptshifter.hooks.general import normalize_spacing_post_assembly
			
 
				-
			
 
				 
			
 
				 logger = getLogger(__name__)
			
 
				 
			
@@ -21,7 +19,7 @@ def parse_numerals_pre_assembly(ctx):
 
				     tk_ct = len(ctx.dest_ls)
			
 
				     token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")
			
 
				 
			
 
				-    output = ""
			
 
				+    output = []
			
 
				 
			
 
				     # Use manual loop as i is manipulated inside it.
			
 
				     i = 0
			
@@ -36,7 +34,7 @@ def parse_numerals_pre_assembly(ctx):
 
				             # characters representing numbers are converted to Arabic
			
 
				             # numerals. When a non-numerical token (or end of string) is
			
 
				             # encountered, the string of numerical tokens is evaluated to
			
 
				-            # determine which version should be used in the output string.
			
 
				+            # determine which version should be used in the output.
			
 
				             # The outer loop then continues where the inner loop left off.
			
 
				             logger.debug(f"Match number: {tk_i}.")
			
 
				             text_v = num_v = ""
			
@@ -96,7 +94,7 @@ def parse_numerals_pre_assembly(ctx):
 
				                             while search("[0-9] [0-9]", num_v):
			
 
				                                 num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
			
 
				 
			
 
				-                    output += num_v if use_num_v else text_v
			
 
				+                    output.append(num_v if use_num_v else text_v)
			
 
				 
			
 
				                     # if the end of the string is not reached, backtrack to the
			
 
				                     # delimiter after the last numerical token (i.e. two tokens
			
@@ -117,16 +115,12 @@ def parse_numerals_pre_assembly(ctx):
 
				 
			
 
				         else:
			
 
				             logger.debug(f"No numeric match: adding {tk_i}.")
			
 
				-            output += tk_i
			
 
				+            output.append(tk_i)
			
 
				 
			
 
				         i += 1
			
 
				 
			
 
				     logger.debug(f"Use num version: {use_num_v}")
			
 
				-    ctx.dest = output
			
 
				-
			
 
				-    # Skip main transliterate function joining.
			
 
				-
			
 
				-    return normalize_spacing_post_assembly(ctx)
			
 
				+    ctx.dest_ls = output
			
 
				 
			
 
				 
			
 
				 def person_name_pre_assembly(ctx):
			
--- a/scriptshifter/hooks/general/__init__.py
+++ b/scriptshifter/hooks/general/__init__.py
@@ -5,13 +5,14 @@ General-purpose hooks.
 
				 from logging import getLogger
			
 
				 from re import compile
			
 
				 
			
 
				-from scriptshifter.trans import MULTI_WS_RE
			
 
				 
			
 
				+# Match multiple spaces.
			
 
				+MULTI_WS_RE = compile(r"(\s){2,}")
			
 
				 
			
 
				 # Punctuation and brackets.
			
 
				 # TODO add angled brackets, opening and closing quotes, etc.
			
 
				 NORM1_RE = compile(r"\s([.,;:\)\]}])")
			
 
				-NORM2_RE = compile(r"([.,;:\)\]}])(\S)")
			
 
				+NORM2_RE = compile(r"([,;\)\]}])(\S)")
			
 
				 NORM3_RE = compile(r"([\(\[\{])\s")
			
 
				 NORM4_RE = compile(r"(\S)([\(\[\{])")
			
 
				 
			
@@ -42,12 +43,15 @@ def capitalize_post_assembly(ctx):
 
				 
			
 
				     dest_ls = _capitalize(dest_ls, ctx.options.get("capitalize"))
			
 
				 
			
 
				-    return " ".join(dest_ls)
			
 
				+    ctx.dest = " ".join(dest_ls)
			
 
				 
			
 
				 
			
 
				 def normalize_spacing_post_assembly(ctx):
			
 
				     """
			
 
				     Remove duplicate and unwanted whitespace around punctuation.
			
 
				+
			
 
				+    NOTE: This is called by default by transliterate() immediately after the
			
 
				+    `post_assembly` hook.
			
 
				     """
			
 
				     # De-duplicate whitespace.
			
 
				     logger.debug(f"Dest pre manipulation: {ctx.dest}")
			
@@ -70,7 +74,7 @@ def normalize_spacing_post_assembly(ctx):
 
				     # Remove multiple white space characters.
			
 
				     # norm = NORM8_RE.sub(r"\1\2", norm)
			
 
				 
			
 
				-    return norm
			
 
				+    ctx.dest = norm
			
 
				 
			
 
				 
			
 
				 def _capitalize(src, which):
			
--- a/scriptshifter/hooks/hebrew/dicta_api.py
+++ b/scriptshifter/hooks/hebrew/dicta_api.py
@@ -27,6 +27,6 @@ def s2r_post_config(ctx):
 
				 
			
 
				     ctx.dest = rsp.json().get("transliteration")
			
 
				     if ctx.dest:
			
 
				-        ctx.dest = capitalize_post_assembly(ctx)
			
 
				+        capitalize_post_assembly(ctx)
			
 
				 
			
 
				     return BREAK
			
--- a/scriptshifter/hooks/korean/romanizer.py
+++ b/scriptshifter/hooks/korean/romanizer.py
@@ -66,7 +66,7 @@ def s2r_nonames_post_config(ctx):
 
				         # FKR042: Capitalize all first letters
			
 
				         # FKR043: Capitalize the first letter
			
 
				         logger.debug(f"Before capitalization: {ctx.dest}")
			
 
				-        ctx.dest = capitalize_post_assembly(ctx)
			
 
				+        capitalize_post_assembly(ctx)
			
 
				 
			
 
				     return BREAK
			
 
				 
			
@@ -84,7 +84,7 @@ def s2r_names_post_config(ctx):
 
				         # FKR042: Capitalize all first letters
			
 
				         # FKR043: Capitalize the first letter
			
 
				         logger.debug(f"Before capitalization: {ctx.dest}")
			
 
				-        ctx.dest = capitalize_post_assembly(ctx)
			
 
				+        capitalize_post_assembly(ctx)
			
 
				 
			
 
				     return BREAK
			
 
				 
			
--- a/scriptshifter/trans.py
+++ b/scriptshifter/trans.py
@@ -5,15 +5,12 @@ from re import Pattern, compile
 
				 from unicodedata import normalize as precomp_normalize
			
 
				 
			
 
				 from scriptshifter.exceptions import BREAK, CONT
			
 
				+from scriptshifter.hooks.general import normalize_spacing_post_assembly
			
 
				 from scriptshifter.tables import (
			
 
				         BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
			
 
				         get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
			
 
				         get_lang_ignore, get_lang_map, get_lang_normalize)
			
 
				 
			
 
				-
			
 
				-# Match multiple spaces.
			
 
				-MULTI_WS_RE = compile(r"(\s){2,}")
			
 
				-
			
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
 
				 
			
@@ -389,20 +386,17 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
				 
			
 
				         # This hook may take care of the assembly and cause the function to
			
 
				         # return its own return value.
			
 
				-        hret = ctx.run_hook("pre_assembly")
			
 
				-        if hret is not None:
			
 
				-            return hret, ctx.warnings
			
 
				+        if ctx.run_hook("pre_assembly") == BREAK:
			
 
				+            return ctx.dest, ctx.warnings
			
 
				 
			
 
				         logger.debug(f"Output list: {ctx.dest_ls}")
			
 
				         ctx.dest = "".join(ctx.dest_ls)
			
 
				 
			
 
				         # This hook may reassign the output string and/or cause the function to
			
 
				         # return it immediately.
			
 
				-        hret = ctx.run_hook("post_assembly")
			
 
				-        if hret is not None:
			
 
				-            return hret, ctx.warnings
			
 
				+        if ctx.run_hook("post_assembly") == BREAK:
			
 
				+            return ctx.dest, ctx.warnings
			
 
				 
			
 
				-        # Strip multiple spaces and leading/trailing whitespace.
			
 
				-        ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
			
 
				+        normalize_spacing_post_assembly(ctx)
			
 
				 
			
 
				         return ctx.dest, ctx.warnings