Explorar el Código

Simplify signature for `post_normalize` hook; add doc.

scossu hace 3 meses
padre
commit
21d491281a
Se han modificado 2 ficheros con 20 adiciones y 7 borrados
  1. 15 3
      doc/hooks.md
  2. 5 4
      scriptshifter/trans.py

+ 15 - 3
doc/hooks.md

@@ -118,7 +118,9 @@ registered as constants under `scriptshifter.exceptions`.
 
 The following members of the context object are available in all the hooks:
 
-- `ctx.src`: Source text. Read only.
+- `ctx.src`: Source text. This should not be changed except in `post_config`
+  and `post_normalize` hooks. It may also change after applying table-based
+  normalization rules.
 - `ctx.general`: Configuration general options.
 - `ctx.langsec`: language section (S2R or R2S) of configuration.
 - `ctx.options`: language-specific options defined in configuration and set
@@ -150,7 +152,17 @@ or REST API.
 
 `None` or `BREAK`. In the former case the application proceeds to the usual
 transliteration process; in the latter case, it returns the value of
-`ctx.dest`, which the hook function should have set.
+`ctx.dest`, which the hook function should have set, along with any warnings
+in `ctx.warnings`.
+
+### `post_normalize`
+
+This hook may be used to normalize the source after the table-based
+normalization rules are applied.
+
+#### Return
+
+- `BREAK`: return immediately an empty content and potentially warnings.
 
 ### `begin_input_token`
 
@@ -175,7 +187,7 @@ the parsing proceeds as normal. `CONT` causes the application to skip the
 parsing of the current token. `BREAK` interrupts the text scanning and
 proceeds directly to handling the result list for output. **CAUTION**: when
 returning CONT, it is the responsibility of the function to advance
-`ctx.cur` so that the loop doesn't become an infinite one. 
+`ctx.cur` so that the loop doesn't become an infinite one.
 
 ### `pre_ignore_token`
 

+ 5 - 4
scriptshifter/trans.py

@@ -5,7 +5,7 @@ from re import Pattern, compile
 
 from scriptshifter.exceptions import BREAK, CONT
 from scriptshifter.tables import (
-        BOW, EOW, WORD_BOUNDARY, FEAT_CASEI, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
+        BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
         get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
         get_lang_ignore, get_lang_map, get_lang_normalize)
 
@@ -121,8 +121,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
             return getattr(ctx, "dest", ""), ctx.warnings
 
         # _normalize_src returns the results of the post_normalize hook.
-        if _normalize_src(
-                ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
+        if _normalize_src(ctx) == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
 
         logger.debug(f"Normalized source: {ctx.src}")
@@ -335,13 +334,15 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
         return ctx.dest, ctx.warnings
 
 
-def _normalize_src(ctx, norm_rules):
+def _normalize_src(ctx):
     """
     Normalize source text according to rules.
 
     NOTE: this manipluates the protected source attribute so it may not
     correspond to the originally provided source.
     """
+    norm_rules = get_lang_normalize(ctx.conn, ctx.lang_id)
+
     for nk, nv in norm_rules.items():
         ctx._src = ctx.src.replace(nk, nv)