1 gadu atpakaļ · 6bcf7bd623
--- a/doc/hooks.md
+++ b/doc/hooks.md
@@ -333,7 +333,7 @@ and return it before any further default processing is done.
 
				 
			
 
				 #### Output
			
 
				 
			
 
				-`"ret"` or `None`. If `"ret"`, the transliteration function returns `ctx.dest`
			
 
				+String or `None`. If a string, the transliteration function returns that
			
 
				 immediately; otherwise it proceeds with standard adjustments of the output
			
 
				 string before returning.
			
 
				 
			
--- a/scriptshifter/hooks/chinese/__init__.py
+++ b/scriptshifter/hooks/chinese/__init__.py
@@ -4,11 +4,13 @@ __doc__ = """Chinese hooks."""
 
				 from logging import getLogger
			
 
				 from re import I, compile, search, sub
			
 
				 
			
 
				+from scriptshifter.hooks.general import normalize_spacing_post_assembly
			
 
				+
			
 
				 
			
 
				 logger = getLogger(__name__)
			
 
				 
			
 
				 
			
 
				-def parse_numerals(ctx):
			
 
				+def parse_numerals_pre_assembly(ctx):
			
 
				     """
			
 
				     Parse Chinese numerals in the already romanized result.
			
 
				 
			
@@ -18,9 +20,8 @@ def parse_numerals(ctx):
 
				     use_num_v = ctx.options.get("marc_field") in ("245n", "830n")
			
 
				 
			
 
				     # tokens = split(r"[\W^#]", ctx.dest)  # Original logic.
			
 
				-    tokens = [tk.strip() for tk in ctx.dest_ls]
			
 
				-    tk_ct = len(tokens)
			
 
				-    token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)$")
			
 
				+    tk_ct = len(ctx.dest_ls)
			
 
				+    token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")
			
 
				 
			
 
				     output = ""
			
 
				 
			
@@ -28,7 +29,7 @@ def parse_numerals(ctx):
 
				     i = 0
			
 
				 
			
 
				     while i < tk_ct:
			
 
				-        tk_i = tokens[i]
			
 
				+        tk_i = ctx.dest_ls[i]
			
 
				         if search(token_ptn, tk_i):
			
 
				             # When a numerical token (containing #) is reached, the inner loop
			
 
				             # consumes it and all consecutive numerical tokens found after it.
			
@@ -39,10 +40,10 @@ def parse_numerals(ctx):
 
				             # encountered, the string of numerical tokens is evaluated to
			
 
				             # determine which version should be used in the output string.
			
 
				             # The outer loop then continues where the inner loop left off.
			
 
				-            logger.debug(f"Match number: {tk_i}")
			
 
				+            logger.debug(f"Match number: {tk_i}.")
			
 
				             text_v = num_v = ""
			
 
				             for j in range(i, tk_ct):
			
 
				-                tk_j = tokens[j]
			
 
				+                tk_j = ctx.dest_ls[j]
			
 
				                 m = search(token_ptn, tk_j)
			
 
				                 # if m:
			
 
				                 #     logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
			
@@ -51,10 +52,10 @@ def parse_numerals(ctx):
 
				                     logger.debug(f"Next token is not numeric: {tk_j}")
			
 
				                     # If this runs, then we are on the last token and it is
			
 
				                     # numeric. Add text after # (if present) to numerical
			
 
				-                    # version
			
 
				+                    # version and captured whitespace after the number.
			
 
				                     if m:
			
 
				-                        text_v += m[1] + " "
			
 
				-                        num_v += m[2] if len(m[2]) else m[1]
			
 
				+                        text_v += m[1] + m[3]
			
 
				+                        num_v += m[2] + m[3] if len(m[2]) else m[1] + m[3]
			
 
				                         # Append white space.
			
 
				                         num_v += " "
			
 
				                     elif j == tk_ct - 1:
			
@@ -68,7 +69,7 @@ def parse_numerals(ctx):
 
				                         search("^di [0-9]", num_v, flags=I) or
			
 
				                         search("[0-9] [0-9] [0-9] [0-9]", num_v) or
			
 
				                         search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
			
 
				-                        search("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
			
 
				+                        search("[0-9]+ yue [0-9]+ ri", num_v, flags=I)
			
 
				                     ):
			
 
				                         use_num_v = True
			
 
				                         # At this point, string may contain literal
			
@@ -97,8 +98,6 @@ def parse_numerals(ctx):
 
				                             while search("[0-9] [0-9]", num_v):
			
 
				                                 num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
			
 
				 
			
 
				-                    logger.debug(f"num_v: {num_v}")
			
 
				-                    logger.debug(f"text_v: {text_v}")
			
 
				                     output += num_v if use_num_v else text_v
			
 
				 
			
 
				                     # if the end of the string is not reached, backtrack to the
			
@@ -111,17 +110,22 @@ def parse_numerals(ctx):
 
				 
			
 
				                 # this is run when we are not yet at the end of the string and
			
 
				                 # have not yet reached a non-numerical token. This is identical
			
 
				-                # to the code that is run above when the last token is numeric.
			
 
				+                # to the code that is run above when the last token is numeric,
			
 
				+                # except that whitespace after the token is stripped.
			
 
				                 m = search(token_ptn, tk_j)
			
 
				                 text_v += m[1] + " "
			
 
				                 num_v += m[2] if len(m[2]) else m[1]
			
 
				                 num_v += " "
			
 
				 
			
 
				         else:
			
 
				-            logger.debug(f"No match: adding {tk_i}.")
			
 
				-            output += tk_i + " "
			
 
				+            logger.debug(f"No numeric match: adding {tk_i}.")
			
 
				+            output += tk_i
			
 
				 
			
 
				         i += 1
			
 
				 
			
 
				-    print(f"Use num version: {use_num_v}")
			
 
				+    logger.debug(f"Use num version: {use_num_v}")
			
 
				     ctx.dest = output
			
 
				+
			
 
				+    # Skip main transliterate function joining.
			
 
				+
			
 
				+    return normalize_spacing_post_assembly(ctx)
			
--- a/scriptshifter/hooks/general/__init__.py
+++ b/scriptshifter/hooks/general/__init__.py
@@ -0,0 +1,48 @@
 
				+__doc__ = """
			
 
				+General-purpose hooks.
			
 
				+"""
			
 
				+
			
 
				+from logging import getLogger
			
 
				+from re import compile
			
 
				+
			
 
				+from scriptshifter.trans import MULTI_WS_RE
			
 
				+
			
 
				+
			
 
				+NORM_MAP = (
			
 
				+    (" .", "."),
			
 
				+    (" ;", ";"),
			
 
				+    (" ,", ","),
			
 
				+    ("( ", "("),
			
 
				+    ("[ ", "["),
			
 
				+    ("{ ", "{"),
			
 
				+    (" )", ")"),
			
 
				+    (" ]", "]"),
			
 
				+    (" }", "}"),
			
 
				+    ("- -", "--"),
			
 
				+)
			
 
				+
			
 
				+NORM1_RE = compile(r"([.,;:\)\]}])\s")
			
 
				+NORM2_RE = compile(r"(\S)([.,;:\)\]}])")
			
 
				+NORM3_RE = compile(r"\s([\)\]\}])")
			
 
				+NORM4_RE = compile(r"([\)\]\}])(\S)")
			
 
				+
			
 
				+logger = getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+def normalize_spacing_post_assembly(ctx):
			
 
				+    """
			
 
				+    Remove duplicate and unwanted whitespace around punctuation.
			
 
				+    """
			
 
				+    # De-duplicate whitespace.
			
 
				+    logger.debug(f"Dest pre manipulation: {ctx.dest}")
			
 
				+    norm = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
			
 
				+    norm = NORM1_RE.sub(r"\1", norm)
			
 
				+    norm = NORM2_RE.sub(r"\1 \2", norm)
			
 
				+    norm = NORM3_RE.sub(r"\1", norm)
			
 
				+    norm = NORM4_RE.sub(r"\1 \2", norm)
			
 
				+
			
 
				+    # Normalize spacing around punctuation and parentheses.
			
 
				+    for a, b in NORM_MAP:
			
 
				+        norm = norm.replace(a, b)
			
 
				+
			
 
				+    return norm
			
--- a/scriptshifter/tables/data/_chinese_base.yml
+++ b/scriptshifter/tables/data/_chinese_base.yml
@@ -45605,7 +45605,7 @@ script_to_roman:
 
				     "\u300D": "\" "
			
 
				     "\u300C": "\" "
			
 
				     "\u300B": "\" "
			
 
				-    "\u300A": "\" "
			
 
				+    "\u300A": "\""  # NOTE removed whitespace after opening quote.
			
 
				     "\u3009": "\" "
			
 
				     "\u3008": "\" "
			
 
				     "\u201D": "\" "
			
--- a/scriptshifter/tables/data/chinese.yml
+++ b/scriptshifter/tables/data/chinese.yml
@@ -25,9 +25,9 @@ script_to_roman:
 
				     capitalize: true
			
 
				 
			
 
				   hooks:
			
 
				-    post_assembly:
			
 
				+    pre_assembly:
			
 
				       -
			
 
				-        - chinese.parse_numerals
			
 
				+        - chinese.parse_numerals_pre_assembly
			
 
				 
			
 
				   map:
			
 
				     "〇": "ling#0 "
			
--- a/scriptshifter/trans.py
+++ b/scriptshifter/trans.py
@@ -1,12 +1,13 @@
 
				 import logging
			
 
				-import re
			
 
				+
			
 
				+from re import compile
			
 
				 
			
 
				 from scriptshifter.exceptions import BREAK, CONT
			
 
				 from scriptshifter.tables import BOW, EOW, WORD_BOUNDARY, load_table
			
 
				 
			
 
				 
			
 
				 # Match multiple spaces.
			
 
				-MULTI_WS_RE = re.compile(r"\s{2,}")
			
 
				+MULTI_WS_RE = compile(r"(\s){2,}")
			
 
				 
			
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
@@ -288,11 +289,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
				     # This hook may reassign the output string and/or cause the function to
			
 
				     # return it immediately.
			
 
				     hret = _run_hook("post_assembly", ctx, langsec_hooks)
			
 
				-    if hret == "ret":
			
 
				-        return ctx.dest, ctx.warnings
			
 
				+    if hret is not None:
			
 
				+        return hret, ctx.warnings
			
 
				 
			
 
				     # Strip multiple spaces and leading/trailing whitespace.
			
 
				-    ctx.dest = re.sub(MULTI_WS_RE, ' ', ctx.dest.strip())
			
 
				+    ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
			
 
				 
			
 
				     return ctx.dest, ctx.warnings