Pārlūkot izejas kodu

Complete numerals:

* Transliterate all numeric examples correctly
* Modify hook return logic for consistency
* WIP partial spacing fix.
scossu 1 gadu atpakaļ
vecāks
revīzija
6bcf7bd623

+ 1 - 1
doc/hooks.md

@@ -333,7 +333,7 @@ and return it before any further default processing is done.
 
 #### Output
 
-`"ret"` or `None`. If `"ret"`, the transliteration function returns `ctx.dest`
+String or `None`. If a string, the transliteration function returns that
 immediately; otherwise it proceeds with standard adjustments of the output
 string before returning.
 

+ 21 - 17
scriptshifter/hooks/chinese/__init__.py

@@ -4,11 +4,13 @@ __doc__ = """Chinese hooks."""
 from logging import getLogger
 from re import I, compile, search, sub
 
+from scriptshifter.hooks.general import normalize_spacing_post_assembly
+
 
 logger = getLogger(__name__)
 
 
-def parse_numerals(ctx):
+def parse_numerals_pre_assembly(ctx):
     """
     Parse Chinese numerals in the already romanized result.
 
@@ -18,9 +20,8 @@ def parse_numerals(ctx):
     use_num_v = ctx.options.get("marc_field") in ("245n", "830n")
 
     # tokens = split(r"[\W^#]", ctx.dest)  # Original logic.
-    tokens = [tk.strip() for tk in ctx.dest_ls]
-    tk_ct = len(tokens)
-    token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)$")
+    tk_ct = len(ctx.dest_ls)
+    token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")
 
     output = ""
 
@@ -28,7 +29,7 @@ def parse_numerals(ctx):
     i = 0
 
     while i < tk_ct:
-        tk_i = tokens[i]
+        tk_i = ctx.dest_ls[i]
         if search(token_ptn, tk_i):
             # When a numerical token (containing #) is reached, the inner loop
             # consumes it and all consecutive numerical tokens found after it.
@@ -39,10 +40,10 @@ def parse_numerals(ctx):
             # encountered, the string of numerical tokens is evaluated to
             # determine which version should be used in the output string.
             # The outer loop then continues where the inner loop left off.
-            logger.debug(f"Match number: {tk_i}")
+            logger.debug(f"Match number: {tk_i}.")
             text_v = num_v = ""
             for j in range(i, tk_ct):
-                tk_j = tokens[j]
+                tk_j = ctx.dest_ls[j]
                 m = search(token_ptn, tk_j)
                 # if m:
                 #     logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
@@ -51,10 +52,10 @@ def parse_numerals(ctx):
                     logger.debug(f"Next token is not numeric: {tk_j}")
                     # If this runs, then we are on the last token and it is
                     # numeric. Add text after # (if present) to numerical
-                    # version
+                    # version and captured whitespace after the number.
                     if m:
-                        text_v += m[1] + " "
-                        num_v += m[2] if len(m[2]) else m[1]
+                        text_v += m[1] + m[3]
+                        num_v += m[2] + m[3] if len(m[2]) else m[1] + m[3]
                         # Append white space.
                         num_v += " "
                     elif j == tk_ct - 1:
@@ -68,7 +69,7 @@ def parse_numerals(ctx):
                         search("^di [0-9]", num_v, flags=I) or
                         search("[0-9] [0-9] [0-9] [0-9]", num_v) or
                         search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
-                        search("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
+                        search("[0-9]+ yue [0-9]+ ri", num_v, flags=I)
                     ):
                         use_num_v = True
                         # At this point, string may contain literal
@@ -97,8 +98,6 @@ def parse_numerals(ctx):
                             while search("[0-9] [0-9]", num_v):
                                 num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
 
-                    logger.debug(f"num_v: {num_v}")
-                    logger.debug(f"text_v: {text_v}")
                     output += num_v if use_num_v else text_v
 
                     # if the end of the string is not reached, backtrack to the
@@ -111,17 +110,22 @@ def parse_numerals(ctx):
 
                 # this is run when we are not yet at the end of the string and
                 # have not yet reached a non-numerical token. This is identical
-                # to the code that is run above when the last token is numeric.
+                # to the code that is run above when the last token is numeric,
+                # except that whitespace after the token is stripped.
                 m = search(token_ptn, tk_j)
                 text_v += m[1] + " "
                 num_v += m[2] if len(m[2]) else m[1]
                 num_v += " "
 
         else:
-            logger.debug(f"No match: adding {tk_i}.")
-            output += tk_i + " "
+            logger.debug(f"No numeric match: adding {tk_i}.")
+            output += tk_i
 
         i += 1
 
-    print(f"Use num version: {use_num_v}")
+    logger.debug(f"Use num version: {use_num_v}")
     ctx.dest = output
+
+    # Skip main transliterate function joining.
+
+    return normalize_spacing_post_assembly(ctx)

+ 48 - 0
scriptshifter/hooks/general/__init__.py

@@ -0,0 +1,48 @@
+__doc__ = """
+General-purpose hooks.
+"""
+
+from logging import getLogger
+from re import compile
+
+from scriptshifter.trans import MULTI_WS_RE
+
+
+NORM_MAP = (
+    (" .", "."),
+    (" ;", ";"),
+    (" ,", ","),
+    ("( ", "("),
+    ("[ ", "["),
+    ("{ ", "{"),
+    (" )", ")"),
+    (" ]", "]"),
+    (" }", "}"),
+    ("- -", "--"),
+)
+
+NORM1_RE = compile(r"([.,;:\)\]}])\s")
+NORM2_RE = compile(r"(\S)([.,;:\)\]}])")
+NORM3_RE = compile(r"\s([\)\]\}])")
+NORM4_RE = compile(r"([\)\]\}])(\S)")
+
+logger = getLogger(__name__)
+
+
+def normalize_spacing_post_assembly(ctx):
+    """
+    Remove duplicate and unwanted whitespace around punctuation.
+    """
+    # De-duplicate whitespace.
+    logger.debug(f"Dest pre manipulation: {ctx.dest}")
+    norm = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
+    norm = NORM1_RE.sub(r"\1", norm)
+    norm = NORM2_RE.sub(r"\1 \2", norm)
+    norm = NORM3_RE.sub(r"\1", norm)
+    norm = NORM4_RE.sub(r"\1 \2", norm)
+
+    # Normalize spacing around punctuation and parentheses.
+    for a, b in NORM_MAP:
+        norm = norm.replace(a, b)
+
+    return norm

+ 1 - 1
scriptshifter/tables/data/_chinese_base.yml

@@ -45605,7 +45605,7 @@ script_to_roman:
     "\u300D": "\" "
     "\u300C": "\" "
     "\u300B": "\" "
-    "\u300A": "\" "
+    "\u300A": "\""  # NOTE removed whitespace after opening quote.
     "\u3009": "\" "
     "\u3008": "\" "
     "\u201D": "\" "

+ 2 - 2
scriptshifter/tables/data/chinese.yml

@@ -25,9 +25,9 @@ script_to_roman:
     capitalize: true
 
   hooks:
-    post_assembly:
+    pre_assembly:
       -
-        - chinese.parse_numerals
+        - chinese.parse_numerals_pre_assembly
 
   map:
     "〇": "ling#0 "

+ 6 - 5
scriptshifter/trans.py

@@ -1,12 +1,13 @@
 import logging
-import re
+
+from re import compile
 
 from scriptshifter.exceptions import BREAK, CONT
 from scriptshifter.tables import BOW, EOW, WORD_BOUNDARY, load_table
 
 
 # Match multiple spaces.
-MULTI_WS_RE = re.compile(r"\s{2,}")
+MULTI_WS_RE = compile(r"(\s){2,}")
 
 logger = logging.getLogger(__name__)
 
@@ -288,11 +289,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
     # This hook may reassign the output string and/or cause the function to
     # return it immediately.
     hret = _run_hook("post_assembly", ctx, langsec_hooks)
-    if hret == "ret":
-        return ctx.dest, ctx.warnings
+    if hret is not None:
+        return hret, ctx.warnings
 
     # Strip multiple spaces and leading/trailing whitespace.
-    ctx.dest = re.sub(MULTI_WS_RE, ' ', ctx.dest.strip())
+    ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
 
     return ctx.dest, ctx.warnings