1 year ago · f4bf40f8b6
--- a/scriptshifter/hooks/chinese/__init__.py
+++ b/scriptshifter/hooks/chinese/__init__.py
@@ -0,0 +1,112 @@
 
															+__doc__ = """Chinese hooks."""
														
 
															+
														
 
															+
														
 
															+from re import I, compile, match, split, sub
														
 
															+
														
 
															+
														
 
															+def parse_numerals(ctx):
														
 
															+    """
														
 
															+    Parse Chinese numerals in the already romanized result.
														
 
															+
														
 
															+    This is run at post-assembly.
														
 
															+    """
														
 
															+    # Only apply to specific MARC fields.
														
 
															+    use_num_v = ctx.options.get("marc_field") in ("245", "830")
														
 
															+
														
 
															+    tokens = split(r"[\W^#]", ctx.dest)
														
 
															+    tk_ct = len(tokens)
														
 
															+
														
 
															+    token_ptn = compile("^([A-Za-z]+)#([0-9]*)$")
														
 
															+
														
 
															+    output = ""
														
 
															+
														
 
															+    # Use manual loop as i is manipulated inside it.
														
 
															+    i = 0
														
 
															+    while i < tk_ct:
														
 
															+        tk_i = tokens[i]
														
 
															+        if match(token_ptn, tk_i):
														
 
															+            text_v = num_v = ""
														
 
															+            for j, tk_j in enumerate(tokens):
														
 
															+                m = match(token_ptn, tk_j)
														
 
															+                # a token without # (or the end of string) is reached
														
 
															+                if ((j % 2 == 0 and not m) or j == len(tokens) - 1):
														
 
															+                    # If this runs, then we are on the last token and it is
														
 
															+                    # numeric. Add text after # (if present) to numerical
														
 
															+                    # version
														
 
															+                    if m:
														
 
															+                        text_v += m[1]
														
 
															+                        num_v += m[2] if m[2] else m[1]
														
 
															+                    elif j == tk_ct - 1:
														
 
															+                        # if last token is non-numerical, just tack it on.
														
 
															+                        text_v += tk_j
														
 
															+                        num_v += tk_j
														
 
															+                    elif len(text_v) and len(num_v):
														
 
															+                        # if not at end of string yet and token is
														
 
															+                        # non-numerical, remove the last delimiter that was
														
 
															+                        # appended (outer loop will pick up at this point)
														
 
															+                        text_v = text_v[:-1]
														
 
															+                        num_v = num_v[:-1]
														
 
															+                    # evaluate numerical string that has been constructed so
														
 
															+                    # far. Use num version for ordinals and date strings
														
 
															+                    if (
														
 
															+                        match("^di [0-9]", num_v, flags=I) or
														
 
															+                        match("[0-9] [0-9] [0-9] [0-9]", num_v) or
														
 
															+                        match("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
														
 
															+                        match("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
														
 
															+                    ):
														
 
															+                        use_num_v = True
														
 
															+                        # At this point, string may contain literal
														
 
															+                        # translations of Chinese numerals Convert these to
														
 
															+                        # Arabic numerals (for example "2 10 7" = "27").
														
 
															+                        while (
														
 
															+                                match(num_v, "[0-9] 10+") or
														
 
															+                                match(num_v, "[1-9]0+ [1-9]")):
														
 
															+                            m = match(num_v, "([0-9]+) ([1-9]0+)")
														
 
															+                            if m:
														
 
															+                                parsed_sum = int(m[1]) + int(m[2])
														
 
															+                                num_v = sub(
														
 
															+                                        "[0-9]+ [1-9]0+", str(parsed_sum),
														
 
															+                                        num_v, 1)
														
 
															+                            else:
														
 
															+                                mb = match(num_v, "([1-9]0+) ([0-9]+)")
														
 
															+                                if mb:
														
 
															+                                    parsed_sum_b = int(m[1]) + int(m[2])
														
 
															+                                    num_v = sub(
														
 
															+                                            "[1-9]0+ [0-9]+",
														
 
															+                                            str(parsed_sum_b), num_v, 1)
														
 
															+                                else:
														
 
															+                                    break
														
 
															+                        # A few other tweaks
														
 
															+                        num_v = sub(
														
 
															+                                "([0-9]) ([0-9]) ([0-9]) ([0-9])",
														
 
															+                                r"\1\2\3\4", num_v)
														
 
															+                        if ctx.options.get("marc_field") in ("245", "830"):
														
 
															+                            # TODO optimize without loop.
														
 
															+                            while match("[0-9] [0-9]", num_v):
														
 
															+                                num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
														
 
															+
														
 
															+                    output += num_v if use_num_v else text_v
														
 
															+
														
 
															+                    # if the end of the string is not reached, backtrack to the
														
 
															+                    # delimiter after the last numerical token (i.e. two tokens
														
 
															+                    # ago)
														
 
															+
														
 
															+                    i = j - 2 if j < tk_ct - 1 else j
														
 
															+                    break
														
 
															+
														
 
															+                # this is run when we are not yet at the end of the string and
														
 
															+                # have not yet reached a non-numerical token. This is identical
														
 
															+                # to the code that is run above when the last token is numeric.
														
 
															+
														
 
															+                if j % 2 == 0:
														
 
															+                    m = match(token_ptn, tk_j)
														
 
															+                    text_v += m[1]
														
 
															+                    num_v += m[2] if m[2] else m[1]
														
 
															+                else:
														
 
															+                    text_v += tk_j
														
 
															+                    num_v += tk_j
														
 
															+
														
 
															+        else:
														
 
															+            output += tk_i
														
 
															+
														
 
															+    ctx.dest = output
														
--- a/scriptshifter/tables/data/chinese.yml
+++ b/scriptshifter/tables/data/chinese.yml
@@ -3,12 +3,25 @@ general: # Section names and other keywords are all snake_cased.
 
															   parents:
														
 
															     - _ignore_base
														
 
															+options:
														
 
															+  - id: marc_field
														
 
															+    label: MARC field
														
 
															+    description: Romanize according to a specific MARC field format. Leave blank if not applicable.
														
 
															+    type: string
														
 
															+    default:
														
 
															+
														
 
															 script_to_roman:
														
 
															   directives: # Directives section.
														
 
															     # Capitalize the first letter of the string only; TODO
														
 
															     # Implement a list that includes all punctuation marks that
														
 
															     # want the following letter capitalized.
														
 
															     capitalize: true
														
 
															+
														
 
															+  hooks:
														
 
															+    post_assembly:
														
 
															+      -
														
 
															+        - chinese.parse_numerals
														
 
															+
														
 
															   map: # Mapping section.
														
 
															     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u862D\u5171\u548C\u570B": "Bajisitan Yisilan Gongheguo "
														
 
															     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u5170\u5171\u548C\u56FD": "Bajisitan Yisilan Gongheguo "