1 yıl önce · f4bf40f8b6
--- a/scriptshifter/hooks/chinese/__init__.py
+++ b/scriptshifter/hooks/chinese/__init__.py
@@ -0,0 +1,112 @@
 
				+__doc__ = """Chinese hooks."""
			
 
				+
			
 
				+
			
 
				+from re import I, compile, match, split, sub
			
 
				+
			
 
				+
			
 
				+def parse_numerals(ctx):
			
 
				+    """
			
 
				+    Parse Chinese numerals in the already romanized result.
			
 
				+
			
 
				+    This is run at post-assembly.
			
 
				+    """
			
 
				+    # Only apply to specific MARC fields.
			
 
				+    use_num_v = ctx.options.get("marc_field") in ("245", "830")
			
 
				+
			
 
				+    tokens = split(r"[\W^#]", ctx.dest)
			
 
				+    tk_ct = len(tokens)
			
 
				+
			
 
				+    token_ptn = compile("^([A-Za-z]+)#([0-9]*)$")
			
 
				+
			
 
				+    output = ""
			
 
				+
			
 
				+    # Use manual loop as i is manipulated inside it.
			
 
				+    i = 0
			
 
				+    while i < tk_ct:
			
 
				+        tk_i = tokens[i]
			
 
				+        if match(token_ptn, tk_i):
			
 
				+            text_v = num_v = ""
			
 
				+            for j, tk_j in enumerate(tokens):
			
 
				+                m = match(token_ptn, tk_j)
			
 
				+                # a token without # (or the end of string) is reached
			
 
				+                if ((j % 2 == 0 and not m) or j == len(tokens) - 1):
			
 
				+                    # If this runs, then we are on the last token and it is
			
 
				+                    # numeric. Add text after # (if present) to numerical
			
 
				+                    # version
			
 
				+                    if m:
			
 
				+                        text_v += m[1]
			
 
				+                        num_v += m[2] if m[2] else m[1]
			
 
				+                    elif j == tk_ct - 1:
			
 
				+                        # if last token is non-numerical, just tack it on.
			
 
				+                        text_v += tk_j
			
 
				+                        num_v += tk_j
			
 
				+                    elif len(text_v) and len(num_v):
			
 
				+                        # if not at end of string yet and token is
			
 
				+                        # non-numerical, remove the last delimiter that was
			
 
				+                        # appended (outer loop will pick up at this point)
			
 
				+                        text_v = text_v[:-1]
			
 
				+                        num_v = num_v[:-1]
			
 
				+                    # evaluate numerical string that has been constructed so
			
 
				+                    # far. Use num version for ordinals and date strings
			
 
				+                    if (
			
 
				+                        match("^di [0-9]", num_v, flags=I) or
			
 
				+                        match("[0-9] [0-9] [0-9] [0-9]", num_v) or
			
 
				+                        match("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
			
 
				+                        match("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
			
 
				+                    ):
			
 
				+                        use_num_v = True
			
 
				+                        # At this point, string may contain literal
			
 
				+                        # translations of Chinese numerals Convert these to
			
 
				+                        # Arabic numerals (for example "2 10 7" = "27").
			
 
				+                        while (
			
 
				+                                match(num_v, "[0-9] 10+") or
			
 
				+                                match(num_v, "[1-9]0+ [1-9]")):
			
 
				+                            m = match(num_v, "([0-9]+) ([1-9]0+)")
			
 
				+                            if m:
			
 
				+                                parsed_sum = int(m[1]) + int(m[2])
			
 
				+                                num_v = sub(
			
 
				+                                        "[0-9]+ [1-9]0+", str(parsed_sum),
			
 
				+                                        num_v, 1)
			
 
				+                            else:
			
 
				+                                mb = match(num_v, "([1-9]0+) ([0-9]+)")
			
 
				+                                if mb:
			
 
				+                                    parsed_sum_b = int(m[1]) + int(m[2])
			
 
				+                                    num_v = sub(
			
 
				+                                            "[1-9]0+ [0-9]+",
			
 
				+                                            str(parsed_sum_b), num_v, 1)
			
 
				+                                else:
			
 
				+                                    break
			
 
				+                        # A few other tweaks
			
 
				+                        num_v = sub(
			
 
				+                                "([0-9]) ([0-9]) ([0-9]) ([0-9])",
			
 
				+                                r"\1\2\3\4", num_v)
			
 
				+                        if ctx.options.get("marc_field") in ("245", "830"):
			
 
				+                            # TODO optimize without loop.
			
 
				+                            while match("[0-9] [0-9]", num_v):
			
 
				+                                num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
			
 
				+
			
 
				+                    output += num_v if use_num_v else text_v
			
 
				+
			
 
				+                    # if the end of the string is not reached, backtrack to the
			
 
				+                    # delimiter after the last numerical token (i.e. two tokens
			
 
				+                    # ago)
			
 
				+
			
 
				+                    i = j - 2 if j < tk_ct - 1 else j
			
 
				+                    break
			
 
				+
			
 
				+                # this is run when we are not yet at the end of the string and
			
 
				+                # have not yet reached a non-numerical token. This is identical
			
 
				+                # to the code that is run above when the last token is numeric.
			
 
				+
			
 
				+                if j % 2 == 0:
			
 
				+                    m = match(token_ptn, tk_j)
			
 
				+                    text_v += m[1]
			
 
				+                    num_v += m[2] if m[2] else m[1]
			
 
				+                else:
			
 
				+                    text_v += tk_j
			
 
				+                    num_v += tk_j
			
 
				+
			
 
				+        else:
			
 
				+            output += tk_i
			
 
				+
			
 
				+    ctx.dest = output
			
--- a/scriptshifter/tables/data/chinese.yml
+++ b/scriptshifter/tables/data/chinese.yml
@@ -3,12 +3,25 @@ general: # Section names and other keywords are all snake_cased.
 
				   parents:
			
 
				     - _ignore_base
			
 
				 
			
 
				+options:
			
 
				+  - id: marc_field
			
 
				+    label: MARC field
			
 
				+    description: Romanize according to a specific MARC field format. Leave blank if not applicable.
			
 
				+    type: string
			
 
				+    default:
			
 
				+
			
 
				 script_to_roman:
			
 
				   directives: # Directives section.
			
 
				     # Capitalize the first letter of the string only; TODO
			
 
				     # Implement a list that includes all punctuation marks that
			
 
				     # want the following letter capitalized.
			
 
				     capitalize: true
			
 
				+
			
 
				+  hooks:
			
 
				+    post_assembly:
			
 
				+      -
			
 
				+        - chinese.parse_numerals
			
 
				+
			
 
				   map: # Mapping section.
			
 
				     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u862D\u5171\u548C\u570B": "Bajisitan Yisilan Gongheguo "
			
 
				     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u5170\u5171\u548C\u56FD": "Bajisitan Yisilan Gongheguo "