Kaynağa Gözat

WIP Parse Chinese numerals.

scossu 1 yıl önce
ebeveyn
işleme
f4bf40f8b6

+ 112 - 0
scriptshifter/hooks/chinese/__init__.py

@@ -0,0 +1,112 @@
+__doc__ = """Chinese hooks."""
+
+
+from re import I, compile, match, split, sub
+
+
+def parse_numerals(ctx):
+    """
+    Parse Chinese numerals in the already romanized result.
+
+    This is run at post-assembly.
+    """
+    # Only apply to specific MARC fields.
+    use_num_v = ctx.options.get("marc_field") in ("245", "830")
+
+    tokens = split(r"[\W^#]", ctx.dest)
+    tk_ct = len(tokens)
+
+    token_ptn = compile("^([A-Za-z]+)#([0-9]*)$")
+
+    output = ""
+
+    # Use manual loop as i is manipulated inside it.
+    i = 0
+    while i < tk_ct:
+        tk_i = tokens[i]
+        if match(token_ptn, tk_i):
+            text_v = num_v = ""
+            for j, tk_j in enumerate(tokens):
+                m = match(token_ptn, tk_j)
+                # a token without # (or the end of string) is reached
+                if ((j % 2 == 0 and not m) or j == len(tokens) - 1):
+                    # If this runs, then we are on the last token and it is
+                    # numeric. Add text after # (if present) to numerical
+                    # version
+                    if m:
+                        text_v += m[1]
+                        num_v += m[2] if m[2] else m[1]
+                    elif j == tk_ct - 1:
+                        # if last token is non-numerical, just tack it on.
+                        text_v += tk_j
+                        num_v += tk_j
+                    elif len(text_v) and len(num_v):
+                        # if not at end of string yet and token is
+                        # non-numerical, remove the last delimiter that was
+                        # appended (outer loop will pick up at this point)
+                        text_v = text_v[:-1]
+                        num_v = num_v[:-1]
+                    # evaluate numerical string that has been constructed so
+                    # far. Use num version for ordinals and date strings
+                    if (
+                        match("^di [0-9]", num_v, flags=I) or
+                        match("[0-9] [0-9] [0-9] [0-9]", num_v) or
+                        match("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
+                        match("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
+                    ):
+                        use_num_v = True
+                        # At this point, string may contain literal
+                        # translations of Chinese numerals Convert these to
+                        # Arabic numerals (for example "2 10 7" = "27").
+                        while (
+                                match(num_v, "[0-9] 10+") or
+                                match(num_v, "[1-9]0+ [1-9]")):
+                            m = match(num_v, "([0-9]+) ([1-9]0+)")
+                            if m:
+                                parsed_sum = int(m[1]) + int(m[2])
+                                num_v = sub(
+                                        "[0-9]+ [1-9]0+", str(parsed_sum),
+                                        num_v, 1)
+                            else:
+                                mb = match(num_v, "([1-9]0+) ([0-9]+)")
+                                if mb:
+                                    parsed_sum_b = int(m[1]) + int(m[2])
+                                    num_v = sub(
+                                            "[1-9]0+ [0-9]+",
+                                            str(parsed_sum_b), num_v, 1)
+                                else:
+                                    break
+                        # A few other tweaks
+                        num_v = sub(
+                                "([0-9]) ([0-9]) ([0-9]) ([0-9])",
+                                r"\1\2\3\4", num_v)
+                        if ctx.options.get("marc_field") in ("245", "830"):
+                            # TODO optimize without loop.
+                            while match("[0-9] [0-9]", num_v):
+                                num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
+
+                    output += num_v if use_num_v else text_v
+
+                    # if the end of the string is not reached, backtrack to the
+                    # delimiter after the last numerical token (i.e. two tokens
+                    # ago)
+
+                    i = j - 2 if j < tk_ct - 1 else j
+                    break
+
+                # this is run when we are not yet at the end of the string and
+                # have not yet reached a non-numerical token. This is identical
+                # to the code that is run above when the last token is numeric.
+
+                if j % 2 == 0:
+                    m = match(token_ptn, tk_j)
+                    text_v += m[1]
+                    num_v += m[2] if m[2] else m[1]
+                else:
+                    text_v += tk_j
+                    num_v += tk_j
+
+        else:
+            output += tk_i
+
+    ctx.dest = output

+ 13 - 0
scriptshifter/tables/data/chinese.yml

@@ -3,12 +3,25 @@ general: # Section names and other keywords are all snake_cased.
   parents:
     - _ignore_base
 
+options:
+  - id: marc_field
+    label: MARC field
+    description: Romanize according to a specific MARC field format. Leave blank if not applicable.
+    type: string
+    default:
+
 script_to_roman:
   directives: # Directives section.
     # Capitalize the first letter of the string only; TODO
     # Implement a list that includes all punctuation marks that
     # want the following letter capitalized.
     capitalize: true
+
+  hooks:
+    post_assembly:
+      -
+        - chinese.parse_numerals
+
   map: # Mapping section.
     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u862D\u5171\u548C\u570B": "Bajisitan Yisilan Gongheguo "
     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u5170\u5171\u548C\u56FD": "Bajisitan Yisilan Gongheguo "