1 ano atrás · 4f3d021c63
--- a/scriptshifter/hooks/chinese/__init__.py
+++ b/scriptshifter/hooks/chinese/__init__.py
@@ -1,7 +1,33 @@
 
															 __doc__ = """Chinese hooks."""
														
 
															-from re import I, compile, match, split, sub
														
 
															+from logging import getLogger
														
 
															+from os import path
														
 
															+from re import I, compile, search, sub
														
 
															+
														
 
															+from yaml import load
														
 
															+try:
														
 
															+    from yaml import CLoader as Loader
														
 
															+except ImportError:
														
 
															+    from yaml import Loader
														
 
															+
														
 
															+
														
 
															+HOOK_DIR = path.dirname(path.realpath(__file__))
														
 
															+
														
 
															+logger = getLogger(__name__)
														
 
															+
														
 
															+
														
 
															+def merge_numerals_pre_config(tdata):
														
 
															+    """
														
 
															+    Add numerals mapping to configuration.
														
 
															+
														
 
															+    This overrides the existing character mappings.
														
 
															+    """
														
 
															+    num_map_yml = path.join(HOOK_DIR, "numerals.yml")
														
 
															+    with open(num_map_yml, "r") as fh:
														
 
															+        num_map = load(fh, Loader=Loader)
														
 
															+
														
 
															+    tdata["script_to_roman"]["map"].update(num_map)
														
 
															 def parse_numerals(ctx):
														
@@ -13,100 +39,111 @@ def parse_numerals(ctx):
 
															     # Only apply to specific MARC fields.
														
 
															     use_num_v = ctx.options.get("marc_field") in ("245", "830")
														
 
															-    tokens = split(r"[\W^#]", ctx.dest)
														
 
															+    # tokens = split(r"[\W^#]", ctx.dest)  # Original logic.
														
 
															+    tokens = [tk.strip() for tk in ctx.dest_ls]
														
 
															     tk_ct = len(tokens)
														
 
															-
														
 
															-    token_ptn = compile("^([A-Za-z]+)#([0-9]*)$")
														
 
															+    token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)$")
														
 
															     output = ""
														
 
															     # Use manual loop as i is manipulated inside it.
														
 
															     i = 0
														
 
															+
														
 
															     while i < tk_ct:
														
 
															         tk_i = tokens[i]
														
 
															-        if match(token_ptn, tk_i):
														
 
															+        if search(token_ptn, tk_i):
														
 
															+            # When a numerical token (containing #) is reached, the inner loop
														
 
															+            # consumes it and all consecutive numerical tokens found after it.
														
 
															+            # Two versions of the string are maintained. The textVersion is
														
 
															+            # the original pinyin (minus the # suffixes). In the numVersion,
														
 
															+            # characters representing numbers are converted to Arabic
														
 
															+            # numerals. When a non-numerical token (or end of string) is
														
 
															+            # encountered, the string of numerical tokens is evaluated to
														
 
															+            # determine which version should be used in the output string.
														
 
															+            # The outer loop then continues where the inner loop left off.
														
 
															+            logger.debug(f"Match number: {tk_i}")
														
 
															             text_v = num_v = ""
														
 
															-            for j, tk_j in enumerate(tokens):
														
 
															-                m = match(token_ptn, tk_j)
														
 
															+            for j in range(i, tk_ct):
														
 
															+                tk_j = tokens[j]
														
 
															+                m = search(token_ptn, tk_j)
														
 
															+                # if m:
														
 
															+                #     logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
														
 
															                 # a token without # (or the end of string) is reached
														
 
															-                if ((j % 2 == 0 and not m) or j == len(tokens) - 1):
														
 
															+                if not m or j == tk_ct - 1:
														
 
															+                    logger.debug(f"Next token is not numeric: {tk_j}")
														
 
															                     # If this runs, then we are on the last token and it is
														
 
															                     # numeric. Add text after # (if present) to numerical
														
 
															                     # version
														
 
															                     if m:
														
 
															-                        text_v += m[1]
														
 
															-                        num_v += m[2] if m[2] else m[1]
														
 
															+                        text_v += m[1] + " "
														
 
															+                        num_v += m[2] if len(m[2]) else m[1]
														
 
															+                        # Append white space.
														
 
															+                        num_v += " "
														
 
															                     elif j == tk_ct - 1:
														
 
															                         # if last token is non-numerical, just tack it on.
														
 
															+                        logger.debug(f"Last token is non-numerical: {tk_j}")
														
 
															                         text_v += tk_j
														
 
															                         num_v += tk_j
														
 
															-                    elif len(text_v) and len(num_v):
														
 
															-                        # if not at end of string yet and token is
														
 
															-                        # non-numerical, remove the last delimiter that was
														
 
															-                        # appended (outer loop will pick up at this point)
														
 
															-                        text_v = text_v[:-1]
														
 
															-                        num_v = num_v[:-1]
														
 
															                     # evaluate numerical string that has been constructed so
														
 
															                     # far. Use num version for ordinals and date strings
														
 
															                     if (
														
 
															-                        match("^di [0-9]", num_v, flags=I) or
														
 
															-                        match("[0-9] [0-9] [0-9] [0-9]", num_v) or
														
 
															-                        match("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
														
 
															-                        match("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
														
 
															+                        search("^di [0-9]", num_v, flags=I) or
														
 
															+                        search("[0-9] [0-9] [0-9] [0-9]", num_v) or
														
 
															+                        search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
														
 
															+                        search("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
														
 
															                     ):
														
 
															                         use_num_v = True
														
 
															                         # At this point, string may contain literal
														
 
															                         # translations of Chinese numerals Convert these to
														
 
															                         # Arabic numerals (for example "2 10 7" = "27").
														
 
															-                        while (
														
 
															-                                match(num_v, "[0-9] 10+") or
														
 
															-                                match(num_v, "[1-9]0+ [1-9]")):
														
 
															-                            m = match(num_v, "([0-9]+) ([1-9]0+)")
														
 
															-                            if m:
														
 
															-                                parsed_sum = int(m[1]) + int(m[2])
														
 
															-                                num_v = sub(
														
 
															-                                        "[0-9]+ [1-9]0+", str(parsed_sum),
														
 
															-                                        num_v, 1)
														
 
															+                        mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)")
														
 
															+                        sum_ptn = compile("([1-9]0+) ([0-9]+)")
														
 
															+                        while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v):
														
 
															+                            logger.debug(f"Match number combination: {_m}")
														
 
															+                            if m := mult_ptn.search(num_v):
														
 
															+                                logger.debug(f"Multiply: {m[1]}, {m[2]}")
														
 
															+                                parsed = int(m[1]) * int(m[2])
														
 
															+                                num_v = mult_ptn.sub(str(parsed), num_v, 1)
														
 
															+                            elif m := sum_ptn.search(num_v):
														
 
															+                                logger.debug(f"Add: {m[1]}, {m[2]}")
														
 
															+                                parsed = int(m[1]) + int(m[2])
														
 
															+                                num_v = sum_ptn.sub(str(parsed), num_v, 1)
														
 
															                             else:
														
 
															-                                mb = match(num_v, "([1-9]0+) ([0-9]+)")
														
 
															-                                if mb:
														
 
															-                                    parsed_sum_b = int(m[1]) + int(m[2])
														
 
															-                                    num_v = sub(
														
 
															-                                            "[1-9]0+ [0-9]+",
														
 
															-                                            str(parsed_sum_b), num_v, 1)
														
 
															-                                else:
														
 
															-                                    break
														
 
															+                                break
														
 
															                         # A few other tweaks
														
 
															                         num_v = sub(
														
 
															                                 "([0-9]) ([0-9]) ([0-9]) ([0-9])",
														
 
															                                 r"\1\2\3\4", num_v)
														
 
															                         if ctx.options.get("marc_field") in ("245", "830"):
														
 
															                             # TODO optimize without loop.
														
 
															-                            while match("[0-9] [0-9]", num_v):
														
 
															+                            while search("[0-9] [0-9]", num_v):
														
 
															                                 num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
														
 
															+                    logger.debug(f"num_v: {num_v}")
														
 
															+                    logger.debug(f"text_v: {text_v}")
														
 
															                     output += num_v if use_num_v else text_v
														
 
															                     # if the end of the string is not reached, backtrack to the
														
 
															                     # delimiter after the last numerical token (i.e. two tokens
														
 
															-                    # ago)
														
 
															-
														
 
															-                    i = j - 2 if j < tk_ct - 1 else j
														
 
															+                    # ago).
														
 
															+                    #
														
 
															+                    # Else, we are at the end of the string, so we are done!
														
 
															+                    i = j - 1 if j < tk_ct - 1 else j
														
 
															                     break
														
 
															                 # this is run when we are not yet at the end of the string and
														
 
															                 # have not yet reached a non-numerical token. This is identical
														
 
															                 # to the code that is run above when the last token is numeric.
														
 
															-
														
 
															-                if j % 2 == 0:
														
 
															-                    m = match(token_ptn, tk_j)
														
 
															-                    text_v += m[1]
														
 
															-                    num_v += m[2] if m[2] else m[1]
														
 
															-                else:
														
 
															-                    text_v += tk_j
														
 
															-                    num_v += tk_j
														
 
															+                m = search(token_ptn, tk_j)
														
 
															+                text_v += m[1] + " "
														
 
															+                num_v += m[2] if len(m[2]) else m[1]
														
 
															+                num_v += " "
														
 
															         else:
														
 
															-            output += tk_i
														
 
															+            logger.debug(f"No match: adding {tk_i}.")
														
 
															+            output += tk_i + " "
														
 
															+
														
 
															+        i += 1
														
 
															+    print(f"Use num version: {use_num_v}")
														
 
															     ctx.dest = output
														
--- a/scriptshifter/hooks/chinese/processNumbers.ts
+++ b/scriptshifter/hooks/chinese/processNumbers.ts
@@ -0,0 +1,144 @@
 
															+private processNumbers(pinyinString: string, tag: string, code: string): string {
														
 
															+    let outputString = "";
														
 
															+    let useNumVersion = false;
														
 
															+    //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
														
 
															+    if ((tag == "245" || tag == "830") && code == "n") {
														
 
															+       useNumVersion = true;
														
 
															+    }
														
 
															+
														
 
															+    /*
														
 
															+     * The input string is split, with any space or punctuation character (except for #) as the delimiter.
														
 
															+     * The delimiters will be captured and included in the string of tokens.  Only the even-numbered
														
 
															+     * array elements are the true 'tokens', so the code for processing tokens is run only for even
														
 
															+     * values of j.
														
 
															+     */
														
 
															+    let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
														
 
															+    let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
														
 
															+    let numToken_re = new RegExp(numTokenPattern);
														
 
															+    let n = tokens.length
														
 
															+    //this.alert.info(tokens.join("|"),{autoClose: false})
														
 
															+    for (let i = 0; i < n; i++) {
														
 
															+        let toki = tokens[i];
														
 
															+        if (toki.match(numToken_re)) {
														
 
															+            /*
														
 
															+             * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
														
 
															+             * found after it.  Two versions of the string are maintained.  The textVersion is the original pinyin (minus the
														
 
															+             * # suffixes).  In the numVersion, characters representing numbers are converted to Arabic numerals.  When a
														
 
															+             * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
														
 
															+             * which version should be used in the output string.  The outer loop then continues where the inner loop left off.
														
 
															+             */
														
 
															+            let textVersion = "";
														
 
															+            let numVersion = "";
														
 
															+            for (let j = i; j < n; j++) {
														
 
															+                let tokj = tokens[j];
														
 
															+                /* a token without # (or the end of string) is reached */
														
 
															+                if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
														
 
															+                    //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
														
 
															+                    let m = tokj.match(numToken_re);
														
 
															+                    if (m) {
														
 
															+                        textVersion += m[1]
														
 
															+                        if (m[2] == "") {
														
 
															+                            numVersion += m[1];
														
 
															+                        } else {
														
 
															+                            numVersion += m[2];
														
 
															+                        }
														
 
															+                    } else if (j == n - 1) {
														
 
															+                    //if last token is non-numerical, just tack it on.
														
 
															+                        textVersion += tokj;
														
 
															+                        numVersion += tokj;
														
 
															+                    } else if (textVersion.length > 0 && numVersion.length > 0) {
														
 
															+                    //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
														
 
															+                    //(outer loop will pick up at this point)
														
 
															+                        textVersion = textVersion.substring(0, textVersion.length - 1);
														
 
															+                        numVersion = numVersion.substring(0, numVersion.length - 1);
														
 
															+                    }
														
 
															+                    //evaluate numerical string that has been constructed so far
														
 
															+                    //use num version for ordinals and date strings
														
 
															+                    if (numVersion.match(/^di [0-9]/i) ||
														
 
															+                        numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
														
 
															+                        numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
														
 
															+                        numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
														
 
															+                        useNumVersion
														
 
															+                       ) {
														
 
															+                        useNumVersion = true;
														
 
															+                        /*
														
 
															+                         * At this point, string may contain literal translations of Chinese numerals
														
 
															+                         * Convert these to Arabic numerals (for example "2 10 7" = "27").
														
 
															+                         */
														
 
															+
														
 
															+                        while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
														
 
															+                            m = numVersion.match(/([0-9]+) ([1-9]0+)/);
														
 
															+                            if (m) {
														
 
															+                                let sum = Number(m[1]) * Number(m[2]);
														
 
															+                                numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
														
 
															+                            } else {
														
 
															+                                let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
														
 
															+                                if (mb)
														
 
															+                                {
														
 
															+                                    let sumb = Number(mb[1]) + Number(mb[2]);
														
 
															+                                    numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
														
 
															+                                }
														
 
															+                                else
														
 
															+                                {
														
 
															+                                    break;
														
 
															+                                }
														
 
															+                            }
														
 
															+                        }
														
 
															+
														
 
															+                        //A few other tweaks
														
 
															+                        numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
														
 
															+                        if ((tag == "245" || tag == "830") && code == "n") {
														
 
															+                            while (numVersion.match(/[0-9] [0-9]/)) {
														
 
															+                                numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
														
 
															+                            }
														
 
															+                        }
														
 
															+                    }
														
 
															+                    if (useNumVersion)
														
 
															+                    {
														
 
															+                        outputString += numVersion;
														
 
															+                    }
														
 
															+                    else
														
 
															+                    {
														
 
															+                        outputString += textVersion;
														
 
															+                    }
														
 
															+                    //if the end of the string is not reached, backtrack to the delimiter after the last numerical token
														
 
															+                    //(i.e. two tokens ago)
														
 
															+                    if (j < n - 1)
														
 
															+                    {
														
 
															+                        i = j - 2;
														
 
															+                    }
														
 
															+                    else //we are at the end of the string, so we are done!
														
 
															+                    {
														
 
															+                        i = j;
														
 
															+                    }
														
 
															+                    break;
														
 
															+                }
														
 
															+                //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
														
 
															+                //This is identical to the code that is run above when the last token is numeric.
														
 
															+                if (j % 2 == 0)
														
 
															+                {
														
 
															+                    let m = tokj.match(numToken_re);
														
 
															+                    textVersion += m[1];
														
 
															+                    if (m[2]== "")
														
 
															+                    {
														
 
															+                        numVersion += m[1];
														
 
															+                    }
														
 
															+                    else
														
 
															+                    {
														
 
															+                        numVersion += m[2];
														
 
															+                    }
														
 
															+                }
														
 
															+                else //a delimiter, just tack it on.
														
 
															+                {
														
 
															+                    textVersion += tokj;
														
 
															+                    numVersion += tokj;
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+        else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
														
 
															+        {
														
 
															+            outputString += toki;
														
 
															+        }
														
 
															+    }
														
 
															+    return outputString;
														
 
															+ }