há 1 ano atrás · 4f3d021c63
--- a/scriptshifter/hooks/chinese/__init__.py
+++ b/scriptshifter/hooks/chinese/__init__.py
@@ -1,7 +1,33 @@
 
				 __doc__ = """Chinese hooks."""
			
 
				 
			
 
				 
			
 
				-from re import I, compile, match, split, sub
			
 
				+from logging import getLogger
			
 
				+from os import path
			
 
				+from re import I, compile, search, sub
			
 
				+
			
 
				+from yaml import load
			
 
				+try:
			
 
				+    from yaml import CLoader as Loader
			
 
				+except ImportError:
			
 
				+    from yaml import Loader
			
 
				+
			
 
				+
			
 
				+HOOK_DIR = path.dirname(path.realpath(__file__))
			
 
				+
			
 
				+logger = getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+def merge_numerals_pre_config(tdata):
			
 
				+    """
			
 
				+    Add numerals mapping to configuration.
			
 
				+
			
 
				+    This overrides the existing character mappings.
			
 
				+    """
			
 
				+    num_map_yml = path.join(HOOK_DIR, "numerals.yml")
			
 
				+    with open(num_map_yml, "r") as fh:
			
 
				+        num_map = load(fh, Loader=Loader)
			
 
				+
			
 
				+    tdata["script_to_roman"]["map"].update(num_map)
			
 
				 
			
 
				 
			
 
				 def parse_numerals(ctx):
			
@@ -13,100 +39,111 @@ def parse_numerals(ctx):
 
				     # Only apply to specific MARC fields.
			
 
				     use_num_v = ctx.options.get("marc_field") in ("245", "830")
			
 
				 
			
 
				-    tokens = split(r"[\W^#]", ctx.dest)
			
 
				+    # tokens = split(r"[\W^#]", ctx.dest)  # Original logic.
			
 
				+    tokens = [tk.strip() for tk in ctx.dest_ls]
			
 
				     tk_ct = len(tokens)
			
 
				-
			
 
				-    token_ptn = compile("^([A-Za-z]+)#([0-9]*)$")
			
 
				+    token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)$")
			
 
				 
			
 
				     output = ""
			
 
				 
			
 
				     # Use manual loop as i is manipulated inside it.
			
 
				     i = 0
			
 
				+
			
 
				     while i < tk_ct:
			
 
				         tk_i = tokens[i]
			
 
				-        if match(token_ptn, tk_i):
			
 
				+        if search(token_ptn, tk_i):
			
 
				+            # When a numerical token (containing #) is reached, the inner loop
			
 
				+            # consumes it and all consecutive numerical tokens found after it.
			
 
				+            # Two versions of the string are maintained. The textVersion is
			
 
				+            # the original pinyin (minus the # suffixes). In the numVersion,
			
 
				+            # characters representing numbers are converted to Arabic
			
 
				+            # numerals. When a non-numerical token (or end of string) is
			
 
				+            # encountered, the string of numerical tokens is evaluated to
			
 
				+            # determine which version should be used in the output string.
			
 
				+            # The outer loop then continues where the inner loop left off.
			
 
				+            logger.debug(f"Match number: {tk_i}")
			
 
				             text_v = num_v = ""
			
 
				-            for j, tk_j in enumerate(tokens):
			
 
				-                m = match(token_ptn, tk_j)
			
 
				+            for j in range(i, tk_ct):
			
 
				+                tk_j = tokens[j]
			
 
				+                m = search(token_ptn, tk_j)
			
 
				+                # if m:
			
 
				+                #     logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
			
 
				                 # a token without # (or the end of string) is reached
			
 
				-                if ((j % 2 == 0 and not m) or j == len(tokens) - 1):
			
 
				+                if not m or j == tk_ct - 1:
			
 
				+                    logger.debug(f"Next token is not numeric: {tk_j}")
			
 
				                     # If this runs, then we are on the last token and it is
			
 
				                     # numeric. Add text after # (if present) to numerical
			
 
				                     # version
			
 
				                     if m:
			
 
				-                        text_v += m[1]
			
 
				-                        num_v += m[2] if m[2] else m[1]
			
 
				+                        text_v += m[1] + " "
			
 
				+                        num_v += m[2] if len(m[2]) else m[1]
			
 
				+                        # Append white space.
			
 
				+                        num_v += " "
			
 
				                     elif j == tk_ct - 1:
			
 
				                         # if last token is non-numerical, just tack it on.
			
 
				+                        logger.debug(f"Last token is non-numerical: {tk_j}")
			
 
				                         text_v += tk_j
			
 
				                         num_v += tk_j
			
 
				-                    elif len(text_v) and len(num_v):
			
 
				-                        # if not at end of string yet and token is
			
 
				-                        # non-numerical, remove the last delimiter that was
			
 
				-                        # appended (outer loop will pick up at this point)
			
 
				-                        text_v = text_v[:-1]
			
 
				-                        num_v = num_v[:-1]
			
 
				                     # evaluate numerical string that has been constructed so
			
 
				                     # far. Use num version for ordinals and date strings
			
 
				                     if (
			
 
				-                        match("^di [0-9]", num_v, flags=I) or
			
 
				-                        match("[0-9] [0-9] [0-9] [0-9]", num_v) or
			
 
				-                        match("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
			
 
				-                        match("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
			
 
				+                        search("^di [0-9]", num_v, flags=I) or
			
 
				+                        search("[0-9] [0-9] [0-9] [0-9]", num_v) or
			
 
				+                        search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
			
 
				+                        search("\"[0-9]+ yue [0-9]+ ri", num_v, flags=I)
			
 
				                     ):
			
 
				                         use_num_v = True
			
 
				                         # At this point, string may contain literal
			
 
				                         # translations of Chinese numerals Convert these to
			
 
				                         # Arabic numerals (for example "2 10 7" = "27").
			
 
				-                        while (
			
 
				-                                match(num_v, "[0-9] 10+") or
			
 
				-                                match(num_v, "[1-9]0+ [1-9]")):
			
 
				-                            m = match(num_v, "([0-9]+) ([1-9]0+)")
			
 
				-                            if m:
			
 
				-                                parsed_sum = int(m[1]) + int(m[2])
			
 
				-                                num_v = sub(
			
 
				-                                        "[0-9]+ [1-9]0+", str(parsed_sum),
			
 
				-                                        num_v, 1)
			
 
				+                        mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)")
			
 
				+                        sum_ptn = compile("([1-9]0+) ([0-9]+)")
			
 
				+                        while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v):
			
 
				+                            logger.debug(f"Match number combination: {_m}")
			
 
				+                            if m := mult_ptn.search(num_v):
			
 
				+                                logger.debug(f"Multiply: {m[1]}, {m[2]}")
			
 
				+                                parsed = int(m[1]) * int(m[2])
			
 
				+                                num_v = mult_ptn.sub(str(parsed), num_v, 1)
			
 
				+                            elif m := sum_ptn.search(num_v):
			
 
				+                                logger.debug(f"Add: {m[1]}, {m[2]}")
			
 
				+                                parsed = int(m[1]) + int(m[2])
			
 
				+                                num_v = sum_ptn.sub(str(parsed), num_v, 1)
			
 
				                             else:
			
 
				-                                mb = match(num_v, "([1-9]0+) ([0-9]+)")
			
 
				-                                if mb:
			
 
				-                                    parsed_sum_b = int(m[1]) + int(m[2])
			
 
				-                                    num_v = sub(
			
 
				-                                            "[1-9]0+ [0-9]+",
			
 
				-                                            str(parsed_sum_b), num_v, 1)
			
 
				-                                else:
			
 
				-                                    break
			
 
				+                                break
			
 
				                         # A few other tweaks
			
 
				                         num_v = sub(
			
 
				                                 "([0-9]) ([0-9]) ([0-9]) ([0-9])",
			
 
				                                 r"\1\2\3\4", num_v)
			
 
				                         if ctx.options.get("marc_field") in ("245", "830"):
			
 
				                             # TODO optimize without loop.
			
 
				-                            while match("[0-9] [0-9]", num_v):
			
 
				+                            while search("[0-9] [0-9]", num_v):
			
 
				                                 num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
			
 
				 
			
 
				+                    logger.debug(f"num_v: {num_v}")
			
 
				+                    logger.debug(f"text_v: {text_v}")
			
 
				                     output += num_v if use_num_v else text_v
			
 
				 
			
 
				                     # if the end of the string is not reached, backtrack to the
			
 
				                     # delimiter after the last numerical token (i.e. two tokens
			
 
				-                    # ago)
			
 
				-
			
 
				-                    i = j - 2 if j < tk_ct - 1 else j
			
 
				+                    # ago).
			
 
				+                    #
			
 
				+                    # Else, we are at the end of the string, so we are done!
			
 
				+                    i = j - 1 if j < tk_ct - 1 else j
			
 
				                     break
			
 
				 
			
 
				                 # this is run when we are not yet at the end of the string and
			
 
				                 # have not yet reached a non-numerical token. This is identical
			
 
				                 # to the code that is run above when the last token is numeric.
			
 
				-
			
 
				-                if j % 2 == 0:
			
 
				-                    m = match(token_ptn, tk_j)
			
 
				-                    text_v += m[1]
			
 
				-                    num_v += m[2] if m[2] else m[1]
			
 
				-                else:
			
 
				-                    text_v += tk_j
			
 
				-                    num_v += tk_j
			
 
				+                m = search(token_ptn, tk_j)
			
 
				+                text_v += m[1] + " "
			
 
				+                num_v += m[2] if len(m[2]) else m[1]
			
 
				+                num_v += " "
			
 
				 
			
 
				         else:
			
 
				-            output += tk_i
			
 
				+            logger.debug(f"No match: adding {tk_i}.")
			
 
				+            output += tk_i + " "
			
 
				+
			
 
				+        i += 1
			
 
				 
			
 
				+    print(f"Use num version: {use_num_v}")
			
 
				     ctx.dest = output
			
--- a/scriptshifter/hooks/chinese/processNumbers.ts
+++ b/scriptshifter/hooks/chinese/processNumbers.ts
@@ -0,0 +1,144 @@
 
				+private processNumbers(pinyinString: string, tag: string, code: string): string {
			
 
				+    let outputString = "";
			
 
				+    let useNumVersion = false;
			
 
				+    //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
			
 
				+    if ((tag == "245" || tag == "830") && code == "n") {
			
 
				+       useNumVersion = true;
			
 
				+    }
			
 
				+
			
 
				+    /*
			
 
				+     * The input string is split, with any space or punctuation character (except for #) as the delimiter.
			
 
				+     * The delimiters will be captured and included in the string of tokens.  Only the even-numbered
			
 
				+     * array elements are the true 'tokens', so the code for processing tokens is run only for even
			
 
				+     * values of j.
			
 
				+     */
			
 
				+    let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
			
 
				+    let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
			
 
				+    let numToken_re = new RegExp(numTokenPattern);
			
 
				+    let n = tokens.length
			
 
				+    //this.alert.info(tokens.join("|"),{autoClose: false})
			
 
				+    for (let i = 0; i < n; i++) {
			
 
				+        let toki = tokens[i];
			
 
				+        if (toki.match(numToken_re)) {
			
 
				+            /*
			
 
				+             * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
			
 
				+             * found after it.  Two versions of the string are maintained.  The textVersion is the original pinyin (minus the
			
 
				+             * # suffixes).  In the numVersion, characters representing numbers are converted to Arabic numerals.  When a
			
 
				+             * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
			
 
				+             * which version should be used in the output string.  The outer loop then continues where the inner loop left off.
			
 
				+             */
			
 
				+            let textVersion = "";
			
 
				+            let numVersion = "";
			
 
				+            for (let j = i; j < n; j++) {
			
 
				+                let tokj = tokens[j];
			
 
				+                /* a token without # (or the end of string) is reached */
			
 
				+                if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
			
 
				+                    //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
			
 
				+                    let m = tokj.match(numToken_re);
			
 
				+                    if (m) {
			
 
				+                        textVersion += m[1]
			
 
				+                        if (m[2] == "") {
			
 
				+                            numVersion += m[1];
			
 
				+                        } else {
			
 
				+                            numVersion += m[2];
			
 
				+                        }
			
 
				+                    } else if (j == n - 1) {
			
 
				+                    //if last token is non-numerical, just tack it on.
			
 
				+                        textVersion += tokj;
			
 
				+                        numVersion += tokj;
			
 
				+                    } else if (textVersion.length > 0 && numVersion.length > 0) {
			
 
				+                    //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
			
 
				+                    //(outer loop will pick up at this point)
			
 
				+                        textVersion = textVersion.substring(0, textVersion.length - 1);
			
 
				+                        numVersion = numVersion.substring(0, numVersion.length - 1);
			
 
				+                    }
			
 
				+                    //evaluate numerical string that has been constructed so far
			
 
				+                    //use num version for ordinals and date strings
			
 
				+                    if (numVersion.match(/^di [0-9]/i) ||
			
 
				+                        numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
			
 
				+                        numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
			
 
				+                        numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
			
 
				+                        useNumVersion
			
 
				+                       ) {
			
 
				+                        useNumVersion = true;
			
 
				+                        /*
			
 
				+                         * At this point, string may contain literal translations of Chinese numerals
			
 
				+                         * Convert these to Arabic numerals (for example "2 10 7" = "27").
			
 
				+                         */
			
 
				+
			
 
				+                        while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
			
 
				+                            m = numVersion.match(/([0-9]+) ([1-9]0+)/);
			
 
				+                            if (m) {
			
 
				+                                let sum = Number(m[1]) * Number(m[2]);
			
 
				+                                numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
			
 
				+                            } else {
			
 
				+                                let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
			
 
				+                                if (mb)
			
 
				+                                {
			
 
				+                                    let sumb = Number(mb[1]) + Number(mb[2]);
			
 
				+                                    numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
			
 
				+                                }
			
 
				+                                else
			
 
				+                                {
			
 
				+                                    break;
			
 
				+                                }
			
 
				+                            }
			
 
				+                        }
			
 
				+
			
 
				+                        //A few other tweaks
			
 
				+                        numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
			
 
				+                        if ((tag == "245" || tag == "830") && code == "n") {
			
 
				+                            while (numVersion.match(/[0-9] [0-9]/)) {
			
 
				+                                numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+                    if (useNumVersion)
			
 
				+                    {
			
 
				+                        outputString += numVersion;
			
 
				+                    }
			
 
				+                    else
			
 
				+                    {
			
 
				+                        outputString += textVersion;
			
 
				+                    }
			
 
				+                    //if the end of the string is not reached, backtrack to the delimiter after the last numerical token
			
 
				+                    //(i.e. two tokens ago)
			
 
				+                    if (j < n - 1)
			
 
				+                    {
			
 
				+                        i = j - 2;
			
 
				+                    }
			
 
				+                    else //we are at the end of the string, so we are done!
			
 
				+                    {
			
 
				+                        i = j;
			
 
				+                    }
			
 
				+                    break;
			
 
				+                }
			
 
				+                //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
			
 
				+                //This is identical to the code that is run above when the last token is numeric.
			
 
				+                if (j % 2 == 0)
			
 
				+                {
			
 
				+                    let m = tokj.match(numToken_re);
			
 
				+                    textVersion += m[1];
			
 
				+                    if (m[2]== "")
			
 
				+                    {
			
 
				+                        numVersion += m[1];
			
 
				+                    }
			
 
				+                    else
			
 
				+                    {
			
 
				+                        numVersion += m[2];
			
 
				+                    }
			
 
				+                }
			
 
				+                else //a delimiter, just tack it on.
			
 
				+                {
			
 
				+                    textVersion += tokj;
			
 
				+                    numVersion += tokj;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
			
 
				+        {
			
 
				+            outputString += toki;
			
 
				+        }
			
 
				+    }
			
 
				+    return outputString;
			
 
				+ }