123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146 |
- __doc__ = """Chinese hooks."""
- from logging import getLogger
- from re import I, compile, search, sub
- from scriptshifter.hooks.general import normalize_spacing_post_assembly
- logger = getLogger(__name__)
- def parse_numerals_pre_assembly(ctx):
- """
- Parse Chinese numerals in the already romanized result.
- """
- # Only apply to specific MARC fields.
- use_num_v = ctx.options.get("marc_field") in ("245n", "830n")
- # tokens = split(r"[\W^#]", ctx.dest) # Original logic.
- tk_ct = len(ctx.dest_ls)
- token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")
- output = ""
- # Use manual loop as i is manipulated inside it.
- i = 0
- while i < tk_ct:
- tk_i = ctx.dest_ls[i]
- if search(token_ptn, tk_i):
- # When a numerical token (containing #) is reached, the inner loop
- # consumes it and all consecutive numerical tokens found after it.
- # Two versions of the string are maintained. The textVersion is
- # the original pinyin (minus the # suffixes). In the numVersion,
- # characters representing numbers are converted to Arabic
- # numerals. When a non-numerical token (or end of string) is
- # encountered, the string of numerical tokens is evaluated to
- # determine which version should be used in the output string.
- # The outer loop then continues where the inner loop left off.
- logger.debug(f"Match number: {tk_i}.")
- text_v = num_v = ""
- for j in range(i, tk_ct):
- tk_j = ctx.dest_ls[j]
- m = search(token_ptn, tk_j)
- # if m:
- # logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
- # a token without # (or the end of string) is reached
- if not m or j == tk_ct - 1:
- logger.debug(f"Next token is not numeric: {tk_j}")
- # If this runs, then we are on the last token and it is
- # numeric. Add text after # (if present) to numerical
- # version and captured whitespace after the number.
- if m:
- text_v += m[1] + m[3]
- num_v += m[2] + m[3] if len(m[2]) else m[1] + m[3]
- # Append white space.
- num_v += " "
- elif j == tk_ct - 1:
- # if last token is non-numerical, just tack it on.
- logger.debug(f"Last token is non-numerical: {tk_j}")
- text_v += tk_j
- num_v += tk_j
- # evaluate numerical string that has been constructed so
- # far. Use num version for ordinals and date strings
- if (
- search("^di [0-9]", num_v, flags=I) or
- search("[0-9] [0-9] [0-9] [0-9]", num_v) or
- search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
- search("[0-9]+ yue [0-9]+ ri", num_v, flags=I)
- ):
- use_num_v = True
- # At this point, string may contain literal
- # translations of Chinese numerals Convert these to
- # Arabic numerals (for example "2 10 7" = "27").
- mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)")
- sum_ptn = compile("([1-9]0+) ([0-9]+)")
- while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v):
- logger.debug(f"Match number combination: {_m}")
- if m := mult_ptn.search(num_v):
- logger.debug(f"Multiply: {m[1]}, {m[2]}")
- parsed = int(m[1]) * int(m[2])
- num_v = mult_ptn.sub(str(parsed), num_v, 1)
- elif m := sum_ptn.search(num_v):
- logger.debug(f"Add: {m[1]}, {m[2]}")
- parsed = int(m[1]) + int(m[2])
- num_v = sum_ptn.sub(str(parsed), num_v, 1)
- else:
- break
- # A few other tweaks
- num_v = sub(
- "([0-9]) ([0-9]) ([0-9]) ([0-9])",
- r"\1\2\3\4", num_v)
- if ctx.options.get("marc_field") in ("245", "830"):
- # TODO optimize without loop.
- while search("[0-9] [0-9]", num_v):
- num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
- output += num_v if use_num_v else text_v
- # if the end of the string is not reached, backtrack to the
- # delimiter after the last numerical token (i.e. two tokens
- # ago).
- #
- # Else, we are at the end of the string, so we are done!
- i = j - 1 if j < tk_ct - 1 else j
- break
- # this is run when we are not yet at the end of the string and
- # have not yet reached a non-numerical token. This is identical
- # to the code that is run above when the last token is numeric,
- # except that whitespace after the token is stripped.
- m = search(token_ptn, tk_j)
- text_v += m[1] + " "
- num_v += m[2] if len(m[2]) else m[1]
- num_v += " "
- else:
- logger.debug(f"No numeric match: adding {tk_i}.")
- output += tk_i
- i += 1
- logger.debug(f"Use num version: {use_num_v}")
- ctx.dest = output
- # Skip main transliterate function joining.
- return normalize_spacing_post_assembly(ctx)
- def person_name_pre_assembly(ctx):
- """
- Parse a personal name from a specific MARC field.
- """
- if not ctx.options.get("marc_field") in ("100", "600", "700", "800"):
- return
- ctx.dest_ls[0] = ctx.dest_ls[0].capitalize().strip() + ", "
- ctx.dest_ls[1] = ctx.dest_ls[1].capitalize()
- if len(ctx.dest_ls) > 2:
- ctx.dest_ls[1] = ctx.dest_ls[1].strip()
- if ctx.dest_ls[2][0] in "aeiou":
- ctx.dest_ls[1] += "'"
- ctx.dest_ls[1] += ctx_ls[2]
- del(ctx_ls[2])
|