__init__.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. __doc__ = """Chinese hooks."""
  2. from logging import getLogger
  3. from re import I, compile, search, sub
  4. from scriptshifter.hooks.general import normalize_spacing_post_assembly
  5. logger = getLogger(__name__)
  6. def parse_numerals_pre_assembly(ctx):
  7. """
  8. Parse Chinese numerals in the already romanized result.
  9. """
  10. # Only apply to specific MARC fields.
  11. use_num_v = ctx.options.get("marc_field") in ("245n", "830n")
  12. # tokens = split(r"[\W^#]", ctx.dest) # Original logic.
  13. tk_ct = len(ctx.dest_ls)
  14. token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")
  15. output = ""
  16. # Use manual loop as i is manipulated inside it.
  17. i = 0
  18. while i < tk_ct:
  19. tk_i = ctx.dest_ls[i]
  20. if search(token_ptn, tk_i):
  21. # When a numerical token (containing #) is reached, the inner loop
  22. # consumes it and all consecutive numerical tokens found after it.
  23. # Two versions of the string are maintained. The textVersion is
  24. # the original pinyin (minus the # suffixes). In the numVersion,
  25. # characters representing numbers are converted to Arabic
  26. # numerals. When a non-numerical token (or end of string) is
  27. # encountered, the string of numerical tokens is evaluated to
  28. # determine which version should be used in the output string.
  29. # The outer loop then continues where the inner loop left off.
  30. logger.debug(f"Match number: {tk_i}.")
  31. text_v = num_v = ""
  32. for j in range(i, tk_ct):
  33. tk_j = ctx.dest_ls[j]
  34. m = search(token_ptn, tk_j)
  35. # if m:
  36. # logger.debug(f"m[1]: {m[1]} - m[2]: {m[2]}")
  37. # a token without # (or the end of string) is reached
  38. if not m or j == tk_ct - 1:
  39. logger.debug(f"Next token is not numeric: {tk_j}")
  40. # If this runs, then we are on the last token and it is
  41. # numeric. Add text after # (if present) to numerical
  42. # version and captured whitespace after the number.
  43. if m:
  44. text_v += m[1] + m[3]
  45. num_v += m[2] + m[3] if len(m[2]) else m[1] + m[3]
  46. # Append white space.
  47. num_v += " "
  48. elif j == tk_ct - 1:
  49. # if last token is non-numerical, just tack it on.
  50. logger.debug(f"Last token is non-numerical: {tk_j}")
  51. text_v += tk_j
  52. num_v += tk_j
  53. # evaluate numerical string that has been constructed so
  54. # far. Use num version for ordinals and date strings
  55. if (
  56. search("^di [0-9]", num_v, flags=I) or
  57. search("[0-9] [0-9] [0-9] [0-9]", num_v) or
  58. search("[0-9]+ nian [0-9]+ yue", num_v, flags=I) or
  59. search("[0-9]+ yue [0-9]+ ri", num_v, flags=I)
  60. ):
  61. use_num_v = True
  62. # At this point, string may contain literal
  63. # translations of Chinese numerals Convert these to
  64. # Arabic numerals (for example "2 10 7" = "27").
  65. mult_ptn = compile(r"(\b[0-9]) ([1-9]0+)")
  66. sum_ptn = compile("([1-9]0+) ([0-9]+)")
  67. while _m := search("[0-9] 10+|[1-9]0+ [1-9]", num_v):
  68. logger.debug(f"Match number combination: {_m}")
  69. if m := mult_ptn.search(num_v):
  70. logger.debug(f"Multiply: {m[1]}, {m[2]}")
  71. parsed = int(m[1]) * int(m[2])
  72. num_v = mult_ptn.sub(str(parsed), num_v, 1)
  73. elif m := sum_ptn.search(num_v):
  74. logger.debug(f"Add: {m[1]}, {m[2]}")
  75. parsed = int(m[1]) + int(m[2])
  76. num_v = sum_ptn.sub(str(parsed), num_v, 1)
  77. else:
  78. break
  79. # A few other tweaks
  80. num_v = sub(
  81. "([0-9]) ([0-9]) ([0-9]) ([0-9])",
  82. r"\1\2\3\4", num_v)
  83. if ctx.options.get("marc_field") in ("245", "830"):
  84. # TODO optimize without loop.
  85. while search("[0-9] [0-9]", num_v):
  86. num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)
  87. output += num_v if use_num_v else text_v
  88. # if the end of the string is not reached, backtrack to the
  89. # delimiter after the last numerical token (i.e. two tokens
  90. # ago).
  91. #
  92. # Else, we are at the end of the string, so we are done!
  93. i = j - 1 if j < tk_ct - 1 else j
  94. break
  95. # this is run when we are not yet at the end of the string and
  96. # have not yet reached a non-numerical token. This is identical
  97. # to the code that is run above when the last token is numeric,
  98. # except that whitespace after the token is stripped.
  99. m = search(token_ptn, tk_j)
  100. text_v += m[1] + " "
  101. num_v += m[2] if len(m[2]) else m[1]
  102. num_v += " "
  103. else:
  104. logger.debug(f"No numeric match: adding {tk_i}.")
  105. output += tk_i
  106. i += 1
  107. logger.debug(f"Use num version: {use_num_v}")
  108. ctx.dest = output
  109. # Skip main transliterate function joining.
  110. return normalize_spacing_post_assembly(ctx)
  111. def person_name_pre_assembly(ctx):
  112. """
  113. Parse a personal name from a specific MARC field.
  114. """
  115. if not ctx.options.get("marc_field") in ("100", "600", "700", "800"):
  116. return
  117. ctx.dest_ls[0] = ctx.dest_ls[0].capitalize().strip() + ", "
  118. ctx.dest_ls[1] = ctx.dest_ls[1].capitalize()
  119. if len(ctx.dest_ls) > 2:
  120. ctx.dest_ls[1] = ctx.dest_ls[1].strip()
  121. if ctx.dest_ls[2][0] in "aeiou":
  122. ctx.dest_ls[1] += "'"
  123. ctx.dest_ls[1] += ctx_ls[2]
  124. del(ctx_ls[2])