trans.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. import logging
  2. import re
  3. from scriptshifter.exceptions import BREAK, CONT
  4. from scriptshifter.tables import WORD_BOUNDARY, load_table
  5. # Match multiple spaces.
  6. MULTI_WS_RE = re.compile(r"\s{2,}")
  7. # Cursor bitwise flags.
  8. CUR_BOW = 1 << 0
  9. CUR_EOW = 1 << 1
  10. logger = logging.getLogger(__name__)
  11. class Context:
  12. """
  13. Context used within the transliteration and passed to hook functions.
  14. """
  15. def __init__(self, src, general, langsec, options={}):
  16. """
  17. Initialize a context.
  18. Args:
  19. src (str): The original text. This is meant to never change.
  20. general (dict): general section of the current config.
  21. langsec (dict): Language configuration section being used.
  22. options (dict): extra options as a dict.
  23. """
  24. self.src = src
  25. self.general = general
  26. self.options = options
  27. self.langsec = langsec
  28. self.dest_ls = []
  29. def transliterate(src, lang, t_dir="s2r", options={}, capitalize=False):
  30. """
  31. Transliterate a single string.
  32. Args:
  33. src (str): Source string.
  34. lang (str): Language name.
  35. t_dir (str): Transliteration direction. Either `s2r` for
  36. script-to-Roman (default) or `r2s` for Roman-to-script.
  37. capitalize: capitalize words: one of `False` (no change - default),
  38. `"first"` (only first letter), or `"all"` (first letter of each
  39. word).
  40. options: extra script-dependent options. Defaults to the empty map.
  41. Keyword args:
  42. r2s (bool): If False (the default), the source is considered to be a
  43. non-latin script in the language and script specified, and the output
  44. the Romanization thereof; if True, the source is considered to be
  45. romanized text to be transliterated into the specified script/language.
  46. Return:
  47. str: The transliterated string.
  48. """
  49. source_str = "Latin" if t_dir == "r2s" else lang
  50. target_str = lang if t_dir == "r2s" else "Latin"
  51. logger.info(f"Transliteration is from {source_str} to {target_str}.")
  52. cfg = load_table(lang)
  53. logger.info(f"Loaded table for {lang}.")
  54. # General directives.
  55. general = cfg.get("general", {})
  56. if t_dir == "s2r" and "script_to_roman" not in cfg:
  57. raise NotImplementedError(
  58. f"Script-to-Roman transliteration not yet supported for {lang}."
  59. )
  60. elif t_dir == "r2s" and "roman_to_script" not in cfg:
  61. raise NotImplementedError(
  62. f"Roman-to-script transliteration not yet supported for {lang}."
  63. )
  64. langsec = (
  65. cfg["script_to_roman"] if t_dir == "s2r"
  66. else cfg["roman_to_script"])
  67. # langsec_dir = langsec.get("directives", {})
  68. langsec_hooks = langsec.get("hooks", {})
  69. src = src.strip()
  70. options["capitalize"] = capitalize
  71. ctx = Context(src, general, langsec, options)
  72. # This hook may take over the whole transliteration process or delegate it
  73. # to some external process, and return the output string directly.
  74. if _run_hook("post_config", ctx, langsec_hooks) == BREAK:
  75. return getattr(ctx, "dest", ""), getattr(ctx, "warnings", [])
  76. # Loop through source characters. The increment of each loop depends on
  77. # the length of the token that eventually matches.
  78. ignore_list = langsec.get("ignore", []) # Only present in R2S
  79. ctx.cur = 0
  80. word_boundary = langsec.get("word_boundary", WORD_BOUNDARY)
  81. while ctx.cur < len(src):
  82. # Reset cursor position flags.
  83. # Carry over extended "beginning of word" flag.
  84. ctx.cur_flags = 0
  85. cur_char = src[ctx.cur]
  86. # Look for a word boundary and flag word beginning/end it if found.
  87. if (ctx.cur == 0 or src[ctx.cur - 1] in word_boundary) and (
  88. cur_char not in word_boundary):
  89. # Beginning of word.
  90. logger.debug(f"Beginning of word at position {ctx.cur}.")
  91. ctx.cur_flags |= CUR_BOW
  92. if (
  93. ctx.cur == len(src) - 1
  94. or src[ctx.cur + 1] in word_boundary
  95. ) and (cur_char not in word_boundary):
  96. # Beginning of word.
  97. # End of word.
  98. logger.debug(f"End of word at position {ctx.cur}.")
  99. ctx.cur_flags |= CUR_EOW
  100. # This hook may skip the parsing of the current
  101. # token or exit the scanning loop altogether.
  102. hret = _run_hook("begin_input_token", ctx, langsec_hooks)
  103. if hret == BREAK:
  104. logger.debug("Breaking text scanning from hook signal.")
  105. break
  106. if hret == CONT:
  107. logger.debug("Skipping scanning iteration from hook signal.")
  108. continue
  109. # Check ignore list. Find as many subsequent ignore tokens
  110. # as possible before moving on to looking for match tokens.
  111. ctx.tk = None
  112. while True:
  113. ctx.ignoring = False
  114. for ctx.tk in ignore_list:
  115. hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
  116. if hret == BREAK:
  117. break
  118. if hret == CONT:
  119. continue
  120. step = len(ctx.tk)
  121. if ctx.tk == src[ctx.cur:ctx.cur + step]:
  122. # The position matches an ignore token.
  123. hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
  124. if hret == BREAK:
  125. break
  126. if hret == CONT:
  127. continue
  128. logger.info(f"Ignored token: {ctx.tk}")
  129. ctx.dest_ls.append(ctx.tk)
  130. ctx.cur += step
  131. ctx.ignoring = True
  132. break
  133. # We looked through all ignore tokens, not found any. Move on.
  134. if not ctx.ignoring:
  135. break
  136. # Otherwise, if we found a match, check if the next position may be
  137. # ignored as well.
  138. delattr(ctx, "tk")
  139. delattr(ctx, "ignoring")
  140. # Begin transliteration token lookup.
  141. ctx.match = False
  142. for ctx.src_tk, ctx.dest_tk in langsec["map"]:
  143. hret = _run_hook("pre_tx_token", ctx, langsec_hooks)
  144. if hret == BREAK:
  145. break
  146. if hret == CONT:
  147. continue
  148. step = len(ctx.src_tk)
  149. # If the first character of the token is greater (= higher code
  150. # point value) than the current character, then break the loop
  151. # without a match, because we know there won't be any more match
  152. # due to the alphabetical ordering.
  153. if ctx.src_tk[0] > cur_char:
  154. logger.debug(
  155. f"{ctx.src_tk} is after {src[ctx.cur:ctx.cur + step]}."
  156. " Breaking loop.")
  157. break
  158. # Longer tokens should be guaranteed to be scanned before their
  159. # substrings at this point.
  160. if ctx.src_tk == src[ctx.cur:ctx.cur + step]:
  161. ctx.match = True
  162. # This hook may skip this token or break out of the token
  163. # lookup for the current position.
  164. hret = _run_hook("on_tx_token_match", ctx, langsec_hooks)
  165. if hret == BREAK:
  166. break
  167. if hret == CONT:
  168. continue
  169. # A match is found. Stop scanning tokens, append result, and
  170. # proceed scanning the source.
  171. # Capitalization.
  172. if (
  173. (ctx.options["capitalize"] == "first" and ctx.cur == 0)
  174. or
  175. (
  176. ctx.options["capitalize"] == "all"
  177. and ctx.cur_flags & CUR_BOW
  178. )
  179. ):
  180. logger.info("Capitalizing token.")
  181. double_cap = False
  182. for dcap_rule in ctx.langsec.get("double_cap", []):
  183. if ctx.dest_tk == dcap_rule:
  184. ctx.dest_tk = ctx.dest_tk.upper()
  185. double_cap = True
  186. break
  187. if not double_cap:
  188. ctx.dest_tk = ctx.dest_tk[0].upper() + ctx.dest_tk[1:]
  189. ctx.dest_ls.append(ctx.dest_tk)
  190. ctx.cur += step
  191. break
  192. if ctx.match is False:
  193. delattr(ctx, "match")
  194. hret = _run_hook("on_no_tx_token_match", ctx, langsec_hooks)
  195. if hret == BREAK:
  196. break
  197. if hret == CONT:
  198. continue
  199. # No match found. Copy non-mapped character (one at a time).
  200. logger.info(
  201. f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
  202. f"at position {ctx.cur} is not mapped.")
  203. ctx.dest_ls.append(cur_char)
  204. ctx.cur += 1
  205. else:
  206. delattr(ctx, "match")
  207. delattr(ctx, "cur_flags")
  208. delattr(ctx, "cur")
  209. # This hook may take care of the assembly and cause the function to return
  210. # its own return value.
  211. hret = _run_hook("pre_assembly", ctx, langsec_hooks)
  212. if hret is not None:
  213. return hret, getattr(ctx, "warnings", [])
  214. logger.debug(f"Output list: {ctx.dest_ls}")
  215. ctx.dest = "".join(ctx.dest_ls)
  216. # This hook may reassign the output string and/or cause the function to
  217. # return it immediately.
  218. hret = _run_hook("post_assembly", ctx, langsec_hooks)
  219. if hret == "ret":
  220. return ctx.dest, getattr(ctx, "warnings", [])
  221. # Strip multiple spaces and leading/trailing whitespace.
  222. ctx.dest = re.sub(MULTI_WS_RE, ' ', ctx.dest.strip())
  223. return ctx.dest, getattr(ctx, "warnings", [])
  224. def _run_hook(hname, ctx, hooks):
  225. ret = None
  226. for hook_def in hooks.get(hname, []):
  227. kwargs = hook_def[1] if len(hook_def) > 1 else {}
  228. ret = hook_def[0](ctx, **kwargs)
  229. if ret in (BREAK, CONT):
  230. # This will stop parsing hooks functions and tell the caller to
  231. # break out of the outer loop or skip iteration.
  232. return ret
  233. return ret