trans.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. import logging
  2. from re import compile
  3. from scriptshifter.exceptions import BREAK, CONT
  4. from scriptshifter.tables import BOW, EOW, WORD_BOUNDARY, load_table
  5. # Match multiple spaces.
  6. MULTI_WS_RE = compile(r"(\s){2,}")
  7. logger = logging.getLogger(__name__)
  8. class Context:
  9. """
  10. Context used within the transliteration and passed to hook functions.
  11. """
  12. @property
  13. def src(self):
  14. return self._src
  15. @src.setter
  16. def src(self):
  17. raise NotImplementedError("Attribute is read-only.")
  18. @src.deleter
  19. def src(self):
  20. raise NotImplementedError("Attribute is read-only.")
  21. def __init__(self, src, general, langsec, options={}):
  22. """
  23. Initialize a context.
  24. Args:
  25. src (str): The original text. Read-only.
  26. general (dict): general section of the current config.
  27. langsec (dict): Language configuration section being used.
  28. options (dict): extra options as a dict.
  29. """
  30. self._src = src
  31. self.general = general
  32. self.options = options
  33. self.langsec = langsec
  34. self.dest_ls = []
  35. self.warnings = []
  36. def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
  37. """
  38. Transliterate a single string.
  39. Args:
  40. src (str): Source string.
  41. lang (str): Language name.
  42. t_dir (str): Transliteration direction. Either `s2r` for
  43. script-to-Roman (default) or `r2s` for Roman-to-script.
  44. capitalize: capitalize words: one of `False` (no change - default),
  45. `"first"` (only first letter), or `"all"` (first letter of each
  46. word).
  47. options: extra script-dependent options. Defaults to the empty map.
  48. Keyword args:
  49. r2s (bool): If False (the default), the source is considered to be a
  50. non-latin script in the language and script specified, and the output
  51. the Romanization thereof; if True, the source is considered to be
  52. romanized text to be transliterated into the specified script/language.
  53. Return:
  54. str: The transliterated string.
  55. """
  56. source_str = "Latin" if t_dir == "r2s" else lang
  57. target_str = lang if t_dir == "r2s" else "Latin"
  58. logger.info(f"Transliteration is from {source_str} to {target_str}.")
  59. cfg = load_table(lang)
  60. logger.info(f"Loaded table for {lang}.")
  61. # General directives.
  62. general = cfg.get("general", {})
  63. if t_dir == "s2r" and "script_to_roman" not in cfg:
  64. raise NotImplementedError(
  65. f"Script-to-Roman transliteration not yet supported for {lang}."
  66. )
  67. elif t_dir == "r2s" and "roman_to_script" not in cfg:
  68. raise NotImplementedError(
  69. f"Roman-to-script transliteration not yet supported for {lang}."
  70. )
  71. langsec = (
  72. cfg["script_to_roman"] if t_dir == "s2r"
  73. else cfg["roman_to_script"])
  74. # langsec_dir = langsec.get("directives", {})
  75. langsec_hooks = langsec.get("hooks", {})
  76. src = src.strip()
  77. options["capitalize"] = capitalize
  78. ctx = Context(src, general, langsec, options)
  79. # This hook may take over the whole transliteration process or delegate it
  80. # to some external process, and return the output string directly.
  81. if _run_hook("post_config", ctx, langsec_hooks) == BREAK:
  82. return getattr(ctx, "dest", ""), ctx.warnings
  83. if "normalize" in ctx.langsec:
  84. _normalize_src(ctx)
  85. if _run_hook("post_normalize", ctx, langsec_hooks) == BREAK:
  86. return getattr(ctx, "dest", ""), ctx.warnings
  87. # Loop through source characters. The increment of each loop depends on
  88. # the length of the token that eventually matches.
  89. ignore_list = langsec.get("ignore", []) # Only present in R2S
  90. ctx.cur = 0
  91. word_boundary = langsec.get("word_boundary", WORD_BOUNDARY)
  92. while ctx.cur < len(ctx.src):
  93. # Reset cursor position flags.
  94. # Carry over extended "beginning of word" flag.
  95. ctx.cur_flags = 0
  96. cur_char = ctx.src[ctx.cur]
  97. # Look for a word boundary and flag word beginning/end it if found.
  98. if _is_bow(ctx.cur, ctx, word_boundary):
  99. # Beginning of word.
  100. logger.debug(f"Beginning of word at position {ctx.cur}.")
  101. ctx.cur_flags |= BOW
  102. if _is_eow(ctx.cur, ctx, word_boundary):
  103. # End of word.
  104. logger.debug(f"End of word at position {ctx.cur}.")
  105. ctx.cur_flags |= EOW
  106. # This hook may skip the parsing of the current
  107. # token or exit the scanning loop altogether.
  108. hret = _run_hook("begin_input_token", ctx, langsec_hooks)
  109. if hret == BREAK:
  110. logger.debug("Breaking text scanning from hook signal.")
  111. break
  112. if hret == CONT:
  113. logger.debug("Skipping scanning iteration from hook signal.")
  114. continue
  115. # Check ignore list. Find as many subsequent ignore tokens
  116. # as possible before moving on to looking for match tokens.
  117. ctx.tk = None
  118. while True:
  119. ctx.ignoring = False
  120. for ctx.tk in ignore_list:
  121. hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
  122. if hret == BREAK:
  123. break
  124. if hret == CONT:
  125. continue
  126. step = len(ctx.tk)
  127. if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
  128. # The position matches an ignore token.
  129. hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
  130. if hret == BREAK:
  131. break
  132. if hret == CONT:
  133. continue
  134. logger.info(f"Ignored token: {ctx.tk}")
  135. ctx.dest_ls.append(ctx.tk)
  136. ctx.cur += step
  137. cur_char = ctx.src[ctx.cur]
  138. ctx.ignoring = True
  139. break
  140. # We looked through all ignore tokens, not found any. Move on.
  141. if not ctx.ignoring:
  142. break
  143. # Otherwise, if we found a match, check if the next position may be
  144. # ignored as well.
  145. delattr(ctx, "tk")
  146. delattr(ctx, "ignoring")
  147. # Begin transliteration token lookup.
  148. ctx.match = False
  149. for ctx.src_tk, ctx.dest_str in langsec["map"]:
  150. hret = _run_hook("pre_tx_token", ctx, langsec_hooks)
  151. if hret == BREAK:
  152. break
  153. if hret == CONT:
  154. continue
  155. step = len(ctx.src_tk.content)
  156. # If the token is longer than the remaining of the string,
  157. # it surely won't match.
  158. if ctx.cur + step > len(ctx.src):
  159. continue
  160. # If the first character of the token is greater (= higher code
  161. # point value) than the current character, then break the loop
  162. # without a match, because we know there won't be any more match
  163. # due to the alphabetical ordering.
  164. if ctx.src_tk.content[0] > cur_char:
  165. logger.debug(
  166. f"{ctx.src_tk.content} is after "
  167. f"{ctx.src[ctx.cur:ctx.cur + step]}. Breaking loop.")
  168. break
  169. # If src_tk has a WB flag but the token is not at WB, skip.
  170. if (
  171. (ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW)
  172. or
  173. # Can't rely on EOW flag, we must check on the last character
  174. # of the potential match.
  175. (ctx.src_tk.flags & EOW and not _is_eow(
  176. ctx.cur + step - 1, ctx, word_boundary))
  177. ):
  178. continue
  179. # Longer tokens should be guaranteed to be scanned before their
  180. # substrings at this point.
  181. # Similarly, flagged tokens are evaluated first.
  182. if ctx.src_tk.content == ctx.src[ctx.cur:ctx.cur + step]:
  183. ctx.match = True
  184. # This hook may skip this token or break out of the token
  185. # lookup for the current position.
  186. hret = _run_hook("on_tx_token_match", ctx, langsec_hooks)
  187. if hret == BREAK:
  188. break
  189. if hret == CONT:
  190. continue
  191. # A match is found. Stop scanning tokens, append result, and
  192. # proceed scanning the source.
  193. # Capitalization.
  194. if (
  195. (ctx.options["capitalize"] == "first" and ctx.cur == 0)
  196. or
  197. (
  198. ctx.options["capitalize"] == "all"
  199. and ctx.cur_flags & BOW
  200. )
  201. ):
  202. logger.info("Capitalizing token.")
  203. double_cap = False
  204. for dcap_rule in ctx.langsec.get("double_cap", []):
  205. if ctx.dest_str == dcap_rule:
  206. ctx.dest_str = ctx.dest_str.upper()
  207. double_cap = True
  208. break
  209. if not double_cap:
  210. ctx.dest_str = (
  211. ctx.dest_str[0].upper() + ctx.dest_str[1:])
  212. ctx.dest_ls.append(ctx.dest_str)
  213. ctx.cur += step
  214. break
  215. if ctx.match is False:
  216. delattr(ctx, "match")
  217. hret = _run_hook("on_no_tx_token_match", ctx, langsec_hooks)
  218. if hret == BREAK:
  219. break
  220. if hret == CONT:
  221. continue
  222. # No match found. Copy non-mapped character (one at a time).
  223. logger.info(
  224. f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
  225. f"at position {ctx.cur} is not mapped.")
  226. ctx.dest_ls.append(cur_char)
  227. ctx.cur += 1
  228. else:
  229. delattr(ctx, "match")
  230. delattr(ctx, "cur_flags")
  231. delattr(ctx, "cur")
  232. # This hook may take care of the assembly and cause the function to return
  233. # its own return value.
  234. hret = _run_hook("pre_assembly", ctx, langsec_hooks)
  235. if hret is not None:
  236. return hret, ctx.warnings
  237. logger.debug(f"Output list: {ctx.dest_ls}")
  238. ctx.dest = "".join(ctx.dest_ls)
  239. # This hook may reassign the output string and/or cause the function to
  240. # return it immediately.
  241. hret = _run_hook("post_assembly", ctx, langsec_hooks)
  242. if hret is not None:
  243. return hret, ctx.warnings
  244. # Strip multiple spaces and leading/trailing whitespace.
  245. ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
  246. return ctx.dest, ctx.warnings
  247. def _normalize_src(ctx):
  248. for nk, nv in ctx.langsec.get("normalize", {}).items():
  249. ctx._src = ctx.src.replace(nk, nv)
  250. logger.debug(f"Normalized source: {ctx.src}")
  251. def _is_bow(cur, ctx, word_boundary):
  252. return (cur == 0 or ctx.src[cur - 1] in word_boundary) and (
  253. ctx.src[cur] not in word_boundary)
  254. def _is_eow(cur, ctx, word_boundary):
  255. return (
  256. cur == len(ctx.src) - 1
  257. or ctx.src[cur + 1] in word_boundary
  258. ) and (ctx.src[cur] not in word_boundary)
  259. def _run_hook(hname, ctx, hooks):
  260. ret = None
  261. for hook_def in hooks.get(hname, []):
  262. kwargs = hook_def[1] if len(hook_def) > 1 else {}
  263. ret = hook_def[0](ctx, **kwargs)
  264. if ret in (BREAK, CONT):
  265. # This will stop parsing hooks functions and tell the caller to
  266. # break out of the outer loop or skip iteration.
  267. return ret
  268. return ret