trans.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. import logging
  2. from importlib import import_module
  3. from re import Pattern, compile
  4. from scriptshifter.exceptions import BREAK, CONT
  5. from scriptshifter.tables import (
  6. BOW, EOW, WORD_BOUNDARY, FEAT_CASEI, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
  7. get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
  8. get_lang_ignore, get_lang_map, get_lang_normalize)
  9. # Match multiple spaces.
  10. MULTI_WS_RE = compile(r"(\s){2,}")
  11. logger = logging.getLogger(__name__)
  12. class Context:
  13. """
  14. Context used within the transliteration and passed to hook functions.
  15. Use within a `with` block for proper cleanup.
  16. """
  17. @property
  18. def src(self):
  19. return self._src
  20. @src.setter
  21. def src(self):
  22. raise NotImplementedError("Attribute is read-only.")
  23. @src.deleter
  24. def src(self):
  25. raise NotImplementedError("Attribute is read-only.")
  26. def __init__(self, lang, src, t_dir, options={}):
  27. """
  28. Initialize a context.
  29. Args:
  30. src (str): The original text. Read-only.
  31. t_dir (int): the direction of transliteration.
  32. Either FEAT_R2S or FEAT_S2R.
  33. options (dict): extra options as a dict.
  34. """
  35. self.lang = lang
  36. self._src = src
  37. self.t_dir = t_dir
  38. self.conn = get_connection()
  39. with self.conn as conn:
  40. general = get_lang_general(conn, self.lang)
  41. self.general = general["data"]
  42. self.lang_id = general["id"]
  43. self.options = options
  44. self.hooks = get_lang_hooks(self.conn, self.lang_id, self.t_dir)
  45. self.dest_ls = []
  46. self.warnings = []
  47. def __enter__(self):
  48. return self
  49. def __exit__(self, exc_type, exc_value, traceback):
  50. self.conn.close()
  51. def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
  52. """
  53. Transliterate a single string.
  54. Args:
  55. src (str): Source string.
  56. lang (str): Language name.
  57. t_dir (str): Transliteration direction. Either `s2r` for
  58. script-to-Roman (default) or `r2s` for Roman-to-script.
  59. capitalize: capitalize words: one of `False` (no change - default),
  60. `"first"` (only first letter), or `"all"` (first letter of each
  61. word).
  62. options: extra script-dependent options. Defaults to the empty map.
  63. Keyword args:
  64. r2s (bool): If False (the default), the source is considered to be a
  65. non-latin script in the language and script specified, and the output
  66. the Romanization thereof; if True, the source is considered to be
  67. romanized text to be transliterated into the specified script/language.
  68. Return:
  69. str: The transliterated string.
  70. """
  71. # Map t_dir to constant.
  72. t_dir = FEAT_S2R if t_dir == "s2r" else FEAT_R2S
  73. source_str = "Roman" if t_dir == FEAT_R2S else lang
  74. target_str = lang if t_dir == FEAT_R2S else "Roman"
  75. logger.info(f"Transliteration is from {source_str} to {target_str}.")
  76. src = src.strip()
  77. options["capitalize"] = capitalize
  78. with Context(lang, src, t_dir, options) as ctx:
  79. if t_dir == FEAT_S2R and not ctx.general["has_s2r"]:
  80. raise NotImplementedError(
  81. f"Script-to-Roman not yet supported for {lang}."
  82. )
  83. if t_dir == FEAT_R2S and not ctx.general["has_r2s"]:
  84. raise NotImplementedError(
  85. f"Roman-to-script not yet supported for {lang}."
  86. )
  87. # Normalize case before post_config and rule-based normalization.
  88. if not ctx.general["case_sensitive"]:
  89. ctx._src = ctx.src.lower()
  90. # This hook may take over the whole transliteration process or delegate
  91. # it to some external process, and return the output string directly.
  92. if _run_hook("post_config", ctx) == BREAK:
  93. return getattr(ctx, "dest", ""), ctx.warnings
  94. # _normalize_src returns the results of the post_normalize hook.
  95. if _normalize_src(
  96. ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
  97. return getattr(ctx, "dest", ""), ctx.warnings
  98. logger.debug(f"Normalized source: {ctx.src}")
  99. lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))
  100. # Loop through source characters. The increment of each loop depends on
  101. # the length of the token that eventually matches.
  102. ctx.cur = 0
  103. while ctx.cur < len(ctx.src):
  104. # Reset cursor position flags.
  105. # Carry over extended "beginning of word" flag.
  106. ctx.cur_flags = 0
  107. cur_char = ctx.src[ctx.cur]
  108. # Look for a word boundary and flag word beginning/end it if found.
  109. if _is_bow(ctx.cur, ctx, WORD_BOUNDARY):
  110. # Beginning of word.
  111. logger.debug(f"Beginning of word at position {ctx.cur}.")
  112. ctx.cur_flags |= BOW
  113. if _is_eow(ctx.cur, ctx, WORD_BOUNDARY):
  114. # End of word.
  115. logger.debug(f"End of word at position {ctx.cur}.")
  116. ctx.cur_flags |= EOW
  117. # This hook may skip the parsing of the current
  118. # token or exit the scanning loop altogether.
  119. hret = _run_hook("begin_input_token", ctx)
  120. if hret == BREAK:
  121. logger.debug("Breaking text scanning from hook signal.")
  122. break
  123. if hret == CONT:
  124. logger.debug("Skipping scanning iteration from hook signal.")
  125. continue
  126. # Check ignore list. Find as many subsequent ignore tokens
  127. # as possible before moving on to looking for match tokens.
  128. ctx.tk = None
  129. while True:
  130. ctx.ignoring = False
  131. for ctx.tk in get_lang_ignore(ctx.conn, ctx.lang_id):
  132. hret = _run_hook("pre_ignore_token", ctx)
  133. if hret == BREAK:
  134. break
  135. if hret == CONT:
  136. continue
  137. _matching = False
  138. if type(ctx.tk) is Pattern:
  139. # Seach RE pattern beginning at cursor.
  140. if _ptn_match := ctx.tk.match(ctx.src[ctx.cur:]):
  141. ctx.tk = _ptn_match[0]
  142. logger.debug(f"Matched regex: {ctx.tk}")
  143. step = len(ctx.tk)
  144. _matching = True
  145. else:
  146. # Search exact match.
  147. step = len(ctx.tk)
  148. if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
  149. _matching = True
  150. if _matching:
  151. # The position matches an ignore token.
  152. hret = _run_hook("on_ignore_match", ctx)
  153. if hret == BREAK:
  154. break
  155. if hret == CONT:
  156. continue
  157. logger.info(f"Ignored token: {ctx.tk}")
  158. ctx.dest_ls.append(ctx.tk)
  159. ctx.cur += step
  160. if ctx.cur >= len(ctx.src):
  161. # reached end of string. Stop ignoring.
  162. # The outer loop will exit imediately after.
  163. ctx.ignoring = False
  164. break
  165. cur_char = ctx.src[ctx.cur]
  166. ctx.ignoring = True
  167. break
  168. # We looked through all ignore tokens, not found any. Move on.
  169. if not ctx.ignoring:
  170. break
  171. # Otherwise, if we found a match, check if the next position
  172. # may be ignored as well.
  173. delattr(ctx, "tk")
  174. delattr(ctx, "ignoring")
  175. if ctx.cur >= len(ctx.src):
  176. break
  177. # Begin transliteration token lookup.
  178. ctx.match = False
  179. for ctx.src_tk, ctx.dest_str in lang_map:
  180. hret = _run_hook("pre_tx_token", ctx)
  181. if hret == BREAK:
  182. break
  183. if hret == CONT:
  184. continue
  185. step = len(ctx.src_tk.content)
  186. # If the token is longer than the remaining of the string,
  187. # it surely won't match.
  188. if ctx.cur + step > len(ctx.src):
  189. continue
  190. # If the first character of the token is greater (= higher code
  191. # point value) than the current character, then break the loop
  192. # without a match, because we know there won't be any more
  193. # match due to the alphabetical ordering.
  194. if ctx.src_tk.content[0] > cur_char:
  195. logger.debug(
  196. f"{ctx.src_tk.content} is after "
  197. f"{ctx.src[ctx.cur:ctx.cur + step]}. "
  198. "Breaking loop.")
  199. break
  200. # If src_tk has a WB flag but the token is not at WB, skip.
  201. if (
  202. (ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW)
  203. or
  204. # Can't rely on EOW flag, we must check on the last
  205. # character of the potential match.
  206. (ctx.src_tk.flags & EOW and not _is_eow(
  207. ctx.cur + step - 1, ctx, WORD_BOUNDARY))
  208. ):
  209. continue
  210. # Longer tokens should be guaranteed to be scanned before their
  211. # substrings at this point.
  212. # Similarly, flagged tokens are evaluated first.
  213. if ctx.src_tk.content == ctx.src[ctx.cur:ctx.cur + step]:
  214. ctx.match = True
  215. # This hook may skip this token or break out of the token
  216. # lookup for the current position.
  217. hret = _run_hook("on_tx_token_match", ctx)
  218. if hret == BREAK:
  219. break
  220. if hret == CONT:
  221. continue
  222. # A match is found. Stop scanning tokens, append result,
  223. # and proceed scanning the source.
  224. # Capitalization.
  225. if (
  226. (ctx.options["capitalize"] == "first" and ctx.cur == 0)
  227. or
  228. (
  229. ctx.options["capitalize"] == "all"
  230. and ctx.cur_flags & BOW
  231. )
  232. ):
  233. logger.info("Capitalizing token.")
  234. double_cap = False
  235. for dcap_rule in get_lang_dcap(ctx.conn, ctx.lang_id):
  236. if ctx.dest_str == dcap_rule:
  237. ctx.dest_str = ctx.dest_str.upper()
  238. double_cap = True
  239. break
  240. if not double_cap:
  241. ctx.dest_str = (
  242. ctx.dest_str[0].upper() + ctx.dest_str[1:])
  243. ctx.dest_ls.append(ctx.dest_str)
  244. ctx.cur += step
  245. break
  246. if ctx.match is False:
  247. delattr(ctx, "match")
  248. hret = _run_hook("on_no_tx_token_match", ctx)
  249. if hret == BREAK:
  250. break
  251. if hret == CONT:
  252. continue
  253. # No match found. Copy non-mapped character (one at a time).
  254. logger.info(
  255. f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
  256. f"at position {ctx.cur} is not mapped.")
  257. ctx.dest_ls.append(cur_char)
  258. ctx.cur += 1
  259. else:
  260. delattr(ctx, "match")
  261. delattr(ctx, "cur_flags")
  262. delattr(ctx, "cur")
  263. # This hook may take care of the assembly and cause the function to
  264. # return its own return value.
  265. hret = _run_hook("pre_assembly", ctx)
  266. if hret is not None:
  267. return hret, ctx.warnings
  268. logger.debug(f"Output list: {ctx.dest_ls}")
  269. ctx.dest = "".join(ctx.dest_ls)
  270. # This hook may reassign the output string and/or cause the function to
  271. # return it immediately.
  272. hret = _run_hook("post_assembly", ctx)
  273. if hret is not None:
  274. return hret, ctx.warnings
  275. # Strip multiple spaces and leading/trailing whitespace.
  276. ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
  277. return ctx.dest, ctx.warnings
  278. def _normalize_src(ctx, norm_rules):
  279. """
  280. Normalize source text according to rules.
  281. NOTE: this manipluates the protected source attribute so it may not
  282. correspond to the originally provided source.
  283. """
  284. for nk, nv in norm_rules.items():
  285. ctx._src = ctx.src.replace(nk, nv)
  286. return _run_hook("post_normalize", ctx)
  287. def _is_bow(cur, ctx, word_boundary):
  288. return (cur == 0 or ctx.src[cur - 1] in word_boundary) and (
  289. ctx.src[cur] not in word_boundary)
  290. def _is_eow(cur, ctx, word_boundary):
  291. return (
  292. cur == len(ctx.src) - 1
  293. or ctx.src[cur + 1] in word_boundary
  294. ) and (ctx.src[cur] not in word_boundary)
  295. def _run_hook(hname, ctx):
  296. ret = None
  297. for hook_def in ctx.hooks.get(hname, []):
  298. fn = getattr(
  299. import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
  300. hook_def["fn_name"])
  301. ret = fn(ctx, **hook_def["kwargs"])
  302. if ret in (BREAK, CONT):
  303. # This will stop parsing hooks functions and tell the caller to
  304. # break out of the outer loop or skip iteration.
  305. return ret
  306. return ret