trans.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. import logging
  2. from importlib import import_module
  3. from re import Pattern, compile
  4. from unicodedata import normalize as precomp_normalize
  5. from scriptshifter.exceptions import BREAK, CONT
  6. from scriptshifter.hooks.general import normalize_spacing_post_assembly
  7. from scriptshifter.tables import (
  8. BOW, EOW, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
  9. get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
  10. get_lang_ignore, get_lang_map, get_lang_normalize)
  11. logger = logging.getLogger(__name__)
  12. WORD_PTN = compile(r"\w")
  13. WB_PTN = compile(r"\W")
  14. class Transliterator:
  15. """
  16. Context carrying the state of transliteration process.
  17. Use within a `with` block for proper cleanup.
  18. """
  19. @property
  20. def orig(self):
  21. return self._orig
  22. @orig.setter
  23. def orig(self, v):
  24. raise NotImplementedError("Attribute is read-only.")
  25. @orig.deleter
  26. def orig(self):
  27. raise NotImplementedError("Attribute is read-only.")
  28. @property
  29. def cur_char(self):
  30. return self.src[self.cur]
  31. def __init__(self, lang, src, t_dir, options={}):
  32. """
  33. Initialize a context.
  34. Args:
  35. src (str): The original text. Read-only.
  36. t_dir (int): the direction of transliteration.
  37. Either FEAT_R2S or FEAT_S2R.
  38. options (dict): extra options as a dict.
  39. """
  40. self.lang = lang
  41. self._orig = src
  42. self.src = src
  43. self.t_dir = t_dir
  44. self.conn = get_connection()
  45. with self.conn as conn:
  46. general = get_lang_general(conn, self.lang)
  47. self.general = general["data"]
  48. self.lang_id = general["id"]
  49. self.options = options
  50. self.hooks = get_lang_hooks(self.conn, self.lang_id, self.t_dir)
  51. self.dest_ls = []
  52. self.warnings = []
  53. def __enter__(self):
  54. return self
  55. def __exit__(self, exc_type, exc_value, traceback):
  56. self.conn.close()
  57. def run_hook(self, hname):
  58. ret = None
  59. for hook_def in self.hooks.get(hname, []):
  60. fn = getattr(
  61. import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
  62. hook_def["fn_name"]
  63. )
  64. ret = fn(self, **hook_def["kwargs"])
  65. if ret in (BREAK, CONT):
  66. # This will stop parsing hooks functions and tell the caller to
  67. # break out of the outer loop or skip iteration.
  68. return ret
  69. return ret
  70. def normalize_src(self):
  71. """
  72. Normalize source text according to rules.
  73. NOTE: this manipluates the protected source attribute so it may not
  74. correspond to the originally provided source.
  75. """
  76. # Normalize precomposed Unicode characters.
  77. #
  78. # In using diacritics, LC standards prefer the decomposed form
  79. # (combining diacritic + base character) to the pre-composed form
  80. # (single Unicode symbol for the letter with diacritic).
  81. #
  82. # Note: only safe for R2S.
  83. if self.t_dir == FEAT_R2S:
  84. logger.debug("Normalizing pre-composed symbols.")
  85. self.src = precomp_normalize("NFD", self.src)
  86. norm_rules = get_lang_normalize(self.conn, self.lang_id)
  87. for nk, nv in norm_rules.items():
  88. self.src = self.src.replace(nk, nv)
  89. return self.run_hook("post_normalize")
  90. def cur_at_bow(self, cur=None):
  91. """
  92. Check if cursor is at the beginning of a word.
  93. @param cur(int): Position to check. By default, the current cursor.
  94. """
  95. if cur is None:
  96. cur = self.cur
  97. return (
  98. self.cur == 0
  99. or WB_PTN.match(self.src[cur - 1])
  100. ) and WORD_PTN.match(self.src[cur])
  101. def cur_at_eow(self, cur=None):
  102. """
  103. Check if cursor is at the end of a word.
  104. @param cur(int): Position to check. By default, the current cursor.
  105. """
  106. if cur is None:
  107. cur = self.cur
  108. return (
  109. cur == len(self.src) - 1
  110. or WB_PTN.match(self.src[cur + 1])
  111. ) and WORD_PTN.match(self.src[cur])
  112. def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
  113. """
  114. Transliterate a single string.
  115. Args:
  116. src (str): Source string.
  117. lang (str): Language name.
  118. t_dir (str): Transliteration direction. Either `s2r` for
  119. script-to-Roman (default) or `r2s` for Roman-to-script.
  120. capitalize: capitalize words: one of `False` (no change - default),
  121. `"first"` (only first letter), or `"all"` (first letter of each
  122. word).
  123. options: extra script-dependent options. Defaults to the empty map.
  124. Keyword args:
  125. r2s (bool): If False (the default), the source is considered to be a
  126. non-latin script in the language and script specified, and the output
  127. the Romanization thereof; if True, the source is considered to be
  128. romanized text to be transliterated into the specified script/language.
  129. Return:
  130. str: The transliterated string.
  131. """
  132. # Map t_dir to constant.
  133. t_dir = FEAT_S2R if t_dir == "s2r" else FEAT_R2S
  134. source_str = "Roman" if t_dir == FEAT_R2S else lang
  135. target_str = lang if t_dir == FEAT_R2S else "Roman"
  136. logger.info(f"Transliteration is from {source_str} to {target_str}.")
  137. src = src.strip()
  138. options["capitalize"] = capitalize
  139. with Transliterator(lang, src, t_dir, options) as ctx:
  140. if t_dir == FEAT_S2R and not ctx.general["has_s2r"]:
  141. raise NotImplementedError(
  142. f"Script-to-Roman not yet supported for {lang}."
  143. )
  144. if t_dir == FEAT_R2S and not ctx.general["has_r2s"]:
  145. raise NotImplementedError(
  146. f"Roman-to-script not yet supported for {lang}."
  147. )
  148. # Normalize case before post_config and rule-based normalization.
  149. if t_dir == FEAT_R2S and not ctx.general["case_sensitive"]:
  150. ctx.src = ctx.src.lower()
  151. # This hook may take over the whole transliteration process or delegate
  152. # it to some external process, and return the output string directly.
  153. if ctx.run_hook("post_config") == BREAK:
  154. return getattr(ctx, "dest", ""), ctx.warnings
  155. # ctx.normalize_src returns the results of the post_normalize hook.
  156. if ctx.normalize_src() == BREAK:
  157. return getattr(ctx, "dest", ""), ctx.warnings
  158. logger.debug(f"Normalized source: {ctx.src}")
  159. lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))
  160. # Loop through source characters. The increment of each loop depends on
  161. # the length of the token that eventually matches.
  162. ctx.cur = 0
  163. while ctx.cur < len(ctx.src):
  164. # Reset cursor position flags.
  165. # Carry over extended "beginning of word" flag.
  166. ctx.cur_flags = 0
  167. # Look for a word boundary and flag word beginning/end it if found.
  168. if ctx.cur_at_bow():
  169. # Beginning of word.
  170. logger.debug(f"Beginning of word at position {ctx.cur}.")
  171. ctx.cur_flags |= BOW
  172. if ctx.cur_at_eow():
  173. # End of word.
  174. logger.debug(f"End of word at position {ctx.cur}.")
  175. ctx.cur_flags |= EOW
  176. # This hook may skip the parsing of the current
  177. # token or exit the scanning loop altogether.
  178. hret = ctx.run_hook("begin_input_token")
  179. if hret == BREAK:
  180. logger.debug("Breaking text scanning from hook signal.")
  181. break
  182. if hret == CONT:
  183. logger.debug("Skipping scanning iteration from hook signal.")
  184. continue
  185. # Check ignore list. Find as many subsequent ignore tokens
  186. # as possible before moving on to looking for match tokens.
  187. ctx.tk = None
  188. while True:
  189. ctx.ignoring = False
  190. for ctx.tk in get_lang_ignore(ctx.conn, ctx.lang_id):
  191. hret = ctx.run_hook("pre_ignore_token")
  192. if hret == BREAK:
  193. break
  194. if hret == CONT:
  195. continue
  196. _matching = False
  197. if type(ctx.tk) is Pattern:
  198. # Seach RE pattern beginning at cursor.
  199. if _ptn_match := ctx.tk.match(ctx.src[ctx.cur:]):
  200. ctx.tk = _ptn_match[0]
  201. logger.debug(f"Matched regex: {ctx.tk}")
  202. step = len(ctx.tk)
  203. _matching = True
  204. else:
  205. # Search exact match.
  206. step = len(ctx.tk)
  207. if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
  208. _matching = True
  209. if _matching:
  210. # The position matches an ignore token.
  211. hret = ctx.run_hook("on_ignore_match")
  212. if hret == BREAK:
  213. break
  214. if hret == CONT:
  215. continue
  216. logger.info(f"Ignored token: {ctx.tk}")
  217. ctx.dest_ls.append(ctx.tk)
  218. ctx.cur += step
  219. if ctx.cur >= len(ctx.src):
  220. # reached end of string. Stop ignoring.
  221. # The outer loop will exit imediately after.
  222. ctx.ignoring = False
  223. break
  224. ctx.ignoring = True
  225. break
  226. # We looked through all ignore tokens, not found any. Move on.
  227. if not ctx.ignoring:
  228. break
  229. # Otherwise, if we found a match, check if the next position
  230. # may be ignored as well.
  231. delattr(ctx, "tk")
  232. delattr(ctx, "ignoring")
  233. if ctx.cur >= len(ctx.src):
  234. break
  235. # Begin transliteration token lookup.
  236. ctx.match = False
  237. for ctx.src_tk, ctx.dest_str in lang_map:
  238. hret = ctx.run_hook("pre_tx_token")
  239. if hret == BREAK:
  240. break
  241. if hret == CONT:
  242. continue
  243. step = len(ctx.src_tk.content)
  244. # If the token is longer than the remaining of the string,
  245. # it surely won't match.
  246. if ctx.cur + step > len(ctx.src):
  247. continue
  248. # If the first character of the token is greater (= higher code
  249. # point value) than the current character, then break the loop
  250. # without a match, because we know there won't be any more
  251. # match due to the alphabetical ordering.
  252. if ctx.src_tk.content[0] > ctx.cur_char:
  253. logger.debug(
  254. f"{ctx.src_tk.content} is after "
  255. f"{ctx.src[ctx.cur:ctx.cur + step]}. "
  256. "Breaking loop.")
  257. break
  258. # If src_tk has a WB flag but the token is not at WB, skip.
  259. if (
  260. (ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW)
  261. or (
  262. # Can't rely on EOW flag, we must check on the last
  263. # character of the potential match.
  264. ctx.src_tk.flags & EOW
  265. and not ctx.cur_at_eow(ctx.cur + step - 1)
  266. )
  267. ):
  268. continue
  269. # Longer tokens should be guaranteed to be scanned before their
  270. # substrings at this point.
  271. # Similarly, flagged tokens are evaluated first.
  272. if ctx.src_tk.content == ctx.src[ctx.cur:ctx.cur + step]:
  273. ctx.match = True
  274. # This hook may skip this token or break out of the token
  275. # lookup for the current position.
  276. hret = ctx.run_hook("on_tx_token_match")
  277. if hret == BREAK:
  278. break
  279. if hret == CONT:
  280. continue
  281. # A match is found. Stop scanning tokens, append result,
  282. # and proceed scanning the source.
  283. # Capitalization. This applies double capitalization
  284. # rules. The external function in
  285. # scriptshifter.tools.capitalize used for non-table
  286. # languages does not.
  287. if (
  288. (ctx.options["capitalize"] == "first" and ctx.cur == 0)
  289. or
  290. (
  291. ctx.options["capitalize"] == "all"
  292. and ctx.cur_flags & BOW
  293. )
  294. ):
  295. logger.info("Capitalizing token.")
  296. double_cap = False
  297. for dcap_rule in get_lang_dcap(ctx.conn, ctx.lang_id):
  298. if ctx.dest_str == dcap_rule:
  299. ctx.dest_str = ctx.dest_str.upper()
  300. double_cap = True
  301. break
  302. if not double_cap:
  303. ctx.dest_str = (
  304. ctx.dest_str[0].upper() + ctx.dest_str[1:])
  305. ctx.dest_ls.append(ctx.dest_str)
  306. ctx.cur += step
  307. break
  308. if ctx.match is False:
  309. delattr(ctx, "match")
  310. hret = ctx.run_hook("on_no_tx_token_match")
  311. if hret == BREAK:
  312. break
  313. if hret == CONT:
  314. continue
  315. # No match found. Copy non-mapped character (one at a time).
  316. logger.info(
  317. f"Token {ctx.cur_char} "
  318. f"(\\u{hex(ord(ctx.cur_char))[2:]}) "
  319. f"at position {ctx.cur} is not mapped.")
  320. ctx.dest_ls.append(ctx.cur_char)
  321. ctx.cur += 1
  322. else:
  323. delattr(ctx, "match")
  324. delattr(ctx, "cur_flags")
  325. delattr(ctx, "cur")
  326. # This hook may take care of the assembly and cause the function to
  327. # return its own return value.
  328. if ctx.run_hook("pre_assembly") == BREAK:
  329. return ctx.dest, ctx.warnings
  330. logger.debug(f"Output list: {ctx.dest_ls}")
  331. ctx.dest = "".join(ctx.dest_ls)
  332. # This hook may reassign the output string and/or cause the function to
  333. # return it immediately.
  334. if ctx.run_hook("post_assembly") == BREAK:
  335. return ctx.dest, ctx.warnings
  336. normalize_spacing_post_assembly(ctx)
  337. return ctx.dest, ctx.warnings