trans.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. import logging
  2. import re
  3. from scriptshifter.exceptions import BREAK, CONT
  4. from scriptshifter.tables import WORD_BOUNDARY, load_table
  5. # Match multiple spaces.
  6. MULTI_WS_RE = re.compile(r"\s{2,}")
  7. # Cursor bitwise flags.
  8. CUR_BOW = 1 << 0
  9. CUR_EOW = 1 << 1
  10. logger = logging.getLogger(__name__)
  11. class Context:
  12. """
  13. Context used within the transliteration and passed to hook functions.
  14. """
  15. @property
  16. def src(self):
  17. return self._src
  18. @src.setter
  19. def src(self):
  20. raise NotImplementedError("Attribute is read-only.")
  21. @src.deleter
  22. def src(self):
  23. raise NotImplementedError("Attribute is read-only.")
  24. def __init__(self, src, general, langsec, options={}):
  25. """
  26. Initialize a context.
  27. Args:
  28. src (str): The original text. Read-only.
  29. general (dict): general section of the current config.
  30. langsec (dict): Language configuration section being used.
  31. options (dict): extra options as a dict.
  32. """
  33. self._src = src
  34. self.general = general
  35. self.options = options
  36. self.langsec = langsec
  37. self.dest_ls = []
  38. self.warnings = []
  39. def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
  40. """
  41. Transliterate a single string.
  42. Args:
  43. src (str): Source string.
  44. lang (str): Language name.
  45. t_dir (str): Transliteration direction. Either `s2r` for
  46. script-to-Roman (default) or `r2s` for Roman-to-script.
  47. capitalize: capitalize words: one of `False` (no change - default),
  48. `"first"` (only first letter), or `"all"` (first letter of each
  49. word).
  50. options: extra script-dependent options. Defaults to the empty map.
  51. Keyword args:
  52. r2s (bool): If False (the default), the source is considered to be a
  53. non-latin script in the language and script specified, and the output
  54. the Romanization thereof; if True, the source is considered to be
  55. romanized text to be transliterated into the specified script/language.
  56. Return:
  57. str: The transliterated string.
  58. """
  59. source_str = "Latin" if t_dir == "r2s" else lang
  60. target_str = lang if t_dir == "r2s" else "Latin"
  61. logger.info(f"Transliteration is from {source_str} to {target_str}.")
  62. cfg = load_table(lang)
  63. logger.info(f"Loaded table for {lang}.")
  64. # General directives.
  65. general = cfg.get("general", {})
  66. if t_dir == "s2r" and "script_to_roman" not in cfg:
  67. raise NotImplementedError(
  68. f"Script-to-Roman transliteration not yet supported for {lang}."
  69. )
  70. elif t_dir == "r2s" and "roman_to_script" not in cfg:
  71. raise NotImplementedError(
  72. f"Roman-to-script transliteration not yet supported for {lang}."
  73. )
  74. langsec = (
  75. cfg["script_to_roman"] if t_dir == "s2r"
  76. else cfg["roman_to_script"])
  77. # langsec_dir = langsec.get("directives", {})
  78. langsec_hooks = langsec.get("hooks", {})
  79. src = src.strip()
  80. options["capitalize"] = capitalize
  81. ctx = Context(src, general, langsec, options)
  82. # This hook may take over the whole transliteration process or delegate it
  83. # to some external process, and return the output string directly.
  84. if _run_hook("post_config", ctx, langsec_hooks) == BREAK:
  85. return getattr(ctx, "dest", ""), ctx.warnings
  86. if "normalize" in ctx.langsec:
  87. _normalize_src(ctx)
  88. if _run_hook("post_normalize", ctx, langsec_hooks) == BREAK:
  89. return getattr(ctx, "dest", ""), ctx.warnings
  90. # Loop through source characters. The increment of each loop depends on
  91. # the length of the token that eventually matches.
  92. ignore_list = langsec.get("ignore", []) # Only present in R2S
  93. ctx.cur = 0
  94. word_boundary = langsec.get("word_boundary", WORD_BOUNDARY)
  95. map_default = langsec["map"]
  96. map_initial = (
  97. langsec["map_initial"] + map_default
  98. if "map_initial" in langsec else None)
  99. map_final = (
  100. langsec["map_final"] + map_default
  101. if "map_final" in langsec else None)
  102. # TODO unused
  103. map_standalone = (
  104. langsec["map_standalone"] + map_default
  105. if "map_standalone" in langsec else None)
  106. while ctx.cur < len(ctx.src):
  107. # Reset cursor position flags.
  108. # Carry over extended "beginning of word" flag.
  109. ctx.cur_flags = 0
  110. cur_char = ctx.src[ctx.cur]
  111. # Look for a word boundary and flag word beginning/end it if found.
  112. if (ctx.cur == 0 or ctx.src[ctx.cur - 1] in word_boundary) and (
  113. cur_char not in word_boundary):
  114. # Beginning of word.
  115. logger.debug(f"Beginning of word at position {ctx.cur}.")
  116. ctx.cur_flags |= CUR_BOW
  117. if (
  118. ctx.cur == len(ctx.src) - 1
  119. or ctx.src[ctx.cur + 1] in word_boundary
  120. ) and (cur_char not in word_boundary):
  121. # Beginning of word.
  122. # End of word.
  123. logger.debug(f"End of word at position {ctx.cur}.")
  124. ctx.cur_flags |= CUR_EOW
  125. # This hook may skip the parsing of the current
  126. # token or exit the scanning loop altogether.
  127. hret = _run_hook("begin_input_token", ctx, langsec_hooks)
  128. if hret == BREAK:
  129. logger.debug("Breaking text scanning from hook signal.")
  130. break
  131. if hret == CONT:
  132. logger.debug("Skipping scanning iteration from hook signal.")
  133. continue
  134. # Check ignore list. Find as many subsequent ignore tokens
  135. # as possible before moving on to looking for match tokens.
  136. ctx.tk = None
  137. while True:
  138. ctx.ignoring = False
  139. for ctx.tk in ignore_list:
  140. hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
  141. if hret == BREAK:
  142. break
  143. if hret == CONT:
  144. continue
  145. step = len(ctx.tk)
  146. if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
  147. # The position matches an ignore token.
  148. hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
  149. if hret == BREAK:
  150. break
  151. if hret == CONT:
  152. continue
  153. logger.info(f"Ignored token: {ctx.tk}")
  154. ctx.dest_ls.append(ctx.tk)
  155. ctx.cur += step
  156. ctx.ignoring = True
  157. break
  158. # We looked through all ignore tokens, not found any. Move on.
  159. if not ctx.ignoring:
  160. break
  161. # Otherwise, if we found a match, check if the next position may be
  162. # ignored as well.
  163. delattr(ctx, "tk")
  164. delattr(ctx, "ignoring")
  165. # Begin transliteration token lookup.
  166. ctx.match = False
  167. # Assign special maps based on token position.
  168. # Standalone has precedence, then initial, then final, then medial.
  169. # This is totally arbitrary and amy change if special cases arise.
  170. if (
  171. ctx.cur_flags & CUR_BOW and ctx.cur_flags & CUR_EOW
  172. and map_standalone):
  173. map_ = map_standalone
  174. elif ctx.cur_flags & CUR_BOW and map_initial:
  175. map_ = map_initial
  176. elif ctx.cur_flags & CUR_EOW and map_final:
  177. map_ = map_final
  178. else:
  179. map_ = map_default
  180. for ctx.src_tk, ctx.dest_tk in map_:
  181. hret = _run_hook("pre_tx_token", ctx, langsec_hooks)
  182. if hret == BREAK:
  183. break
  184. if hret == CONT:
  185. continue
  186. step = len(ctx.src_tk)
  187. # If the first character of the token is greater (= higher code
  188. # point value) than the current character, then break the loop
  189. # without a match, because we know there won't be any more match
  190. # due to the alphabetical ordering.
  191. if ctx.src_tk[0] > cur_char:
  192. logger.debug(
  193. f"{ctx.src_tk} is after "
  194. f"{ctx.src[ctx.cur:ctx.cur + step]}. Breaking loop.")
  195. break
  196. # Longer tokens should be guaranteed to be scanned before their
  197. # substrings at this point.
  198. if ctx.src_tk == ctx.src[ctx.cur:ctx.cur + step]:
  199. ctx.match = True
  200. # This hook may skip this token or break out of the token
  201. # lookup for the current position.
  202. hret = _run_hook("on_tx_token_match", ctx, langsec_hooks)
  203. if hret == BREAK:
  204. break
  205. if hret == CONT:
  206. continue
  207. # A match is found. Stop scanning tokens, append result, and
  208. # proceed scanning the source.
  209. # Capitalization.
  210. if (
  211. (ctx.options["capitalize"] == "first" and ctx.cur == 0)
  212. or
  213. (
  214. ctx.options["capitalize"] == "all"
  215. and ctx.cur_flags & CUR_BOW
  216. )
  217. ):
  218. logger.info("Capitalizing token.")
  219. double_cap = False
  220. for dcap_rule in ctx.langsec.get("double_cap", []):
  221. if ctx.dest_tk == dcap_rule:
  222. ctx.dest_tk = ctx.dest_tk.upper()
  223. double_cap = True
  224. break
  225. if not double_cap:
  226. ctx.dest_tk = ctx.dest_tk[0].upper() + ctx.dest_tk[1:]
  227. ctx.dest_ls.append(ctx.dest_tk)
  228. ctx.cur += step
  229. break
  230. if ctx.match is False:
  231. delattr(ctx, "match")
  232. hret = _run_hook("on_no_tx_token_match", ctx, langsec_hooks)
  233. if hret == BREAK:
  234. break
  235. if hret == CONT:
  236. continue
  237. # No match found. Copy non-mapped character (one at a time).
  238. logger.info(
  239. f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
  240. f"at position {ctx.cur} is not mapped.")
  241. ctx.dest_ls.append(cur_char)
  242. ctx.cur += 1
  243. else:
  244. delattr(ctx, "match")
  245. delattr(ctx, "cur_flags")
  246. delattr(ctx, "cur")
  247. # This hook may take care of the assembly and cause the function to return
  248. # its own return value.
  249. hret = _run_hook("pre_assembly", ctx, langsec_hooks)
  250. if hret is not None:
  251. return hret, ctx.warnings
  252. logger.debug(f"Output list: {ctx.dest_ls}")
  253. ctx.dest = "".join(ctx.dest_ls)
  254. # This hook may reassign the output string and/or cause the function to
  255. # return it immediately.
  256. hret = _run_hook("post_assembly", ctx, langsec_hooks)
  257. if hret == "ret":
  258. return ctx.dest, ctx.warnings
  259. # Strip multiple spaces and leading/trailing whitespace.
  260. ctx.dest = re.sub(MULTI_WS_RE, ' ', ctx.dest.strip())
  261. return ctx.dest, ctx.warnings
  262. def _normalize_src(ctx):
  263. for nk, nv in ctx.langsec.get("normalize", {}).items():
  264. ctx._src = ctx.src.replace(nk, nv)
  265. logger.debug(f"Normalized source: {ctx.src}")
  266. def _run_hook(hname, ctx, hooks):
  267. ret = None
  268. for hook_def in hooks.get(hname, []):
  269. kwargs = hook_def[1] if len(hook_def) > 1 else {}
  270. ret = hook_def[0](ctx, **kwargs)
  271. if ret in (BREAK, CONT):
  272. # This will stop parsing hooks functions and tell the caller to
  273. # break out of the outer loop or skip iteration.
  274. return ret
  275. return ret