trans.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. import logging
  2. import re
  3. from transliterator.exceptions import BREAK, CONT
  4. from transliterator.tables import load_table
  5. # Match multiple spaces.
  6. MULTI_WS_RE = re.compile(r"\s{2,}")
  7. # Default characters defining a word boundary. TODO Make this configurable
  8. # per-table.
  9. WORD_BOUNDARY = " \n\t:;.,\"'"
  10. # Cursor flags.
  11. CUR_BOW = 1
  12. CUR_EOW = 2
  13. logger = logging.getLogger(__name__)
  14. class Context:
  15. """
  16. Context used within the transliteration and passed to hook functions.
  17. """
  18. def __init__(self, src, general, langsec):
  19. """
  20. Initialize a context.
  21. Args:
  22. src (str): The original text. This is meant to never change.
  23. general (dict): general section of the current config.
  24. langsec (dict): Language configuration section being used.
  25. """
  26. self.src = src
  27. self.general = general
  28. self.langsec = langsec
  29. self.dest_ls = []
  30. def transliterate(src, lang, r2s=False):
  31. """
  32. Transliterate a single string.
  33. Args:
  34. src (str): Source string.
  35. lang (str): Language name.
  36. Keyword args:
  37. r2s (bool): If False (the default), the source is considered to be a
  38. non-latin script in the language and script specified, and the output
  39. the Romanization thereof; if True, the source is considered to be
  40. romanized text to be transliterated into the specified script/language.
  41. Return:
  42. str: The transliterated string.
  43. """
  44. source_str = "Latin" if r2s else lang
  45. target_str = lang if r2s else "Latin"
  46. logger.info(f"Transliteration is from {source_str} to {target_str}.")
  47. cfg = load_table(lang)
  48. logger.info(f"Loaded table for {lang}.")
  49. # General directives.
  50. general = cfg.get("general", {})
  51. if not r2s and "script_to_roman" not in cfg:
  52. raise NotImplementedError(
  53. f"Script-to-Roman transliteration not yet supported for {lang}."
  54. )
  55. elif r2s and "roman_to_script" not in cfg:
  56. raise NotImplementedError(
  57. f"Roman-to-script transliteration not yet supported for {lang}."
  58. )
  59. langsec = cfg["script_to_roman"] if not r2s else cfg["roman_to_script"]
  60. langsec_dir = langsec.get("directives", {})
  61. langsec_hooks = langsec.get("hooks", {})
  62. ctx = Context(src, general, langsec)
  63. # This hook may take over the whole transliteration process or delegate it
  64. # to some external process, and return the output string directly.
  65. if _run_hook("post_config", ctx, langsec_hooks) == BREAK:
  66. return getattr(ctx, "dest", "")
  67. # Loop through source characters. The increment of each loop depends on
  68. # the length of the token that eventually matches.
  69. ignore_list = langsec.get("ignore", []) # Only present in R2S
  70. ignore_ptn_list = langsec.get("ignore_ptn", []) # Only present in R2S
  71. ctx.cur = 0
  72. while ctx.cur < len(src):
  73. # Reset cursor position flags.
  74. ctx.cur_flags = 0
  75. # Look for a word boundary and flag word beginning/end it if found.
  76. if ctx.cur == 0 or src[ctx.cur - 1] in WORD_BOUNDARY:
  77. # Beginning of word.
  78. logger.debug(f"Beginning of word at position {ctx.cur}.")
  79. ctx.cur_flags |= CUR_BOW
  80. if ctx.cur == len(src) - 1 or src[ctx.cur + 1] in WORD_BOUNDARY:
  81. # End of word.
  82. logger.debug(f"End of word at position {ctx.cur}.")
  83. ctx.cur_flags |= CUR_EOW
  84. # This hook may skip the parsing of the current
  85. # token or exit the scanning loop altogether.
  86. hret = _run_hook("begin_input_token", ctx, langsec_hooks)
  87. if hret == BREAK:
  88. logger.debug("Breaking text scanning from hook signal.")
  89. break
  90. if hret == CONT:
  91. logger.debug("Skipping scanning iteration from hook signal.")
  92. continue
  93. # Check ignore lists. Find as many subsequent ignore tokens
  94. # as possible before moving on to looking for match tokens.
  95. ctx.tk = None
  96. while True:
  97. ctx.ignoring = False
  98. # Ignore patterns.
  99. for ctx.tk in ignore_ptn_list:
  100. hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
  101. if hret == BREAK:
  102. break
  103. if hret == CONT:
  104. continue
  105. step = len(ctx.tk)
  106. # FIXME This is an issue if we want to specify
  107. # beginning-of-word matches, as we aren't reading the
  108. # previous token and we only know that from the CUR_BOW.
  109. # Which means we would have to analyze the regexp to find if
  110. # it's looking for BOW. Messy.
  111. match = re.match(src[ctx.cur:])
  112. if match:
  113. # The position matches an ignore token.
  114. hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
  115. if hret == BREAK:
  116. break
  117. if hret == CONT:
  118. continue
  119. logger.info(f"Ignored token: {ctx.tk}")
  120. ctx.dest_ls.append(ctx.tk)
  121. ctx.cur += step
  122. ctx.ignoring = True
  123. break
  124. # Ignore plain strings.
  125. for ctx.tk in ignore_list:
  126. hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
  127. if hret == BREAK:
  128. break
  129. if hret == CONT:
  130. continue
  131. step = len(ctx.tk)
  132. if ctx.tk == src[ctx.cur:ctx.cur + step]:
  133. # The position matches an ignore token.
  134. hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
  135. if hret == BREAK:
  136. break
  137. if hret == CONT:
  138. continue
  139. logger.info(f"Ignored token: {ctx.tk}")
  140. ctx.dest_ls.append(ctx.tk)
  141. ctx.cur += step
  142. ctx.ignoring = True
  143. break
  144. # We looked through all ignore tokens, not found any. Move on.
  145. if not ctx.ignoring:
  146. break
  147. # Otherwise, if we found a match, check if the next position may be
  148. # ignored as well.
  149. delattr(ctx, "tk")
  150. delattr(ctx, "ignoring")
  151. # Begin transliteration token lookup.
  152. ctx.match = False
  153. for ctx.src_tk, ctx.dest_tk in langsec["map"]:
  154. hret = _run_hook("pre_tx_token", ctx, langsec_hooks)
  155. if hret == BREAK:
  156. break
  157. if hret == CONT:
  158. continue
  159. # Longer tokens should be guaranteed to be scanned before their
  160. # substrings at this point.
  161. step = len(ctx.src_tk)
  162. if ctx.src_tk == src[ctx.cur:ctx.cur + step]:
  163. ctx.match = True
  164. # This hook may skip this token or break out of the token
  165. # lookup for the current position.
  166. hret = _run_hook("on_tx_token_match", ctx, langsec_hooks)
  167. if hret == BREAK:
  168. break
  169. if hret == CONT:
  170. continue
  171. # A match is found. Stop scanning tokens, append result, and
  172. # proceed scanning the source.
  173. ctx.dest_ls.append(ctx.dest_tk)
  174. ctx.cur += step
  175. break
  176. if ctx.match is False:
  177. delattr(ctx, "match")
  178. hret = _run_hook("on_no_tx_token_match", ctx, langsec_hooks)
  179. if hret == BREAK:
  180. break
  181. if hret == CONT:
  182. continue
  183. # No match found. Copy non-mapped character (one at a time).
  184. logger.info(
  185. f"Token {src[ctx.cur]} (\\u{hex(ord(src[ctx.cur]))[2:]})"
  186. f"at position {ctx.cur} is not mapped.")
  187. ctx.dest_ls.append(src[ctx.cur])
  188. ctx.cur += 1
  189. else:
  190. delattr(ctx, "match")
  191. delattr(ctx, "cur_flags")
  192. delattr(ctx, "cur")
  193. # This hook may take care of the assembly and cause the function to return
  194. # its own return value.
  195. hret = _run_hook("pre_assembly", ctx, langsec_hooks)
  196. if hret is not None:
  197. return hret
  198. if langsec_dir.get("capitalize", False):
  199. ctx.dest_ls[0] = ctx.dest_ls[0].capitalize()
  200. logger.debug(f"Output list: {ctx.dest_ls}")
  201. ctx.dest = "".join(ctx.dest_ls)
  202. # This hook may reassign the output string and/or cause the function to
  203. # return it immediately.
  204. hret = _run_hook("post_assembly", ctx, langsec_hooks)
  205. if hret == "ret":
  206. return ctx.dest
  207. # Strip multiple spaces and leading/trailing whitespace.
  208. ctx.dest = re.sub(MULTI_WS_RE, ' ', ctx.dest.strip())
  209. return ctx.dest
  210. def _run_hook(hname, ctx, hooks):
  211. ret = None
  212. for hook_def in hooks.get(hname, []):
  213. kwargs = hook_def[1] if len(hook_def) > 1 else {}
  214. ret = hook_def[0](ctx, **kwargs)
  215. if ret in (BREAK, CONT):
  216. # This will stop parsing hooks functions and tell the caller to
  217. # break out of the outer loop or skip iteration.
  218. return ret
  219. return ret