trans.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. import logging
  2. import re
  3. from transliterator.tables import load_table
  4. # Match multiple spaces.
  5. MULTI_WS_RE = re.compile(r"\s{2,}")
  6. logger = logging.getLogger(__name__)
  7. class Context:
  8. """
  9. Context used within the transliteration and passed to hook functions.
  10. """
  11. def __init__(self, src, general, langsec):
  12. """
  13. Initialize a context.
  14. Args:
  15. src (str): The original text. This is meant to never change.
  16. general (dict): general section of the current config.
  17. langsec (dict): Language configuration section being used.
  18. """
  19. self.src = src
  20. self.general = general
  21. self.langsec = langsec
  22. self.dest_ls = []
  23. def transliterate(src, lang, r2s=False):
  24. """
  25. Transliterate a single string.
  26. Args:
  27. src (str): Source string.
  28. lang (str): Language name.
  29. Keyword args:
  30. r2s (bool): If False (the default), the source is considered to be a
  31. non-latin script in the language and script specified, and the output
  32. the Romanization thereof; if True, the source is considered to be
  33. romanized text to be transliterated into the specified script/language.
  34. Return:
  35. str: The transliterated string.
  36. """
  37. source_str = "Latin" if r2s else lang
  38. target_str = lang if r2s else "Latin"
  39. logger.info(f"Transliteration is from {source_str} to {target_str}.")
  40. cfg = load_table(lang)
  41. logger.info(f"Loaded table for {lang}.")
  42. # General directives.
  43. general = cfg.get("general", {})
  44. if not r2s and "script_to_roman" not in cfg:
  45. raise NotImplementedError(
  46. f"Script-to-Roman transliteration not yet supported for {lang}."
  47. )
  48. elif r2s and "roman_to_script" not in cfg:
  49. raise NotImplementedError(
  50. f"Roman-to-script transliteration not yet supported for {lang}."
  51. )
  52. langsec = cfg["script_to_roman"] if not r2s else cfg["roman_to_script"]
  53. langsec_dir = langsec.get("directives", {})
  54. langsec_hooks = langsec.get("hooks", {})
  55. ctx = Context(src, general, langsec)
  56. _run_hook("post_config", ctx, langsec_hooks)
  57. # Loop through source characters. The increment of each loop depends on
  58. # the length of the token that eventually matches.
  59. ignore_list = langsec.get("ignore", []) # Only present in R2S
  60. ctx.cur = 0
  61. while ctx.cur < len(src):
  62. # This hook may skip the parsing of the current
  63. # token or exit the scanning loop altogether.
  64. hret = _run_hook("begin_input_token", ctx, langsec_hooks)
  65. if hret == "break":
  66. logger.debug("Breaking text scanning from hook signal.")
  67. break
  68. if hret == "continue":
  69. logger.debug("Skipping scanning iteration from hook signal.")
  70. continue
  71. # Check ignore list first. Find as many subsequent ignore tokens
  72. # as possible before moving on to looking for match tokens.
  73. ctx.tk = None
  74. while True:
  75. ctx.ignoring = False
  76. for ctx.tk in ignore_list:
  77. hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
  78. if hret == "break":
  79. break
  80. if hret == "continue":
  81. continue
  82. step = len(ctx.tk)
  83. if ctx.tk == src[ctx.cur:ctx.cur + step]:
  84. # The position matches an ignore token.
  85. hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
  86. if hret == "break":
  87. break
  88. if hret == "continue":
  89. continue
  90. logger.info(f"Ignored token: {ctx.tk}")
  91. ctx.dest_ls.append(ctx.tk)
  92. ctx.cur += step
  93. ctx.ignoring = True
  94. break
  95. # We looked through all ignore tokens, not found any. Move on.
  96. if not ctx.ignoring:
  97. break
  98. # Otherwise, if we found a match, check if the next position may be
  99. # ignored as well.
  100. delattr(ctx, "tk")
  101. delattr(ctx, "ignoring")
  102. # Begin transliteration token lookup.
  103. ctx.match = False
  104. for ctx.src_tk, ctx.dest_tk in langsec["map"]:
  105. hret = _run_hook("pre_tx_token", ctx, langsec_hooks)
  106. if hret == "break":
  107. break
  108. if hret == "continue":
  109. continue
  110. # Longer tokens should be guaranteed to be scanned before their
  111. # substrings at this point.
  112. step = len(ctx.src_tk)
  113. if ctx.src_tk == src[ctx.cur:ctx.cur + step]:
  114. ctx.match = True
  115. # This hook may skip this token or break out of the token
  116. # lookup for the current position.
  117. hret = _run_hook("on_tx_token_match", ctx, langsec_hooks)
  118. if hret == "break":
  119. break
  120. if hret == "continue":
  121. continue
  122. # A match is found. Stop scanning tokens, append result, and
  123. # proceed scanning the source.
  124. ctx.dest_ls.append(ctx.dest_tk)
  125. ctx.cur += step
  126. break
  127. if ctx.match is False:
  128. delattr(ctx, "match")
  129. hret = _run_hook("on_no_tx_token_match", ctx, langsec_hooks)
  130. if hret == "break":
  131. break
  132. if hret == "continue":
  133. continue
  134. # No match found. Copy non-mapped character (one at a time).
  135. logger.info(
  136. f"Token {src[ctx.cur]} at position {ctx.cur} is not mapped."
  137. )
  138. ctx.dest_ls.append(src[ctx.cur])
  139. ctx.cur += 1
  140. else:
  141. delattr(ctx, "match")
  142. delattr(ctx, "cur")
  143. # This hook may take care of the assembly and cause the function to return
  144. # its own return value.
  145. hret = _run_hook("pre_assembly", ctx, langsec_hooks)
  146. if hret is not None:
  147. return hret
  148. if langsec_dir.get("capitalize", False):
  149. ctx.dest_ls[0] = ctx.dest_ls[0].capitalize()
  150. logger.debug(f"Output list: {ctx.dest_ls}")
  151. ctx.dest = "".join(ctx.dest_ls)
  152. # This hook may reassign the output string and/or cause the function to
  153. # return it immediately.
  154. hret = _run_hook("post_assembly", ctx, langsec_hooks)
  155. if hret == "ret":
  156. return ctx.dest
  157. # Strip multiple spaces and leading/trailing whitespace.
  158. ctx.dest = re.sub(MULTI_WS_RE, ' ', ctx.dest.strip())
  159. return ctx.dest
  160. def _run_hook(hname, ctx, hooks):
  161. ret = None
  162. for hook_def in hooks.get(hname, []):
  163. kwargs = hook_def[1] if len(hook_def) > 1 else {}
  164. ret = hook_def[0](ctx, **kwargs)
  165. if ret in ("break", "cont"):
  166. # This will stop parsing hooks functions and tell the caller to
  167. # break out of the outer loop or skip iteration.
  168. return ret
  169. return ret