trans.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. import logging
  2. import re
  3. from transliterator.tables import load_table
  4. # Match multiple spaces.
  5. MULTI_WS_RE = re.compile(r"\s{2,}")
  6. logger = logging.getLogger(__name__)
  7. class Context:
  8. """
  9. Context used within the transliteration and passed to hook functions.
  10. """
  11. cur = 0 # Input text cursor.
  12. dest_ls = [] # Token list making up the output string.
  13. def __init__(self, src, general, langsec):
  14. """
  15. Initialize a context.
  16. Args:
  17. src (str): The original text. This is meant to never change.
  18. general (dict): general section of the current config.
  19. langsec (dict): Language configuration section being used.
  20. """
  21. self.src = src
  22. self.general = general
  23. self.langsec = langsec
  24. def transliterate(src, lang, r2s=False):
  25. """
  26. Transliterate a single string.
  27. Args:
  28. src (str): Source string.
  29. lang (str): Language name.
  30. Keyword args:
  31. r2s (bool): If False (the default), the source is considered to be a
  32. non-latin script in the language and script specified, and the output
  33. the Romanization thereof; if True, the source is considered to be
  34. romanized text to be transliterated into the specified script/language.
  35. Return:
  36. str: The transliterated string.
  37. """
  38. source_str = "Latin" if r2s else lang
  39. target_str = lang if r2s else "Latin"
  40. logger.info(f"Transliteration is from {source_str} to {target_str}.")
  41. cfg = load_table(lang)
  42. logger.info(f"Loaded table for {lang}.")
  43. # General directives.
  44. general = cfg.get("general", {})
  45. if not r2s and "script_to_roman" not in cfg:
  46. raise NotImplementedError(
  47. f"Script-to-Roman transliteration not yet supported for {lang}."
  48. )
  49. elif r2s and "roman_to_script" not in cfg:
  50. raise NotImplementedError(
  51. f"Roman-to-script transliteration not yet supported for {lang}."
  52. )
  53. langsec = cfg["script_to_roman"] if not r2s else cfg["roman_to_script"]
  54. langsec_dir = langsec.get("directives", {})
  55. langsec_hooks = langsec.get("hooks", {})
  56. ctx = Context(src, general, langsec)
  57. _run_hook("post_config", ctx, langsec_hooks)
  58. # Loop through source characters. The increment of each loop depends on
  59. # the length of the token that eventually matches.
  60. ignore_list = langsec.get("ignore", []) # Only present in R2S
  61. while ctx.cur < len(src):
  62. # This hook may skip the parsing of the current
  63. # token or exit the scanning loop altogether.
  64. hret = _run_hook("begin_input_token", ctx, langsec_hooks)
  65. if hret == "break":
  66. break
  67. if hret == "continue":
  68. continue
  69. # Check ignore list first. Find as many subsequent ignore tokens
  70. # as possible before moving on to looking for match tokens.
  71. while True:
  72. ctx.ignoring = False
  73. for tk in ignore_list:
  74. hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
  75. if hret == "break":
  76. break
  77. if hret == "continue":
  78. continue
  79. step = len(tk)
  80. if tk == src[ctx.i:ctx.i + step]:
  81. hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
  82. if hret == "break":
  83. break
  84. if hret == "continue":
  85. continue
  86. logger.info(f"Ignored token: {tk}")
  87. ctx.dest_ls.append(tk)
  88. ctx.i += step
  89. ctx.ignoring = True
  90. break
  91. # We looked through all ignore tokens, not found any. Move on.
  92. if not ctx.ignoring:
  93. break
  94. ctx.match = False
  95. for src_tk, dest_tk in langsec["map"]:
  96. hret = _run_hook("pre_tx_token", ctx, langsec_hooks)
  97. if hret == "break":
  98. break
  99. if hret == "continue":
  100. continue
  101. # Longer tokens should be guaranteed to be scanned before their
  102. # substrings at this point.
  103. step = len(src_tk)
  104. if src_tk == src[ctx.i:ctx.i + step]:
  105. # This hook may skip this token or break out of the token
  106. # lookup for the current position.
  107. hret = _run_hook("on_tx_token_match", ctx, langsec_hooks)
  108. if hret == "break":
  109. break
  110. if hret == "continue":
  111. continue
  112. # A match is found. Stop scanning tokens, append result, and
  113. # proceed scanning the source.
  114. ctx.dest_ls.append(dest_tk)
  115. ctx.match = True
  116. ctx.i += step
  117. break
  118. if not ctx.match:
  119. hret = _run_hook("on_no_tx_token_match", ctx, langsec_hooks)
  120. if hret == "break":
  121. break
  122. if hret == "continue":
  123. continue
  124. # No match found. Copy non-mapped character (one at a time).
  125. logger.info(
  126. f"Token {src[ctx.i]} at position {ctx.i} is not mapped.")
  127. ctx.dest_ls.append(src[ctx.i])
  128. ctx.i += 1
  129. if langsec_dir.get("capitalize", False):
  130. ctx.dest_ls[0] = ctx.dest_ls[0].capitalize()
  131. # This hook may take care of the assembly and cause the function to return
  132. # its own return value.
  133. hret = _run_hook("pre_assembly", ctx, langsec_hooks)
  134. if hret is not None:
  135. return hret
  136. logger.debug(f"Output list: {ctx.dest_ls}")
  137. ctx.dest = "".join(ctx.dest_ls)
  138. # This hook may manipulate the output string and cause the function to
  139. # return that.
  140. hret = _run_hook("post_assembly", ctx, langsec_hooks)
  141. if hret is not None:
  142. return hret
  143. # Strip multiple spaces and leading/trailing whitespace.
  144. ctx.dest = re.sub(MULTI_WS_RE, ' ', ctx.dest.strip())
  145. return ctx.dest
  146. def _run_hook(hname, ctx, hooks):
  147. for hook_def in hooks.get(hname, []):
  148. kwargs = hook_def[1] if len(hook_def > 1) else {}
  149. ret = hook_def[0](ctx.src, ctx.cur, ctx.dest_ls, **kwargs)
  150. if ret in ("break", "cont"):
  151. # This will stop parsing hooks functions and tell the caller to
  152. # break out of the outer loop or skip iteration.
  153. return ret
  154. return ret