__init__.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. import logging
  2. from functools import cache
  3. from importlib import import_module
  4. from os import path, access, R_OK
  5. from yaml import load
  6. try:
  7. from yaml import CLoader as Loader
  8. except ImportError:
  9. from yaml import Loader
  10. from transliterator.exceptions import ConfigError
  11. __doc__ = """
  12. Transliteration tables.
  13. These tables contain all transliteration information, grouped by script and
  14. language (or language and script? TBD)
  15. """
  16. TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
  17. # Available hook names.
  18. HOOKS = (
  19. "post_config",
  20. "begin_input_token",
  21. "pre_ignore_token",
  22. "on_ignore_match",
  23. "pre_tx_token",
  24. "on_tx_token_match",
  25. "on_no_tx_token_match",
  26. "pre_assembly",
  27. "post_assembly",
  28. )
  29. # Package path where hook functions are kept.
  30. HOOK_PKG_PATH = "transliterator.hooks"
  31. logger = logging.getLogger(__name__)
  32. class Token(str):
  33. """
  34. Token class: minimal unit of text parsing.
  35. This class overrides the `<` operator for strings, so that sorting is done
  36. in a way that prioritizes a longer string over a shorter one with identical
  37. root.
  38. """
  39. def __init__(self, content):
  40. self.content = content
  41. def __lt__(self, other):
  42. """
  43. Operator to sort tokens.
  44. E.g:
  45. - ABCD
  46. - AB
  47. - A
  48. - BCDE
  49. - BCD
  50. - BEFGH
  51. - B
  52. """
  53. logger.debug(f"a: {self.content}, b: {other.content}")
  54. self_len = len(self.content)
  55. other_len = len(other.content)
  56. min_len = min(self_len, other_len)
  57. # If one of the strings is entirely contained in the other string...
  58. if self.content[:min_len] == other.content[:min_len]:
  59. logger.debug("Roots match.")
  60. # ...then the longer one takes precedence (is "less")
  61. return self_len > other_len
  62. # If the root strings are different, perform a normal comparison.
  63. return self.content < other.content
  64. def __hash__(self):
  65. return hash(self.content)
  66. @cache
  67. def list_tables():
  68. """
  69. List all the available tables.
  70. """
  71. with open(path.join(TABLE_DIR, "index.yml")) as fh:
  72. tdata = load(fh, Loader=Loader)
  73. return tdata
  74. @cache
  75. def load_table(tname):
  76. """
  77. Load one transliteration table and possible parents.
  78. The table file is parsed into an in-memory configuration that contains
  79. the language & script metadata and parsing rules.
  80. """
  81. fname = path.join(TABLE_DIR, tname + ".yml")
  82. if not access(fname, R_OK):
  83. raise ValueError(f"No transliteration table for {tname}!")
  84. with open(fname) as fh:
  85. tdata = load(fh, Loader=Loader)
  86. # NOTE Only one level of inheritance. No need for recursion for now.
  87. parents = tdata.get("general", {}).get("parents", [])
  88. if "script_to_roman" in tdata:
  89. tokens = {}
  90. for parent in parents:
  91. parent_tdata = load_table(parent)
  92. # Merge parent tokens. Child overrides parents, and a parent listed
  93. # later override ones listed earlier.
  94. tokens |= {
  95. Token(k): v for k, v in parent_tdata.get(
  96. "script_to_roman", {}).get("map", {})
  97. }
  98. tokens |= {
  99. Token(k): v
  100. for k, v in tdata["script_to_roman"].get("map", {}).items()}
  101. tdata["script_to_roman"]["map"] = tuple(
  102. (k.content, tokens[k]) for k in sorted(tokens))
  103. if "hooks" in tdata["script_to_roman"]:
  104. tdata["script_to_roman"]["hooks"] = load_hook_fn(
  105. tname, tdata["script_to_roman"])
  106. if "roman_to_script" in tdata:
  107. tokens = {}
  108. for parent in parents:
  109. parent_tdata = load_table(parent)
  110. # Merge parent tokens. Child overrides parents, and a parent listed
  111. # later override ones listed earlier.
  112. tokens |= {
  113. Token(k): v for k, v in parent_tdata.get(
  114. "roman_to_script", {}).get("map", {})
  115. }
  116. tokens |= {
  117. Token(k): v
  118. for k, v in tdata["roman_to_script"].get("map", {}).items()
  119. }
  120. tdata["roman_to_script"]["map"] = tuple(
  121. (k.content, tokens[k]) for k in sorted(tokens))
  122. ignore = {
  123. Token(t)
  124. for t in tdata["roman_to_script"].get("ignore", [])
  125. }
  126. for parent in parents:
  127. parent_tdata = load_table(parent)
  128. # No overriding occurs with the ignore list, only de-duplication.
  129. ignore |= {
  130. Token(t) for t in parent_tdata.get(
  131. "roman_to_script", {}).get("ignore", [])
  132. }
  133. tdata["roman_to_script"]["ignore"] = [
  134. t.content for t in sorted(ignore)]
  135. if "hooks" in tdata["roman_to_script"]:
  136. tdata["roman_to_script"]["hooks"] = load_hook_fn(
  137. tname, tdata["script_to_roman"])
  138. return tdata
  139. def load_hook_fn(cname, sec):
  140. """
  141. Load hook functions from configuration file.
  142. Args:
  143. lang (str): The language key for the configuration.
  144. sec (dict): The `script_to_roman` or `roman_to_script` section
  145. that may contain the `hooks` key to be parsed.
  146. Return:
  147. dict: Dictionary of hook name and list of hook functions pairs.
  148. """
  149. hook_fn = {}
  150. for cfg_hook, cfg_hook_fns in sec.get("hooks", {}).items():
  151. if cfg_hook not in HOOKS:
  152. raise ConfigError(f"{cfg_hook} is not a valid hook name!")
  153. hook_fn[cfg_hook] = []
  154. # There may be more than one function in each hook. They are
  155. # executed in the order they are found.
  156. for cfg_hook_fn in cfg_hook_fns:
  157. modname, fnname = path.splitext(cfg_hook_fn[0])
  158. fnname = fnname.lstrip(".")
  159. fn_kwargs = cfg_hook_fn[1]
  160. try:
  161. fn = getattr(import_module(
  162. "." + modname, HOOK_PKG_PATH), fnname)
  163. except NameError:
  164. raise ConfigError(
  165. f"Hook function {fnname} defined in {cname} configuration "
  166. f"not found in module {HOOK_PKG_PATH}.{modname}!"
  167. )
  168. hook_fn[cfg_hook].append((fn, fn_kwargs))
  169. return hook_fn