__init__.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. import logging
  2. import re
  3. from functools import cache
  4. from importlib import import_module
  5. from os import environ, path, access, R_OK
  6. from yaml import load
  7. try:
  8. from yaml import CLoader as Loader
  9. except ImportError:
  10. from yaml import Loader
  11. from scriptshifter.exceptions import ConfigError
  12. __doc__ = """
  13. Transliteration tables.
  14. These tables contain all transliteration information, grouped by script and
  15. language (or language and script? TBD)
  16. """
  17. DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
  18. # Can be overridden for tests.
  19. TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
  20. # Available hook names.
  21. HOOKS = (
  22. "post_config",
  23. "begin_input_token",
  24. "pre_ignore_token",
  25. "on_ignore_match",
  26. "pre_tx_token",
  27. "on_tx_token_match",
  28. "on_no_tx_token_match",
  29. "pre_assembly",
  30. "post_assembly",
  31. )
  32. # Package path where hook functions are kept.
  33. HOOK_PKG_PATH = "scriptshifter.hooks"
  34. # Default characters defining a word boundary. This is configurable per-table.
  35. WORD_BOUNDARY = " \n\t:;.,\"'-()[]{}"
  36. logger = logging.getLogger(__name__)
  37. class Token(str):
  38. """
  39. Token class: minimal unit of text parsing.
  40. This class overrides the `<` operator for strings, so that sorting is done
  41. in a way that prioritizes a longer string over a shorter one with identical
  42. root.
  43. """
  44. def __init__(self, content):
  45. self.content = content
  46. def __lt__(self, other):
  47. """
  48. Operator to sort tokens.
  49. E.g:
  50. - ABCD
  51. - AB
  52. - A
  53. - BCDE
  54. - BCD
  55. - BEFGH
  56. - B
  57. """
  58. logger.debug(f"a: {self.content}, b: {other.content}")
  59. self_len = len(self.content)
  60. other_len = len(other.content)
  61. min_len = min(self_len, other_len)
  62. # If one of the strings is entirely contained in the other string...
  63. if self.content[:min_len] == other.content[:min_len]:
  64. logger.debug("Roots match.")
  65. # ...then the longer one takes precedence (is "less")
  66. return self_len > other_len
  67. # If the root strings are different, perform a normal comparison.
  68. return self.content < other.content
  69. def __hash__(self):
  70. return hash(self.content)
  71. @cache
  72. def list_tables():
  73. """
  74. List all the available tables.
  75. """
  76. with open(path.join(TABLE_DIR, "index.yml")) as fh:
  77. tdata = load(fh, Loader=Loader)
  78. return tdata
  79. @cache
  80. def load_table(tname):
  81. """
  82. Load one transliteration table and possible parents.
  83. The table file is parsed into an in-memory configuration that contains
  84. the language & script metadata and parsing rules.
  85. """
  86. fname = path.join(TABLE_DIR, tname + ".yml")
  87. if not access(fname, R_OK):
  88. raise ValueError(f"No transliteration table for {tname}!")
  89. with open(fname) as fh:
  90. tdata = load(fh, Loader=Loader)
  91. # NOTE Only one level of inheritance. No need for recursion for now.
  92. parents = tdata.get("general", {}).get("parents", [])
  93. if "script_to_roman" in tdata:
  94. if "double_cap" in tdata["script_to_roman"]:
  95. tdata["script_to_roman"]["double_cap"] = tuple(
  96. tdata["script_to_roman"]["double_cap"])
  97. tokens = {}
  98. for parent in parents:
  99. parent_tdata = load_table(parent)
  100. # Merge parent tokens. Child overrides parents, and a parent listed
  101. # later override ones listed earlier.
  102. tokens |= {
  103. Token(k): v for k, v in parent_tdata.get(
  104. "script_to_roman", {}).get("map", {})
  105. }
  106. # Merge and/or remove double cap rules.
  107. tdata["script_to_roman"]["double_cap"] = tuple((
  108. set(parent_tdata["script_to_roman"].get("double_cap", {})) |
  109. set(tdata["script_to_roman"].get("double_cap", {}))
  110. ) - set(tdata["script_to_roman"].get("no_double_cap", {})))
  111. if "no_double_cap" in tdata["script_to_roman"]:
  112. del tdata["script_to_roman"]["no_double_cap"]
  113. tokens |= {
  114. Token(k): v
  115. for k, v in tdata["script_to_roman"].get("map", {}).items()}
  116. tdata["script_to_roman"]["map"] = tuple(
  117. (k.content, tokens[k]) for k in sorted(tokens))
  118. if "hooks" in tdata["script_to_roman"]:
  119. tdata["script_to_roman"]["hooks"] = load_hook_fn(
  120. tname, tdata["script_to_roman"])
  121. if "roman_to_script" in tdata:
  122. tokens = {}
  123. for parent in parents:
  124. parent_tdata = load_table(parent)
  125. # Merge parent tokens. Child overrides parents, and a parent listed
  126. # later override ones listed earlier.
  127. tokens |= {
  128. Token(k): v for k, v in parent_tdata.get(
  129. "roman_to_script", {}).get("map", {})
  130. }
  131. tokens |= {
  132. Token(k): v
  133. for k, v in tdata["roman_to_script"].get("map", {}).items()
  134. }
  135. tdata["roman_to_script"]["map"] = tuple(
  136. (k.content, tokens[k]) for k in sorted(tokens))
  137. # Ignore regular expression patterns.
  138. # Patterns are evaluated in the order they are listed in the config.
  139. ignore_ptn = [
  140. re.compile(ptn)
  141. for ptn in tdata["roman_to_script"].get("ignore_ptn", [])]
  142. for parent in parents:
  143. parent_tdata = load_table(parent)
  144. # NOTE: duplicates are not removed.
  145. ignore_ptn = [
  146. re.compile(ptn)
  147. for ptn in parent_tdata.get(
  148. "roman_to_script", {}).get("ignore_ptn", [])
  149. ] + ignore_ptn
  150. tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn
  151. # Ignore plain strings.
  152. ignore = {
  153. Token(t)
  154. for t in tdata["roman_to_script"].get("ignore", [])
  155. }
  156. for parent in parents:
  157. parent_tdata = load_table(parent)
  158. # No overriding occurs with the ignore list, only de-duplication.
  159. ignore |= {
  160. Token(t) for t in parent_tdata.get(
  161. "roman_to_script", {}).get("ignore", [])
  162. }
  163. tdata["roman_to_script"]["ignore"] = [
  164. t.content for t in sorted(ignore)]
  165. # Hooks.
  166. if "hooks" in tdata["roman_to_script"]:
  167. tdata["roman_to_script"]["hooks"] = load_hook_fn(
  168. tname, tdata["script_to_roman"])
  169. return tdata
  170. def load_hook_fn(cname, sec):
  171. """
  172. Load hook functions from configuration file.
  173. Args:
  174. lang (str): The language key for the configuration.
  175. sec (dict): The `script_to_roman` or `roman_to_script` section
  176. that may contain the `hooks` key to be parsed.
  177. Return:
  178. dict: Dictionary of hook name and list of hook functions pairs.
  179. """
  180. hook_fn = {}
  181. for cfg_hook, cfg_hook_fns in sec.get("hooks", {}).items():
  182. if cfg_hook not in HOOKS:
  183. raise ConfigError(f"{cfg_hook} is not a valid hook name!")
  184. hook_fn[cfg_hook] = []
  185. # There may be more than one function in each hook. They are
  186. # executed in the order they are found.
  187. for cfg_hook_fn in cfg_hook_fns:
  188. modname, fnname = path.splitext(cfg_hook_fn[0])
  189. fnname = fnname.lstrip(".")
  190. fn_kwargs = cfg_hook_fn[1]
  191. try:
  192. fn = getattr(import_module(
  193. "." + modname, HOOK_PKG_PATH), fnname)
  194. except NameError:
  195. raise ConfigError(
  196. f"Hook function {fnname} defined in {cname} configuration "
  197. f"not found in module {HOOK_PKG_PATH}.{modname}!"
  198. )
  199. hook_fn[cfg_hook].append((fn, fn_kwargs))
  200. return hook_fn