__init__.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. import logging
  2. import re
  3. from functools import cache
  4. from importlib import import_module
  5. from os import environ, path, access, R_OK
  6. from yaml import load
  7. try:
  8. from yaml import CLoader as Loader
  9. except ImportError:
  10. from yaml import Loader
  11. from scriptshifter.exceptions import BREAK, ConfigError
  12. __doc__ = """
  13. Transliteration tables.
  14. These tables contain all transliteration information, grouped by script and
  15. language (or language and script? TBD)
  16. """
  17. DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
  18. # Can be overridden for tests.
  19. TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
  20. # Available hook names.
  21. HOOKS = (
  22. "post_config",
  23. "post_normalize",
  24. "begin_input_token",
  25. "pre_ignore_token",
  26. "on_ignore_match",
  27. "pre_tx_token",
  28. "on_tx_token_match",
  29. "on_no_tx_token_match",
  30. "pre_assembly",
  31. "post_assembly",
  32. )
  33. # Package path where hook functions are kept.
  34. HOOK_PKG_PATH = "scriptshifter.hooks"
  35. # Default characters defining a word boundary. This is configurable per-table.
  36. WORD_BOUNDARY = " \n\t:;.,\"'-()[]{}"
  37. # Token word boundary marker. Used in maps to distinguish special
  38. # transliterations for initial, final, and standalone tokens.
  39. TOKEN_WB_MARKER = "%"
  40. # Word boundary bitwise flags.
  41. BOW = 1 << 1
  42. EOW = 1 << 0
  43. logger = logging.getLogger(__name__)
  44. class Token(str):
  45. """
  46. Token class: minimal unit of text parsing.
  47. This class overrides the `<` operator for strings, so that sorting is done
  48. in a way that prioritizes a longer string over a shorter one with identical
  49. root.
  50. """
  51. flags = 0
  52. def __init__(self, content):
  53. self.content = content
  54. # Assign special precedence based on token position.
  55. # Standalone has precedence, then initial, then final, then medial.
  56. # This is somewhat arbitrary and may change if special cases arise.
  57. # WB markers are moved to flags to allow default comparison.
  58. if self.content.endswith(TOKEN_WB_MARKER):
  59. self.flags |= BOW
  60. self.content = self.content.rstrip(TOKEN_WB_MARKER)
  61. if self.content.startswith(TOKEN_WB_MARKER):
  62. self.flags |= EOW
  63. self.content = self.content.lstrip(TOKEN_WB_MARKER)
  64. def __lt__(self, other):
  65. """
  66. Operator to sort tokens.
  67. E.g:
  68. - ABCD
  69. - AB
  70. - A
  71. - BCDE
  72. - BCD
  73. - BEFGH
  74. - B
  75. """
  76. # logger.debug(f"a: {self.content}, b: {other.content}")
  77. self_len = len(self.content)
  78. other_len = len(other.content)
  79. min_len = min(self_len, other_len)
  80. # Check word boundary flags only if tokens are identical.
  81. # Higher flag value has precedence.
  82. if (
  83. (self.flags > 0 or other.flags > 0)
  84. and self.content == other.content):
  85. logger.debug(f"{self.content} flags: {self.flags}")
  86. logger.debug(f"{other.content} flags: {other.flags}")
  87. logger.debug("Performing flags comparison.")
  88. return self.flags > other.flags
  89. # If one of the strings is entirely contained in the other string...
  90. if self.content[:min_len] == other.content[:min_len]:
  91. # logger.debug("Roots match.")
  92. # ...then the longer one takes precedence (is "less")
  93. return self_len > other_len
  94. # If the root strings are different, perform a normal comparison.
  95. return self.content < other.content
  96. def __hash__(self):
  97. return hash(self.content)
  98. @cache
  99. def list_tables():
  100. """
  101. List all the indexed tables.
  102. Note that this may not correspond to all the table files in the data
  103. folder, but only those exposed in the index.
  104. """
  105. with open(path.join(TABLE_DIR, "index.yml")) as fh:
  106. tdata = load(fh, Loader=Loader)
  107. return tdata
  108. @cache
  109. def load_table(tname):
  110. """
  111. Load one transliteration table and possible parents.
  112. The table file is parsed into an in-memory configuration that contains
  113. the language & script metadata and parsing rules.
  114. """
  115. fname = path.join(TABLE_DIR, tname + ".yml")
  116. if not access(fname, R_OK):
  117. raise ValueError(f"No transliteration table for {tname}!")
  118. with open(fname) as fh:
  119. tdata = load(fh, Loader=Loader)
  120. # Pre-config hooks.
  121. # If any of these hooks returns BREAK, interrupt the configuration
  122. # parsing and return whatever is obtained so far.
  123. if "hooks" in tdata:
  124. tdata["hooks"] = load_hook_fn(tname, tdata)
  125. pre_cfg_hooks = tdata.get("hooks", {}).get("pre_config", [])
  126. for hook_def in pre_cfg_hooks:
  127. kwargs = hook_def[1] if len(hook_def) > 1 else {}
  128. ret = hook_def[0](tdata, **kwargs)
  129. if ret == BREAK:
  130. return tdata
  131. parents = tdata.get("general", {}).get("parents", [])
  132. if "script_to_roman" in tdata:
  133. if "double_cap" in tdata["script_to_roman"]:
  134. tdata["script_to_roman"]["double_cap"] = tuple(
  135. tdata["script_to_roman"]["double_cap"])
  136. tokens = {}
  137. for parent in parents:
  138. parent_tdata = load_table(parent)
  139. # Merge parent tokens. Child overrides parents, and a parent listed
  140. # later override ones listed earlier.
  141. tokens |= {
  142. Token(k): v for k, v in parent_tdata.get(
  143. "script_to_roman", {}).get("map", {})
  144. }
  145. # Merge and/or remove double cap rules.
  146. tdata["script_to_roman"]["double_cap"] = tuple((
  147. set(parent_tdata.get(
  148. "script_to_roman", {}
  149. ).get("double_cap", set())) |
  150. set(tdata["script_to_roman"].get("double_cap", set()))
  151. ) - set(tdata["script_to_roman"].get("no_double_cap", set())))
  152. if "no_double_cap" in tdata["script_to_roman"]:
  153. del tdata["script_to_roman"]["no_double_cap"]
  154. tokens |= {
  155. Token(k): v
  156. for k, v in tdata["script_to_roman"].get("map", {}).items()}
  157. tdata["script_to_roman"]["map"] = tuple(
  158. (k, tokens[k]) for k in sorted(tokens))
  159. # Normalization.
  160. normalize = {}
  161. # Inherit normalization rules.
  162. for parent in parents:
  163. parent_langsec = load_table(parent)["script_to_roman"]
  164. normalize |= parent_langsec.get("normalize", {})
  165. for k, v in tdata["script_to_roman"].get("normalize", {}).items():
  166. for vv in v:
  167. normalize[Token(vv)] = k
  168. tdata["script_to_roman"]["normalize"] = dict(sorted(normalize.items()))
  169. # Hook function.
  170. if "hooks" in tdata["script_to_roman"]:
  171. tdata["script_to_roman"]["hooks"] = load_hook_fn(
  172. tname, tdata["script_to_roman"])
  173. if "roman_to_script" in tdata:
  174. tokens = {}
  175. for parent in parents:
  176. parent_tdata = load_table(parent)
  177. # Merge parent tokens. Child overrides parents, and a parent listed
  178. # later override ones listed earlier.
  179. tokens |= {
  180. Token(k): v for k, v in parent_tdata.get(
  181. "roman_to_script", {}).get("map", {})
  182. }
  183. tokens |= {
  184. Token(k): v
  185. for k, v in tdata["roman_to_script"].get("map", {}).items()
  186. }
  187. tdata["roman_to_script"]["map"] = tuple(
  188. (k, tokens[k]) for k in sorted(tokens))
  189. # Ignore regular expression patterns.
  190. # Patterns are evaluated in the order they are listed in the config.
  191. ignore_ptn = [
  192. re.compile(ptn)
  193. for ptn in tdata["roman_to_script"].get("ignore_ptn", [])]
  194. for parent in parents:
  195. parent_tdata = load_table(parent)
  196. # NOTE: duplicates are not removed.
  197. ignore_ptn = [
  198. re.compile(ptn)
  199. for ptn in parent_tdata.get(
  200. "roman_to_script", {}).get("ignore_ptn", [])
  201. ] + ignore_ptn
  202. tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn
  203. # Ignore plain strings.
  204. ignore = {
  205. Token(t)
  206. for t in tdata["roman_to_script"].get("ignore", [])
  207. }
  208. for parent in parents:
  209. parent_tdata = load_table(parent)
  210. # No overriding occurs with the ignore list, only de-duplication.
  211. ignore |= {
  212. Token(t) for t in parent_tdata.get(
  213. "roman_to_script", {}).get("ignore", [])
  214. }
  215. tdata["roman_to_script"]["ignore"] = [
  216. t.content for t in sorted(ignore)]
  217. # Hooks.
  218. if "hooks" in tdata["roman_to_script"]:
  219. tdata["roman_to_script"]["hooks"] = load_hook_fn(
  220. tname, tdata["roman_to_script"])
  221. return tdata
  222. def load_hook_fn(cname, sec):
  223. """
  224. Load hook functions from configuration file.
  225. Args:
  226. lang (str): The language key for the configuration.
  227. sec (dict): The `script_to_roman` or `roman_to_script` section
  228. that may contain the `hooks` key to be parsed.
  229. Return:
  230. dict: Dictionary of hook name and list of hook functions pairs.
  231. """
  232. hook_fn = {}
  233. for cfg_hook, cfg_hook_fns in sec.get("hooks", {}).items():
  234. if cfg_hook not in HOOKS:
  235. raise ConfigError(f"{cfg_hook} is not a valid hook name!")
  236. hook_fn[cfg_hook] = []
  237. # There may be more than one function in each hook. They are
  238. # executed in the order they are found.
  239. for cfg_hook_fn in cfg_hook_fns:
  240. modname, fnname = path.splitext(cfg_hook_fn[0])
  241. fnname = fnname.lstrip(".")
  242. fn_kwargs = cfg_hook_fn[1] if len(cfg_hook_fn) > 1 else {}
  243. try:
  244. fn = getattr(import_module(
  245. "." + modname, HOOK_PKG_PATH), fnname)
  246. except NameError:
  247. raise ConfigError(
  248. f"Hook function {fnname} defined in {cname} configuration "
  249. f"not found in module {HOOK_PKG_PATH}.{modname}!"
  250. )
  251. hook_fn[cfg_hook].append((fn, fn_kwargs))
  252. return hook_fn