__init__.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. import logging
  2. import re
  3. from functools import cache
  4. from importlib import import_module
  5. from os import environ, path, access, R_OK
  6. from yaml import load
  7. try:
  8. from yaml import CLoader as Loader
  9. except ImportError:
  10. from yaml import Loader
  11. from scriptshifter.exceptions import ConfigError
  12. __doc__ = """
  13. Transliteration tables.
  14. These tables contain all transliteration information, grouped by script and
  15. language (or language and script? TBD)
  16. """
  17. DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
  18. # Can be overridden for tests.
  19. TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
  20. # Available hook names.
  21. HOOKS = (
  22. "post_config",
  23. "post_normalize",
  24. "begin_input_token",
  25. "pre_ignore_token",
  26. "on_ignore_match",
  27. "pre_tx_token",
  28. "on_tx_token_match",
  29. "on_no_tx_token_match",
  30. "pre_assembly",
  31. "post_assembly",
  32. )
  33. # Package path where hook functions are kept.
  34. HOOK_PKG_PATH = "scriptshifter.hooks"
  35. # Default characters defining a word boundary. This is configurable per-table.
  36. WORD_BOUNDARY = " \n\t:;.,\"'-()[]{}"
  37. # Token word boundary marker. Used in maps to distinguish special
  38. # transliterations for initial, final, and standalone tokens.
  39. TOKEN_WB_MARKER = "%"
  40. # Word boundary bitwise flags.
  41. BOW = 1 << 1
  42. EOW = 1 << 0
  43. logger = logging.getLogger(__name__)
  44. class Token(str):
  45. """
  46. Token class: minimal unit of text parsing.
  47. This class overrides the `<` operator for strings, so that sorting is done
  48. in a way that prioritizes a longer string over a shorter one with identical
  49. root.
  50. """
  51. flags = 0
  52. def __init__(self, content):
  53. self.content = content
  54. # Assign special precedence based on token position.
  55. # Standalone has precedence, then initial, then final, then medial.
  56. # This is somewhat arbitrary and may change if special cases arise.
  57. # WB markers are moved to flags to allow default comparison.
  58. if self.content.endswith(TOKEN_WB_MARKER):
  59. self.flags |= BOW
  60. self.content = self.content.rstrip(TOKEN_WB_MARKER)
  61. if self.content.startswith(TOKEN_WB_MARKER):
  62. self.flags |= EOW
  63. self.content = self.content.lstrip(TOKEN_WB_MARKER)
  64. def __lt__(self, other):
  65. """
  66. Operator to sort tokens.
  67. E.g:
  68. - ABCD
  69. - AB
  70. - A
  71. - BCDE
  72. - BCD
  73. - BEFGH
  74. - B
  75. """
  76. # logger.debug(f"a: {self.content}, b: {other.content}")
  77. self_len = len(self.content)
  78. other_len = len(other.content)
  79. min_len = min(self_len, other_len)
  80. # Check word boundary flags only if tokens are identical.
  81. # Higher flag value has precedence.
  82. if (
  83. (self.flags > 0 or other.flags > 0)
  84. and self.content == other.content):
  85. logger.debug(f"{self.content} flags: {self.flags}")
  86. logger.debug(f"{other.content} flags: {other.flags}")
  87. logger.debug("Performing flags comparison.")
  88. return self.flags > other.flags
  89. # If one of the strings is entirely contained in the other string...
  90. if self.content[:min_len] == other.content[:min_len]:
  91. # logger.debug("Roots match.")
  92. # ...then the longer one takes precedence (is "less")
  93. return self_len > other_len
  94. # If the root strings are different, perform a normal comparison.
  95. return self.content < other.content
  96. def __hash__(self):
  97. return hash(self.content)
  98. @cache
  99. def list_tables():
  100. """
  101. List all the available tables.
  102. """
  103. with open(path.join(TABLE_DIR, "index.yml")) as fh:
  104. tdata = load(fh, Loader=Loader)
  105. return tdata
  106. @cache
  107. def load_table(tname):
  108. """
  109. Load one transliteration table and possible parents.
  110. The table file is parsed into an in-memory configuration that contains
  111. the language & script metadata and parsing rules.
  112. """
  113. fname = path.join(TABLE_DIR, tname + ".yml")
  114. if not access(fname, R_OK):
  115. raise ValueError(f"No transliteration table for {tname}!")
  116. with open(fname) as fh:
  117. tdata = load(fh, Loader=Loader)
  118. # NOTE Only one level of inheritance. No need for recursion for now.
  119. parents = tdata.get("general", {}).get("parents", [])
  120. if "script_to_roman" in tdata:
  121. if "double_cap" in tdata["script_to_roman"]:
  122. tdata["script_to_roman"]["double_cap"] = tuple(
  123. tdata["script_to_roman"]["double_cap"])
  124. tokens = {}
  125. for parent in parents:
  126. parent_tdata = load_table(parent)
  127. # Merge parent tokens. Child overrides parents, and a parent listed
  128. # later override ones listed earlier.
  129. tokens |= {
  130. Token(k): v for k, v in parent_tdata.get(
  131. "script_to_roman", {}).get("map", {})
  132. }
  133. # Merge and/or remove double cap rules.
  134. tdata["script_to_roman"]["double_cap"] = tuple((
  135. set(parent_tdata.get(
  136. "script_to_roman", {}
  137. ).get("double_cap", set())) |
  138. set(tdata["script_to_roman"].get("double_cap", set()))
  139. ) - set(tdata["script_to_roman"].get("no_double_cap", set())))
  140. if "no_double_cap" in tdata["script_to_roman"]:
  141. del tdata["script_to_roman"]["no_double_cap"]
  142. tokens |= {
  143. Token(k): v
  144. for k, v in tdata["script_to_roman"].get("map", {}).items()}
  145. tdata["script_to_roman"]["map"] = tuple(
  146. (k, tokens[k]) for k in sorted(tokens))
  147. # Normalization.
  148. normalize = {}
  149. # Inherit normalization rules.
  150. for parent in parents:
  151. parent_langsec = load_table(parent)["script_to_roman"]
  152. normalize |= parent_langsec.get("normalize", {})
  153. for k, v in tdata["script_to_roman"].get("normalize", {}).items():
  154. for vv in v:
  155. normalize[Token(vv)] = k
  156. tdata["script_to_roman"]["normalize"] = dict(sorted(normalize.items()))
  157. # Hook function.
  158. if "hooks" in tdata["script_to_roman"]:
  159. tdata["script_to_roman"]["hooks"] = load_hook_fn(
  160. tname, tdata["script_to_roman"])
  161. if "roman_to_script" in tdata:
  162. tokens = {}
  163. for parent in parents:
  164. parent_tdata = load_table(parent)
  165. # Merge parent tokens. Child overrides parents, and a parent listed
  166. # later override ones listed earlier.
  167. tokens |= {
  168. Token(k): v for k, v in parent_tdata.get(
  169. "roman_to_script", {}).get("map", {})
  170. }
  171. tokens |= {
  172. Token(k): v
  173. for k, v in tdata["roman_to_script"].get("map", {}).items()
  174. }
  175. tdata["roman_to_script"]["map"] = tuple(
  176. (k, tokens[k]) for k in sorted(tokens))
  177. # Ignore regular expression patterns.
  178. # Patterns are evaluated in the order they are listed in the config.
  179. ignore_ptn = [
  180. re.compile(ptn)
  181. for ptn in tdata["roman_to_script"].get("ignore_ptn", [])]
  182. for parent in parents:
  183. parent_tdata = load_table(parent)
  184. # NOTE: duplicates are not removed.
  185. ignore_ptn = [
  186. re.compile(ptn)
  187. for ptn in parent_tdata.get(
  188. "roman_to_script", {}).get("ignore_ptn", [])
  189. ] + ignore_ptn
  190. tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn
  191. # Ignore plain strings.
  192. ignore = {
  193. Token(t)
  194. for t in tdata["roman_to_script"].get("ignore", [])
  195. }
  196. for parent in parents:
  197. parent_tdata = load_table(parent)
  198. # No overriding occurs with the ignore list, only de-duplication.
  199. ignore |= {
  200. Token(t) for t in parent_tdata.get(
  201. "roman_to_script", {}).get("ignore", [])
  202. }
  203. tdata["roman_to_script"]["ignore"] = [
  204. t.content for t in sorted(ignore)]
  205. # Hooks.
  206. if "hooks" in tdata["roman_to_script"]:
  207. tdata["roman_to_script"]["hooks"] = load_hook_fn(
  208. tname, tdata["roman_to_script"])
  209. return tdata
  210. def load_hook_fn(cname, sec):
  211. """
  212. Load hook functions from configuration file.
  213. Args:
  214. lang (str): The language key for the configuration.
  215. sec (dict): The `script_to_roman` or `roman_to_script` section
  216. that may contain the `hooks` key to be parsed.
  217. Return:
  218. dict: Dictionary of hook name and list of hook functions pairs.
  219. """
  220. hook_fn = {}
  221. for cfg_hook, cfg_hook_fns in sec.get("hooks", {}).items():
  222. if cfg_hook not in HOOKS:
  223. raise ConfigError(f"{cfg_hook} is not a valid hook name!")
  224. hook_fn[cfg_hook] = []
  225. # There may be more than one function in each hook. They are
  226. # executed in the order they are found.
  227. for cfg_hook_fn in cfg_hook_fns:
  228. modname, fnname = path.splitext(cfg_hook_fn[0])
  229. fnname = fnname.lstrip(".")
  230. fn_kwargs = cfg_hook_fn[1] if len(cfg_hook_fn) > 1 else {}
  231. try:
  232. fn = getattr(import_module(
  233. "." + modname, HOOK_PKG_PATH), fnname)
  234. except NameError:
  235. raise ConfigError(
  236. f"Hook function {fnname} defined in {cname} configuration "
  237. f"not found in module {HOOK_PKG_PATH}.{modname}!"
  238. )
  239. hook_fn[cfg_hook].append((fn, fn_kwargs))
  240. return hook_fn