__init__.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. import logging
  2. import re
  3. import sqlite3
  4. from functools import cache
  5. from importlib import import_module
  6. from json import dumps
  7. from os import R_OK, access, environ, makedirs, path, unlink
  8. from shutil import move
  9. from yaml import load
  10. try:
  11. from yaml import CLoader as Loader
  12. except ImportError:
  13. from yaml import Loader
  14. from scriptshifter import DB_PATH
  15. from scriptshifter.exceptions import BREAK, ConfigError
  16. __doc__ = """
  17. Transliteration tables.
  18. These tables contain all transliteration information. The static YML files are
  19. transformed and loaded into a database, which is the effective data source at
  20. runtime.
  21. """
  22. TMP_DB_PATH = path.join(
  23. path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH))
  24. DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
  25. # Can be overridden for tests.
  26. TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
  27. # Available hook names.
  28. HOOKS = (
  29. "post_config",
  30. "post_normalize",
  31. "begin_input_token",
  32. "pre_ignore_token",
  33. "on_ignore_match",
  34. "pre_tx_token",
  35. "on_tx_token_match",
  36. "on_no_tx_token_match",
  37. "pre_assembly",
  38. "post_assembly",
  39. )
  40. # Package path where hook functions are kept.
  41. HOOK_PKG_PATH = "scriptshifter.hooks"
  42. # Default characters defining a word boundary. This is configurable per-table.
  43. WORD_BOUNDARY = " \n\t:;.,\"'-()[]{}"
  44. # Token word boundary marker. Used in maps to distinguish special
  45. # transliterations for initial, final, and standalone tokens.
  46. TOKEN_WB_MARKER = "%"
  47. # Word boundary bitwise flags.
  48. BOW = 1 << 1
  49. EOW = 1 << 0
  50. # Feature flags used in database tables.
  51. FEAT_S2R = 1 << 0 # Has S2R.
  52. FEAT_R2S = 1 << 1 # Has R2S.
  53. FEAT_CASEI = 1 << 2 # Case-insensitive script.
  54. FEAT_RE = 1 << 3 # Regular expression.
  55. logger = logging.getLogger(__name__)
  56. class Token(str):
  57. """
  58. Token class: minimal unit of text parsing.
  59. This class overrides the `<` operator for strings, so that sorting is done
  60. in a way that prioritizes a longer string over a shorter one with identical
  61. root.
  62. """
  63. flags = 0
  64. def __init__(self, content):
  65. self.content = str(content) # Normalize in case a token is passed.
  66. # Assign special precedence based on token position.
  67. # Standalone has precedence, then initial, then final, then medial.
  68. # This is somewhat arbitrary and may change if special cases arise.
  69. # WB markers are moved to flags to allow default comparison.
  70. if self.content.endswith(TOKEN_WB_MARKER):
  71. self.flags |= BOW
  72. self.content = self.content.rstrip(TOKEN_WB_MARKER)
  73. if self.content.startswith(TOKEN_WB_MARKER):
  74. self.flags |= EOW
  75. self.content = self.content.lstrip(TOKEN_WB_MARKER)
  76. def __lt__(self, other):
  77. """
  78. Operator to sort tokens.
  79. E.g:
  80. - ABCD
  81. - AB
  82. - A
  83. - BCDE
  84. - BCD
  85. - BEFGH
  86. - B
  87. """
  88. # logger.debug(f"a: {self.content}, b: {other.content}")
  89. self_len = len(self.content)
  90. other_len = len(other.content)
  91. min_len = min(self_len, other_len)
  92. # Check word boundary flags only if tokens are identical.
  93. # Higher flag value has precedence.
  94. if (
  95. (self.flags > 0 or other.flags > 0)
  96. and self.content == other.content):
  97. logger.debug(f"{self.content} flags: {self.flags}")
  98. logger.debug(f"{other.content} flags: {other.flags}")
  99. logger.debug("Performing flags comparison.")
  100. return self.flags > other.flags
  101. # If one of the strings is entirely contained in the other string...
  102. if self.content[:min_len] == other.content[:min_len]:
  103. # logger.debug("Roots match.")
  104. # ...then the longer one takes precedence (is "less")
  105. return self_len > other_len
  106. # If the root strings are different, perform a normal comparison.
  107. return self.content < other.content
  108. def __hash__(self):
  109. return hash(self.content)
  110. def init_db():
  111. """
  112. Populate database with language data.
  113. This operation removes any preexisting database.
  114. All tables in the index file (`./data/index.yml`) will be parsed
  115. (including inheritance rules) and loaded into the designated DB.
  116. This must be done only once at bootstrap. To update individual tables,
  117. see populate_table(), which this function calls iteratively.
  118. """
  119. # Create parent diretories if necessary.
  120. # If the DB already exists, it will be overwritten ONLY on success at
  121. # thhis point.
  122. makedirs(path.dirname(TMP_DB_PATH), exist_ok=True)
  123. conn = sqlite3.connect(TMP_DB_PATH)
  124. # Initialize schema.
  125. with open(path.join(path.dirname(DEFAULT_TABLE_DIR), "init.sql")) as fh:
  126. with conn:
  127. conn.executescript(fh.read())
  128. # Populate tables.
  129. try:
  130. with conn:
  131. for tname, tdata in list_tables().items():
  132. res = conn.execute(
  133. """INSERT INTO tbl_language (
  134. name, label, marc_code, description
  135. ) VALUES (?, ?, ?, ?)""",
  136. (
  137. tname, tdata.get("name"), tdata.get("marc_code"),
  138. tdata.get("description"),
  139. )
  140. )
  141. populate_table(conn, res.lastrowid, tname)
  142. # If the DB already exists, it will be overwritten ONLY on success at
  143. # thhis point.
  144. move(TMP_DB_PATH, DB_PATH)
  145. finally:
  146. conn.close()
  147. if path.isfile(TMP_DB_PATH):
  148. # Remove leftover temp files from bungled up operation.
  149. unlink(TMP_DB_PATH)
  150. def populate_table(conn, tid, tname):
  151. data = load_table(tname)
  152. flags = 0
  153. if "script_to_roman" in data:
  154. flags |= FEAT_S2R
  155. if "roman_to_script" in data:
  156. flags |= FEAT_R2S
  157. conn.execute(
  158. "UPDATE tbl_language SET features = ? WHERE id = ?",
  159. (flags, tid))
  160. for t_dir in (FEAT_S2R, FEAT_R2S):
  161. # BEGIN per-section loop.
  162. sec_name = (
  163. "script_to_roman" if t_dir == FEAT_S2R else "roman_to_script")
  164. sec = data.get(sec_name)
  165. if not sec:
  166. continue
  167. # Transliteration map.
  168. for k, v in sec.get("map", {}):
  169. conn.execute(
  170. """INSERT INTO tbl_trans_map (
  171. lang_id, dir, src, dest
  172. ) VALUES (?, ?, ?, ?)""",
  173. (tid, t_dir, k, v))
  174. # hooks.
  175. for k, v in sec.get("hooks", {}).items():
  176. for i, hook_data in enumerate(v, start=1):
  177. conn.execute(
  178. """INSERT INTO tbl_hook (
  179. lang_id, dir, name, sort, fn, signature
  180. ) VALUES (?, ?, ?, ?, ?, ?)""",
  181. (
  182. tid, t_dir, k, i,
  183. hook_data[0].__name__, dumps(hook_data[1:])))
  184. # Ignore rules (R2S only).
  185. for row in sec.get("ignore", []):
  186. if isinstance(row, dict):
  187. if "re" in row:
  188. flags = FEAT_RE
  189. rule = row["re"]
  190. else:
  191. flags = 0
  192. rule = row
  193. conn.execute(
  194. """INSERT INTO tbl_ignore (
  195. lang_id, rule, features
  196. ) VALUES (?, ?, ?)""",
  197. (tid, rule, flags))
  198. # Double caps (S2R only).
  199. for rule in sec.get("double_cap", []):
  200. conn.execute(
  201. """INSERT INTO tbl_double_cap (
  202. lang_id, rule
  203. ) VALUES (?, ?)""",
  204. (tid, rule))
  205. # Normalize (S2R only).
  206. for src, dest in sec.get("normalize", {}).items():
  207. conn.execute(
  208. """INSERT INTO tbl_normalize (lang_id, src, dest)
  209. VALUES (?, ?, ?)""",
  210. (tid, src, dest))
  211. # END per-section loop.
  212. # UI options
  213. for opt in data.get("options", []):
  214. conn.execute(
  215. """INSERT INTO tbl_option (
  216. lang_id, name, label, description, dtype, default_v
  217. ) VALUES (?, ?, ?, ?, ?, ?)""",
  218. (
  219. tid, opt["id"], opt["label"], opt["description"],
  220. opt["type"], opt["default"]))
  221. @cache
  222. def list_tables():
  223. """
  224. List all the indexed tables.
  225. Note that this may not correspond to all the table files in the data
  226. folder, but only those exposed in the index.
  227. """
  228. conn = sqlite3.connect(DB_PATH)
  229. with conn:
  230. data = conn.execute(
  231. """SELECT name, label, features, marc_code, description
  232. FROM tbl_language""")
  233. tdata = {
  234. row[0]: {
  235. "label": row[1],
  236. "has_s2r": bool(row[2] & FEAT_S2R),
  237. "has_r2s": bool(row[2] & FEAT_R2S),
  238. "case_sensitive": not (row[2] & FEAT_CASEI),
  239. "marc_code": row[3],
  240. "description": row[4],
  241. } for row in data
  242. }
  243. return tdata
  244. @cache
  245. def load_table(tname):
  246. """
  247. Load one transliteration table and possible parents.
  248. The table file is parsed into an in-memory configuration that contains
  249. the language & script metadata and parsing rules.
  250. """
  251. fname = path.join(TABLE_DIR, tname + ".yml")
  252. if not access(fname, R_OK):
  253. raise ValueError(f"No transliteration table for {tname}!")
  254. with open(fname) as fh:
  255. tdata = load(fh, Loader=Loader)
  256. # Pre-config hooks.
  257. # If any of these hooks returns BREAK, interrupt the configuration
  258. # parsing and return whatever is obtained so far.
  259. if "hooks" in tdata:
  260. tdata["hooks"] = load_hook_fn(tname, tdata)
  261. pre_cfg_hooks = tdata.get("hooks", {}).get("pre_config", [])
  262. for hook_def in pre_cfg_hooks:
  263. kwargs = hook_def[1] if len(hook_def) > 1 else {}
  264. ret = hook_def[0](tdata, **kwargs)
  265. if ret == BREAK:
  266. return tdata
  267. parents = tdata.get("general", {}).get("parents", [])
  268. if "script_to_roman" in tdata:
  269. if "double_cap" in tdata["script_to_roman"]:
  270. tdata["script_to_roman"]["double_cap"] = tuple(
  271. tdata["script_to_roman"]["double_cap"])
  272. tokens = {}
  273. for parent in parents:
  274. parent_tdata = load_table(parent)
  275. # Merge parent tokens. Child overrides parents, and a parent listed
  276. # later override ones listed earlier.
  277. tokens |= {
  278. Token(k): v for k, v in parent_tdata.get(
  279. "script_to_roman", {}).get("map", {})
  280. }
  281. # Merge and/or remove double cap rules.
  282. tdata["script_to_roman"]["double_cap"] = tuple((
  283. set(parent_tdata.get(
  284. "script_to_roman", {}
  285. ).get("double_cap", set())) |
  286. set(tdata["script_to_roman"].get("double_cap", set()))
  287. ) - set(tdata["script_to_roman"].get("no_double_cap", set())))
  288. if "no_double_cap" in tdata["script_to_roman"]:
  289. del tdata["script_to_roman"]["no_double_cap"]
  290. tokens |= {
  291. Token(k): v
  292. for k, v in tdata["script_to_roman"].get("map", {}).items()}
  293. tdata["script_to_roman"]["map"] = tuple(
  294. (k, tokens[k]) for k in sorted(tokens))
  295. # Normalization.
  296. normalize = {}
  297. # Inherit normalization rules.
  298. for parent in parents:
  299. parent_langsec = load_table(parent)["script_to_roman"]
  300. normalize |= parent_langsec.get("normalize", {})
  301. for k, v in tdata["script_to_roman"].get("normalize", {}).items():
  302. for vv in v:
  303. normalize[Token(vv)] = k
  304. tdata["script_to_roman"]["normalize"] = dict(sorted(normalize.items()))
  305. # Hook function.
  306. if "hooks" in tdata["script_to_roman"]:
  307. tdata["script_to_roman"]["hooks"] = load_hook_fn(
  308. tname, tdata["script_to_roman"])
  309. if "roman_to_script" in tdata:
  310. tokens = {}
  311. for parent in parents:
  312. parent_tdata = load_table(parent)
  313. # Merge parent tokens. Child overrides parents, and a parent listed
  314. # later override ones listed earlier.
  315. tokens |= {
  316. Token(k): v for k, v in parent_tdata.get(
  317. "roman_to_script", {}).get("map", {})
  318. }
  319. tokens |= {
  320. Token(k): v
  321. for k, v in tdata["roman_to_script"].get("map", {}).items()
  322. }
  323. tdata["roman_to_script"]["map"] = tuple(
  324. (k, tokens[k]) for k in sorted(tokens))
  325. # Ignore regular expression patterns.
  326. # Patterns are evaluated in the order they are listed in the config.
  327. ignore_ptn = [
  328. re.compile(ptn)
  329. for ptn in tdata["roman_to_script"].get("ignore_ptn", [])]
  330. for parent in parents:
  331. parent_tdata = load_table(parent)
  332. # NOTE: duplicates are not removed.
  333. ignore_ptn = [
  334. re.compile(ptn)
  335. for ptn in parent_tdata.get(
  336. "roman_to_script", {}).get("ignore_ptn", [])
  337. ] + ignore_ptn
  338. tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn
  339. # Ignore plain strings.
  340. ignore = {
  341. Token(t)
  342. for t in tdata["roman_to_script"].get("ignore", [])
  343. }
  344. for parent in parents:
  345. parent_tdata = load_table(parent)
  346. # No overriding occurs with the ignore list, only de-duplication.
  347. ignore |= {
  348. Token(t) for t in parent_tdata.get(
  349. "roman_to_script", {}).get("ignore", [])
  350. }
  351. tdata["roman_to_script"]["ignore"] = [
  352. t.content for t in sorted(ignore)]
  353. # Hooks.
  354. if "hooks" in tdata["roman_to_script"]:
  355. tdata["roman_to_script"]["hooks"] = load_hook_fn(
  356. tname, tdata["roman_to_script"])
  357. return tdata
  358. def load_hook_fn(cname, sec):
  359. """
  360. Load hook functions from configuration file.
  361. Args:
  362. lang (str): The language key for the configuration.
  363. sec (dict): The `script_to_roman` or `roman_to_script` section
  364. that may contain the `hooks` key to be parsed.
  365. Return:
  366. dict: Dictionary of hook name and list of hook functions pairs.
  367. """
  368. hook_fn = {}
  369. for cfg_hook, cfg_hook_fns in sec.get("hooks", {}).items():
  370. if cfg_hook not in HOOKS:
  371. raise ConfigError(f"{cfg_hook} is not a valid hook name!")
  372. hook_fn[cfg_hook] = []
  373. # There may be more than one function in each hook. They are
  374. # executed in the order they are found.
  375. for cfg_hook_fn in cfg_hook_fns:
  376. modname, fnname = path.splitext(cfg_hook_fn[0])
  377. fnname = fnname.lstrip(".")
  378. fn_kwargs = cfg_hook_fn[1] if len(cfg_hook_fn) > 1 else {}
  379. try:
  380. fn = getattr(import_module(
  381. "." + modname, HOOK_PKG_PATH), fnname)
  382. except NameError:
  383. raise ConfigError(
  384. f"Hook function {fnname} defined in {cname} configuration "
  385. f"not found in module {HOOK_PKG_PATH}.{modname}!"
  386. )
  387. hook_fn[cfg_hook].append((fn, fn_kwargs))
  388. return hook_fn