__init__.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666
  1. import logging
  2. import sqlite3
  3. from collections import defaultdict
  4. from functools import cache
  5. from importlib import import_module
  6. from json import dumps as jdumps, loads as jloads
  7. from os import R_OK, access, environ, makedirs, path, unlink
  8. from re import compile
  9. from shutil import move
  10. from yaml import load
  11. try:
  12. from yaml import CLoader as Loader
  13. except ImportError:
  14. from yaml import Loader
  15. from scriptshifter import DB_PATH
  16. from scriptshifter.exceptions import BREAK, ApiError, ConfigError
  17. __doc__ = """
  18. Transliteration tables.
  19. These tables contain all transliteration information. The static YML files are
  20. transformed and loaded into a database, which is the effective data source at
  21. runtime.
  22. """
  23. DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
  24. # Can be overridden for tests.
  25. TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
  26. # Available hook names.
  27. HOOKS = (
  28. "post_config",
  29. "post_normalize",
  30. "begin_input_token",
  31. "pre_ignore_token",
  32. "on_ignore_match",
  33. "pre_tx_token",
  34. "on_tx_token_match",
  35. "on_no_tx_token_match",
  36. "pre_assembly",
  37. "post_assembly",
  38. )
  39. # Package path where hook functions are kept.
  40. HOOK_PKG_PATH = "scriptshifter.hooks"
  41. # Default characters defining a word boundary. This is configurable per-table.
  42. WORD_BOUNDARY = " \n\t:;.,\"'-()[]{}"
  43. # Token word boundary marker. Used in maps to distinguish special
  44. # transliterations for initial, final, and standalone tokens.
  45. TOKEN_WB_MARKER = "%"
  46. # Word boundary bitwise flags.
  47. BOW = 1 << 1
  48. EOW = 1 << 0
  49. # Feature flags used in database tables.
  50. FEAT_S2R = 1 << 0 # Has S2R.
  51. FEAT_R2S = 1 << 1 # Has R2S.
  52. FEAT_CASEI = 1 << 2 # Case-insensitive script.
  53. FEAT_RE = 1 << 3 # Regular expression.
  54. logger = logging.getLogger(__name__)
  55. class Token(str):
  56. """
  57. Token class: minimal unit of text parsing.
  58. This class overrides the `<` operator for strings, so that sorting is done
  59. in a way that prioritizes a longer string over a shorter one with identical
  60. root.
  61. """
  62. flags = 0
  63. def __init__(self, content):
  64. self.content = str(content) # Normalize in case a token is passed.
  65. # Assign special precedence based on token position.
  66. # Standalone has precedence, then initial, then final, then medial.
  67. # This is somewhat arbitrary and may change if special cases arise.
  68. # WB markers are moved to flags to allow default comparison.
  69. if self.content.startswith(TOKEN_WB_MARKER):
  70. self.flags |= BOW
  71. self.content = self.content.lstrip(TOKEN_WB_MARKER)
  72. if self.content.endswith(TOKEN_WB_MARKER):
  73. self.flags |= EOW
  74. self.content = self.content.rstrip(TOKEN_WB_MARKER)
  75. def __lt__(self, other):
  76. """
  77. Operator to sort tokens.
  78. E.g:
  79. - ABCD
  80. - AB
  81. - A
  82. - BCDE
  83. - BCD
  84. - BEFGH
  85. - B
  86. """
  87. # logger.debug(f"a: {self.content}, b: {other.content}")
  88. self_len = len(self.content)
  89. other_len = len(other.content)
  90. min_len = min(self_len, other_len)
  91. # Check word boundary flags only if tokens are identical.
  92. # Higher flag value has precedence.
  93. if (
  94. (self.flags > 0 or other.flags > 0)
  95. and self.content == other.content):
  96. # logger.debug(f"{self.content} flags: {self.flags}")
  97. # logger.debug(f"{other.content} flags: {other.flags}")
  98. # logger.debug("Performing flags comparison.")
  99. return self.flags > other.flags
  100. # If one of the strings is entirely contained in the other string...
  101. if self.content[:min_len] == other.content[:min_len]:
  102. # logger.debug("Roots match.")
  103. # ...then the longer one takes precedence (is "less")
  104. return self_len > other_len
  105. # If the root strings are different, perform a normal comparison.
  106. return self.content < other.content
  107. def __hash__(self):
  108. return hash(self.content)
  109. def init_db():
  110. """
  111. Populate database with language data.
  112. This operation removes any preexisting database.
  113. All tables in the index file (`./index.yml`) will be parsed
  114. (including inheritance rules) and loaded into the designated DB.
  115. This must be done only once at bootstrap. To update individual tables,
  116. see populate_table(), which this function calls iteratively.
  117. """
  118. # Create parent diretories if necessary.
  119. # If the DB already exists, it will be overwritten ONLY on success at
  120. # this point.
  121. TMP_DB_PATH = path.join(
  122. path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH))
  123. if path.isfile(TMP_DB_PATH):
  124. # Remove previous temp file (possibly from failed attempt)
  125. unlink(TMP_DB_PATH)
  126. else:
  127. makedirs(path.dirname(TMP_DB_PATH), exist_ok=True)
  128. conn = sqlite3.connect(TMP_DB_PATH)
  129. # Initialize schema.
  130. with open(path.join(path.dirname(DEFAULT_TABLE_DIR), "init.sql")) as fh:
  131. with conn:
  132. conn.executescript(fh.read())
  133. # Populate tables.
  134. with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh:
  135. tlist = load(fh, Loader=Loader)
  136. try:
  137. with conn:
  138. for tname, tdata in tlist.items():
  139. populate_table(conn, tname, tdata)
  140. # If the DB already exists, it will be overwritten ONLY on success at
  141. # thhis point.
  142. move(TMP_DB_PATH, DB_PATH)
  143. logger.info(f"Database initialized at {DB_PATH}.")
  144. finally:
  145. conn.close()
  146. if path.isfile(TMP_DB_PATH):
  147. # Remove leftover temp files from bungled up operation.
  148. unlink(TMP_DB_PATH)
  149. def get_connection():
  150. """
  151. Get the default DB connection object.
  152. To be closed by the caller or used as a context.
  153. """
  154. return sqlite3.connect(DB_PATH)
  155. def populate_table(conn, tname, tdata):
  156. """
  157. Populate an individual table with data from a configuration.
  158. @param conn: SQLite connection.
  159. @param tname(str): Table name.
  160. @param tdata(dict): Table data.
  161. """
  162. logger.info(f"Populating table: {tname}")
  163. res = conn.execute(
  164. """INSERT INTO tbl_language (
  165. name, label, marc_code, description
  166. ) VALUES (?, ?, ?, ?)""",
  167. (
  168. tname, tdata.get("name"), tdata.get("marc_code"),
  169. tdata.get("description"),
  170. )
  171. )
  172. tid = res.lastrowid
  173. data = load_table(tname)
  174. flags = 0
  175. if "script_to_roman" in data:
  176. flags |= FEAT_S2R
  177. if "roman_to_script" in data:
  178. flags |= FEAT_R2S
  179. if not data.get("general", {}).get("case_sensitive", True):
  180. flags |= FEAT_CASEI
  181. conn.execute(
  182. "UPDATE tbl_language SET features = ? WHERE id = ?",
  183. (flags, tid))
  184. for t_dir in (FEAT_S2R, FEAT_R2S):
  185. # BEGIN per-section loop.
  186. sec_name = (
  187. "script_to_roman" if t_dir == FEAT_S2R else "roman_to_script")
  188. sec = data.get(sec_name)
  189. if not sec:
  190. continue
  191. # Transliteration map.
  192. sort = 1
  193. for k, v in sec.get("map", {}):
  194. conn.execute(
  195. """INSERT INTO tbl_trans_map (
  196. lang_id, dir, src, dest, sort
  197. ) VALUES (?, ?, ?, ?, ?)""",
  198. (tid, t_dir, k, v, sort))
  199. sort += 1
  200. # hooks.
  201. for k, v in sec.get("hooks", {}).items():
  202. for i, hook_data in enumerate(v, start=1):
  203. conn.execute(
  204. """INSERT INTO tbl_hook (
  205. lang_id, dir, name, sort, module, fn, kwargs
  206. ) VALUES (?, ?, ?, ?, ?, ?, ?)""",
  207. (
  208. tid, t_dir, k, i, hook_data[0],
  209. hook_data[1].__name__, jdumps(hook_data[2])))
  210. # Ignore rules (R2S only).
  211. for rule in sec.get("ignore", []):
  212. conn.execute(
  213. """INSERT INTO tbl_ignore (
  214. lang_id, rule, features
  215. ) VALUES (?, ?, ?)""",
  216. (tid, rule, 0))
  217. for rule in sec.get("ignore_ptn", []):
  218. conn.execute(
  219. """INSERT INTO tbl_ignore (
  220. lang_id, rule, features
  221. ) VALUES (?, ?, ?)""",
  222. (tid, rule, FEAT_RE))
  223. # Double caps (S2R only).
  224. for rule in sec.get("double_cap", []):
  225. conn.execute(
  226. """INSERT INTO tbl_double_cap (
  227. lang_id, rule
  228. ) VALUES (?, ?)""",
  229. (tid, rule))
  230. # Normalize (S2R only).
  231. for src, dest in sec.get("normalize", {}).items():
  232. conn.execute(
  233. """INSERT INTO tbl_normalize (lang_id, src, dest)
  234. VALUES (?, ?, ?)""",
  235. (tid, src, dest))
  236. # END per-section loop.
  237. # UI options
  238. for opt in data.get("options", []):
  239. conn.execute(
  240. """INSERT INTO tbl_option (
  241. lang_id, name, label, description, dtype,
  242. options, default_v
  243. ) VALUES (?, ?, ?, ?, ?, ?, ?)""",
  244. (
  245. tid, opt["id"], opt["label"], opt["description"],
  246. opt["type"], jdumps(opt.get("options")),
  247. opt["default"]))
  248. @cache
  249. def list_tables():
  250. """
  251. List all the indexed tables.
  252. Note that this may not correspond to all the table files in the data
  253. folder, but only those exposed in the index.
  254. """
  255. conn = get_connection()
  256. with conn:
  257. data = conn.execute(
  258. """SELECT name, label, features, marc_code, description
  259. FROM tbl_language""")
  260. tdata = {
  261. row[0]: {
  262. "label": row[1],
  263. "has_s2r": bool(row[2] & FEAT_S2R),
  264. "has_r2s": bool(row[2] & FEAT_R2S),
  265. "case_sensitive": not (row[2] & FEAT_CASEI),
  266. "marc_code": row[3],
  267. "description": row[4],
  268. } for row in data
  269. }
  270. return tdata
  271. def load_table(tname):
  272. """
  273. Parse one transliteration table and possible parents from YML files.
  274. The table file is parsed into an in-memory configuration that contains
  275. the language & script metadata and parsing rules.
  276. """
  277. fname = path.join(TABLE_DIR, tname + ".yml")
  278. if not access(fname, R_OK):
  279. raise ValueError(f"No transliteration table for {tname}!")
  280. with open(fname) as fh:
  281. tdata = load(fh, Loader=Loader)
  282. # Pre-config hooks.
  283. # If any of these hooks returns BREAK, interrupt the configuration
  284. # parsing and return whatever is obtained so far.
  285. if "hooks" in tdata:
  286. tdata["hooks"] = load_hook_fn(tname, tdata)
  287. pre_cfg_hooks = tdata.get("hooks", {}).get("pre_config", [])
  288. for hook_def in pre_cfg_hooks:
  289. kwargs = hook_def[1] if len(hook_def) > 1 else {}
  290. ret = hook_def[0](tdata, **kwargs)
  291. if ret == BREAK:
  292. return tdata
  293. parents = tdata.get("general", {}).get("parents", [])
  294. if "script_to_roman" in tdata:
  295. if "double_cap" in tdata["script_to_roman"]:
  296. tdata["script_to_roman"]["double_cap"] = tuple(
  297. tdata["script_to_roman"]["double_cap"])
  298. tokens = {}
  299. for parent in parents:
  300. parent_tdata = load_table(parent)
  301. # Merge parent tokens. Child overrides parents, and a parent listed
  302. # later override ones listed earlier.
  303. tokens |= {
  304. Token(k): v for k, v in parent_tdata.get(
  305. "script_to_roman", {}).get("map", {})
  306. }
  307. # Merge and/or remove double cap rules.
  308. tdata["script_to_roman"]["double_cap"] = tuple((
  309. set(parent_tdata.get(
  310. "script_to_roman", {}
  311. ).get("double_cap", set())) |
  312. set(tdata["script_to_roman"].get("double_cap", set()))
  313. ) - set(tdata["script_to_roman"].get("no_double_cap", set())))
  314. if "no_double_cap" in tdata["script_to_roman"]:
  315. del tdata["script_to_roman"]["no_double_cap"]
  316. tokens |= {
  317. Token(k): v
  318. for k, v in tdata["script_to_roman"].get("map", {}).items()}
  319. tdata["script_to_roman"]["map"] = tuple(
  320. (k, tokens[k]) for k in sorted(tokens))
  321. # Normalization.
  322. normalize = {}
  323. # Inherit normalization rules.
  324. for parent in parents:
  325. parent_langsec = load_table(parent)["script_to_roman"]
  326. normalize |= parent_langsec.get("normalize", {})
  327. for k, v in tdata["script_to_roman"].get("normalize", {}).items():
  328. for vv in v:
  329. normalize[Token(vv)] = k
  330. tdata["script_to_roman"]["normalize"] = dict(sorted(normalize.items()))
  331. # Hook function.
  332. if "hooks" in tdata["script_to_roman"]:
  333. tdata["script_to_roman"]["hooks"] = load_hook_fn(
  334. tname, tdata["script_to_roman"])
  335. if "roman_to_script" in tdata:
  336. tokens = {}
  337. for parent in parents:
  338. parent_tdata = load_table(parent)
  339. # Merge parent tokens. Child overrides parents, and a parent listed
  340. # later override ones listed earlier.
  341. tokens |= {
  342. Token(k): v for k, v in parent_tdata.get(
  343. "roman_to_script", {}).get("map", {})
  344. }
  345. tokens |= {
  346. Token(k): v
  347. for k, v in tdata["roman_to_script"].get("map", {}).items()
  348. }
  349. tdata["roman_to_script"]["map"] = tuple(
  350. (k, tokens[k]) for k in sorted(tokens))
  351. # Ignore regular expression patterns.
  352. # Patterns are evaluated in the order they are listed in the config.
  353. ignore_ptn = tdata["roman_to_script"].get("ignore_ptn", [])
  354. for parent in parents:
  355. parent_tdata = load_table(parent)
  356. # NOTE: duplicates are not removed.
  357. ignore_ptn = parent_tdata.get(
  358. "roman_to_script", {}).get("ignore_ptn", []) + ignore_ptn
  359. tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn
  360. # Ignore plain strings.
  361. ignore = set(tdata["roman_to_script"].get("ignore", []))
  362. for parent in parents:
  363. parent_tdata = load_table(parent)
  364. # No overriding occurs with the ignore list, only de-duplication.
  365. ignore |= set(parent_tdata.get(
  366. "roman_to_script", {}).get("ignore", []))
  367. tdata["roman_to_script"]["ignore"] = sorted(ignore)
  368. # Hooks.
  369. if "hooks" in tdata["roman_to_script"]:
  370. tdata["roman_to_script"]["hooks"] = load_hook_fn(
  371. tname, tdata["roman_to_script"])
  372. return tdata
  373. def load_hook_fn(cname, sec):
  374. """
  375. Load hook functions from configuration file.
  376. Args:
  377. lang (str): The language key for the configuration.
  378. sec (dict): The `script_to_roman` or `roman_to_script` section
  379. that may contain the `hooks` key to be parsed.
  380. Return:
  381. dict: Dictionary of hook name and list of hook functions pairs.
  382. """
  383. hook_fn = {}
  384. for cfg_hook, cfg_hook_fns in sec.get("hooks", {}).items():
  385. if cfg_hook not in HOOKS:
  386. raise ConfigError(f"{cfg_hook} is not a valid hook name!")
  387. hook_fn[cfg_hook] = []
  388. # There may be more than one function in each hook. They are
  389. # executed in the order they are found.
  390. for cfg_hook_fn in cfg_hook_fns:
  391. modname, fnname = path.splitext(cfg_hook_fn[0])
  392. fnname = fnname.lstrip(".")
  393. fn_kwargs = cfg_hook_fn[1] if len(cfg_hook_fn) > 1 else {}
  394. try:
  395. fn = getattr(import_module(
  396. "." + modname, HOOK_PKG_PATH), fnname)
  397. except NameError:
  398. raise ConfigError(
  399. f"Hook function {fnname} defined in {cname} configuration "
  400. f"not found in module {HOOK_PKG_PATH}.{modname}!"
  401. )
  402. hook_fn[cfg_hook].append((modname, fn, fn_kwargs))
  403. return hook_fn
  404. @cache
  405. def get_language(lang):
  406. """ Get all language options from the DB. """
  407. conn = get_connection()
  408. with conn:
  409. general = get_lang_general(conn, lang)
  410. lang_id = general["id"]
  411. data = general["data"]
  412. # Normalization.
  413. norm_data = get_lang_normalize(conn, lang_id)
  414. if len(norm_data):
  415. data["normalize"] = norm_data
  416. # Script to Roman map and hooks.
  417. if data["has_s2r"]:
  418. data["script_to_roman"] = {}
  419. s2r_map = tuple(
  420. row for row in get_lang_map(conn, lang_id, FEAT_S2R))
  421. if len(s2r_map):
  422. data["script_to_roman"]["map"] = s2r_map
  423. s2r_hooks = get_lang_hooks(conn, lang_id, FEAT_S2R)
  424. if len(s2r_hooks):
  425. data["script_to_roman"]["hooks"] = s2r_hooks
  426. double_cap = get_lang_dcap(conn, lang_id)
  427. if len(double_cap):
  428. data["script_to_roman"]["double_cap"] = double_cap
  429. # Roman to script map, ignore list, and hooks.
  430. if data["has_r2s"]:
  431. data["roman_to_script"] = {}
  432. r2s_map = tuple(
  433. row for row in get_lang_map(conn, lang_id, FEAT_R2S))
  434. if len(r2s_map):
  435. data["roman_to_script"]["map"] = r2s_map
  436. r2s_ignore = get_lang_ignore(conn, lang_id)
  437. if len(r2s_ignore):
  438. data["roman_to_script"]["ignore"] = r2s_ignore
  439. r2s_hooks = get_lang_hooks(conn, lang_id, FEAT_R2S)
  440. if len(r2s_hooks):
  441. data["roman_to_script"]["hooks"] = r2s_hooks
  442. opt_data = get_lang_options(conn, lang_id)
  443. if len(opt_data):
  444. data["options"] = opt_data
  445. conn.close()
  446. return data
  447. def get_lang_general(conn, lang):
  448. """ Language general attributes. """
  449. lang_q = conn.execute(
  450. """SELECT id, name, label, features, marc_code, description
  451. FROM tbl_language WHERE name = ?""", (lang,))
  452. lang_data = lang_q.fetchone()
  453. if not lang_data:
  454. raise ApiError(f"No language data found for {lang}", 404)
  455. return {
  456. "id": lang_data[0],
  457. "data": {
  458. "name": lang_data[1],
  459. "label": lang_data[2],
  460. "has_s2r": bool(lang_data[3] & FEAT_S2R),
  461. "has_r2s": bool(lang_data[3] & FEAT_R2S),
  462. "case_sensitive": not (lang_data[3] & FEAT_CASEI),
  463. "marc_code": lang_data[4],
  464. "description": lang_data[5],
  465. },
  466. }
  467. def get_lang_normalize(conn, lang_id):
  468. qry = conn.execute(
  469. """SELECT src, dest FROM tbl_normalize
  470. WHERE lang_id = ?""",
  471. (lang_id,))
  472. return {row[0]: row[1] for row in qry}
  473. def get_lang_ignore(conn, lang_id):
  474. """
  475. Ignore list as a tuple.
  476. """
  477. qry = conn.execute(
  478. """SELECT rule, features FROM tbl_ignore
  479. WHERE lang_id = ?""",
  480. (lang_id,))
  481. return tuple(
  482. compile(row[0]) if row[1] & FEAT_RE else row[0]
  483. for row in qry)
  484. @cache
  485. def get_lang_map(conn, lang_id, t_dir):
  486. """
  487. S2R or R2S map.
  488. Generator of tuples (source, destination).
  489. """
  490. qry = conn.execute(
  491. """SELECT src, dest FROM tbl_trans_map
  492. WHERE lang_id = ? AND dir = ?
  493. ORDER BY sort ASC""",
  494. (lang_id, t_dir))
  495. for row in qry:
  496. yield (Token(row[0]), row[1])
  497. def get_lang_options(conn, lang_id):
  498. """ Language options as a tuple of dictionaries. """
  499. qry = conn.execute(
  500. """SELECT name, label, description, dtype, options, default_v
  501. FROM tbl_option
  502. WHERE lang_id = ?""",
  503. (lang_id,))
  504. return tuple(
  505. {
  506. "id": row[0],
  507. "label": row[1],
  508. "description": row[2],
  509. "type": row[3],
  510. "options": jloads(row[4]) if row[4] else None,
  511. "default": row[5],
  512. }
  513. for row in qry
  514. )
  515. def get_lang_hooks(conn, lang_id, t_dir):
  516. """ Language hooks in sorting order. """
  517. hooks = defaultdict(list)
  518. qry = conn.execute(
  519. """SELECT name, module, fn, kwargs
  520. FROM tbl_hook WHERE lang_id = ? AND dir = ?
  521. ORDER BY name, sort""",
  522. (lang_id, t_dir))
  523. for row in qry:
  524. hooks[row[0]].append(
  525. {
  526. "module_name": row[1],
  527. "fn_name": row[2],
  528. "kwargs": jloads(row[3]),
  529. }
  530. )
  531. return dict(hooks)
  532. def get_lang_dcap(conn, lang_id):
  533. qry = conn.execute(
  534. """SELECT rule
  535. FROM tbl_double_cap WHERE lang_id = ?""",
  536. (lang_id,))
  537. return tuple(row[0] for row in qry)