Browse Source

Use DB for transliterate function [untested].

scossu 9 months ago
parent
commit
6cac2dcbd1
5 changed files with 371 additions and 292 deletions
  1. 1 0
      .gitignore
  2. 0 0
      scriptshifter/data/.keep
  3. 148 86
      scriptshifter/tables/__init__.py
  4. 8 2
      scriptshifter/tables/init.sql
  5. 214 204
      scriptshifter/trans.py

+ 1 - 0
.gitignore

@@ -137,5 +137,6 @@ tags.temp
 
 
 # Local
 # Local
 ext/arabic_rom/data
 ext/arabic_rom/data
+scriptshifter/data/*.db
 !.keep
 !.keep
 VERSION
 VERSION

+ 0 - 0
scriptshifter/data/.keep


+ 148 - 86
scriptshifter/tables/__init__.py

@@ -2,6 +2,7 @@ import logging
 import re
 import re
 import sqlite3
 import sqlite3
 
 
+from collections import defaultdict
 from importlib import import_module
 from importlib import import_module
 from json import dumps as jdumps, loads as jloads
 from json import dumps as jdumps, loads as jloads
 from os import R_OK, access, environ, makedirs, path, unlink
 from os import R_OK, access, environ, makedirs, path, unlink
@@ -184,6 +185,15 @@ def init_db():
             unlink(TMP_DB_PATH)
             unlink(TMP_DB_PATH)
 
 
 
 
+def get_connection():
+    """
+    Get the default DB connection object.
+
+    To be closed by the caller or used as a context.
+    """
+    return sqlite3.connect(DB_PATH)
+
+
 def populate_table(conn, tid, tname):
 def populate_table(conn, tid, tname):
     data = load_table(tname)
     data = load_table(tname)
     flags = 0
     flags = 0
@@ -206,23 +216,25 @@ def populate_table(conn, tid, tname):
             continue
             continue
 
 
         # Transliteration map.
         # Transliteration map.
+        sort = 1
         for k, v in sec.get("map", {}):
         for k, v in sec.get("map", {}):
             conn.execute(
             conn.execute(
                     """INSERT INTO tbl_trans_map (
                     """INSERT INTO tbl_trans_map (
-                        lang_id, dir, src, dest
-                    ) VALUES (?, ?, ?, ?)""",
-                    (tid, t_dir, k, v))
+                        lang_id, dir, src, dest, sort
+                    ) VALUES (?, ?, ?, ?, ?)""",
+                    (tid, t_dir, k, v, sort))
+            sort += 1
 
 
         # hooks.
         # hooks.
         for k, v in sec.get("hooks", {}).items():
         for k, v in sec.get("hooks", {}).items():
             for i, hook_data in enumerate(v, start=1):
             for i, hook_data in enumerate(v, start=1):
                 conn.execute(
                 conn.execute(
                         """INSERT INTO tbl_hook (
                         """INSERT INTO tbl_hook (
-                            lang_id, dir, name, sort, fn, signature
-                        ) VALUES (?, ?, ?, ?, ?, ?)""",
+                            lang_id, dir, name, sort, module, fn, kwargs
+                        ) VALUES (?, ?, ?, ?, ?, ?, ?)""",
                         (
                         (
-                            tid, t_dir, k, i,
-                            hook_data[0].__name__, jdumps(hook_data[1:])))
+                            tid, t_dir, k, i, hook_data[0],
+                            hook_data[1].__name__, jdumps(hook_data[2])))
 
 
         # Ignore rules (R2S only).
         # Ignore rules (R2S only).
         for row in sec.get("ignore", []):
         for row in sec.get("ignore", []):
@@ -277,7 +289,7 @@ def list_tables():
     Note that this may not correspond to all the table files in the data
     Note that this may not correspond to all the table files in the data
     folder, but only those exposed in the index.
     folder, but only those exposed in the index.
     """
     """
-    conn = sqlite3.connect(DB_PATH)
+    conn = get_connection()
 
 
     with conn:
     with conn:
         data = conn.execute(
         data = conn.execute(
@@ -463,7 +475,7 @@ def load_hook_fn(cname, sec):
                     f"Hook function {fnname} defined in {cname} configuration "
                     f"Hook function {fnname} defined in {cname} configuration "
                     f"not found in module {HOOK_PKG_PATH}.{modname}!"
                     f"not found in module {HOOK_PKG_PATH}.{modname}!"
                 )
                 )
-            hook_fn[cfg_hook].append((fn, fn_kwargs))
+            hook_fn[cfg_hook].append((modname, fn, fn_kwargs))
 
 
     return hook_fn
     return hook_fn
 
 
@@ -471,32 +483,16 @@ def load_hook_fn(cname, sec):
 def get_language(lang):
 def get_language(lang):
     """ Get all language options from the DB. """
     """ Get all language options from the DB. """
 
 
-    conn = sqlite3.connect(DB_PATH)
+    conn = get_connection()
 
 
     with conn:
     with conn:
-        lang_q = conn.execute(
-                """SELECT id, name, label, features, marc_code, description
-                FROM tbl_language WHERE name = ?""", (lang,))
-        lang_data = lang_q.fetchone()
-        lang_id = lang_data[0]
-
-        data = {
-            "name": lang_data[1],
-            "label": lang_data[2],
-            "has_s2r": bool(lang_data[3] & FEAT_S2R),
-            "has_r2s": bool(lang_data[3] & FEAT_R2S),
-            "case_sensitive": not (lang_data[3] & FEAT_CASEI),
-            "marc_code": lang_data[4],
-            "description": lang_data[5],
-        }
+        general = get_lang_general(conn, lang)
+        lang_id = general["id"]
+        data = general["data"]
 
 
         # Normalization.
         # Normalization.
 
 
-        norm_q = conn.execute(
-                """SELECT src, dest FROM tbl_normalize
-                WHERE lang_id = ?""",
-                (lang_id,))
-        norm_data = {row[0]: row[1] for row in norm_q}
+        norm_data = get_lang_normalize(conn, lang_id)
         if len(norm_data):
         if len(norm_data):
             data["normalize"] = norm_data
             data["normalize"] = norm_data
 
 
@@ -504,26 +500,12 @@ def get_language(lang):
 
 
         if data["has_s2r"]:
         if data["has_s2r"]:
             data["script_to_roman"] = {}
             data["script_to_roman"] = {}
-            s2r_q = conn.execute(
-                    """SELECT src, dest FROM tbl_trans_map
-                    WHERE lang_id = ? AND dir = ?""",
-                    (lang_id, FEAT_S2R))
-            s2r_map = tuple((row[0], row[1]) for row in s2r_q)
+            s2r_map = tuple(
+                    row for row in get_lang_map(conn, lang_id, FEAT_S2R))
             if len(s2r_map):
             if len(s2r_map):
                 data["script_to_roman"]["map"] = s2r_map
                 data["script_to_roman"]["map"] = s2r_map
 
 
-            hooks_q = conn.execute(
-                    """SELECT name, fn, signature
-                    FROM tbl_hook WHERE lang_id = ? AND dir = ?
-                    ORDER BY sort""",
-                    (lang_id, FEAT_S2R))
-            s2r_hooks = [
-                {
-                    "name": row[0],
-                    "fn": row[1],
-                    "signature": jloads(row[2]),
-                } for row in hooks_q
-            ]
+            s2r_hooks = get_lang_hooks(conn, lang_id, FEAT_S2R)
             if len(s2r_hooks):
             if len(s2r_hooks):
                 data["script_to_roman"]["hooks"] = s2r_hooks
                 data["script_to_roman"]["hooks"] = s2r_hooks
 
 
@@ -531,56 +513,136 @@ def get_language(lang):
 
 
         if data["has_r2s"]:
         if data["has_r2s"]:
             data["roman_to_script"] = {}
             data["roman_to_script"] = {}
-            r2s_q = conn.execute(
-                    """SELECT src, dest FROM tbl_trans_map
-                    WHERE lang_id = ? AND dir = ?""",
-                    (lang_id, FEAT_R2S))
-            r2s_map = tuple((row[0], row[1]) for row in r2s_q)
+            r2s_map = tuple(
+                    row for row in get_lang_map(conn, lang_id, FEAT_R2S))
             if len(r2s_map):
             if len(r2s_map):
                 data["roman_to_script"]["map"] = r2s_map
                 data["roman_to_script"]["map"] = r2s_map
 
 
-            ignore_q = conn.execute(
-                    """SELECT rule, features FROM tbl_ignore
-                    WHERE lang_id = ?""",
-                    (lang_id,))
-            # Features (regular expressions) not implemented yet.
-            r2s_ignore = tuple(row[0] for row in ignore_q)
+            r2s_ignore = get_lang_ignore(conn, lang_id)
             if len(r2s_ignore):
             if len(r2s_ignore):
                 data["roman_to_script"]["ignore"] = r2s_ignore
                 data["roman_to_script"]["ignore"] = r2s_ignore
 
 
-            hooks_q = conn.execute(
-                    """SELECT name, fn, signature
-                    FROM tbl_hook WHERE lang_id = ? AND dir = ?
-                    ORDER BY sort""",
-                    (lang_id, FEAT_R2S))
-            r2s_hooks = [
-                {
-                    "name": row[0],
-                    "fn": row[1],
-                    "signature": jloads(row[2]),
-                } for row in hooks_q
-            ]
+            r2s_hooks = get_lang_hooks(conn, lang_id, FEAT_R2S)
             if len(r2s_hooks):
             if len(r2s_hooks):
                 data["roman_to_script"]["hooks"] = r2s_hooks
                 data["roman_to_script"]["hooks"] = r2s_hooks
 
 
-        options_q = conn.execute(
-                """SELECT name, label, description, dtype, options, default_v
-                FROM tbl_option
-                WHERE lang_id = ?""",
-                (lang_id,))
+        opt_data = get_lang_options(conn, lang_id)
+        if len(opt_data):
+            data["options"] = opt_data
+
+        double_cap = get_lang_dcap(conn, lang_id)
+        if len(double_cap):
+            data["double_cap"] = double_cap
+
+    conn.close()
+
+    return data
 
 
-        opt_data = tuple(
+
+def get_lang_general(conn, lang):
+    """ Language general attributes. """
+    lang_q = conn.execute(
+            """SELECT id, name, label, features, marc_code, description
+            FROM tbl_language WHERE name = ?""", (lang,))
+    lang_data = lang_q.fetchone()
+
+    return {
+        "id": lang_data[0],
+        "data": {
+            "name": lang_data[1],
+            "label": lang_data[2],
+            "has_s2r": bool(lang_data[3] & FEAT_S2R),
+            "has_r2s": bool(lang_data[3] & FEAT_R2S),
+            "case_sensitive": not (lang_data[3] & FEAT_CASEI),
+            "marc_code": lang_data[4],
+            "description": lang_data[5],
+        },
+    }
+
+
+def get_lang_normalize(conn, lang_id):
+    qry = conn.execute(
+            """SELECT src, dest FROM tbl_normalize
+            WHERE lang_id = ?""",
+            (lang_id,))
+    return {row[0]: row[1] for row in qry}
+
+
+def get_lang_ignore(conn, lang_id):
+    """
+    Ignore list as a tuple.
+    """
+    qry = conn.execute(
+            """SELECT rule, features FROM tbl_ignore
+            WHERE lang_id = ?""",
+            (lang_id,))
+    # Features (regular expressions) not implemented yet.
+    return tuple(row[0] for row in qry)
+
+
+def get_lang_map(conn, lang_id, t_dir):
+    """
+    S2R or R2S map.
+
+    Generator of tuples (source, destination).
+    """
+    qry = conn.execute(
+            """SELECT src, dest FROM tbl_trans_map
+            WHERE lang_id = ? AND dir = ?
+            ORDER BY sort ASC""",
+            (lang_id, FEAT_S2R))
+
+    for row in qry:
+        yield (Token(row[0]), row[1])
+
+
+def get_lang_options(conn, lang_id):
+    """ Language options as a tuple of dictionaries. """
+    qry = conn.execute(
+            """SELECT name, label, description, dtype, options, default_v
+            FROM tbl_option
+            WHERE lang_id = ?""",
+            (lang_id,))
+
+    return tuple(
+        {
+            "id": row[0],
+            "label": row[1],
+            "description": row[2],
+            "type": row[3],
+            "options": jloads(row[4]) if row[4] else None,
+            "default": row[5],
+        }
+        for row in qry
+    )
+
+
+def get_lang_hooks(conn, lang_id, t_dir):
+    """ Language hooks in sorting order. """
+    hooks = defaultdict(list)
+
+    qry = conn.execute(
+            """SELECT name, module, fn, kwargs
+            FROM tbl_hook WHERE lang_id = ? AND dir = ?
+            ORDER BY name, sort""",
+            (lang_id, t_dir))
+
+    for row in qry:
+        hooks[row[0]].append(
             {
             {
-                "id": row[0],
-                "label": row[1],
-                "description": row[2],
-                "type": row[3],
-                "options": jloads(row[4]) if row[4] else None,
-                "default": row[5],
+                "module_name": row[1],
+                "fn_name": row[2],
+                "kwargs": jloads(row[3]),
             }
             }
-            for row in options_q
         )
         )
-        if len(opt_data):
-            data["options"] = opt_data
 
 
-        return data
+    return hooks
+
+
+def get_lang_dcap(conn, lang_id):
+    qry = conn.execute(
+            """SELECT rule
+            FROM tbl_double_cap WHERE lang_id = ?""",
+            (lang_id,))
+
+    return tuple(row[0] for row in qry)

+ 8 - 2
scriptshifter/tables/init.sql

@@ -23,22 +23,28 @@ CREATE TABLE tbl_trans_map (
     dir TINYINT NOT NULL DEFAULT 0,  /* 1 = S2R; 2 = R2S */
     dir TINYINT NOT NULL DEFAULT 0,  /* 1 = S2R; 2 = R2S */
     src TEXT NOT NULL,
     src TEXT NOT NULL,
     dest TEXT,
     dest TEXT,
+    sort INT NOT NULL,  /* Smaller values have higher priority. */
 
 
     FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE
     FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE
 );
 );
 CREATE UNIQUE INDEX idx_trans_lookup ON tbl_trans_map (lang_id, dir, src);
 CREATE UNIQUE INDEX idx_trans_lookup ON tbl_trans_map (lang_id, dir, src);
+CREATE INDEX idx_trans_map_sort ON tbl_trans_map (sort ASC);
 
 
 /*
 /*
  * Processing hooks.
  * Processing hooks.
+ *
+ * Note that multiple functions may be grouped under the same hook, lang, and
+ * direction. These are ordered by `sort`.
  */
  */
 CREATE TABLE tbl_hook (
 CREATE TABLE tbl_hook (
     id INTEGER PRIMARY KEY,
     id INTEGER PRIMARY KEY,
     lang_id INTEGER NOT NULL,
     lang_id INTEGER NOT NULL,
     dir TINYINT NOT NULL DEFAULT 0,  /* 1 = S2R; 2 = R2S */
     dir TINYINT NOT NULL DEFAULT 0,  /* 1 = S2R; 2 = R2S */
-    name TEXT NOT NULL,  /* Hook name. */
+    name TEXT NOT NULL, /* Hook name. */
     sort INT NOT NULL,  /* Function sorting order within the hook. */
     sort INT NOT NULL,  /* Function sorting order within the hook. */
+    module TEXT NOT NULL, /* Module name. */
     fn TEXT NOT NULL,   /* Function name. */
     fn TEXT NOT NULL,   /* Function name. */
-    signature TEXT,     /* Arguments as JSON blob. */
+    kwargs TEXT,        /* KW arguments as JSON blob. */
 
 
     FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE
     FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE
 );
 );

+ 214 - 204
scriptshifter/trans.py

@@ -1,9 +1,13 @@
 import logging
 import logging
 
 
+from importlib import import_module
 from re import compile
 from re import compile
 
 
 from scriptshifter.exceptions import BREAK, CONT
 from scriptshifter.exceptions import BREAK, CONT
-from scriptshifter.tables import BOW, EOW, WORD_BOUNDARY, load_table
+from scriptshifter.tables import (
+        BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
+        get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
+        get_lang_ignore, get_lang_map, get_lang_normalize)
 
 
 
 
 # Match multiple spaces.
 # Match multiple spaces.
@@ -15,6 +19,8 @@ logger = logging.getLogger(__name__)
 class Context:
 class Context:
     """
     """
     Context used within the transliteration and passed to hook functions.
     Context used within the transliteration and passed to hook functions.
+
+    Use within a `with` block for proper cleanup.
     """
     """
     @property
     @property
     def src(self):
     def src(self):
@@ -28,23 +34,35 @@ class Context:
     def src(self):
     def src(self):
         raise NotImplementedError("Attribute is read-only.")
         raise NotImplementedError("Attribute is read-only.")
 
 
-    def __init__(self, src, general, langsec, options={}):
+    def __init__(self, lang, src, t_dir, options={}):
         """
         """
         Initialize a context.
         Initialize a context.
 
 
         Args:
         Args:
             src (str): The original text. Read-only.
             src (str): The original text. Read-only.
-            general (dict): general section of the current config.
-            langsec (dict): Language configuration section being used.
+            t_dir (int): the direction of transliteration.
+                    Either FEAT_R2S or FEAT_S2R.
             options (dict): extra options as a dict.
             options (dict): extra options as a dict.
         """
         """
+        self.lang = lang
         self._src = src
         self._src = src
-        self.general = general
+        self.t_dir = t_dir
+        self.conn = get_connection()
+        with self.conn as conn:
+            general = get_lang_general(conn, self.lang)
+        self.general = general["data"]
+        self.lang_id = general["id"]
         self.options = options
         self.options = options
-        self.langsec = langsec
+        self.hooks = get_lang_hooks(self.conn, self.lang_id, self.t_dir)
         self.dest_ls = []
         self.dest_ls = []
         self.warnings = []
         self.warnings = []
 
 
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.conn.close()
+
 
 
 def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
     """
     """
@@ -73,234 +91,224 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
     Return:
     Return:
         str: The transliterated string.
         str: The transliterated string.
     """
     """
-    source_str = "Latin" if t_dir == "r2s" else lang
-    target_str = lang if t_dir == "r2s" else "Latin"
-    logger.info(f"Transliteration is from {source_str} to {target_str}.")
-
-    cfg = load_table(lang)
-    logger.info(f"Loaded table for {lang}.")
-
-    # General directives.
-    general = cfg.get("general", {})
-
-    if t_dir == "s2r" and "script_to_roman" not in cfg:
-        raise NotImplementedError(
-            f"Script-to-Roman transliteration not yet supported for {lang}."
-        )
-    elif t_dir == "r2s" and "roman_to_script" not in cfg:
-        raise NotImplementedError(
-            f"Roman-to-script transliteration not yet supported for {lang}."
-        )
+    # Map t_dir to constant.
+    t_dir = FEAT_S2R if t_dir == "s2r" else FEAT_R2S
 
 
-    langsec = (
-            cfg["script_to_roman"] if t_dir == "s2r"
-            else cfg["roman_to_script"])
-    # langsec_dir = langsec.get("directives", {})
-    langsec_hooks = langsec.get("hooks", {})
+    source_str = "Roman" if t_dir == FEAT_S2R else lang
+    target_str = lang if t_dir == FEAT_R2S else "Roman"
+    logger.info(f"Transliteration is from {source_str} to {target_str}.")
 
 
     src = src.strip()
     src = src.strip()
     options["capitalize"] = capitalize
     options["capitalize"] = capitalize
-    ctx = Context(src, general, langsec, options)
-
-    # This hook may take over the whole transliteration process or delegate it
-    # to some external process, and return the output string directly.
-    if _run_hook("post_config", ctx, langsec_hooks) == BREAK:
-        return getattr(ctx, "dest", ""), ctx.warnings
-
-    if "normalize" in ctx.langsec:
-        _normalize_src(ctx)
-
-    if _run_hook("post_normalize", ctx, langsec_hooks) == BREAK:
-        return getattr(ctx, "dest", ""), ctx.warnings
-
-    # Loop through source characters. The increment of each loop depends on
-    # the length of the token that eventually matches.
-    ignore_list = langsec.get("ignore", [])  # Only present in R2S
-    ctx.cur = 0
-    word_boundary = langsec.get("word_boundary", WORD_BOUNDARY)
-
-    while ctx.cur < len(ctx.src):
-        # Reset cursor position flags.
-        # Carry over extended "beginning of word" flag.
-        ctx.cur_flags = 0
-        cur_char = ctx.src[ctx.cur]
-
-        # Look for a word boundary and flag word beginning/end it if found.
-        if _is_bow(ctx.cur, ctx, word_boundary):
-            # Beginning of word.
-            logger.debug(f"Beginning of word at position {ctx.cur}.")
-            ctx.cur_flags |= BOW
-        if _is_eow(ctx.cur, ctx, word_boundary):
-            # End of word.
-            logger.debug(f"End of word at position {ctx.cur}.")
-            ctx.cur_flags |= EOW
-
-        # This hook may skip the parsing of the current
-        # token or exit the scanning loop altogether.
-        hret = _run_hook("begin_input_token", ctx, langsec_hooks)
-        if hret == BREAK:
-            logger.debug("Breaking text scanning from hook signal.")
-            break
-        if hret == CONT:
-            logger.debug("Skipping scanning iteration from hook signal.")
-            continue
-
-        # Check ignore list. Find as many subsequent ignore tokens
-        # as possible before moving on to looking for match tokens.
-        ctx.tk = None
-        while True:
-            ctx.ignoring = False
-            for ctx.tk in ignore_list:
-                hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
-                if hret == BREAK:
-                    break
-                if hret == CONT:
-                    continue
+    with Context(lang, src, t_dir, options) as ctx:
+
+        if t_dir == FEAT_S2R and not ctx.general["has_s2r"]:
+            raise NotImplementedError(
+                f"Script-to-Roman not yet supported for {lang}."
+            )
+        if t_dir == FEAT_R2S and not ctx.general["has_r2s"]:
+            raise NotImplementedError(
+                f"Roman-to-script not yet supported for {lang}."
+            )
+
+        # This hook may take over the whole transliteration process or delegate
+        # it to some external process, and return the output string directly.
+        if _run_hook("post_config", ctx) == BREAK:
+            return getattr(ctx, "dest", ""), ctx.warnings
+
+        _normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id))
+
+        if _run_hook("post_normalize", ctx) == BREAK:
+            return getattr(ctx, "dest", ""), ctx.warnings
+
+        # Loop through source characters. The increment of each loop depends on
+        # the length of the token that eventually matches.
+        ctx.cur = 0
+
+        while ctx.cur < len(ctx.src):
+            # Reset cursor position flags.
+            # Carry over extended "beginning of word" flag.
+            ctx.cur_flags = 0
+            cur_char = ctx.src[ctx.cur]
+
+            # Look for a word boundary and flag word beginning/end it if found.
+            if _is_bow(ctx.cur, ctx, WORD_BOUNDARY):
+                # Beginning of word.
+                logger.debug(f"Beginning of word at position {ctx.cur}.")
+                ctx.cur_flags |= BOW
+            if _is_eow(ctx.cur, ctx, WORD_BOUNDARY):
+                # End of word.
+                logger.debug(f"End of word at position {ctx.cur}.")
+                ctx.cur_flags |= EOW
+
+            # This hook may skip the parsing of the current
+            # token or exit the scanning loop altogether.
+            hret = _run_hook("begin_input_token", ctx)
+            if hret == BREAK:
+                logger.debug("Breaking text scanning from hook signal.")
+                break
+            if hret == CONT:
+                logger.debug("Skipping scanning iteration from hook signal.")
+                continue
 
 
-                step = len(ctx.tk)
-                if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
-                    # The position matches an ignore token.
-                    hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
+            # Check ignore list. Find as many subsequent ignore tokens
+            # as possible before moving on to looking for match tokens.
+            ctx.tk = None
+            while True:
+                ctx.ignoring = False
+                for ctx.tk in get_lang_ignore(ctx.conn, ctx.lang_id):
+                    hret = _run_hook("pre_ignore_token", ctx)
                     if hret == BREAK:
                     if hret == BREAK:
                         break
                         break
                     if hret == CONT:
                     if hret == CONT:
                         continue
                         continue
 
 
-                    logger.info(f"Ignored token: {ctx.tk}")
-                    ctx.dest_ls.append(ctx.tk)
-                    ctx.cur += step
-                    cur_char = ctx.src[ctx.cur]
-                    ctx.ignoring = True
+                    step = len(ctx.tk)
+                    if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
+                        # The position matches an ignore token.
+                        hret = _run_hook("on_ignore_match", ctx)
+                        if hret == BREAK:
+                            break
+                        if hret == CONT:
+                            continue
+
+                        logger.info(f"Ignored token: {ctx.tk}")
+                        ctx.dest_ls.append(ctx.tk)
+                        ctx.cur += step
+                        cur_char = ctx.src[ctx.cur]
+                        ctx.ignoring = True
+                        break
+                # We looked through all ignore tokens, not found any. Move on.
+                if not ctx.ignoring:
                     break
                     break
-            # We looked through all ignore tokens, not found any. Move on.
-            if not ctx.ignoring:
-                break
-            # Otherwise, if we found a match, check if the next position may be
-            # ignored as well.
-
-        delattr(ctx, "tk")
-        delattr(ctx, "ignoring")
-
-        # Begin transliteration token lookup.
-        ctx.match = False
-
-        for ctx.src_tk, ctx.dest_str in langsec["map"]:
-            hret = _run_hook("pre_tx_token", ctx, langsec_hooks)
-            if hret == BREAK:
-                break
-            if hret == CONT:
-                continue
+                # Otherwise, if we found a match, check if the next position
+                # may be ignored as well.
 
 
-            step = len(ctx.src_tk.content)
-            # If the token is longer than the remaining of the string,
-            # it surely won't match.
-            if ctx.cur + step > len(ctx.src):
-                continue
+            delattr(ctx, "tk")
+            delattr(ctx, "ignoring")
 
 
-            # If the first character of the token is greater (= higher code
-            # point value) than the current character, then break the loop
-            # without a match, because we know there won't be any more match
-            # due to the alphabetical ordering.
-            if ctx.src_tk.content[0] > cur_char:
-                logger.debug(
-                        f"{ctx.src_tk.content} is after "
-                        f"{ctx.src[ctx.cur:ctx.cur + step]}. Breaking loop.")
-                break
+            # Begin transliteration token lookup.
+            ctx.match = False
 
 
-            # If src_tk has a WB flag but the token is not at WB, skip.
-            if (
-                (ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW)
-                or
-                # Can't rely on EOW flag, we must check on the last character
-                # of the potential match.
-                (ctx.src_tk.flags & EOW and not _is_eow(
-                        ctx.cur + step - 1, ctx, word_boundary))
-            ):
-                continue
-
-            # Longer tokens should be guaranteed to be scanned before their
-            # substrings at this point.
-            # Similarly, flagged tokens are evaluated first.
-            if ctx.src_tk.content == ctx.src[ctx.cur:ctx.cur + step]:
-                ctx.match = True
-                # This hook may skip this token or break out of the token
-                # lookup for the current position.
-                hret = _run_hook("on_tx_token_match", ctx, langsec_hooks)
+            for ctx.src_tk, ctx.dest_str in get_lang_map(
+                    ctx.conn, ctx.lang_id, ctx.t_dir):
+                hret = _run_hook("pre_tx_token", ctx)
                 if hret == BREAK:
                 if hret == BREAK:
                     break
                     break
                 if hret == CONT:
                 if hret == CONT:
                     continue
                     continue
 
 
-                # A match is found. Stop scanning tokens, append result, and
-                # proceed scanning the source.
+                step = len(ctx.src_tk.content)
+                # If the token is longer than the remaining of the string,
+                # it surely won't match.
+                if ctx.cur + step > len(ctx.src):
+                    continue
 
 
-                # Capitalization.
+                # If the first character of the token is greater (= higher code
+                # point value) than the current character, then break the loop
+                # without a match, because we know there won't be any more
+                # match due to the alphabetical ordering.
+                if ctx.src_tk.content[0] > cur_char:
+                    logger.debug(
+                            f"{ctx.src_tk.content} is after "
+                            f"{ctx.src[ctx.cur:ctx.cur + step]}. "
+                            "Breaking loop.")
+                    break
+
+                # If src_tk has a WB flag but the token is not at WB, skip.
                 if (
                 if (
-                    (ctx.options["capitalize"] == "first" and ctx.cur == 0)
+                    (ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW)
                     or
                     or
-                    (
-                        ctx.options["capitalize"] == "all"
-                        and ctx.cur_flags & BOW
-                    )
+                    # Can't rely on EOW flag, we must check on the last
+                    # character of the potential match.
+                    (ctx.src_tk.flags & EOW and not _is_eow(
+                            ctx.cur + step - 1, ctx, WORD_BOUNDARY))
                 ):
                 ):
-                    logger.info("Capitalizing token.")
-                    double_cap = False
-                    for dcap_rule in ctx.langsec.get("double_cap", []):
-                        if ctx.dest_str == dcap_rule:
-                            ctx.dest_str = ctx.dest_str.upper()
-                            double_cap = True
-                            break
-                    if not double_cap:
-                        ctx.dest_str = (
-                                ctx.dest_str[0].upper() + ctx.dest_str[1:])
+                    continue
 
 
-                ctx.dest_ls.append(ctx.dest_str)
-                ctx.cur += step
-                break
+                # Longer tokens should be guaranteed to be scanned before their
+                # substrings at this point.
+                # Similarly, flagged tokens are evaluated first.
+                if ctx.src_tk.content == ctx.src[ctx.cur:ctx.cur + step]:
+                    ctx.match = True
+                    # This hook may skip this token or break out of the token
+                    # lookup for the current position.
+                    hret = _run_hook("on_tx_token_match", ctx)
+                    if hret == BREAK:
+                        break
+                    if hret == CONT:
+                        continue
 
 
-        if ctx.match is False:
-            delattr(ctx, "match")
-            hret = _run_hook("on_no_tx_token_match", ctx, langsec_hooks)
-            if hret == BREAK:
-                break
-            if hret == CONT:
-                continue
+                    # A match is found. Stop scanning tokens, append result,
+                    # and proceed scanning the source.
+
+                    # Capitalization.
+                    if (
+                        (ctx.options["capitalize"] == "first" and ctx.cur == 0)
+                        or
+                        (
+                            ctx.options["capitalize"] == "all"
+                            and ctx.cur_flags & BOW
+                        )
+                    ):
+                        logger.info("Capitalizing token.")
+                        double_cap = False
+                        for dcap_rule in get_lang_dcap(ctx.conn, ctx.lang_id):
+                            if ctx.dest_str == dcap_rule:
+                                ctx.dest_str = ctx.dest_str.upper()
+                                double_cap = True
+                                break
+                        if not double_cap:
+                            ctx.dest_str = (
+                                    ctx.dest_str[0].upper() + ctx.dest_str[1:])
+
+                    ctx.dest_ls.append(ctx.dest_str)
+                    ctx.cur += step
+                    break
+
+            if ctx.match is False:
+                delattr(ctx, "match")
+                hret = _run_hook("on_no_tx_token_match", ctx)
+                if hret == BREAK:
+                    break
+                if hret == CONT:
+                    continue
 
 
-            # No match found. Copy non-mapped character (one at a time).
-            logger.info(
-                    f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
-                    f"at position {ctx.cur} is not mapped.")
-            ctx.dest_ls.append(cur_char)
-            ctx.cur += 1
-        else:
-            delattr(ctx, "match")
-        delattr(ctx, "cur_flags")
+                # No match found. Copy non-mapped character (one at a time).
+                logger.info(
+                        f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
+                        f"at position {ctx.cur} is not mapped.")
+                ctx.dest_ls.append(cur_char)
+                ctx.cur += 1
+            else:
+                delattr(ctx, "match")
+            delattr(ctx, "cur_flags")
 
 
-    delattr(ctx, "cur")
+        delattr(ctx, "cur")
 
 
-    # This hook may take care of the assembly and cause the function to return
-    # its own return value.
-    hret = _run_hook("pre_assembly", ctx, langsec_hooks)
-    if hret is not None:
-        return hret, ctx.warnings
+        # This hook may take care of the assembly and cause the function to
+        # return its own return value.
+        hret = _run_hook("pre_assembly", ctx)
+        if hret is not None:
+            return hret, ctx.warnings
 
 
-    logger.debug(f"Output list: {ctx.dest_ls}")
-    ctx.dest = "".join(ctx.dest_ls)
+        logger.debug(f"Output list: {ctx.dest_ls}")
+        ctx.dest = "".join(ctx.dest_ls)
 
 
-    # This hook may reassign the output string and/or cause the function to
-    # return it immediately.
-    hret = _run_hook("post_assembly", ctx, langsec_hooks)
-    if hret is not None:
-        return hret, ctx.warnings
+        # This hook may reassign the output string and/or cause the function to
+        # return it immediately.
+        hret = _run_hook("post_assembly", ctx)
+        if hret is not None:
+            return hret, ctx.warnings
 
 
-    # Strip multiple spaces and leading/trailing whitespace.
-    ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
+        # Strip multiple spaces and leading/trailing whitespace.
+        ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
 
 
-    return ctx.dest, ctx.warnings
+        return ctx.dest, ctx.warnings
 
 
 
 
-def _normalize_src(ctx):
-    for nk, nv in ctx.langsec.get("normalize", {}).items():
+def _normalize_src(ctx, norm_rules):
+    for nk, nv in norm_rules.items():
         ctx._src = ctx.src.replace(nk, nv)
         ctx._src = ctx.src.replace(nk, nv)
     logger.debug(f"Normalized source: {ctx.src}")
     logger.debug(f"Normalized source: {ctx.src}")
 
 
@@ -317,11 +325,13 @@ def _is_eow(cur, ctx, word_boundary):
     ) and (ctx.src[cur] not in word_boundary)
     ) and (ctx.src[cur] not in word_boundary)
 
 
 
 
-def _run_hook(hname, ctx, hooks):
+def _run_hook(hname, ctx):
     ret = None
     ret = None
-    for hook_def in hooks.get(hname, []):
-        kwargs = hook_def[1] if len(hook_def) > 1 else {}
-        ret = hook_def[0](ctx, **kwargs)
+    for hook_def in ctx.hooks.get(hname, []):
+        fn = getattr(
+                import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
+                hook_def["fn_name"])
+        ret = fn(ctx, **hook_def["kwargs"])
         if ret in (BREAK, CONT):
         if ret in (BREAK, CONT):
             # This will stop parsing hooks functions and tell the caller to
             # This will stop parsing hooks functions and tell the caller to
             # break out of the outer loop or skip iteration.
             # break out of the outer loop or skip iteration.