Bladeren bron

Merge pull request #130 from lcnetdev/test

DB back end.
Stefano Cossu 8 maanden geleden
bovenliggende
commit
f5b1e37594

+ 1 - 0
.gitignore

@@ -137,5 +137,6 @@ tags.temp
 
 
 # Local
 # Local
 ext/arabic_rom/data
 ext/arabic_rom/data
+scriptshifter/data/*.db
 !.keep
 !.keep
 VERSION
 VERSION

+ 2 - 1
Dockerfile

@@ -3,8 +3,9 @@ ARG WORKROOT "/usr/local/scriptshifter/src"
 
 
 # Copy core application files.
 # Copy core application files.
 WORKDIR ${WORKROOT}
 WORKDIR ${WORKROOT}
-COPY entrypoint.sh uwsgi.ini wsgi.py VERSION ./
+COPY VERSION entrypoint.sh sscli uwsgi.ini wsgi.py ./
 COPY scriptshifter ./scriptshifter/
 COPY scriptshifter ./scriptshifter/
+COPY tests ./tests/
 COPY requirements.txt ./
 COPY requirements.txt ./
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 
 

+ 37 - 2
README.md

@@ -18,6 +18,21 @@ Currently, the following environment variables are defined:
 - `TXL_DICTA_EP`: Endpoint for the Dicta Hebrew transliteration service. This
 - `TXL_DICTA_EP`: Endpoint for the Dicta Hebrew transliteration service. This
   is mandatory for using the Hebrew module.
   is mandatory for using the Hebrew module.
 
 
+## Initial setup
+
+In order to run Scriptshifter, a local SQLite database must be created. The
+simplest way to do that is via command-line:
+
+```bash
+./sscli admin init-db
+```
+
+This step is already included in the `entrypoint.sh` script that gets executed
+by Docker, so no additional action is necessary.
+
+Note that the DB must be recreated every time any of the configuration tables
+in `scriptshifter/tables/data` changes.
+
 ## Local development server
 ## Local development server
 
 
 For local development, it is easiest to run Flask without the WSGI wrapper,
 For local development, it is easiest to run Flask without the WSGI wrapper,
@@ -73,11 +88,12 @@ string in a production environment.
 
 
 `TXL_LOGLEVEL`: Logging level. Use Python notation. The default is `WARN`.
 `TXL_LOGLEVEL`: Logging level. Use Python notation. The default is `WARN`.
 
 
-`TXL_SMTP_HOST`: SMTP host to send feedback messages through. Defaults to
-`localhost`.
+`TXL_SMTP_HOST`: SMTP host to send feedback messages through. If not defined,
+the feedback form will not be shown in the UI.
 
 
 `TXL_SMTP_PORT`: Port of the SMTP server. Defaults to `1025`.
 `TXL_SMTP_PORT`: Port of the SMTP server. Defaults to `1025`.
 
 
+
 ## Web UI
 ## Web UI
 
 
 `/` renders a simple HTML form to test the transliteration service.
 `/` renders a simple HTML form to test the transliteration service.
@@ -88,6 +104,25 @@ the drop-down automatically. The value must be one of the keys found in
 `/languages`.
 `/languages`.
 
 
 
 
+## Command-line interface
+
+Various Scriptshifter commands can be accessed via the shell command `sscli`.
+At the moment only a few essential admin and testing tools are available. More
+commands can be made avaliable on an as-needed basis.
+
+Help menu:
+
+```
+/path/to/sscli --help
+```
+
+Section help:
+
+```
+/path/to/sscli admin --help
+```
+
+
 ## Contributing
 ## Contributing
 
 
 See the [contributing guide](./doc/contributing.md).
 See the [contributing guide](./doc/contributing.md).

+ 2 - 0
entrypoint.sh

@@ -12,6 +12,8 @@ fi
 host=${TXL_WEBAPP_HOST:-"0.0.0.0"}
 host=${TXL_WEBAPP_HOST:-"0.0.0.0"}
 port=${TXL_WEBAPP_PORT:-"8000"}
 port=${TXL_WEBAPP_PORT:-"8000"}
 
 
+./sscli admin init-db
+
 if [ "${FLASK_ENV}" == "development" ]; then
 if [ "${FLASK_ENV}" == "development" ]; then
     exec flask run -h $host -p $port
     exec flask run -h $host -p $port
 else
 else

+ 8 - 0
scriptshifter/__init__.py

@@ -9,6 +9,14 @@ env = load_dotenv()
 
 
 APP_ROOT = path.dirname(path.realpath(__file__))
 APP_ROOT = path.dirname(path.realpath(__file__))
 
 
+"""
+SQLite database path.
+
+This DB stores all the runtime transliteration data.
+"""
+DB_PATH = environ.get(
+        "DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))
+
 """
 """
 SMTP server for sending email. For a dummy server that just echoes the
 SMTP server for sending email. For a dummy server that just echoes the
 messages, run: `python -m smtpd -n -c DebuggingServer localhost:1025`
 messages, run: `python -m smtpd -n -c DebuggingServer localhost:1025`

+ 0 - 0
scriptshifter/data/.keep


+ 2 - 2
scriptshifter/hooks/greek/__init__.py

@@ -6,9 +6,9 @@ from logging import getLogger
 from scriptshifter.exceptions import CONT
 from scriptshifter.exceptions import CONT
 
 
 
 
-# Suffixed by ʹ
 # Indices are positions in the numeric string from the right
 # Indices are positions in the numeric string from the right
 DIGITS = {
 DIGITS = {
+    # Suffixed by ʹ (U+0374)
     1: {  # Units
     1: {  # Units
         "α": 1,
         "α": 1,
         "β": 2,
         "β": 2,
@@ -45,7 +45,7 @@ DIGITS = {
         "ω": 8,
         "ω": 8,
         "ϡ": 9,
         "ϡ": 9,
     },
     },
-    # Prefixed by ͵
+    # Prefixed by ͵ (U+0375)
     4: {
     4: {
         "α": 1,
         "α": 1,
         "β": 2,
         "β": 2,

+ 4 - 12
scriptshifter/rest_api.py

@@ -1,7 +1,6 @@
 import logging
 import logging
 
 
 from base64 import b64encode
 from base64 import b64encode
-from copy import deepcopy
 from email.message import EmailMessage
 from email.message import EmailMessage
 from json import dumps
 from json import dumps
 from os import environ, urandom
 from os import environ, urandom
@@ -15,7 +14,7 @@ from scriptshifter import (
         GIT_COMMIT, GIT_TAG,
         GIT_COMMIT, GIT_TAG,
         SMTP_HOST, SMTP_PORT)
         SMTP_HOST, SMTP_PORT)
 from scriptshifter.exceptions import ApiError
 from scriptshifter.exceptions import ApiError
-from scriptshifter.tables import list_tables, load_table
+from scriptshifter.tables import list_tables, get_language
 from scriptshifter.trans import transliterate
 from scriptshifter.trans import transliterate
 
 
 
 
@@ -89,16 +88,9 @@ def list_languages():
 @app.route("/table/<lang>")
 @app.route("/table/<lang>")
 def dump_table(lang):
 def dump_table(lang):
     """
     """
-    Dump parsed transliteration table for a language.
+    Dump a language configuration from the DB.
     """
     """
-    tbl = deepcopy(load_table(lang))
-    for sec_name in ("roman_to_script", "script_to_roman"):
-        if sec_name in tbl:
-            for hname, fn_defs in tbl[sec_name].get("hooks", {}).items():
-                tbl[sec_name]["hooks"][hname] = [
-                        (fn.__name__, kw) for (fn, kw) in fn_defs]
-
-    return jsonify(tbl)
+    return get_language(lang)
 
 
 
 
 @app.route("/options/<lang>", methods=["GET"])
 @app.route("/options/<lang>", methods=["GET"])
@@ -106,7 +98,7 @@ def get_options(lang):
     """
     """
     Get extra options for a table.
     Get extra options for a table.
     """
     """
-    tbl = load_table(lang)
+    tbl = get_language(lang)
 
 
     return jsonify(tbl.get("options", []))
     return jsonify(tbl.get("options", []))
 
 

+ 357 - 8
scriptshifter/tables/__init__.py

@@ -1,9 +1,13 @@
 import logging
 import logging
 import re
 import re
+import sqlite3
 
 
+from collections import defaultdict
 from functools import cache
 from functools import cache
 from importlib import import_module
 from importlib import import_module
-from os import environ, path, access, R_OK
+from json import dumps as jdumps, loads as jloads
+from os import R_OK, access, environ, makedirs, path, unlink
+from shutil import move
 
 
 from yaml import load
 from yaml import load
 try:
 try:
@@ -11,17 +15,22 @@ try:
 except ImportError:
 except ImportError:
     from yaml import Loader
     from yaml import Loader
 
 
+from scriptshifter import DB_PATH
 from scriptshifter.exceptions import BREAK, ConfigError
 from scriptshifter.exceptions import BREAK, ConfigError
 
 
 
 
 __doc__ = """
 __doc__ = """
 Transliteration tables.
 Transliteration tables.
 
 
-These tables contain all transliteration information, grouped by script and
-language (or language and script? TBD)
+These tables contain all transliteration information. The static YML files are
+transformed and loaded into a database, which is the effective data source at
+runtime.
 """
 """
 
 
 
 
+TMP_DB_PATH = path.join(
+        path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH))
+
 DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
 DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
 # Can be overridden for tests.
 # Can be overridden for tests.
 TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
 TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
@@ -52,6 +61,11 @@ TOKEN_WB_MARKER = "%"
 BOW = 1 << 1
 BOW = 1 << 1
 EOW = 1 << 0
 EOW = 1 << 0
 
 
+# Feature flags used in database tables.
+FEAT_S2R = 1 << 0       # Has S2R.
+FEAT_R2S = 1 << 1       # Has R2S.
+FEAT_CASEI = 1 << 2     # Case-insensitive script.
+FEAT_RE = 1 << 3        # Regular expression.
 
 
 logger = logging.getLogger(__name__)
 logger = logging.getLogger(__name__)
 
 
@@ -123,6 +137,158 @@ class Token(str):
         return hash(self.content)
         return hash(self.content)
 
 
 
 
+def init_db():
+    """
+    Populate database with language data.
+
+    This operation removes any preexisting database.
+
+    All tables in the index file (`./data/index.yml`) will be parsed
+    (including inheritance rules) and loaded into the designated DB.
+
+    This must be done only once at bootstrap. To update individual tables,
+    see populate_table(), which this function calls iteratively.
+    """
+    # Create parent diretories if necessary.
+    # If the DB already exists, it will be overwritten ONLY on success at
+    # hhis point.
+    if path.isfile(TMP_DB_PATH):
+        # Remove previous temp file (possibly from failed attempt)
+        unlink(TMP_DB_PATH)
+    else:
+        makedirs(path.dirname(TMP_DB_PATH), exist_ok=True)
+
+    conn = sqlite3.connect(TMP_DB_PATH)
+
+    # Initialize schema.
+    with open(path.join(path.dirname(DEFAULT_TABLE_DIR), "init.sql")) as fh:
+        with conn:
+            conn.executescript(fh.read())
+
+    # Populate tables.
+    with open(path.join(TABLE_DIR, "index.yml")) as fh:
+        tlist = load(fh, Loader=Loader)
+    try:
+        with conn:
+            for tname, tdata in tlist.items():
+                res = conn.execute(
+                    """INSERT INTO tbl_language (
+                        name, label, marc_code, description
+                    ) VALUES (?, ?, ?, ?)""",
+                    (
+                        tname, tdata.get("name"), tdata.get("marc_code"),
+                        tdata.get("description"),
+                    )
+                )
+                populate_table(conn, res.lastrowid, tname)
+
+        # If the DB already exists, it will be overwritten ONLY on success at
+        # thhis point.
+        move(TMP_DB_PATH, DB_PATH)
+    finally:
+        conn.close()
+        if path.isfile(TMP_DB_PATH):
+            # Remove leftover temp files from bungled up operation.
+            unlink(TMP_DB_PATH)
+
+
+def get_connection():
+    """
+    Get the default DB connection object.
+
+    To be closed by the caller or used as a context.
+    """
+    return sqlite3.connect(DB_PATH)
+
+
+def populate_table(conn, tid, tname):
+    data = load_table(tname)
+    flags = 0
+    if "script_to_roman" in data:
+        flags |= FEAT_S2R
+    if "roman_to_script" in data:
+        flags |= FEAT_R2S
+
+    conn.execute(
+            "UPDATE tbl_language SET features = ? WHERE id = ?",
+            (flags, tid))
+
+    for t_dir in (FEAT_S2R, FEAT_R2S):
+        # BEGIN per-section loop.
+
+        sec_name = (
+                "script_to_roman" if t_dir == FEAT_S2R else "roman_to_script")
+        sec = data.get(sec_name)
+        if not sec:
+            continue
+
+        # Transliteration map.
+        sort = 1
+        for k, v in sec.get("map", {}):
+            conn.execute(
+                    """INSERT INTO tbl_trans_map (
+                        lang_id, dir, src, dest, sort
+                    ) VALUES (?, ?, ?, ?, ?)""",
+                    (tid, t_dir, k, v, sort))
+            sort += 1
+
+        # hooks.
+        for k, v in sec.get("hooks", {}).items():
+            for i, hook_data in enumerate(v, start=1):
+                conn.execute(
+                        """INSERT INTO tbl_hook (
+                            lang_id, dir, name, sort, module, fn, kwargs
+                        ) VALUES (?, ?, ?, ?, ?, ?, ?)""",
+                        (
+                            tid, t_dir, k, i, hook_data[0],
+                            hook_data[1].__name__, jdumps(hook_data[2])))
+
+        # Ignore rules (R2S only).
+        for row in sec.get("ignore", []):
+            if isinstance(row, dict):
+                if "re" in row:
+                    flags = FEAT_RE
+                    rule = row["re"]
+            else:
+                flags = 0
+                rule = row
+
+            conn.execute(
+                    """INSERT INTO tbl_ignore (
+                        lang_id, rule, features
+                    ) VALUES (?, ?, ?)""",
+                    (tid, rule, flags))
+
+        # Double caps (S2R only).
+        for rule in sec.get("double_cap", []):
+            conn.execute(
+                    """INSERT INTO tbl_double_cap (
+                        lang_id, rule
+                    ) VALUES (?, ?)""",
+                    (tid, rule))
+
+        # Normalize (S2R only).
+        for src, dest in sec.get("normalize", {}).items():
+            conn.execute(
+                    """INSERT INTO tbl_normalize (lang_id, src, dest)
+                    VALUES (?, ?, ?)""",
+                    (tid, src, dest))
+
+        # END per-section loop.
+
+    # UI options
+    for opt in data.get("options", []):
+        conn.execute(
+                """INSERT INTO tbl_option (
+                    lang_id, name, label, description, dtype,
+                    options, default_v
+                ) VALUES (?, ?, ?, ?, ?, ?, ?)""",
+                (
+                    tid, opt["id"], opt["label"], opt["description"],
+                    opt["type"], jdumps(opt.get("options")),
+                    opt["default"]))
+
+
 @cache
 @cache
 def list_tables():
 def list_tables():
     """
     """
@@ -131,16 +297,29 @@ def list_tables():
     Note that this may not correspond to all the table files in the data
     Note that this may not correspond to all the table files in the data
     folder, but only those exposed in the index.
     folder, but only those exposed in the index.
     """
     """
-    with open(path.join(TABLE_DIR, "index.yml")) as fh:
-        tdata = load(fh, Loader=Loader)
+    conn = get_connection()
+
+    with conn:
+        data = conn.execute(
+                """SELECT name, label, features, marc_code, description
+                FROM tbl_language""")
+        tdata = {
+            row[0]: {
+                "label": row[1],
+                "has_s2r": bool(row[2] & FEAT_S2R),
+                "has_r2s": bool(row[2] & FEAT_R2S),
+                "case_sensitive": not (row[2] & FEAT_CASEI),
+                "marc_code": row[3],
+                "description": row[4],
+            } for row in data
+        }
 
 
     return tdata
     return tdata
 
 
 
 
-@cache
 def load_table(tname):
 def load_table(tname):
     """
     """
-    Load one transliteration table and possible parents.
+    Parse one transliteration table and possible parents from YML files.
 
 
     The table file is parsed into an in-memory configuration that contains
     The table file is parsed into an in-memory configuration that contains
     the language & script metadata and parsing rules.
     the language & script metadata and parsing rules.
@@ -304,6 +483,176 @@ def load_hook_fn(cname, sec):
                     f"Hook function {fnname} defined in {cname} configuration "
                     f"Hook function {fnname} defined in {cname} configuration "
                     f"not found in module {HOOK_PKG_PATH}.{modname}!"
                     f"not found in module {HOOK_PKG_PATH}.{modname}!"
                 )
                 )
-            hook_fn[cfg_hook].append((fn, fn_kwargs))
+            hook_fn[cfg_hook].append((modname, fn, fn_kwargs))
 
 
     return hook_fn
     return hook_fn
+
+
+@cache
+def get_language(lang):
+    """ Get all language options from the DB. """
+
+    conn = get_connection()
+
+    with conn:
+        general = get_lang_general(conn, lang)
+        lang_id = general["id"]
+        data = general["data"]
+
+        # Normalization.
+
+        norm_data = get_lang_normalize(conn, lang_id)
+        if len(norm_data):
+            data["normalize"] = norm_data
+
+        # Script to Roman map and hooks.
+
+        if data["has_s2r"]:
+            data["script_to_roman"] = {}
+            s2r_map = tuple(
+                    row for row in get_lang_map(conn, lang_id, FEAT_S2R))
+            if len(s2r_map):
+                data["script_to_roman"]["map"] = s2r_map
+
+            s2r_hooks = get_lang_hooks(conn, lang_id, FEAT_S2R)
+            if len(s2r_hooks):
+                data["script_to_roman"]["hooks"] = s2r_hooks
+
+        # Roman to script map, ignore list, and hooks.
+
+        if data["has_r2s"]:
+            data["roman_to_script"] = {}
+            r2s_map = tuple(
+                    row for row in get_lang_map(conn, lang_id, FEAT_R2S))
+            if len(r2s_map):
+                data["roman_to_script"]["map"] = r2s_map
+
+            r2s_ignore = get_lang_ignore(conn, lang_id)
+            if len(r2s_ignore):
+                data["roman_to_script"]["ignore"] = r2s_ignore
+
+            r2s_hooks = get_lang_hooks(conn, lang_id, FEAT_R2S)
+            if len(r2s_hooks):
+                data["roman_to_script"]["hooks"] = r2s_hooks
+
+        opt_data = get_lang_options(conn, lang_id)
+        if len(opt_data):
+            data["options"] = opt_data
+
+        double_cap = get_lang_dcap(conn, lang_id)
+        if len(double_cap):
+            data["double_cap"] = double_cap
+
+    conn.close()
+
+    return data
+
+
+def get_lang_general(conn, lang):
+    """ Language general attributes. """
+    lang_q = conn.execute(
+            """SELECT id, name, label, features, marc_code, description
+            FROM tbl_language WHERE name = ?""", (lang,))
+    lang_data = lang_q.fetchone()
+
+    return {
+        "id": lang_data[0],
+        "data": {
+            "name": lang_data[1],
+            "label": lang_data[2],
+            "has_s2r": bool(lang_data[3] & FEAT_S2R),
+            "has_r2s": bool(lang_data[3] & FEAT_R2S),
+            "case_sensitive": not (lang_data[3] & FEAT_CASEI),
+            "marc_code": lang_data[4],
+            "description": lang_data[5],
+        },
+    }
+
+
+def get_lang_normalize(conn, lang_id):
+    qry = conn.execute(
+            """SELECT src, dest FROM tbl_normalize
+            WHERE lang_id = ?""",
+            (lang_id,))
+    return {row[0]: row[1] for row in qry}
+
+
+def get_lang_ignore(conn, lang_id):
+    """
+    Ignore list as a tuple.
+    """
+    qry = conn.execute(
+            """SELECT rule, features FROM tbl_ignore
+            WHERE lang_id = ?""",
+            (lang_id,))
+    # Features (regular expressions) not implemented yet.
+    return tuple(row[0] for row in qry)
+
+
+@cache
+def get_lang_map(conn, lang_id, t_dir):
+    """
+    S2R or R2S map.
+
+    Generator of tuples (source, destination).
+    """
+    qry = conn.execute(
+            """SELECT src, dest FROM tbl_trans_map
+            WHERE lang_id = ? AND dir = ?
+            ORDER BY sort ASC""",
+            (lang_id, t_dir))
+
+    for row in qry:
+        yield (Token(row[0]), row[1])
+
+
+def get_lang_options(conn, lang_id):
+    """ Language options as a tuple of dictionaries. """
+    qry = conn.execute(
+            """SELECT name, label, description, dtype, options, default_v
+            FROM tbl_option
+            WHERE lang_id = ?""",
+            (lang_id,))
+
+    return tuple(
+        {
+            "id": row[0],
+            "label": row[1],
+            "description": row[2],
+            "type": row[3],
+            "options": jloads(row[4]) if row[4] else None,
+            "default": row[5],
+        }
+        for row in qry
+    )
+
+
+def get_lang_hooks(conn, lang_id, t_dir):
+    """ Language hooks in sorting order. """
+    hooks = defaultdict(list)
+
+    qry = conn.execute(
+            """SELECT name, module, fn, kwargs
+            FROM tbl_hook WHERE lang_id = ? AND dir = ?
+            ORDER BY name, sort""",
+            (lang_id, t_dir))
+
+    for row in qry:
+        hooks[row[0]].append(
+            {
+                "module_name": row[1],
+                "fn_name": row[2],
+                "kwargs": jloads(row[3]),
+            }
+        )
+
+    return hooks
+
+
+def get_lang_dcap(conn, lang_id):
+    qry = conn.execute(
+            """SELECT rule
+            FROM tbl_double_cap WHERE lang_id = ?""",
+            (lang_id,))
+
+    return tuple(row[0] for row in qry)

+ 2 - 2
scriptshifter/tables/data/asian_cyrillic.yml

@@ -391,8 +391,8 @@ roman_to_script:
 script_to_roman:
 script_to_roman:
   map:
   map:
     
     
-    "\u00AB": """
-    "\u00BB": """
+    "\u00AB": "\""
+    "\u00BB": "\""
     "\u2116": "No\u0332"
     "\u2116": "No\u0332"
     "\u0400": "E\u0300"
     "\u0400": "E\u0300"
     "\u0401": "E\u0308"
     "\u0401": "E\u0308"

+ 1 - 1
scriptshifter/tables/data/bashkir_cyrillic.yml

@@ -23,7 +23,7 @@ roman_to_script:
     "U\u0307": "\u04AE"
     "U\u0307": "\u04AE"
     "u\u0307": "\u04AF"
     "u\u0307": "\u04AF"
     "TH": "\u04AA"
     "TH": "\u04AA"
-    "Th": "\u04AA"s
+    "Th": "\u04AA"
     "th": "\u04AB"
     "th": "\u04AB"
     "J": "\u04B8"
     "J": "\u04B8"
     "j": "\u04B9"
     "j": "\u04B9"

+ 2 - 2
scriptshifter/tables/data/index.yml

@@ -47,7 +47,7 @@ bengali:
 bulgarian:
 bulgarian:
   marc_code: bul
   marc_code: bul
   name: Bulgarian
   name: Bulgarian
-buriat:
+buriat_cyrillic:
   marc_code: bua
   marc_code: bua
   name: Buriat (Cyrillic)
   name: Buriat (Cyrillic)
 burmese:
 burmese:
@@ -111,7 +111,7 @@ kannada:
 kara-kalpak_cyrillic:
 kara-kalpak_cyrillic:
   marc_code: kaa
   marc_code: kaa
   name: Kara-Kalpak (Cyrillic)
   name: Kara-Kalpak (Cyrillic)
-karachai-balkar_cyrillic:
+karachay-balkar_cyrillic:
   marc_code: krc
   marc_code: krc
   name: Karachay-Balkar  (Cyrillic)
   name: Karachay-Balkar  (Cyrillic)
 karelian_cyrillic:
 karelian_cyrillic:

+ 2 - 2
scriptshifter/tables/data/kara-kalpak_cyrillic.yml

@@ -27,11 +27,11 @@ roman_to_script:
 script_to_roman:
 script_to_roman:
   map:
   map:
     "\u040E": "W"
     "\u040E": "W"
-    "\u045E"" "w"
+    "\u045E": "w"
     "\u0492": "Gh"
     "\u0492": "Gh"
     "\u0493": "gh"
     "\u0493": "gh"
     "\u049A": "Q"
     "\u049A": "Q"
-    "\u-49B": "q"
+    "\u049B": "q"
     "\u04A2": "N\uFE20G\uFE21"
     "\u04A2": "N\uFE20G\uFE21"
     "\u04A3": "n\uFE20g\uFE21"
     "\u04A3": "n\uFE20g\uFE21"
     "\u04AE": "U\u0307"
     "\u04AE": "U\u0307"

+ 4 - 4
scriptshifter/tables/data/komi_cyrillic.yml

@@ -5,10 +5,10 @@ general:
 
 
 roman_to_script:
 roman_to_script:
   map:
   map:
-    "D\u0320Z\u0320\H\u\0320": "\u0496"
-    "D\u0320Z\u0320\h\u\0320": "\u0496"
-    "D\u0320z\u0320\h\u\0320": "\u0496"
-    "d\u0320z\u0320\h\u\0320": "\u0497"
+    "D\u0320Z\u0320H\u0320": "\u0496"
+    "D\u0320Z\u0320h\u0320": "\u0496"
+    "D\u0320z\u0320h\u0320": "\u0496"
+    "d\u0320z\u0320h\u0320": "\u0497"
     "D\uFE20Z\uFE21": "\u0506"
     "D\uFE20Z\uFE21": "\u0506"
     "D\uFE20z\uFE21": "\u0506"
     "D\uFE20z\uFE21": "\u0506"
     "d\uFE20z\uFE21": "\u0507"
     "d\uFE20z\uFE21": "\u0507"

+ 2 - 2
scriptshifter/tables/data/mongolian_mongol_bichig.yml

@@ -6,12 +6,12 @@ general:
 roman_to_script:
 roman_to_script:
 
 
   map:
   map:
-    "\u002Daca": "\u202F\u1820\u1834\u1820
+    "\u002Daca": "\u202F\u1820\u1834\u1820"
     "\u002DA": "\u180E\u1820"
     "\u002DA": "\u180E\u1820"
     "\u002Da": "\u180E\u1820"
     "\u002Da": "\u180E\u1820"
     "A": "\u1820"
     "A": "\u1820"
     "a": "\u1820"
     "a": "\u1820"
-    "\u002Dece": "\u202F\u1821\u1834\u1821
+    "\u002Dece": "\u202F\u1821\u1834\u1821"
     "\u002DE": "\u180E\u1821"
     "\u002DE": "\u180E\u1821"
     "\u002De": "\u180E\u1821"
     "\u002De": "\u180E\u1821"
     "\u002D": "\u202F"
     "\u002D": "\u202F"

+ 1 - 1
scriptshifter/tables/data/yiddish.yml

@@ -4,7 +4,7 @@ general:
 options:
 options:
   - id: loshn_koydesh
   - id: loshn_koydesh
     label: Loshn Koydesh
     label: Loshn Koydesh
-    description: [TODO]
+    description: "Apply Loshn Koydesh vocalization."
     type: boolean
     type: boolean
     default: false
     default: false
 
 

+ 107 - 0
scriptshifter/tables/init.sql

@@ -0,0 +1,107 @@
+/*
+ * Master language table.
+ *
+ * Overview of languages available in Scriptshifter.
+ */
+CREATE TABLE tbl_language (
+    id INTEGER PRIMARY KEY,
+    name TEXT UNIQUE,
+    label TEXT,
+    marc_code TEXT,
+    description TEXT,
+    features TINYINT DEFAULT 0
+);
+
+/*
+ * Transliteration maps.
+ *
+ * Each row is a S2R or R2S pair associated with a language ID.
+ */
+CREATE TABLE tbl_trans_map (
+    id INTEGER PRIMARY KEY,
+    lang_id INTEGER NOT NULL,
+    dir TINYINT NOT NULL DEFAULT 0,  /* 1 = S2R; 2 = R2S */
+    src TEXT NOT NULL,
+    dest TEXT,
+    sort INT NOT NULL,  /* Smaller values have higher priority. */
+
+    FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE
+);
+CREATE UNIQUE INDEX idx_trans_lookup ON tbl_trans_map (lang_id, dir, src);
+CREATE INDEX idx_trans_map_sort ON tbl_trans_map (sort ASC);
+
+/*
+ * Processing hooks.
+ *
+ * Note that multiple functions may be grouped under the same hook, lang, and
+ * direction. These are ordered by `sort`.
+ */
+CREATE TABLE tbl_hook (
+    id INTEGER PRIMARY KEY,
+    lang_id INTEGER NOT NULL,
+    dir TINYINT NOT NULL DEFAULT 0,  /* 1 = S2R; 2 = R2S */
+    name TEXT NOT NULL, /* Hook name. */
+    sort INT NOT NULL,  /* Function sorting order within the hook. */
+    module TEXT NOT NULL, /* Module name. */
+    fn TEXT NOT NULL,   /* Function name. */
+    kwargs TEXT,        /* KW arguments as JSON blob. */
+
+    FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE
+);
+CREATE INDEX idx_hook_lookup ON tbl_hook (lang_id, dir);
+CREATE INDEX idx_hookname_lookup ON tbl_hook (name);
+CREATE INDEX idx_hook_sort ON tbl_hook (sort ASC);
+
+/*
+ * Ignore lists for R2S.
+ */
+CREATE TABLE tbl_ignore (
+    id INTEGER PRIMARY KEY,
+    lang_id INTEGER NOT NULL,
+    rule TEXT NOT NULL,
+    features TINYINT,  /* 1 = case insensitive; 2 = regular expression. */
+
+    FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE
+);
+
+/*
+ * Double capitals.
+ */
+CREATE TABLE tbl_double_cap (
+    id INTEGER PRIMARY KEY,
+    lang_id INTEGER NOT NULL,
+    rule TEXT NOT NULL,
+
+    FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE
+);
+
+/*
+ * Normalization rules.
+ */
+CREATE TABLE tbl_normalize (
+    id INTEGER PRIMARY KEY,
+    lang_id INTEGER NOT NULL,
+    src TEXT NOT NULL,
+    dest TEXT NOT NULL,
+
+    FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE
+);
+
+/*
+ * Input options.
+ */
+CREATE TABLE tbl_option (
+    id INTEGER PRIMARY KEY,
+    lang_id INTEGER NOT NULL,
+    name TEXT NOT NULL,
+    label TEXT NOT NULL,
+    description TEXT,
+    dtype TEXT,
+    options TEXT,
+    default_v TEXT,
+
+    FOREIGN KEY (lang_id) REFERENCES tbl_language(id) ON DELETE CASCADE
+);
+CREATE UNIQUE INDEX idx_option_lookup ON tbl_option (lang_id, name);
+
+

+ 2 - 2
scriptshifter/templates/index.html

@@ -60,7 +60,7 @@
             <label for="lang">Language</label>
             <label for="lang">Language</label>
             <select id="lang" name="lang">
             <select id="lang" name="lang">
                 {% for k, v in languages.items() %}
                 {% for k, v in languages.items() %}
-                    <option value="{{ k }}">{{ v["name"] }}</option>
+                    <option value="{{ k }}">{{ v["label"] }}</option>
                 {% endfor %}
                 {% endfor %}
             </select>
             </select>
         </fieldset>
         </fieldset>
@@ -176,4 +176,4 @@
     {% endif %}
     {% endif %}
 
 
     <script type="text/javascript" src="/static/ss.js"></script>
     <script type="text/javascript" src="/static/ss.js"></script>
-{% endblock %}
+{% endblock %}

+ 215 - 204
scriptshifter/trans.py

@@ -1,9 +1,13 @@
 import logging
 import logging
 
 
+from importlib import import_module
 from re import compile
 from re import compile
 
 
 from scriptshifter.exceptions import BREAK, CONT
 from scriptshifter.exceptions import BREAK, CONT
-from scriptshifter.tables import BOW, EOW, WORD_BOUNDARY, load_table
+from scriptshifter.tables import (
+        BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
+        get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
+        get_lang_ignore, get_lang_map, get_lang_normalize)
 
 
 
 
 # Match multiple spaces.
 # Match multiple spaces.
@@ -15,6 +19,8 @@ logger = logging.getLogger(__name__)
 class Context:
 class Context:
     """
     """
     Context used within the transliteration and passed to hook functions.
     Context used within the transliteration and passed to hook functions.
+
+    Use within a `with` block for proper cleanup.
     """
     """
     @property
     @property
     def src(self):
     def src(self):
@@ -28,23 +34,35 @@ class Context:
     def src(self):
     def src(self):
         raise NotImplementedError("Attribute is read-only.")
         raise NotImplementedError("Attribute is read-only.")
 
 
-    def __init__(self, src, general, langsec, options={}):
+    def __init__(self, lang, src, t_dir, options={}):
         """
         """
         Initialize a context.
         Initialize a context.
 
 
         Args:
         Args:
             src (str): The original text. Read-only.
             src (str): The original text. Read-only.
-            general (dict): general section of the current config.
-            langsec (dict): Language configuration section being used.
+            t_dir (int): the direction of transliteration.
+                    Either FEAT_R2S or FEAT_S2R.
             options (dict): extra options as a dict.
             options (dict): extra options as a dict.
         """
         """
+        self.lang = lang
         self._src = src
         self._src = src
-        self.general = general
+        self.t_dir = t_dir
+        self.conn = get_connection()
+        with self.conn as conn:
+            general = get_lang_general(conn, self.lang)
+        self.general = general["data"]
+        self.lang_id = general["id"]
         self.options = options
         self.options = options
-        self.langsec = langsec
+        self.hooks = get_lang_hooks(self.conn, self.lang_id, self.t_dir)
         self.dest_ls = []
         self.dest_ls = []
         self.warnings = []
         self.warnings = []
 
 
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.conn.close()
+
 
 
 def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
     """
     """
@@ -73,234 +91,225 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
     Return:
     Return:
         str: The transliterated string.
         str: The transliterated string.
     """
     """
-    source_str = "Latin" if t_dir == "r2s" else lang
-    target_str = lang if t_dir == "r2s" else "Latin"
-    logger.info(f"Transliteration is from {source_str} to {target_str}.")
-
-    cfg = load_table(lang)
-    logger.info(f"Loaded table for {lang}.")
-
-    # General directives.
-    general = cfg.get("general", {})
-
-    if t_dir == "s2r" and "script_to_roman" not in cfg:
-        raise NotImplementedError(
-            f"Script-to-Roman transliteration not yet supported for {lang}."
-        )
-    elif t_dir == "r2s" and "roman_to_script" not in cfg:
-        raise NotImplementedError(
-            f"Roman-to-script transliteration not yet supported for {lang}."
-        )
+    # Map t_dir to constant.
+    t_dir = FEAT_S2R if t_dir == "s2r" else FEAT_R2S
 
 
-    langsec = (
-            cfg["script_to_roman"] if t_dir == "s2r"
-            else cfg["roman_to_script"])
-    # langsec_dir = langsec.get("directives", {})
-    langsec_hooks = langsec.get("hooks", {})
+    source_str = "Roman" if t_dir == FEAT_R2S else lang
+    target_str = lang if t_dir == FEAT_R2S else "Roman"
+    logger.info(f"Transliteration is from {source_str} to {target_str}.")
 
 
     src = src.strip()
     src = src.strip()
     options["capitalize"] = capitalize
     options["capitalize"] = capitalize
-    ctx = Context(src, general, langsec, options)
-
-    # This hook may take over the whole transliteration process or delegate it
-    # to some external process, and return the output string directly.
-    if _run_hook("post_config", ctx, langsec_hooks) == BREAK:
-        return getattr(ctx, "dest", ""), ctx.warnings
-
-    if "normalize" in ctx.langsec:
-        _normalize_src(ctx)
-
-    if _run_hook("post_normalize", ctx, langsec_hooks) == BREAK:
-        return getattr(ctx, "dest", ""), ctx.warnings
-
-    # Loop through source characters. The increment of each loop depends on
-    # the length of the token that eventually matches.
-    ignore_list = langsec.get("ignore", [])  # Only present in R2S
-    ctx.cur = 0
-    word_boundary = langsec.get("word_boundary", WORD_BOUNDARY)
-
-    while ctx.cur < len(ctx.src):
-        # Reset cursor position flags.
-        # Carry over extended "beginning of word" flag.
-        ctx.cur_flags = 0
-        cur_char = ctx.src[ctx.cur]
-
-        # Look for a word boundary and flag word beginning/end it if found.
-        if _is_bow(ctx.cur, ctx, word_boundary):
-            # Beginning of word.
-            logger.debug(f"Beginning of word at position {ctx.cur}.")
-            ctx.cur_flags |= BOW
-        if _is_eow(ctx.cur, ctx, word_boundary):
-            # End of word.
-            logger.debug(f"End of word at position {ctx.cur}.")
-            ctx.cur_flags |= EOW
-
-        # This hook may skip the parsing of the current
-        # token or exit the scanning loop altogether.
-        hret = _run_hook("begin_input_token", ctx, langsec_hooks)
-        if hret == BREAK:
-            logger.debug("Breaking text scanning from hook signal.")
-            break
-        if hret == CONT:
-            logger.debug("Skipping scanning iteration from hook signal.")
-            continue
-
-        # Check ignore list. Find as many subsequent ignore tokens
-        # as possible before moving on to looking for match tokens.
-        ctx.tk = None
-        while True:
-            ctx.ignoring = False
-            for ctx.tk in ignore_list:
-                hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
-                if hret == BREAK:
-                    break
-                if hret == CONT:
-                    continue
+    with Context(lang, src, t_dir, options) as ctx:
+
+        if t_dir == FEAT_S2R and not ctx.general["has_s2r"]:
+            raise NotImplementedError(
+                f"Script-to-Roman not yet supported for {lang}."
+            )
+        if t_dir == FEAT_R2S and not ctx.general["has_r2s"]:
+            raise NotImplementedError(
+                f"Roman-to-script not yet supported for {lang}."
+            )
+
+        # This hook may take over the whole transliteration process or delegate
+        # it to some external process, and return the output string directly.
+        if _run_hook("post_config", ctx) == BREAK:
+            return getattr(ctx, "dest", ""), ctx.warnings
+
+        _normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id))
+
+        if _run_hook("post_normalize", ctx) == BREAK:
+            return getattr(ctx, "dest", ""), ctx.warnings
+
+        lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))
+
+        # Loop through source characters. The increment of each loop depends on
+        # the length of the token that eventually matches.
+        ctx.cur = 0
+
+        while ctx.cur < len(ctx.src):
+            # Reset cursor position flags.
+            # Carry over extended "beginning of word" flag.
+            ctx.cur_flags = 0
+            cur_char = ctx.src[ctx.cur]
+
+            # Look for a word boundary and flag word beginning/end it if found.
+            if _is_bow(ctx.cur, ctx, WORD_BOUNDARY):
+                # Beginning of word.
+                logger.debug(f"Beginning of word at position {ctx.cur}.")
+                ctx.cur_flags |= BOW
+            if _is_eow(ctx.cur, ctx, WORD_BOUNDARY):
+                # End of word.
+                logger.debug(f"End of word at position {ctx.cur}.")
+                ctx.cur_flags |= EOW
+
+            # This hook may skip the parsing of the current
+            # token or exit the scanning loop altogether.
+            hret = _run_hook("begin_input_token", ctx)
+            if hret == BREAK:
+                logger.debug("Breaking text scanning from hook signal.")
+                break
+            if hret == CONT:
+                logger.debug("Skipping scanning iteration from hook signal.")
+                continue
 
 
-                step = len(ctx.tk)
-                if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
-                    # The position matches an ignore token.
-                    hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
+            # Check ignore list. Find as many subsequent ignore tokens
+            # as possible before moving on to looking for match tokens.
+            ctx.tk = None
+            while True:
+                ctx.ignoring = False
+                for ctx.tk in get_lang_ignore(ctx.conn, ctx.lang_id):
+                    hret = _run_hook("pre_ignore_token", ctx)
                     if hret == BREAK:
                     if hret == BREAK:
                         break
                         break
                     if hret == CONT:
                     if hret == CONT:
                         continue
                         continue
 
 
-                    logger.info(f"Ignored token: {ctx.tk}")
-                    ctx.dest_ls.append(ctx.tk)
-                    ctx.cur += step
-                    cur_char = ctx.src[ctx.cur]
-                    ctx.ignoring = True
+                    step = len(ctx.tk)
+                    if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
+                        # The position matches an ignore token.
+                        hret = _run_hook("on_ignore_match", ctx)
+                        if hret == BREAK:
+                            break
+                        if hret == CONT:
+                            continue
+
+                        logger.info(f"Ignored token: {ctx.tk}")
+                        ctx.dest_ls.append(ctx.tk)
+                        ctx.cur += step
+                        cur_char = ctx.src[ctx.cur]
+                        ctx.ignoring = True
+                        break
+                # We looked through all ignore tokens, not found any. Move on.
+                if not ctx.ignoring:
                     break
                     break
-            # We looked through all ignore tokens, not found any. Move on.
-            if not ctx.ignoring:
-                break
-            # Otherwise, if we found a match, check if the next position may be
-            # ignored as well.
-
-        delattr(ctx, "tk")
-        delattr(ctx, "ignoring")
-
-        # Begin transliteration token lookup.
-        ctx.match = False
-
-        for ctx.src_tk, ctx.dest_str in langsec["map"]:
-            hret = _run_hook("pre_tx_token", ctx, langsec_hooks)
-            if hret == BREAK:
-                break
-            if hret == CONT:
-                continue
+                # Otherwise, if we found a match, check if the next position
+                # may be ignored as well.
 
 
-            step = len(ctx.src_tk.content)
-            # If the token is longer than the remaining of the string,
-            # it surely won't match.
-            if ctx.cur + step > len(ctx.src):
-                continue
+            delattr(ctx, "tk")
+            delattr(ctx, "ignoring")
 
 
-            # If the first character of the token is greater (= higher code
-            # point value) than the current character, then break the loop
-            # without a match, because we know there won't be any more match
-            # due to the alphabetical ordering.
-            if ctx.src_tk.content[0] > cur_char:
-                logger.debug(
-                        f"{ctx.src_tk.content} is after "
-                        f"{ctx.src[ctx.cur:ctx.cur + step]}. Breaking loop.")
-                break
+            # Begin transliteration token lookup.
+            ctx.match = False
 
 
-            # If src_tk has a WB flag but the token is not at WB, skip.
-            if (
-                (ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW)
-                or
-                # Can't rely on EOW flag, we must check on the last character
-                # of the potential match.
-                (ctx.src_tk.flags & EOW and not _is_eow(
-                        ctx.cur + step - 1, ctx, word_boundary))
-            ):
-                continue
-
-            # Longer tokens should be guaranteed to be scanned before their
-            # substrings at this point.
-            # Similarly, flagged tokens are evaluated first.
-            if ctx.src_tk.content == ctx.src[ctx.cur:ctx.cur + step]:
-                ctx.match = True
-                # This hook may skip this token or break out of the token
-                # lookup for the current position.
-                hret = _run_hook("on_tx_token_match", ctx, langsec_hooks)
+            for ctx.src_tk, ctx.dest_str in lang_map:
+                hret = _run_hook("pre_tx_token", ctx)
                 if hret == BREAK:
                 if hret == BREAK:
                     break
                     break
                 if hret == CONT:
                 if hret == CONT:
                     continue
                     continue
 
 
-                # A match is found. Stop scanning tokens, append result, and
-                # proceed scanning the source.
+                step = len(ctx.src_tk.content)
+                # If the token is longer than the remaining of the string,
+                # it surely won't match.
+                if ctx.cur + step > len(ctx.src):
+                    continue
 
 
-                # Capitalization.
+                # If the first character of the token is greater (= higher code
+                # point value) than the current character, then break the loop
+                # without a match, because we know there won't be any more
+                # match due to the alphabetical ordering.
+                if ctx.src_tk.content[0] > cur_char:
+                    logger.debug(
+                            f"{ctx.src_tk.content} is after "
+                            f"{ctx.src[ctx.cur:ctx.cur + step]}. "
+                            "Breaking loop.")
+                    break
+
+                # If src_tk has a WB flag but the token is not at WB, skip.
                 if (
                 if (
-                    (ctx.options["capitalize"] == "first" and ctx.cur == 0)
+                    (ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW)
                     or
                     or
-                    (
-                        ctx.options["capitalize"] == "all"
-                        and ctx.cur_flags & BOW
-                    )
+                    # Can't rely on EOW flag, we must check on the last
+                    # character of the potential match.
+                    (ctx.src_tk.flags & EOW and not _is_eow(
+                            ctx.cur + step - 1, ctx, WORD_BOUNDARY))
                 ):
                 ):
-                    logger.info("Capitalizing token.")
-                    double_cap = False
-                    for dcap_rule in ctx.langsec.get("double_cap", []):
-                        if ctx.dest_str == dcap_rule:
-                            ctx.dest_str = ctx.dest_str.upper()
-                            double_cap = True
-                            break
-                    if not double_cap:
-                        ctx.dest_str = (
-                                ctx.dest_str[0].upper() + ctx.dest_str[1:])
+                    continue
 
 
-                ctx.dest_ls.append(ctx.dest_str)
-                ctx.cur += step
-                break
+                # Longer tokens should be guaranteed to be scanned before their
+                # substrings at this point.
+                # Similarly, flagged tokens are evaluated first.
+                if ctx.src_tk.content == ctx.src[ctx.cur:ctx.cur + step]:
+                    ctx.match = True
+                    # This hook may skip this token or break out of the token
+                    # lookup for the current position.
+                    hret = _run_hook("on_tx_token_match", ctx)
+                    if hret == BREAK:
+                        break
+                    if hret == CONT:
+                        continue
 
 
-        if ctx.match is False:
-            delattr(ctx, "match")
-            hret = _run_hook("on_no_tx_token_match", ctx, langsec_hooks)
-            if hret == BREAK:
-                break
-            if hret == CONT:
-                continue
+                    # A match is found. Stop scanning tokens, append result,
+                    # and proceed scanning the source.
+
+                    # Capitalization.
+                    if (
+                        (ctx.options["capitalize"] == "first" and ctx.cur == 0)
+                        or
+                        (
+                            ctx.options["capitalize"] == "all"
+                            and ctx.cur_flags & BOW
+                        )
+                    ):
+                        logger.info("Capitalizing token.")
+                        double_cap = False
+                        for dcap_rule in get_lang_dcap(ctx.conn, ctx.lang_id):
+                            if ctx.dest_str == dcap_rule:
+                                ctx.dest_str = ctx.dest_str.upper()
+                                double_cap = True
+                                break
+                        if not double_cap:
+                            ctx.dest_str = (
+                                    ctx.dest_str[0].upper() + ctx.dest_str[1:])
+
+                    ctx.dest_ls.append(ctx.dest_str)
+                    ctx.cur += step
+                    break
+
+            if ctx.match is False:
+                delattr(ctx, "match")
+                hret = _run_hook("on_no_tx_token_match", ctx)
+                if hret == BREAK:
+                    break
+                if hret == CONT:
+                    continue
 
 
-            # No match found. Copy non-mapped character (one at a time).
-            logger.info(
-                    f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
-                    f"at position {ctx.cur} is not mapped.")
-            ctx.dest_ls.append(cur_char)
-            ctx.cur += 1
-        else:
-            delattr(ctx, "match")
-        delattr(ctx, "cur_flags")
+                # No match found. Copy non-mapped character (one at a time).
+                logger.info(
+                        f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
+                        f"at position {ctx.cur} is not mapped.")
+                ctx.dest_ls.append(cur_char)
+                ctx.cur += 1
+            else:
+                delattr(ctx, "match")
+            delattr(ctx, "cur_flags")
 
 
-    delattr(ctx, "cur")
+        delattr(ctx, "cur")
 
 
-    # This hook may take care of the assembly and cause the function to return
-    # its own return value.
-    hret = _run_hook("pre_assembly", ctx, langsec_hooks)
-    if hret is not None:
-        return hret, ctx.warnings
+        # This hook may take care of the assembly and cause the function to
+        # return its own return value.
+        hret = _run_hook("pre_assembly", ctx)
+        if hret is not None:
+            return hret, ctx.warnings
 
 
-    logger.debug(f"Output list: {ctx.dest_ls}")
-    ctx.dest = "".join(ctx.dest_ls)
+        logger.debug(f"Output list: {ctx.dest_ls}")
+        ctx.dest = "".join(ctx.dest_ls)
 
 
-    # This hook may reassign the output string and/or cause the function to
-    # return it immediately.
-    hret = _run_hook("post_assembly", ctx, langsec_hooks)
-    if hret is not None:
-        return hret, ctx.warnings
+        # This hook may reassign the output string and/or cause the function to
+        # return it immediately.
+        hret = _run_hook("post_assembly", ctx)
+        if hret is not None:
+            return hret, ctx.warnings
 
 
-    # Strip multiple spaces and leading/trailing whitespace.
-    ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
+        # Strip multiple spaces and leading/trailing whitespace.
+        ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
 
 
-    return ctx.dest, ctx.warnings
+        return ctx.dest, ctx.warnings
 
 
 
 
-def _normalize_src(ctx):
-    for nk, nv in ctx.langsec.get("normalize", {}).items():
+def _normalize_src(ctx, norm_rules):
+    for nk, nv in norm_rules.items():
         ctx._src = ctx.src.replace(nk, nv)
         ctx._src = ctx.src.replace(nk, nv)
     logger.debug(f"Normalized source: {ctx.src}")
     logger.debug(f"Normalized source: {ctx.src}")
 
 
@@ -317,11 +326,13 @@ def _is_eow(cur, ctx, word_boundary):
     ) and (ctx.src[cur] not in word_boundary)
     ) and (ctx.src[cur] not in word_boundary)
 
 
 
 
-def _run_hook(hname, ctx, hooks):
+def _run_hook(hname, ctx):
     ret = None
     ret = None
-    for hook_def in hooks.get(hname, []):
-        kwargs = hook_def[1] if len(hook_def) > 1 else {}
-        ret = hook_def[0](ctx, **kwargs)
+    for hook_def in ctx.hooks.get(hname, []):
+        fn = getattr(
+                import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
+                hook_def["fn_name"])
+        ret = fn(ctx, **hook_def["kwargs"])
         if ret in (BREAK, CONT):
         if ret in (BREAK, CONT):
             # This will stop parsing hooks functions and tell the caller to
             # This will stop parsing hooks functions and tell the caller to
             # break out of the outer loop or skip iteration.
             # break out of the outer loop or skip iteration.

+ 74 - 0
sscli

@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+__doc__ = """ Scriptshifter command line interface. """
+
+
+import click
+
+from glob import glob
+from os import path
+
+from scriptshifter import DB_PATH
+from scriptshifter.tables import init_db as _init_db
+from tests import test_sample
+
+
+@click.group()
+def cli():
+    """ Scriptshifter CLI. """
+    pass
+
+
+@cli.group(name="admin")
+def admin_grp():
+    """ Admin operations. """
+    pass
+
+
+@admin_grp.command()
+def init_db():
+    """ Initialize SS database. """
+    _init_db()
+
+    click.echo(f"Initialized Scriptshifter DB in {DB_PATH}")
+
+
+@cli.group(name="test")
+def test_grp():
+    """ Test operations. """
+    pass
+
+
+@test_grp.command()
+def list_samples():
+    """ List string sample sets that can be tested. """
+    path_ptn = path.join(
+            path.dirname(path.realpath(__file__)),
+            "tests", "data", "script_samples", "*.csv")
+
+    click.echo("Sample string sets available for batch testing:")
+    for fn in glob(path_ptn):
+        click.echo(path.splitext(path.basename(fn))[0])
+
+
+@test_grp.command()
+@click.argument("lang")
+def samples(lang):
+    """
+    Test sample strings for language LANG.
+
+    LANG must match one of the names obtained with `test list-samples` command.
+
+    The command will generate a test report file.
+    """
+    return test_sample(lang)
+
+
+@cli.group(name="trans")
+def trans_grp():
+    """ Transliteration and transcription operations. """
+    pass
+
+
+if __name__ == "__main__":
+    cli()

+ 2 - 1
tests/__init__.py

@@ -20,7 +20,8 @@ def reload_tables():
     reload(scriptshifter.tables)  # Reload new config dir.
     reload(scriptshifter.tables)  # Reload new config dir.
     from scriptshifter import tables
     from scriptshifter import tables
     tables.list_tables.cache_clear()
     tables.list_tables.cache_clear()
-    tables.load_table.cache_clear()
+    tables.get_language.cache_clear()
+    tables.get_lang_map.cache_clear()
 
 
     return tables
     return tables
 
 

+ 3 - 0
tests/data/script_samples/arabic.csv

@@ -0,0 +1,3 @@
+arabic,نظام الحكم في عمان : من إمامة الإنتخاب الى السلطنة الوراثية,Niẓām al-ḥukm fī ʻUmān : min imāmat al-intikhāb ilá al-salṭanah al-wirāthīyah,,
+arabic,ندوة علاقات مصر بدول حوض النيل في ظل رئاسة مصر للاتحاد الإفريقي,Nadwat ʻAlāqāt Miṣr bi-Duwal Ḥawḍ al-Nīl fī ẓill Riʼāsat Miṣr lil-Ittiḥād al-Ifrīqī,,
+arabic,تهذيب البيان والجمع في الفرق بين التكليف والوضع,Tahdhīb al-bayān wa-al-jamʻ fī al-farq bayna al-taklīf wa-al-waḍʻ,,

+ 94 - 0
tests/data/script_samples/arabic2.csv

@@ -0,0 +1,94 @@
+arabic,قضايا فكرية و سياسية باقلام كردية عراقية ,Qaḍāyā fikrīyah wa siyāsīyah bi-aqlām Kurdīyah ʻIrāqīyah,,
+arabic,‏الأستاذ الدكتور عماد الجواهري؛ مراجعة و تقديم الأستاذ الدكتور عبد الفتاح علي البوتاني,al-Ustādh al-Duktūr ʻImād al-Jawāhirī; murājaʻat wa taqdīm al-Ustādh al-Duktūr ʻAbd al-Fattāḥ ʻAlī al-Būtānī,,
+arabic,العلاقة الشيعية - الكوردية ومستقبلها,al-ʻAlāqah al-Shīʻīyah - al-Kūrdīyah wa-mustaqbaluhā,,
+arabic,مركز دراسات رووداو,Markaz Dirāsāt Rūwūdāw,,
+arabic,ماذا يخبئ الغربال في السياسة العراقية,Mādhā yukhabbiʼ al-ghurbāl fī al-siyāsah al-ʻIrāqīyah,,
+arabic,الحزب الشيوعي العراقي .. المكونات السياسية .. الحكومة,al-Ḥizb al-Shuyūʻī al-ʻIrāqī .. al-mukawwināt al-siyāsīyah .. al-ḥukūmah,,
+arabic,الدولة الأموية في الشام,al-Dawlah al-Umawīyah fī al-Shām,,
+arabic,تأليف أنيس زكريا النصولي,taʼlīf Anīs Zakarīyā al-Nuṣūlī.,,
+arabic,الدين وسياسة الدولة في بلاد الرافدين في ضوء النصوص المسمارية، (٢٨٠٠ ق.م-٥٣٩ ق.م)  ,"al-Dīn wa-siyāsat al-dawlah fī bilād al-Rāfidayn fī ḍawʼ al-nuṣūṣ al-mismārīyah, (2800 Q.M-539 Q.M)",,
+arabic,المدن والموانيء التجارية في شرق الجزيرة العربية منذ بداية الالف الثالث ق.م حتى نهاية الالف الاول ق.م ,al-Mudun wa-al-mawānīʼ al-tijārīyah fī sharq al-jazīrah al-ʻArabīyah mundhu bidāyat al-alf al-thālith Q.M ḥattá nihāyat al-alf al-awwal Q.M,,
+arabic,أمير الإنسانية وقائد الدبلماسية  ,Amīr al-insānīyah wa-qāʼid al-diblumāsīyah,,
+arabic,النقد الادبي واللغوي المعاصر ,al-Naqd al-adabī wa-al-lughawī al-muʻāṣir,,
+arabic,جدلية الاصالة والتجديد : المؤتمر النقدي الرابع والعشرون,Jadalīyat al-aṣālah wa-al-tajdīd : al-muʼtamar al-naqdī al-rābiʻ wa-al-ʻishrūn,,
+arabic,أماني سراج عبدالوهاب أبوزيد,Amānī Sirāj ʻAbd al-Wahhāb Abū Zayd,,
+arabic,المدن والموانيء التجارية في شرق الجزيرة العربية منذ بداية الالف الثالث ق.م حتى نهاية الالف الاول ق.م,al-Mudun wa-al-mawānīʼ al-tijārīyah fī sharq al-Jazīrah al-ʻArabīyah mundhu bidāyat al-alf al-thālith Q.M ḥattá nihāyat al-alf al-awwal Q.M,,
+arabic,محمد صوضان,Muḥammad Ṣawḍān,,
+arabic,كتاب سفينة السعادة لاهل الضعف والنجادة في مديح النبي، المعروفة، بالعشرينيات ‏ ," Kitāb Safīnat al-saʻādah li-ahl al-ḍaʻf wa-al-najādah fī madīḥ al-Nabī, al-maʻrūfah, bi-al-ʻIshrīniyāt",,
+arabic,من الشريعة الموروثة إلى الإنسان الخليفة ,Min al-sharīʻah al-mawrūthah ilá al-insān al-khalīfah,,
+arabic,إعداد محمد هشام بوعتور, iʻdād Muḥammad Hishām Bū ʻAttūr,,
+arabic,موسوعة الحكايات الخرافية الفلسطينية ,Mawsūʻat al-ḥikāyāt al-khurāfīyah al-Filasṭīnīyah,,
+arabic,مؤسسة تامر للتعليم المجتمعي,Muʼassasat Tāmir lil-Taʻlīm al-Mujtamaʻī,,
+arabic,نصوص ودراسة في ‌الحكاية الشعبية الفلسطينية, nuṣūṣ wa-dirāsah fi al-ḥikāyah al-shaʻbīyah al-Filasṭīnīyah,,
+arabic,تأليف إبراهيم مهوي و شريف كناعنه ,taʼlīf Ibrāhīm Muhawwī wa-Sharīf Kanāʻinah,,
+arabic,التراث الفلسطيني بين الطمس والاحياء, al-Turāth al-Filasṭīnī bayna al-ṭams wa-al-iḥyāʼ,,
+arabic,أشرف على تحريرها منعم حداد,ashrafa ʻalá taḥrīrihā Munʻim Ḥaddād,,
+arabic,من تراثنا الشعبي في السهل الساحلي الفلسطيني ,Min turāthinā al-shaʻbī fī al-sahl al-sāḥilī al-Filastīnī,,
+arabic,بقلم حسن محمد عوض,bi-qalam Ḥasan Muḥammad ʻAwaḍ,,
+arabic,تاريخ ما لم يذكره التاريخ,Tārīkh mā lam yadhkurhu al-tārīkh,,
+arabic,دراسة ميدانية فى التراث الشعبى الفلسطينى,dirāsah maydānīyah fī al-turāth al-shaʻbī al-Filasṭīnī ,,
+arabic,بيت الفلاح الفلسطيني, Bayt al-falāḥ al-Filasṭīnī,,
+arabic,معان ثقافية وعادات وتقاليد اجتماعية، اثاث وفراش وادوات,"maʻānin thaqāfīyah wa-ʻādāt wa-taqālīd ijtimāʻīyah, athāth wa-firāsh wa-adawāt",,
+arabic,الحزازير والألعاب الشعبية الفلسطينية,al-Ḥazāzīr wa-al-alʻāb al-shaʻbīyah al-Filasṭīnīyah,,
+arabic,المرأة في المثل الشعبي في الأردن وفلسطين,al-Marʼah fī al-mathal al-shaʻbī fī al-Urdun wa-Filasṭīn,,
+arabic,الأحاجي والالغاز الادبية ,al-Aḥājī wa-al-alghāz al-adabīyah,,
+arabic,فصول الحياة في قريتي,fuṣūl al-ḥayāh fī qaryatī,,
+arabic,قرية الدمينة الشرقية بين الماضي والحاضر,Qaryat al-Dumaynah al-Sharqīyah bayna al-māḍī wa-al-ḥāḍir,,
+arabic,الألعاب الشعبية في الجزيرة السورية,al-Alʻāb al-shaʻbīyah fī al-Jazīrah al-Sūrīyah,,
+arabic,وزارة الثقافة، منشورات الهيئه العامة السورية للكتاب,"Wizārat al-Thaqāfah, Manshūrāt al-Hayʼah al-ʻĀmmah al-Sūrīyah lil-Kitāb",,
+arabic,طرائف الأمس غرائب اليوم,Ṭarāʼif al-ams gharāʼib al-Yawm,,
+arabic,صور من حياة النبك وجبل القلمون في أواسط القرن التاسع عشر,ṣuwar min ḥayāt al-Nabk wa-Jabal al-Qalamūn fī awāsiṭ al-qarn al-tāsiʻ ʻashar,,
+arabic,ولدت مرتين,Wulidtu marratayn,,
+arabic,من حكايا الدمع في سوريا,min Ḥakāyā al-damʻ fī Sūriyā,,
+arabic,العين والماء والفخار في التراث الساحلي الريفي,al-ʻAyn wa-al-māʼ wa-al-fukhkhār fī al-turāth al-sāḥilī al-rīfī,,
+arabic,المواسم التقليدية بمنطقة الأبيض سيدي الشيخ، الوعدات,"al-Mawāsim al-taqlīdīyah bi-minṭaqat al-Abyaḍ Sīdī al-Shaykh, al-Waʻdāt",,
+arabic,فضاءات تلقي الادب الشعبي,Faḍāʼāt talaqqī al-adab al-shaʻbī,,
+arabic,المجتمع الجزائري وفعالياته في العهد العثماني,al-Mujtamaʻ al-Jazāʼirī wa-faʻʻālīyātuhu fī al-ʻahd al-ʻUthmānī,,
+arabic,بدو الطوارق بين الثبات والتغير ,Badw al-Ṭawāriq bayna al-thabāt wa-al-taghayyur,,
+arabic,النظم الإجتماعية والتغيرات المرافقة للمد العربي,al-nuẓum al-ijtimāʻīyah wa-al-taghayyurāt al-murāfiqah lil-madd al-ʻArabī,,
+arabic,لماذا يصحو مارد الهضبة ويغفو مارد السهل,Li-mādhā yaṣʹḥū mārid al-haḍabah wa-yaghfū mārid al-sahl,,
+arabic,رؤى الحداثة وآفاق التحولات في الخطاب الأدبي الأردني الحداثي,ruʼá al-ḥadāthah wa-āfāq al-taḥawwulāt fī al-khiṭāb al-Adabī al-Urdunī al-ḥadāthī,,
+arabic,الحقيبة الملكية على الطائر الميمون ‏ ,al-Ḥaqībah al-malakīyah ʻalá al-ṭāʼir al-maymūn,,
+arabic,عيسى الناعوري وجهوده في مجال الدراسات الادبية والنقدية,ʻĪsá al-Nāʻūrī wa-juhūduh fī majāl al-dirāsāt al-adabīyah wa-al-naqdīyah,,
+arabic,أقحوان على ضفاف النهر ,Uqḥuwān ʻalá ḍifāf al-nahr,,
+arabic,صورة المرأة في... السرد النسوي الأردني,Ṣūrat al-marʼah fī ... al-sard al-niswī al-Urdunī,,
+arabic,آراء ونصوص في تجربته الادبية,Ārāʼ wa-nuṣūṣ fī tajribatih al-adabīyah,,
+arabic,مدخل الى أدبنا المعاصر,Madkhal ilá adabinā al-muʻāṣir,,
+arabic,صاحب المئة كتاب والستين عاما في خدمة التربية والتعليم,ṣāḥib al-miʼat kitāb wa-al-sittīn ʻāman fī khidmat al-tarbiyah wa-al-taʻlīm,,
+arabic,خمسة رواد يحاورون العصر,khamsat rūwād yuḥāwirūn al-ʻaṣr,,
+arabic,حوار مع رواد النهضة العربية,Ḥiwār maʻa rūwād al-nahḍah al-ʻArabīyah,,
+arabic,أعلام الحركة الادبية في الرقة,Aʻlām al-ḥarakah al-adabīyah fī al-Raqqah,,
+arabic,دراسة تحليلية في أدب الأطفال لدى الكرد في سوريا وأبرز نماذجه المدونة,dirāsah taḥlīlīyah fī adab al-aṭfāl ladá al-Kurd fī Sūriyā wa-abraz namādhijihi al-mudawwanah,,
+arabic,دراسات ومقالات حول حياة الكتاب والكتاب, dirāsāt wa-maqālāt ḥawla ḥayāt al-kuttāb wa-al-kitāb,,
+arabic,القصص القرآني :  إيحاؤه ونفحاته ,al-Qaṣaṣ al-Qurʼānī : īḥāʼuhu wa-nafaḥātuh ,,
+arabic,للسائلين عن، أخلاق وطبائع بني إسرائيل في قصة يوسف عليه السلام,"Lil-sāʼilīn ʻan, Akhlāq wa-ṭabāʼiʻ Banī Isrāʼīl fī qiṣṣat Yūsuf ʻalayhi al-Salām",,
+arabic,إبراهيم الدسوقي عبد الرحمن,Ibrāhīm al-Dasūqī ʻAbd al-Raḥmān,,
+arabic,لا تكن كابني آدم,Lā takun ka-ibnay Ādam ,,
+arabic,لا قاتلا ولا مقتولا,lā qātilan wa-lā maqtūlan,,
+arabic,الجانب الفني في القصة القرآنية,al-jānib al-fannī fī al-qiṣṣah al-Qurʼānīyah,,
+arabic,منهجها، وأسس بنائها,"manhajuhā, wa-usus bināʼihā ",,
+arabic,المبادىء التربوية والأسس النفسية في القصص القرآني,al-Mabādiʼ al-tarbawīyah wa-al-usus al-nafsīyah fī al-qaṣaṣ al-Qurʼānī ,,
+arabic,الابتلاءات الشديدة عند مخالفة الشريعة,al-Ibtilāʼāt al-shadīdah ʻinda mukhālafat al-Sharīʻah,,
+arabic,للداعية الإسلامي الشيخ محمد ياسين أبو يحيى,lil-Dāʻiyah al-Islāmī al-Shaykh Muḥammad Yāsīn Abū Yaḥyá,,
+arabic,روضة المشتاقين في فضائل الأنبياء والمرسلين وشيء من أخبارهم,Rawḍat al-mushtāqīn fī faḍāʼil al-anbiyāʼ wa-al-mursalīn wa-shayʼ min akhbārihim,,
+arabic,يحيى خذ الكتاب بقوة,Yaḥyá khudh al-kitāb bi-qūwah,,
+arabic,خصائص التراكيب ودلالاتها في القصص القرآني,Khaṣāʼiṣ al-tarākīb wa-dalālātuhā fī al-qaṣaṣ al-Qurʼānī,,
+arabic,الخطيئة والصراع,al-khaṭīʼah wa-al-ṣirāʻ,,
+arabic,اللاموضوعية عند المفسرين :‏,al-Lāmawḍūʻīyah ʻinda al-mufassirīn ,,
+arabic,القصص القرآني بين الآباء والابناء :‏ ,al-Qaṣaṣ al-Qurʼānī bayna al-ābāʼ wa-al-abnāʼ ,,
+arabic,الاتساع النصي في القصص القرآني بين الاستباق والاسترجاع ‏ ,al-Ittisāʻ al-naṣṣī fī al-qaṣaṣ al-Qurʼānī bayna al-istibāq wa-al-istirjāʻ,,
+arabic,فأزلهما الشيطان عنها فأخرجهما مما كانا فيه,fa-azallahumā al-Shayṭān ʻanhā fa-akhrajahumā mimmā kānā fīhi,,
+arabic,آثار المشتق البليغ من قصة يوسف الصديق ,Āthār al-mushtaqq al-balīgh min qiṣṣat Yūsuf al-Ṣiddīq,,
+arabic,الجامع الصحيح في القصص النبوي,al-Jāmiʻ al-ṣaḥīḥ fī al-qaṣaṣ al-Nabawī,,
+arabic,يطبع لاول مرة محققا عا نسخة الحافظ الذهبي التي كتبها بخطة,Yuṭbaʻu li-awwal marrah muḥaqqiqan ʻan nuskhah al-Ḥāfiẓ al-Dhahabī allatī katabahā bi-khuṭṭat.,,
+arabic,,Yuṭbaʻu li-awwal marrah ʻan nuskhah Nafīsah manqūlah bi-khaṭṭ al-muʼallif bi-khaṭṭ al-muʼallif,,
+arabic,العربية ولهجاتها ,al-ʻArabīyah wa-lahajātuhā,,
+arabic,اللغة المهرية المعاصرة بين عربيتين,al-Lughah al-Mahrīyah al-muʻāṣirah bayna ʻArabīyatayn,,
+arabic,نحو عربية ميسرة‏, Naḥwa ʻArabīyah muyassarah,,
+arabic,لغات القبائل في كتب إعراب القرآن ومعانيه,Lughāt al-qabāʼil fī kutub iʻrāb al-Qurʼān wa-maʻānīh,,
+arabic,الأدب الجاهلي بين لهجات القبائل وللغة الموحدة,al-adab al-Jāhilī bayna Lahajāt al-qabāʼil wa-al-lughah al-muwaḥḥadah,,
+arabic,التحليل العام للغة العوام,al-Tahḷīl al-ʻāmm li-lughat al-ʻawāmm,,
+arabic,تاريخ الدعوة إلى العامية وآثارها في مصر  ‏ ,Tārīkh al-Daʻwah ilá al-ʻāmmīyah wa-āthāruhā fī Miṣr,,
+arabic,الفصيح الذي حفظته العامية العراقية بين الدراسة والتطبيق,al-faṣīḥ alladhī ḥafiẓatʹhu al-ʻāmmīyah al-ʻIrāqīyah bayna al-dirāsah wa-al-taṭbīq,,
+arabic,ويلي ذلك معجم بألفاظ اللهجة الشائعة في العراق,wa-yalī dhālika Muʻjam bi-alfāẓ al-lahjah al-shāʼiʻah fī al-ʻIrāq,,
+arabic,كلمات فارسية مستعملة في عامية الموصل وفي انحاء العراق,Kalimāt Fārisīyah mustaʻmalah fī ʻāmmīyat al-Mūṣil wa-fī anḥāʼ al-ʻIrāq,,

+ 1 - 1
tests/test01_cfg.py

@@ -113,7 +113,7 @@ class TestHooks(TestCase):
                 tbl["script_to_roman"]["hooks"],
                 tbl["script_to_roman"]["hooks"],
                 {
                 {
                     "begin_input_token": [
                     "begin_input_token": [
-                        (scriptshifter.hooks.test.rotate, {"n": -3})
+                        ("test", scriptshifter.hooks.test.rotate, {"n": -3})
                     ]
                     ]
                 })
                 })