9 months ago · b57fa6ef97
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,6 +9,9 @@ COPY tests ./tests/
 
				 COPY requirements.txt ./
			
 
				 RUN pip install --no-cache-dir -r requirements.txt
			
 
				 
			
 
				+ENV HF_DATASETS_CACHE /data/hf/datasets
			
 
				+RUN ./sscli admin init-db
			
 
				+
			
 
				 RUN chmod +x ./entrypoint.sh
			
 
				 #RUN chown -R www:www ${WORKROOT} .
			
 
				 
			
--- a/doc/rest_api.md
+++ b/doc/rest_api.md
@@ -73,7 +73,7 @@ MIME type: `application/json`
 
				 
			
 
				 Content: JSON object with the following keys:
			
 
				 
			
 
				-- `lang`: Language code as given by the `/languages` endpoint. 
			
 
				+- `lang`: Language code as given by the `/languages` endpoint.
			
 
				 - `text`: Input text to be transliterated.
			
 
				 - `capitalize`: One of `first` (capitalize the first letter of the input),
			
 
				   `all` (capitalize all words separated by spaces), or null (default: apply no
			
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -9,11 +9,12 @@ else
 
				     export FLASK_ENV="production"
			
 
				 fi
			
 
				 
			
 
				+# Preload Thai model.
			
 
				+python -c 'from esupar import load; load("th")'
			
 
				+
			
 
				 host=${TXL_WEBAPP_HOST:-"0.0.0.0"}
			
 
				 port=${TXL_WEBAPP_PORT:-"8000"}
			
 
				 
			
 
				-./sscli admin init-db
			
 
				-
			
 
				 if [ "${FLASK_ENV}" == "development" ]; then
			
 
				     exec flask run -h $host -p $port
			
 
				 else
			
--- a/example.env
+++ b/example.env
@@ -2,4 +2,6 @@ FLASK_DEBUG=true
 
				 TXL_DICTA_EP="changeme"
			
 
				 TXL_FLASK_SECRET="changeme"
			
 
				 TXL_LOGLEVEL="INFO"
			
 
				+TXL_EMAIL_FROM="me@loc.gov"
			
 
				+TXL_EMAIL_TO="me@loc.gov"
			
 
				 LOSHN_KOYDESH_O2P_SRC="${PWD}/scriptshifter/hooks/yiddish_/loshn_koydesh_o2p_override.tsv"
			
--- a/legacy/processNumbers.ts
+++ b/legacy/processNumbers.ts
@@ -0,0 +1,144 @@
 
				+private processNumbers(pinyinString: string, tag: string, code: string): string {
			
 
				+    let outputString = "";
			
 
				+    let useNumVersion = false;
			
 
				+    //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
			
 
				+    if ((tag == "245" || tag == "830") && code == "n") {
			
 
				+       useNumVersion = true;
			
 
				+    }
			
 
				+
			
 
				+    /*
			
 
				+     * The input string is split, with any space or punctuation character (except for #) as the delimiter.
			
 
				+     * The delimiters will be captured and included in the string of tokens.  Only the even-numbered
			
 
				+     * array elements are the true 'tokens', so the code for processing tokens is run only for even
			
 
				+     * values of j.
			
 
				+     */
			
 
				+    let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
			
 
				+    let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
			
 
				+    let numToken_re = new RegExp(numTokenPattern);
			
 
				+    let n = tokens.length
			
 
				+    //this.alert.info(tokens.join("|"),{autoClose: false})
			
 
				+    for (let i = 0; i < n; i++) {
			
 
				+        let toki = tokens[i];
			
 
				+        if (toki.match(numToken_re)) {
			
 
				+            /*
			
 
				+             * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
			
 
				+             * found after it.  Two versions of the string are maintained.  The textVersion is the original pinyin (minus the
			
 
				+             * # suffixes).  In the numVersion, characters representing numbers are converted to Arabic numerals.  When a
			
 
				+             * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
			
 
				+             * which version should be used in the output string.  The outer loop then continues where the inner loop left off.
			
 
				+             */
			
 
				+            let textVersion = "";
			
 
				+            let numVersion = "";
			
 
				+            for (let j = i; j < n; j++) {
			
 
				+                let tokj = tokens[j];
			
 
				+                /* a token without # (or the end of string) is reached */
			
 
				+                if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
			
 
				+                    //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
			
 
				+                    let m = tokj.match(numToken_re);
			
 
				+                    if (m) {
			
 
				+                        textVersion += m[1]
			
 
				+                        if (m[2] == "") {
			
 
				+                            numVersion += m[1];
			
 
				+                        } else {
			
 
				+                            numVersion += m[2];
			
 
				+                        }
			
 
				+                    } else if (j == n - 1) {
			
 
				+                    //if last token is non-numerical, just tack it on.
			
 
				+                        textVersion += tokj;
			
 
				+                        numVersion += tokj;
			
 
				+                    } else if (textVersion.length > 0 && numVersion.length > 0) {
			
 
				+                    //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
			
 
				+                    //(outer loop will pick up at this point)
			
 
				+                        textVersion = textVersion.substring(0, textVersion.length - 1);
			
 
				+                        numVersion = numVersion.substring(0, numVersion.length - 1);
			
 
				+                    }
			
 
				+                    //evaluate numerical string that has been constructed so far
			
 
				+                    //use num version for ordinals and date strings
			
 
				+                    if (numVersion.match(/^di [0-9]/i) ||
			
 
				+                        numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
			
 
				+                        numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
			
 
				+                        numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
			
 
				+                        useNumVersion
			
 
				+                       ) {
			
 
				+                        useNumVersion = true;
			
 
				+                        /*
			
 
				+                         * At this point, string may contain literal translations of Chinese numerals
			
 
				+                         * Convert these to Arabic numerals (for example "2 10 7" = "27").
			
 
				+                         */
			
 
				+
			
 
				+                        while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
			
 
				+                            m = numVersion.match(/([0-9]+) ([1-9]0+)/);
			
 
				+                            if (m) {
			
 
				+                                let sum = Number(m[1]) * Number(m[2]);
			
 
				+                                numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
			
 
				+                            } else {
			
 
				+                                let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
			
 
				+                                if (mb)
			
 
				+                                {
			
 
				+                                    let sumb = Number(mb[1]) + Number(mb[2]);
			
 
				+                                    numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
			
 
				+                                }
			
 
				+                                else
			
 
				+                                {
			
 
				+                                    break;
			
 
				+                                }
			
 
				+                            }
			
 
				+                        }
			
 
				+
			
 
				+                        //A few other tweaks
			
 
				+                        numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
			
 
				+                        if ((tag == "245" || tag == "830") && code == "n") {
			
 
				+                            while (numVersion.match(/[0-9] [0-9]/)) {
			
 
				+                                numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+                    if (useNumVersion)
			
 
				+                    {
			
 
				+                        outputString += numVersion;
			
 
				+                    }
			
 
				+                    else
			
 
				+                    {
			
 
				+                        outputString += textVersion;
			
 
				+                    }
			
 
				+                    //if the end of the string is not reached, backtrack to the delimiter after the last numerical token
			
 
				+                    //(i.e. two tokens ago)
			
 
				+                    if (j < n - 1)
			
 
				+                    {
			
 
				+                        i = j - 2;
			
 
				+                    }
			
 
				+                    else //we are at the end of the string, so we are done!
			
 
				+                    {
			
 
				+                        i = j;
			
 
				+                    }
			
 
				+                    break;
			
 
				+                }
			
 
				+                //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
			
 
				+                //This is identical to the code that is run above when the last token is numeric.
			
 
				+                if (j % 2 == 0)
			
 
				+                {
			
 
				+                    let m = tokj.match(numToken_re);
			
 
				+                    textVersion += m[1];
			
 
				+                    if (m[2]== "")
			
 
				+                    {
			
 
				+                        numVersion += m[1];
			
 
				+                    }
			
 
				+                    else
			
 
				+                    {
			
 
				+                        numVersion += m[2];
			
 
				+                    }
			
 
				+                }
			
 
				+                else //a delimiter, just tack it on.
			
 
				+                {
			
 
				+                    textVersion += tokj;
			
 
				+                    numVersion += tokj;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
			
 
				+        {
			
 
				+            outputString += toki;
			
 
				+        }
			
 
				+    }
			
 
				+    return outputString;
			
 
				+ }
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 
				 # Core application dependencies.
			
 
				 aksharamukha>=2.2,<3
			
 
				+esupar>=1.7.5
			
 
				 flask>=2.3,<3
			
 
				 flask-cors>=4.0,<5
			
 
				 python-dotenv>=1.0,<2
			
--- a/scriptshifter/__init__.py
+++ b/scriptshifter/__init__.py
@@ -15,7 +15,7 @@ SQLite database path.
 
				 This DB stores all the runtime transliteration data.
			
 
				 """
			
 
				 DB_PATH = environ.get(
			
 
				-        "DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))
			
 
				+        "TXL_DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))
			
 
				 
			
 
				 """
			
 
				 SMTP server for sending email. For a dummy server that just echoes the
			
@@ -50,8 +50,10 @@ logger = logging.getLogger(__name__)
 
				 if not env:
			
 
				     logger.warn("No .env file found. Assuming env was passed externally.")
			
 
				 
			
 
				-EMAIL_FROM = environ["TXL_EMAIL_FROM"]
			
 
				-EMAIL_TO = environ["TXL_EMAIL_TO"]
			
 
				+if SMTP_HOST or FEEDBACK_PATH:
			
 
				+    EMAIL_FROM = environ["TXL_EMAIL_FROM"]
			
 
				+    EMAIL_TO = environ["TXL_EMAIL_TO"]
			
 
				+
			
 
				 try:
			
 
				     SMTP_PORT = int(environ.get("TXL_SMTP_PORT", "1025"))
			
 
				 except ValueError:
			
--- a/scriptshifter/hooks/asian_tokenizer/__init__.py
+++ b/scriptshifter/hooks/asian_tokenizer/__init__.py
@@ -0,0 +1,8 @@
 
				+from esupar import load
			
 
				+
			
 
				+
			
 
				+def s2r_tokenize(ctx, model):
			
 
				+    nlp = load(model)
			
 
				+    token_data = nlp(ctx.src)
			
 
				+
			
 
				+    ctx._src = " ".join(token_data.values[1])
			
--- a/scriptshifter/hooks/chinese/__init__.py
+++ b/scriptshifter/hooks/chinese/__init__.py
@@ -127,3 +127,20 @@ def parse_numerals_pre_assembly(ctx):
 
				     # Skip main transliterate function joining.
			
 
				 
			
 
				     return normalize_spacing_post_assembly(ctx)
			
 
				+
			
 
				+
			
 
				+def person_name_pre_assembly(ctx):
			
 
				+    """
			
 
				+    Parse a personal name from a specific MARC field.
			
 
				+    """
			
 
				+    if not ctx.options.get("marc_field") in ("100", "600", "700", "800"):
			
 
				+        return
			
 
				+
			
 
				+    ctx.dest_ls[0] = ctx.dest_ls[0].capitalize().strip() + ", "
			
 
				+    ctx.dest_ls[1] = ctx.dest_ls[1].capitalize()
			
 
				+    if len(ctx.dest_ls) > 2:
			
 
				+        ctx.dest_ls[1] = ctx.dest_ls[1].strip()
			
 
				+        if ctx.dest_ls[2][0] in "aeiou":
			
 
				+            ctx.dest_ls[1] += "'"
			
 
				+        ctx.dest_ls[1] += ctx.dest_ls[2]
			
 
				+        del(ctx.dest_ls[2])
			
--- a/scriptshifter/tables/__init__.py
+++ b/scriptshifter/tables/__init__.py
@@ -1,5 +1,4 @@
 
				 import logging
			
 
				-import re
			
 
				 import sqlite3
			
 
				 
			
 
				 from collections import defaultdict
			
@@ -7,6 +6,7 @@ from functools import cache
 
				 from importlib import import_module
			
 
				 from json import dumps as jdumps, loads as jloads
			
 
				 from os import R_OK, access, environ, makedirs, path, unlink
			
 
				+from re import compile
			
 
				 from shutil import move
			
 
				 
			
 
				 from yaml import load
			
@@ -28,9 +28,6 @@ runtime.
 
				 """
			
 
				 
			
 
				 
			
 
				-TMP_DB_PATH = path.join(
			
 
				-        path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH))
			
 
				-
			
 
				 DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
			
 
				 # Can be overridden for tests.
			
 
				 TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
			
@@ -143,7 +140,7 @@ def init_db():
 
				 
			
 
				     This operation removes any preexisting database.
			
 
				 
			
 
				-    All tables in the index file (`./data/index.yml`) will be parsed
			
 
				+    All tables in the index file (`./index.yml`) will be parsed
			
 
				     (including inheritance rules) and loaded into the designated DB.
			
 
				 
			
 
				     This must be done only once at bootstrap. To update individual tables,
			
@@ -151,7 +148,9 @@ def init_db():
 
				     """
			
 
				     # Create parent diretories if necessary.
			
 
				     # If the DB already exists, it will be overwritten ONLY on success at
			
 
				-    # hhis point.
			
 
				+    # this point.
			
 
				+    TMP_DB_PATH = path.join(
			
 
				+            path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH))
			
 
				     if path.isfile(TMP_DB_PATH):
			
 
				         # Remove previous temp file (possibly from failed attempt)
			
 
				         unlink(TMP_DB_PATH)
			
@@ -166,25 +165,17 @@ def init_db():
 
				             conn.executescript(fh.read())
			
 
				 
			
 
				     # Populate tables.
			
 
				-    with open(path.join(TABLE_DIR, "index.yml")) as fh:
			
 
				+    with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh:
			
 
				         tlist = load(fh, Loader=Loader)
			
 
				     try:
			
 
				         with conn:
			
 
				             for tname, tdata in tlist.items():
			
 
				-                res = conn.execute(
			
 
				-                    """INSERT INTO tbl_language (
			
 
				-                        name, label, marc_code, description
			
 
				-                    ) VALUES (?, ?, ?, ?)""",
			
 
				-                    (
			
 
				-                        tname, tdata.get("name"), tdata.get("marc_code"),
			
 
				-                        tdata.get("description"),
			
 
				-                    )
			
 
				-                )
			
 
				-                populate_table(conn, res.lastrowid, tname)
			
 
				+                populate_table(conn, tname, tdata)
			
 
				 
			
 
				         # If the DB already exists, it will be overwritten ONLY on success at
			
 
				         # thhis point.
			
 
				         move(TMP_DB_PATH, DB_PATH)
			
 
				+        logger.info(f"Database initialized at {DB_PATH}.")
			
 
				     finally:
			
 
				         conn.close()
			
 
				         if path.isfile(TMP_DB_PATH):
			
@@ -201,7 +192,27 @@ def get_connection():
 
				     return sqlite3.connect(DB_PATH)
			
 
				 
			
 
				 
			
 
				-def populate_table(conn, tid, tname):
			
 
				+def populate_table(conn, tname, tdata):
			
 
				+    """
			
 
				+    Populate an individual table with data from a configuration.
			
 
				+
			
 
				+    @param conn: SQLite connection.
			
 
				+
			
 
				+    @param tname(str): Table name.
			
 
				+
			
 
				+    @param tdata(dict): Table data.
			
 
				+    """
			
 
				+    res = conn.execute(
			
 
				+        """INSERT INTO tbl_language (
			
 
				+            name, label, marc_code, description
			
 
				+        ) VALUES (?, ?, ?, ?)""",
			
 
				+        (
			
 
				+            tname, tdata.get("name"), tdata.get("marc_code"),
			
 
				+            tdata.get("description"),
			
 
				+        )
			
 
				+    )
			
 
				+    tid = res.lastrowid
			
 
				+
			
 
				     data = load_table(tname)
			
 
				     flags = 0
			
 
				     if "script_to_roman" in data:
			
@@ -247,20 +258,19 @@ def populate_table(conn, tid, tname):
 
				                             hook_data[1].__name__, jdumps(hook_data[2])))
			
 
				 
			
 
				         # Ignore rules (R2S only).
			
 
				-        for row in sec.get("ignore", []):
			
 
				-            if isinstance(row, dict):
			
 
				-                if "re" in row:
			
 
				-                    flags = FEAT_RE
			
 
				-                    rule = row["re"]
			
 
				-            else:
			
 
				-                flags = 0
			
 
				-                rule = row
			
 
				+        for rule in sec.get("ignore", []):
			
 
				+            conn.execute(
			
 
				+                    """INSERT INTO tbl_ignore (
			
 
				+                        lang_id, rule, features
			
 
				+                    ) VALUES (?, ?, ?)""",
			
 
				+                    (tid, rule, 0))
			
 
				 
			
 
				+        for rule in sec.get("ignore_ptn", []):
			
 
				             conn.execute(
			
 
				                     """INSERT INTO tbl_ignore (
			
 
				                         lang_id, rule, features
			
 
				                     ) VALUES (?, ?, ?)""",
			
 
				-                    (tid, rule, flags))
			
 
				+                    (tid, rule, FEAT_RE))
			
 
				 
			
 
				         # Double caps (S2R only).
			
 
				         for rule in sec.get("double_cap", []):
			
@@ -417,33 +427,22 @@ def load_table(tname):
 
				 
			
 
				         # Ignore regular expression patterns.
			
 
				         # Patterns are evaluated in the order they are listed in the config.
			
 
				-        ignore_ptn = [
			
 
				-                re.compile(ptn)
			
 
				-                for ptn in tdata["roman_to_script"].get("ignore_ptn", [])]
			
 
				+        ignore_ptn = tdata["roman_to_script"].get("ignore_ptn", [])
			
 
				         for parent in parents:
			
 
				             parent_tdata = load_table(parent)
			
 
				             # NOTE: duplicates are not removed.
			
 
				-            ignore_ptn = [
			
 
				-                re.compile(ptn)
			
 
				-                for ptn in parent_tdata.get(
			
 
				-                        "roman_to_script", {}).get("ignore_ptn", [])
			
 
				-            ] + ignore_ptn
			
 
				+            ignore_ptn = parent_tdata.get(
			
 
				+                    "roman_to_script", {}).get("ignore_ptn", []) + ignore_ptn
			
 
				         tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn
			
 
				 
			
 
				         # Ignore plain strings.
			
 
				-        ignore = {
			
 
				-            Token(t)
			
 
				-            for t in tdata["roman_to_script"].get("ignore", [])
			
 
				-        }
			
 
				+        ignore = set(tdata["roman_to_script"].get("ignore", []))
			
 
				         for parent in parents:
			
 
				             parent_tdata = load_table(parent)
			
 
				             # No overriding occurs with the ignore list, only de-duplication.
			
 
				-            ignore |= {
			
 
				-                Token(t) for t in parent_tdata.get(
			
 
				-                        "roman_to_script", {}).get("ignore", [])
			
 
				-            }
			
 
				-        tdata["roman_to_script"]["ignore"] = [
			
 
				-                t.content for t in sorted(ignore)]
			
 
				+            ignore |= set(parent_tdata.get(
			
 
				+                        "roman_to_script", {}).get("ignore", []))
			
 
				+        tdata["roman_to_script"]["ignore"] = sorted(ignore)
			
 
				 
			
 
				         # Hooks.
			
 
				         if "hooks" in tdata["roman_to_script"]:
			
@@ -521,6 +520,10 @@ def get_language(lang):
 
				             if len(s2r_hooks):
			
 
				                 data["script_to_roman"]["hooks"] = s2r_hooks
			
 
				 
			
 
				+            double_cap = get_lang_dcap(conn, lang_id)
			
 
				+            if len(double_cap):
			
 
				+                data["script_to_roman"]["double_cap"] = double_cap
			
 
				+
			
 
				         # Roman to script map, ignore list, and hooks.
			
 
				 
			
 
				         if data["has_r2s"]:
			
@@ -542,10 +545,6 @@ def get_language(lang):
 
				         if len(opt_data):
			
 
				             data["options"] = opt_data
			
 
				 
			
 
				-        double_cap = get_lang_dcap(conn, lang_id)
			
 
				-        if len(double_cap):
			
 
				-            data["double_cap"] = double_cap
			
 
				-
			
 
				     conn.close()
			
 
				 
			
 
				     return data
			
@@ -591,8 +590,9 @@ def get_lang_ignore(conn, lang_id):
 
				             """SELECT rule, features FROM tbl_ignore
			
 
				             WHERE lang_id = ?""",
			
 
				             (lang_id,))
			
 
				-    # Features (regular expressions) not implemented yet.
			
 
				-    return tuple(row[0] for row in qry)
			
 
				+    return tuple(
			
 
				+            compile(row[0]) if row[1] & FEAT_RE else row[0]
			
 
				+            for row in qry)
			
 
				 
			
 
				 
			
 
				 @cache
			
@@ -652,7 +652,7 @@ def get_lang_hooks(conn, lang_id, t_dir):
 
				             }
			
 
				         )
			
 
				 
			
 
				-    return hooks
			
 
				+    return dict(hooks)
			
 
				 
			
 
				 
			
 
				 def get_lang_dcap(conn, lang_id):
			
--- a/scriptshifter/tables/data/_ignore_base.yml
+++ b/scriptshifter/tables/data/_ignore_base.yml
@@ -16,106 +16,21 @@ roman_to_script:
 
				     # dedicated U+2160÷U+216F (uppercase Roman
			
 
				     # numerals) and/or U+2170÷U+217F (lower case Roman
			
 
				     # numerals) ranges to avoid this ambiguity.
			
 
				-    # TODO implement regular expressions for ignore patterns.
			
 
				-    #- re: "I{2,3}"
			
 
				-    #- re: "I(V|X)"
			
 
				-    #- re: "LI{,3}"
			
 
				-    #- re: "LI?(V|X)"
			
 
				-    #- re: "L(V|X{1,3})I{,3}"
			
 
				-    #- re: "LX{1,3}I?V"
			
 
				-    #- re: "LX{1,3}VI{,3}"
			
 
				-    #- re: "(V|X{1,3})I{,3}"
			
 
				-    #- re: "X{1,3}I{,3}"
			
 
				-    #- re: "X{1,3}I(V|X)"
			
 
				-    #- re: "X{1,3}VI{,3}"
			
 
				-    - "II"
			
 
				-    - "III"
			
 
				-    - "IV"
			
 
				-    - "IX"
			
 
				-    - "LI"
			
 
				-    - "LII"
			
 
				-    - "LIII"
			
 
				-    - "LIV"
			
 
				-    - "LIX"
			
 
				-    - "LV"
			
 
				-    - "LVI"
			
 
				-    - "LVII"
			
 
				-    - "LVIII"
			
 
				-    - "LX"
			
 
				-    - "LXI"
			
 
				-    - "LXII"
			
 
				-    - "LXIII"
			
 
				-    - "LXIV"
			
 
				-    - "LXIX"
			
 
				-    - "LXV"
			
 
				-    - "LXVI"
			
 
				-    - "LXVII"
			
 
				-    - "LXVIII"
			
 
				-    - "LXX"
			
 
				-    - "LXXI"
			
 
				-    - "LXXII"
			
 
				-    - "LXXIII"
			
 
				-    - "LXXIV"
			
 
				-    - "LXXIX"
			
 
				-    - "LXXV"
			
 
				-    - "LXXVI"
			
 
				-    - "LXXVII"
			
 
				-    - "LXXVIII"
			
 
				-    - "LXXX"
			
 
				-    - "LXXXI"
			
 
				-    - "LXXXII"
			
 
				-    - "LXXXIII"
			
 
				-    - "LXXXIV"
			
 
				-    - "LXXXIX"
			
 
				-    - "LXXXV"
			
 
				-    - "LXXXVI"
			
 
				-    - "LXXXVII"
			
 
				-    - "LXXXVIII"
			
 
				-    - "VI"
			
 
				-    - "VII"
			
 
				-    - "VIII"
			
 
				-    - "XI"
			
 
				-    - "XII"
			
 
				-    - "XIII"
			
 
				-    - "XIV"
			
 
				-    - "XIX"
			
 
				-    - "XL"
			
 
				-    - "XLI"
			
 
				-    - "XLII"
			
 
				-    - "XLIII"
			
 
				-    - "XLIV"
			
 
				-    - "XLIX"
			
 
				-    - "XLV"
			
 
				-    - "XLVI"
			
 
				-    - "XLVII"
			
 
				-    - "XLVIII"
			
 
				-    - "XV"
			
 
				-    - "XVI"
			
 
				-    - "XVII"
			
 
				-    - "XVIII"
			
 
				-    - "XX"
			
 
				-    - "XXI"
			
 
				-    - "XXII"
			
 
				-    - "XXIII"
			
 
				-    - "XXIV"
			
 
				-    - "XXIX"
			
 
				-    - "XXV"
			
 
				-    - "XXVI"
			
 
				-    - "XXVII"
			
 
				-    - "XXVIII"
			
 
				-    - "XXX"
			
 
				-    - "XXXI"
			
 
				-    - "XXXII"
			
 
				-    - "XXXIII"
			
 
				-    - "XXXIV"
			
 
				-    - "XXXIX"
			
 
				-    - "XXXV"
			
 
				-    - "XXXVI"
			
 
				-    - "XXXVII"
			
 
				-    - "XXXVIII"
			
 
				     - "and one other"
			
 
				-    #- re: "and ([a-z0-9]+ )?others"
			
 
				     - "et al."
			
 
				+  ignore_ptn:
			
 
				+    - "and ([a-z0-9]+ )?others"
			
 
				+    - "I{2,3}"
			
 
				+    - "I(V|X)"
			
 
				+    - "LI{,3}"
			
 
				+    - "LI?(V|X)"
			
 
				+    - "L(V|X{1,3})I{,3}"
			
 
				+    - "LX{1,3}I?V"
			
 
				+    - "LX{1,3}VI{,3}"
			
 
				+    - "(V|X{1,3})I{,3}"
			
 
				+    - "X{1,3}I{,3}"
			
 
				+    - "X{1,3}I(V|X)"
			
 
				+    - "X{1,3}VI{,3}"
			
 
				 
			
 
				 script_to_roman:
			
 
				   ignore:
			
--- a/scriptshifter/tables/data/chinese.yml
+++ b/scriptshifter/tables/data/chinese.yml
@@ -31,6 +31,8 @@ script_to_roman:
 
				     pre_assembly:
			
 
				       -
			
 
				         - chinese.parse_numerals_pre_assembly
			
 
				+      -
			
 
				+        - chinese.person_name_pre_assembly
			
 
				 
			
 
				   map:
			
 
				     "〇": "ling#0 "
			
--- a/scriptshifter/tables/data/thai.yml
+++ b/scriptshifter/tables/data/thai.yml
@@ -33,6 +33,9 @@ options:
 
				 script_to_roman:
			
 
				   hooks:
			
 
				     post_config:
			
 
				+      -
			
 
				+        - asian_tokenizer.s2r_tokenize
			
 
				+        - model: "KoichiYasuoka/roberta-base-thai-spm-upos"
			
 
				       -
			
 
				         - aksharamukha.romanizer.s2r_post_config
			
 
				         - src_script: "Thai"
			
--- a/scriptshifter/tables/data/thai_alt.yml
+++ b/scriptshifter/tables/data/thai_alt.yml
@@ -4,6 +4,11 @@ general:
 
				   case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				+  hooks:
			
 
				+    post_normalize:
			
 
				+      -
			
 
				+        - asian_tokenizer.s2r_tokenize
			
 
				+        - model: "th"
			
 
				   map:
			
 
				     # COMMON SPECIAL CHARACTERS
			
 
				 
			
--- a/scriptshifter/tables/data/uighur_arabic.yml
+++ b/scriptshifter/tables/data/uighur_arabic.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Uighur (Arabic)
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 roman_to_script:
			
 
				   map:
			
@@ -118,7 +120,7 @@ roman_to_script:
 
				     "%zh": "\uFB8A"
			
 
				     "zh": "\uFB8B"
			
 
				     "%zh%": "\uFB8A"
			
 
				-    
			
 
				+
			
 
				 script_to_roman:
			
 
				   map:
			
 
				     "\u0626\u0627": "a"
			
@@ -157,9 +159,9 @@ script_to_roman:
 
				     "\uFEEB": "h"
			
 
				     "\uFEEC": "h"
			
 
				     "\u0640\u0629": "h"
			
 
				-    "\uFEEA": "h"
			
 
				+    "%\uFEEA": "h"
			
 
				     "\u0629": "h"
			
 
				-    "\u0647": "h"
			
 
				+    "%\u0647%": "h"
			
 
				     "\uFE8C": "i"
			
 
				     "\uFBE8": "i"
			
 
				     "\uFBE9": "i"
			
--- a/scriptshifter/tables/data/index.yml
+++ b/scriptshifter/tables/data/index.yml
--- a/scriptshifter/trans.py
+++ b/scriptshifter/trans.py
@@ -1,7 +1,7 @@
 
				 import logging
			
 
				 
			
 
				 from importlib import import_module
			
 
				-from re import compile
			
 
				+from re import Pattern, compile
			
 
				 
			
 
				 from scriptshifter.exceptions import BREAK, CONT
			
 
				 from scriptshifter.tables import (
			
@@ -120,11 +120,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
				         if _run_hook("post_config", ctx) == BREAK:
			
 
				             return getattr(ctx, "dest", ""), ctx.warnings
			
 
				 
			
 
				-        _normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id))
			
 
				-
			
 
				-        if _run_hook("post_normalize", ctx) == BREAK:
			
 
				+        # _normalize_src returns the results of the post_normalize hook.
			
 
				+        if _normalize_src(
			
 
				+                ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
			
 
				             return getattr(ctx, "dest", ""), ctx.warnings
			
 
				 
			
 
				+        logger.debug(f"Normalized source: {ctx.src}")
			
 
				         lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))
			
 
				 
			
 
				         # Loop through source characters. The increment of each loop depends on
			
@@ -169,8 +170,21 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
				                     if hret == CONT:
			
 
				                         continue
			
 
				 
			
 
				-                    step = len(ctx.tk)
			
 
				-                    if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
			
 
				+                    _matching = False
			
 
				+                    if type(ctx.tk) is Pattern:
			
 
				+                        # Seach RE pattern beginning at cursor.
			
 
				+                        if _ptn_match := ctx.tk.match(ctx.src[ctx.cur:]):
			
 
				+                            ctx.tk = _ptn_match[0]
			
 
				+                            logger.debug(f"Matched regex: {ctx.tk}")
			
 
				+                            step = len(ctx.tk)
			
 
				+                            _matching = True
			
 
				+                    else:
			
 
				+                        # Search exact match.
			
 
				+                        step = len(ctx.tk)
			
 
				+                        if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
			
 
				+                            _matching = True
			
 
				+
			
 
				+                    if _matching:
			
 
				                         # The position matches an ignore token.
			
 
				                         hret = _run_hook("on_ignore_match", ctx)
			
 
				                         if hret == BREAK:
			
@@ -181,6 +195,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
				                         logger.info(f"Ignored token: {ctx.tk}")
			
 
				                         ctx.dest_ls.append(ctx.tk)
			
 
				                         ctx.cur += step
			
 
				+                        if ctx.cur >= len(ctx.src):
			
 
				+                            # reached end of string. Stop ignoring.
			
 
				+                            # The outer loop will exit imediately after.
			
 
				+                            ctx.ignoring = False
			
 
				+                            break
			
 
				+
			
 
				                         cur_char = ctx.src[ctx.cur]
			
 
				                         ctx.ignoring = True
			
 
				                         break
			
@@ -193,6 +213,9 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
				             delattr(ctx, "tk")
			
 
				             delattr(ctx, "ignoring")
			
 
				 
			
 
				+            if ctx.cur >= len(ctx.src):
			
 
				+                break
			
 
				+
			
 
				             # Begin transliteration token lookup.
			
 
				             ctx.match = False
			
 
				 
			
@@ -315,10 +338,14 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
				 def _normalize_src(ctx, norm_rules):
			
 
				     """
			
 
				     Normalize source text according to rules.
			
 
				+
			
 
				+    NOTE: this manipluates the protected source attribute so it may not
			
 
				+    correspond to the originally provided source.
			
 
				     """
			
 
				     for nk, nv in norm_rules.items():
			
 
				         ctx._src = ctx.src.replace(nk, nv)
			
 
				-    logger.debug(f"Normalized source: {ctx.src}")
			
 
				+
			
 
				+    return _run_hook("post_normalize", ctx)
			
 
				 
			
 
				 
			
 
				 def _is_bow(cur, ctx, word_boundary):
			
--- a/scriptshifter_base.Dockerfile
+++ b/scriptshifter_base.Dockerfile
@@ -10,6 +10,8 @@ RUN addgroup --system www
 
				 RUN adduser --system www
			
 
				 RUN gpasswd -a www www
			
 
				 
			
 
				+ENV HF_DATASETS_CACHE /data/hf/datasets
			
 
				+
			
 
				 # Copy external dependencies.
			
 
				 WORKDIR ${WORKROOT}
			
 
				 COPY ext ./ext/
			
--- a/sscli
+++ b/sscli
@@ -10,7 +10,7 @@ from os import path
 
				 
			
 
				 from scriptshifter import DB_PATH
			
 
				 from scriptshifter.tables import init_db as _init_db
			
 
				-from tests import test_sample
			
 
				+from tests.integration_tests import test_sample
			
 
				 
			
 
				 
			
 
				 @click.group()
			
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,71 +1,17 @@
 
				-from csv import reader
			
 
				-from difflib import ndiff
			
 
				 from importlib import reload
			
 
				-from json import loads as jloads
			
 
				-from logging import getLogger
			
 
				-from os import path
			
 
				+from os import path, environ
			
 
				+from tempfile import gettempdir
			
 
				 
			
 
				-import scriptshifter.tables
			
 
				-
			
 
				-from scriptshifter.trans import transliterate
			
 
				+import scriptshifter
			
 
				+from scriptshifter import tables
			
 
				 
			
 
				 
			
 
				 TEST_DIR = path.dirname(path.realpath(__file__))
			
 
				 TEST_DATA_DIR = path.join(TEST_DIR, "data")
			
 
				+TEST_CONFIG_DIR = path.join(TEST_DIR, "tables", "data")
			
 
				 
			
 
				-logger = getLogger(__name__)
			
 
				-
			
 
				-
			
 
				-def reload_tables():
			
 
				-    reload(scriptshifter.tables)  # Reload new config dir.
			
 
				-    from scriptshifter import tables
			
 
				-    tables.list_tables.cache_clear()
			
 
				-    tables.get_language.cache_clear()
			
 
				-    tables.get_lang_map.cache_clear()
			
 
				-
			
 
				-    return tables
			
 
				-
			
 
				-
			
 
				-def test_sample(dset):
			
 
				-    """
			
 
				-    Test an individual sample set and produce a human-readable report.
			
 
				-
			
 
				-    Used outside of automated tests.
			
 
				-
			
 
				-    @param dset (str): sample set name (without the .csv extension) found in
			
 
				-    the `data/script_samples` directory.
			
 
				-    """
			
 
				-    deltas = []
			
 
				-    dset_fpath = path.join(TEST_DATA_DIR, "script_samples", dset + ".csv")
			
 
				-    log_fpath = path.join(TEST_DATA_DIR, f"test_{dset}.log")
			
 
				-
			
 
				-    with open(dset_fpath, newline="") as fh:
			
 
				-        csv = reader(fh)
			
 
				-        for row in csv:
			
 
				-            lang, script, rom = row[:3]
			
 
				-            if not lang:
			
 
				-                continue
			
 
				-            opts = jloads(row[3]) if len(row) > 3 and row[3] else {}
			
 
				-            trans, warnings = transliterate(
			
 
				-                    script, lang, t_dir="s2r",
			
 
				-                    capitalize=opts.get("capitalize"), options=opts)
			
 
				-            if (trans == rom):
			
 
				-                print(".", end="")
			
 
				-            else:
			
 
				-                print("F", end="")
			
 
				-                deltas.append((lang, script, ndiff([trans], [rom])))
			
 
				-
			
 
				-    with open(log_fpath, "w") as fh:
			
 
				-        # If no deltas, just truncate the file.
			
 
				-        for lang, script, delta in deltas:
			
 
				-            fh.write(f"Language: {lang}\n")
			
 
				-            fh.write(f"Original: {script}\nDiff (result vs. expected):\n")
			
 
				-            for dline in delta:
			
 
				-                fh.write(dline.strip() + "\n")
			
 
				-            fh.write("\n\n")
			
 
				-
			
 
				-    ct = len(deltas)
			
 
				-    if ct > 0:
			
 
				-        print(f"{ct} failed tests. See report at {log_fpath}")
			
 
				-    else:
			
 
				-        print("All tests passed.")
			
 
				+# Reload main SS modules after changing environment variables.
			
 
				+environ["TXL_DB_PATH"] = path.join(gettempdir(), "scriptshifter_unittest.db")
			
 
				+reload(scriptshifter)
			
 
				+environ["TXL_CONFIG_TABLE_DIR"] = TEST_CONFIG_DIR
			
 
				+reload(tables)
			
--- a/tests/data/index.yml
+++ b/tests/data/index.yml
@@ -1,6 +0,0 @@
 
				-inherited:
			
 
				-  name: Test inherited table
			
 
				-ordering:
			
 
				-  name: Test ordering
			
 
				-rot3:
			
 
				-  name: Test ROT3 hooks
			
--- a/tests/data/script_samples/unittest.csv
+++ b/tests/data/script_samples/unittest.csv
@@ -0,0 +1,8 @@
 
				+inherited,abcd,9078,,
			
 
				+inherited,TUVX,tuvx,"{""t_dir"": ""r2s""}",
			
 
				+rot3,defg,abcd,,
			
 
				+rot3,HIJK,KLMN,"{""t_dir"": ""r2s""}",
			
 
				+rot3,st uv,Vw xy,"{""t_dir"": ""r2s"", ""capitalize"": ""first""}",
			
 
				+rot3,st uv,Vw Xy,"{""t_dir"": ""r2s"", ""capitalize"": ""all""}",
			
 
				+regex,Hello abc,Hello 678,"{""t_dir"": ""r2s""}",
			
 
				+regex,Hullo abc,5u22o 678,"{""t_dir"": ""r2s""}",
			
--- a/tests/integration_tests.py
+++ b/tests/integration_tests.py
@@ -0,0 +1,58 @@
 
				+from csv import reader
			
 
				+from difflib import ndiff
			
 
				+from json import loads as jloads
			
 
				+from logging import getLogger
			
 
				+from os import path
			
 
				+
			
 
				+from scriptshifter.trans import transliterate
			
 
				+from tests import TEST_DATA_DIR
			
 
				+
			
 
				+logger = getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+def test_sample(dset):
			
 
				+    """
			
 
				+    Test an individual sample set and produce a human-readable report.
			
 
				+
			
 
				+    Used outside of automated tests.
			
 
				+
			
 
				+    @param dset (str): sample set name (without the .csv extension) found in
			
 
				+    the `data/script_samples` directory.
			
 
				+    """
			
 
				+    deltas = []
			
 
				+    dset_fpath = path.join(TEST_DATA_DIR, "script_samples", dset + ".csv")
			
 
				+    log_fpath = path.join(TEST_DATA_DIR, f"test_{dset}.log")
			
 
				+
			
 
				+    with open(dset_fpath, newline="") as fh:
			
 
				+        csv = reader(fh)
			
 
				+        i = 1
			
 
				+        for row in csv:
			
 
				+            logger.info(f"CSV row #{i}")
			
 
				+            i += 1
			
 
				+            lang, script, rom = row[:3]
			
 
				+            if not lang:
			
 
				+                continue
			
 
				+            opts = jloads(row[3]) if len(row) > 3 and row[3] else {}
			
 
				+            trans, warnings = transliterate(
			
 
				+                    script, lang, t_dir="s2r",
			
 
				+                    capitalize=opts.get("capitalize"), options=opts)
			
 
				+            if (trans == rom):
			
 
				+                print(".", end="")
			
 
				+            else:
			
 
				+                print("F", end="")
			
 
				+                deltas.append((lang, script, ndiff([trans], [rom])))
			
 
				+
			
 
				+    with open(log_fpath, "w") as fh:
			
 
				+        # If no deltas, just truncate the file.
			
 
				+        for lang, script, delta in deltas:
			
 
				+            fh.write(f"Language: {lang}\n")
			
 
				+            fh.write(f"Original: {script}\nDiff (result vs. expected):\n")
			
 
				+            for dline in delta:
			
 
				+                fh.write(dline.strip() + "\n")
			
 
				+            fh.write("\n\n")
			
 
				+
			
 
				+    ct = len(deltas)
			
 
				+    if ct > 0:
			
 
				+        print(f"{ct} failed tests. See report at {log_fpath}")
			
 
				+    else:
			
 
				+        print("All tests passed.")
			
--- a/tests/tables/data/_base1.yml
+++ b/tests/tables/data/_base1.yml
--- a/tests/tables/data/_base2.yml
+++ b/tests/tables/data/_base2.yml
--- a/tests/tables/data/_base3.yml
+++ b/tests/tables/data/_base3.yml
--- a/tests/tables/data/cap_base1.yml
+++ b/tests/tables/data/cap_base1.yml
--- a/tests/tables/data/cap_base2.yml
+++ b/tests/tables/data/cap_base2.yml
--- a/tests/tables/data/cap_inherited.yml
+++ b/tests/tables/data/cap_inherited.yml
--- a/tests/tables/data/inherited.yml
+++ b/tests/tables/data/inherited.yml
--- a/tests/tables/data/ordering.yml
+++ b/tests/tables/data/ordering.yml
--- a/tests/tables/data/regex.yml
+++ b/tests/tables/data/regex.yml
@@ -0,0 +1,19 @@
 
				+---
			
 
				+# Test file for regex ignoring.
			
 
				+
			
 
				+general:
			
 
				+  name: Test regex ignoring.
			
 
				+  parents:
			
 
				+    - inherited
			
 
				+
			
 
				+roman_to_script:
			
 
				+  ignore_ptn:
			
 
				+    - "[hH][ae]llo"
			
 
				+
			
 
				+  map:
			
 
				+    "h": "1"
			
 
				+    "H": "5"
			
 
				+    "l": "2"
			
 
				+    "a": "6"
			
 
				+    "b": "7"
			
 
				+    "c": "8"
			
--- a/tests/tables/data/rot3.yml
+++ b/tests/tables/data/rot3.yml
--- a/tests/tables/index.yml
+++ b/tests/tables/index.yml
@@ -0,0 +1,17 @@
 
				+---
			
 
				+inherited:
			
 
				+  name: Test inheritance leaf file
			
 
				+  marc_code: inh
			
 
				+  description: Test description.
			
 
				+cap_base1:
			
 
				+  name: Test capitalization base 1
			
 
				+cap_base2:
			
 
				+  name: Test capitalization base 2
			
 
				+cap_inherited:
			
 
				+  name: Test capitalization
			
 
				+ordering:
			
 
				+  name: Test ordering
			
 
				+regex:
			
 
				+  name: inherited config + regex ignore.
			
 
				+rot3:
			
 
				+  name: Test ROT3 hooks
			
--- a/tests/test01_cfg.py
+++ b/tests/test01_cfg.py
@@ -1,20 +1,21 @@
 
				+from os import environ, unlink
			
 
				 from unittest import TestCase
			
 
				 
			
 
				-from os import environ
			
 
				+from scriptshifter.tables import get_language, init_db
			
 
				 
			
 
				-import scriptshifter
			
 
				 
			
 
				-from tests import TEST_DATA_DIR, reload_tables
			
 
				+def setUpModule():
			
 
				+    init_db()
			
 
				+
			
 
				+
			
 
				+def tearDownModule():
			
 
				+    unlink(environ["TXL_DB_PATH"])
			
 
				 
			
 
				 
			
 
				 class TestConfig(TestCase):
			
 
				     """ Test configuration parsing. """
			
 
				-    def setUp(self):
			
 
				-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
			
 
				-        self.tables = reload_tables()
			
 
				-
			
 
				     def test_ordering(self):
			
 
				-        tbl = self.tables.load_table("ordering")
			
 
				+        tbl = get_language("ordering")
			
 
				         exp_order = ["ABCD", "AB", "A", "BCDE", "BCD", "BEFGH", "B"]
			
 
				 
			
 
				         self.assertEqual(
			
@@ -23,19 +24,17 @@ class TestConfig(TestCase):
 
				 
			
 
				 class TestOverride(TestCase):
			
 
				     """ Test configuration overrides. """
			
 
				-    def setUp(self):
			
 
				-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
			
 
				-        self.tables = reload_tables()
			
 
				-
			
 
				     def test_override_map(self):
			
 
				-        tbl = self.tables.load_table("inherited")
			
 
				+        tbl = get_language("inherited")
			
 
				 
			
 
				-        self.assertEqual(tbl["general"]["name"], "Test inheritance leaf file")
			
 
				+        self.assertEqual(tbl["label"], "Test inheritance leaf file")
			
 
				+        self.assertEqual(tbl["marc_code"], "inh")
			
 
				+        self.assertEqual(tbl["description"], "Test description.")
			
 
				 
			
 
				         # Entries are additive.
			
 
				         self.assertEqual(
			
 
				                 tbl["roman_to_script"]["ignore"],
			
 
				-                ["Fritter my wig", "Hi", "Ho", "Thing-um-a-jig"])
			
 
				+                ("Fritter my wig", "Hi", "Ho", "Thing-um-a-jig"))
			
 
				         self.assertEqual(
			
 
				                 tbl["roman_to_script"]["map"],
			
 
				                 (
			
@@ -102,34 +101,31 @@ class TestOverride(TestCase):
 
				 
			
 
				 class TestHooks(TestCase):
			
 
				     """ Test parsing of hook functions. """
			
 
				-    def setUp(self):
			
 
				-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
			
 
				-        self.tables = reload_tables()
			
 
				-
			
 
				     def test_rot3(self):
			
 
				-        tbl = self.tables.load_table("rot3")
			
 
				+        tbl = get_language("rot3")
			
 
				 
			
 
				         self.assertEqual(
			
 
				-                tbl["script_to_roman"]["hooks"],
			
 
				-                {
			
 
				-                    "begin_input_token": [
			
 
				-                        ("test", scriptshifter.hooks.test.rotate, {"n": -3})
			
 
				-                    ]
			
 
				-                })
			
 
				+            tbl["script_to_roman"]["hooks"],
			
 
				+            {
			
 
				+                "begin_input_token": [
			
 
				+                    {
			
 
				+                        "module_name": "test",
			
 
				+                        "fn_name": "rotate",
			
 
				+                        "kwargs": {"n": -3},
			
 
				+                    }
			
 
				+                ]
			
 
				+            }
			
 
				+        )
			
 
				 
			
 
				 
			
 
				 class TestDoubleCaps(TestCase):
			
 
				     """ Test double capitalization configuration. """
			
 
				-    def setUp(self):
			
 
				-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
			
 
				-        self.tables = reload_tables()
			
 
				-
			
 
				     def test_dcaps_base1(self):
			
 
				-        cap_base1 = self.tables.load_table("cap_base1")
			
 
				+        cap_base1 = get_language("cap_base1")
			
 
				         assert "z︠h︡" in cap_base1["script_to_roman"]["double_cap"]
			
 
				 
			
 
				     def test_dcaps_base2(self):
			
 
				-        cap_base2 = self.tables.load_table("cap_base2")
			
 
				+        cap_base2 = get_language("cap_base2")
			
 
				         dcap = cap_base2["script_to_roman"]["double_cap"]
			
 
				 
			
 
				         assert len(dcap) == 2
			
@@ -137,7 +133,7 @@ class TestDoubleCaps(TestCase):
 
				         assert "i︠o︡" in dcap
			
 
				 
			
 
				     def test_dcaps_inherited(self):
			
 
				-        cap_inherited = self.tables.load_table("cap_inherited")
			
 
				+        cap_inherited = get_language("cap_inherited")
			
 
				         dcap = cap_inherited["script_to_roman"]["double_cap"]
			
 
				 
			
 
				         assert len(dcap) == 1
			
--- a/tests/test02_transliteration.py
+++ b/tests/test02_transliteration.py
@@ -2,18 +2,25 @@ import logging
 
				 
			
 
				 from unittest import TestCase, TestSuite, TextTestRunner
			
 
				 from csv import reader
			
 
				-from glob import glob
			
 
				 from json import loads as jloads
			
 
				-from os import environ, path
			
 
				+from os import environ, path, unlink
			
 
				 
			
 
				-from tests import TEST_DATA_DIR, reload_tables
			
 
				 from scriptshifter.trans import transliterate
			
 
				-import scriptshifter.tables
			
 
				+from scriptshifter.tables import get_language, init_db
			
 
				+from tests import TEST_DATA_DIR
			
 
				 
			
 
				 
			
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
 
				 
			
 
				+def setUpModule():
			
 
				+    init_db()
			
 
				+
			
 
				+
			
 
				+def tearDownModule():
			
 
				+    unlink(environ["TXL_DB_PATH"])
			
 
				+
			
 
				+
			
 
				 class TestTrans(TestCase):
			
 
				     """
			
 
				     Test S2R transliteration.
			
@@ -23,20 +30,21 @@ class TestTrans(TestCase):
 
				     TODO use a comprehensive sample table and report errors for unsupported
			
 
				     languages.
			
 
				     """
			
 
				-
			
 
				-    maxDiff = None
			
 
				-
			
 
				-    def sample_s2r(self):
			
 
				+    def sample(self):
			
 
				         """
			
 
				-        Test S2R transliteration for one CSV sample.
			
 
				+        Test transliteration for one CSV row.
			
 
				 
			
 
				         This function name won't start with `test_` otherwise will be
			
 
				         automatically run without parameters.
			
 
				         """
			
 
				-        config = scriptshifter.tables.load_table(self.tbl)
			
 
				-        if "script_to_roman" in config:
			
 
				+        config = get_language(self.tbl)
			
 
				+        t_dir = self.options.get("t_dir", "s2r")
			
 
				+        if (
			
 
				+                t_dir == "s2r" and config["has_s2r"]
			
 
				+                or t_dir == "r2s" and config["has_r2s"]):
			
 
				             txl = transliterate(
			
 
				                     self.script, self.tbl,
			
 
				+                    t_dir=t_dir,
			
 
				                     capitalize=self.options.get("capitalize", False),
			
 
				                     options=self.options)[0]
			
 
				             self.assertEqual(
			
@@ -44,49 +52,27 @@ class TestTrans(TestCase):
 
				                     f"S2R transliteration error for {self.tbl}!\n"
			
 
				                     f"Original: {self.script}")
			
 
				 
			
 
				-    def sample_r2s(self):
			
 
				-        """
			
 
				-        Test R2S transliteration for one CSV sample.
			
 
				-
			
 
				-        This function name won't start with `test_` otherwise will be
			
 
				-        automatically run without parameters.
			
 
				-        """
			
 
				-        config = scriptshifter.tables.load_table(self.tbl)
			
 
				-        if "roman_to_script" in config:
			
 
				-            txl = transliterate(
			
 
				-                    self.roman, self.tbl,
			
 
				-                    t_dir="r2s",
			
 
				-                    capitalize=self.options.get("capitalize", False),
			
 
				-                    options=self.options)[0]
			
 
				-            self.assertEqual(
			
 
				-                    txl, self.script,
			
 
				-                    f"R2S transliteration error for {self.tbl}!\n"
			
 
				-                    f"Original: {self.roman}")
			
 
				-
			
 
				 
			
 
				 def make_suite():
			
 
				     """
			
 
				     Build parametrized test cases.
			
 
				     """
			
 
				-    if "TXL_CONFIG_TABLE_DIR" in environ:
			
 
				-        del environ["TXL_CONFIG_TABLE_DIR"]
			
 
				-    reload_tables()
			
 
				-
			
 
				     suite = TestSuite()
			
 
				 
			
 
				-    for fpath in glob(path.join(TEST_DATA_DIR, "script_samples", "*.csv")):
			
 
				-        with open(fpath, newline="") as fh:
			
 
				-            csv = reader(fh)
			
 
				-            for row in csv:
			
 
				-                if len(row[0]):
			
 
				-                    # Inject transliteration info in the test case.
			
 
				-                    for tname in ("sample_s2r", "sample_r2s"):
			
 
				-                        tcase = TestTrans(tname)
			
 
				-                        tcase.tbl = row[0]
			
 
				-                        tcase.script = row[1].strip()
			
 
				-                        tcase.roman = row[2].strip()
			
 
				-                        tcase.options = jloads(row[3]) if len(row[3]) else {}
			
 
				-                        suite.addTest(tcase)
			
 
				+    with open(path.join(
			
 
				+        TEST_DATA_DIR, "script_samples", "unittest.csv"
			
 
				+    ), newline="") as fh:
			
 
				+        csv = reader(fh)
			
 
				+        for row in csv:
			
 
				+            if len(row[0]):
			
 
				+                # Inject transliteration info in the test case.
			
 
				+                tcase = TestTrans("sample")
			
 
				+                tcase.tbl = row[0]
			
 
				+                tcase.script = row[1].strip()
			
 
				+                tcase.roman = row[2].strip()
			
 
				+                tcase.options = jloads(row[3]) if len(row[3]) else {}
			
 
				+
			
 
				+                suite.addTest(tcase)
			
 
				 
			
 
				     return suite
			
 
				 
			
--- a/tests/test03_capitalization.py
+++ b/tests/test03_capitalization.py
@@ -1,19 +1,22 @@
 
				-from os import environ
			
 
				+from os import environ, unlink
			
 
				 from unittest import TestCase
			
 
				 
			
 
				 from scriptshifter.trans import transliterate
			
 
				-from tests import TEST_DATA_DIR, reload_tables
			
 
				+from scriptshifter.tables import init_db
			
 
				+
			
 
				+
			
 
				+def setUpModule():
			
 
				+    init_db()
			
 
				+
			
 
				+
			
 
				+def tearDownModule():
			
 
				+    unlink(environ["TXL_DB_PATH"])
			
 
				 
			
 
				 
			
 
				 class TestCapitalization(TestCase):
			
 
				     """
			
 
				     Test capitalization.
			
 
				     """
			
 
				-
			
 
				-    def setUp(self):
			
 
				-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
			
 
				-        self.tables = reload_tables()
			
 
				-
			
 
				     def test_cap(self):
			
 
				         tbl = "cap_inherited"
			
 
				         in_str = "зг іо"
			
--- a/tests/test04_rest_api.py
+++ b/tests/test04_rest_api.py
@@ -1,25 +1,28 @@
 
				 import json
			
 
				 
			
 
				-from os import environ
			
 
				+from os import environ, unlink
			
 
				 from unittest import TestCase
			
 
				 
			
 
				 from scriptshifter.rest_api import app
			
 
				-from tests import TEST_DATA_DIR, reload_tables
			
 
				+from scriptshifter.tables import init_db
			
 
				 
			
 
				 
			
 
				 EP = "http://localhost:8000"
			
 
				 
			
 
				 
			
 
				+def setUpModule():
			
 
				+    init_db()
			
 
				+
			
 
				+
			
 
				+def tearDownModule():
			
 
				+    unlink(environ["TXL_DB_PATH"])
			
 
				+
			
 
				+
			
 
				 class TestRestAPI(TestCase):
			
 
				     """ Test REST API interaction. """
			
 
				-    def setUp(self):
			
 
				-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
			
 
				-        # if "TXL_CONFIG_TABLE_DIR" in environ:
			
 
				-        #     del environ["TXL_CONFIG_TABLE_DIR"]
			
 
				-        reload_tables()
			
 
				-
			
 
				-        # Start webapp.
			
 
				-        app.testing = True
			
 
				+    # def setUp(self):
			
 
				+    #     # Start webapp.
			
 
				+    #     app.testing = True
			
 
				 
			
 
				     def test_health(self):
			
 
				         with app.test_client() as c:
			
@@ -35,7 +38,7 @@ class TestRestAPI(TestCase):
 
				 
			
 
				         data = json.loads(rsp.get_data(as_text=True))
			
 
				         self.assertIn("inherited", data)
			
 
				-        self.assertIn("name", data["inherited"])
			
 
				+        self.assertIn("label", data["inherited"])
			
 
				         self.assertNotIn("_base1", data)
			
 
				         self.assertNotIn("_base2", data)
			
 
				         self.assertNotIn("_base3", data)
			
@@ -47,14 +50,17 @@ class TestRestAPI(TestCase):
 
				         self.assertEqual(rsp.status_code, 200)
			
 
				         data = json.loads(rsp.get_data(as_text=True))
			
 
				 
			
 
				-        self.assertIn("general", data)
			
 
				+        self.assertIn("case_sensitive", data)
			
 
				+        self.assertIn("description", data)
			
 
				         self.assertIn("roman_to_script", data)
			
 
				         self.assertIn("map", data["roman_to_script"])
			
 
				+        self.assertEqual(data["has_r2s"], True)
			
 
				+        self.assertEqual(data["has_s2r"], False)
			
 
				         self.assertEqual(data["roman_to_script"]["map"][0], ["ABCD", ""])
			
 
				 
			
 
				     def test_trans_api_s2r(self):
			
 
				         with app.test_client() as c:
			
 
				-            rsp = c.post("/trans", data={"lang": "rot3", "text": "defg"})
			
 
				+            rsp = c.post("/trans", json={"lang": "rot3", "text": "defg"})
			
 
				 
			
 
				         self.assertEqual(rsp.status_code, 200)
			
 
				         data = json.loads(rsp.get_data(as_text=True))
			
@@ -64,7 +70,7 @@ class TestRestAPI(TestCase):
 
				     def test_trans_api_r2s(self):
			
 
				         with app.test_client() as c:
			
 
				             rsp = c.post(
			
 
				-                "/trans", data={
			
 
				+                "/trans", json={
			
 
				                     "lang": "rot3",
			
 
				                     "text": "abcd",
			
 
				                     "t_dir": "r2s"
			
@@ -80,7 +86,7 @@ class TestRestAPI(TestCase):
 
				         with app.test_client() as c:
			
 
				             rsp = c.post(
			
 
				                 "/trans",
			
 
				-                data={
			
 
				+                json={
			
 
				                     "lang": "rot3",
			
 
				                     "capitalize": "first",
			
 
				                     "text": "bcde",