Parcourir la source

Merge pull request #152 from lcnetdev/re

Use regular expressions for ignore patterns.
Stefano Cossu il y a 6 mois
Parent
commit
b57fa6ef97

+ 3 - 0
Dockerfile

@@ -9,6 +9,9 @@ COPY tests ./tests/
 COPY requirements.txt ./
 COPY requirements.txt ./
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 
 
+ENV HF_DATASETS_CACHE /data/hf/datasets
+RUN ./sscli admin init-db
+
 RUN chmod +x ./entrypoint.sh
 RUN chmod +x ./entrypoint.sh
 #RUN chown -R www:www ${WORKROOT} .
 #RUN chown -R www:www ${WORKROOT} .
 
 

+ 1 - 1
doc/rest_api.md

@@ -73,7 +73,7 @@ MIME type: `application/json`
 
 
 Content: JSON object with the following keys:
 Content: JSON object with the following keys:
 
 
-- `lang`: Language code as given by the `/languages` endpoint. 
+- `lang`: Language code as given by the `/languages` endpoint.
 - `text`: Input text to be transliterated.
 - `text`: Input text to be transliterated.
 - `capitalize`: One of `first` (capitalize the first letter of the input),
 - `capitalize`: One of `first` (capitalize the first letter of the input),
   `all` (capitalize all words separated by spaces), or null (default: apply no
   `all` (capitalize all words separated by spaces), or null (default: apply no

+ 3 - 2
entrypoint.sh

@@ -9,11 +9,12 @@ else
     export FLASK_ENV="production"
     export FLASK_ENV="production"
 fi
 fi
 
 
+# Preload Thai model.
+python -c 'from esupar import load; load("th")'
+
 host=${TXL_WEBAPP_HOST:-"0.0.0.0"}
 host=${TXL_WEBAPP_HOST:-"0.0.0.0"}
 port=${TXL_WEBAPP_PORT:-"8000"}
 port=${TXL_WEBAPP_PORT:-"8000"}
 
 
-./sscli admin init-db
-
 if [ "${FLASK_ENV}" == "development" ]; then
 if [ "${FLASK_ENV}" == "development" ]; then
     exec flask run -h $host -p $port
     exec flask run -h $host -p $port
 else
 else

+ 2 - 0
example.env

@@ -2,4 +2,6 @@ FLASK_DEBUG=true
 TXL_DICTA_EP="changeme"
 TXL_DICTA_EP="changeme"
 TXL_FLASK_SECRET="changeme"
 TXL_FLASK_SECRET="changeme"
 TXL_LOGLEVEL="INFO"
 TXL_LOGLEVEL="INFO"
+TXL_EMAIL_FROM="me@loc.gov"
+TXL_EMAIL_TO="me@loc.gov"
 LOSHN_KOYDESH_O2P_SRC="${PWD}/scriptshifter/hooks/yiddish_/loshn_koydesh_o2p_override.tsv"
 LOSHN_KOYDESH_O2P_SRC="${PWD}/scriptshifter/hooks/yiddish_/loshn_koydesh_o2p_override.tsv"

+ 144 - 0
legacy/processNumbers.ts

@@ -0,0 +1,144 @@
+private processNumbers(pinyinString: string, tag: string, code: string): string {
+    let outputString = "";
+    let useNumVersion = false;
+    //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
+    if ((tag == "245" || tag == "830") && code == "n") {
+       useNumVersion = true;
+    }
+
+    /*
+     * The input string is split, with any space or punctuation character (except for #) as the delimiter.
+     * The delimiters will be captured and included in the string of tokens.  Only the even-numbered
+     * array elements are the true 'tokens', so the code for processing tokens is run only for even
+     * values of j.
+     */
+    let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
+    let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
+    let numToken_re = new RegExp(numTokenPattern);
+    let n = tokens.length
+    //this.alert.info(tokens.join("|"),{autoClose: false})
+    for (let i = 0; i < n; i++) {
+        let toki = tokens[i];
+        if (toki.match(numToken_re)) {
+            /*
+             * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
+             * found after it.  Two versions of the string are maintained.  The textVersion is the original pinyin (minus the
+             * # suffixes).  In the numVersion, characters representing numbers are converted to Arabic numerals.  When a
+             * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
+             * which version should be used in the output string.  The outer loop then continues where the inner loop left off.
+             */
+            let textVersion = "";
+            let numVersion = "";
+            for (let j = i; j < n; j++) {
+                let tokj = tokens[j];
+                /* a token without # (or the end of string) is reached */
+                if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
+                    //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
+                    let m = tokj.match(numToken_re);
+                    if (m) {
+                        textVersion += m[1]
+                        if (m[2] == "") {
+                            numVersion += m[1];
+                        } else {
+                            numVersion += m[2];
+                        }
+                    } else if (j == n - 1) {
+                    //if last token is non-numerical, just tack it on.
+                        textVersion += tokj;
+                        numVersion += tokj;
+                    } else if (textVersion.length > 0 && numVersion.length > 0) {
+                    //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
+                    //(outer loop will pick up at this point)
+                        textVersion = textVersion.substring(0, textVersion.length - 1);
+                        numVersion = numVersion.substring(0, numVersion.length - 1);
+                    }
+                    //evaluate numerical string that has been constructed so far
+                    //use num version for ordinals and date strings
+                    if (numVersion.match(/^di [0-9]/i) ||
+                        numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
+                        numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
+                        numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
+                        useNumVersion
+                       ) {
+                        useNumVersion = true;
+                        /*
+                         * At this point, string may contain literal translations of Chinese numerals
+                         * Convert these to Arabic numerals (for example "2 10 7" = "27").
+                         */
+
+                        while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
+                            m = numVersion.match(/([0-9]+) ([1-9]0+)/);
+                            if (m) {
+                                let sum = Number(m[1]) * Number(m[2]);
+                                numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
+                            } else {
+                                let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
+                                if (mb)
+                                {
+                                    let sumb = Number(mb[1]) + Number(mb[2]);
+                                    numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
+                                }
+                                else
+                                {
+                                    break;
+                                }
+                            }
+                        }
+
+                        //A few other tweaks
+                        numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
+                        if ((tag == "245" || tag == "830") && code == "n") {
+                            while (numVersion.match(/[0-9] [0-9]/)) {
+                                numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
+                            }
+                        }
+                    }
+                    if (useNumVersion)
+                    {
+                        outputString += numVersion;
+                    }
+                    else
+                    {
+                        outputString += textVersion;
+                    }
+                    //if the end of the string is not reached, backtrack to the delimiter after the last numerical token
+                    //(i.e. two tokens ago)
+                    if (j < n - 1)
+                    {
+                        i = j - 2;
+                    }
+                    else //we are at the end of the string, so we are done!
+                    {
+                        i = j;
+                    }
+                    break;
+                }
+                //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
+                //This is identical to the code that is run above when the last token is numeric.
+                if (j % 2 == 0)
+                {
+                    let m = tokj.match(numToken_re);
+                    textVersion += m[1];
+                    if (m[2]== "")
+                    {
+                        numVersion += m[1];
+                    }
+                    else
+                    {
+                        numVersion += m[2];
+                    }
+                }
+                else //a delimiter, just tack it on.
+                {
+                    textVersion += tokj;
+                    numVersion += tokj;
+                }
+            }
+        }
+        else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
+        {
+            outputString += toki;
+        }
+    }
+    return outputString;
+ }

+ 1 - 0
requirements.txt

@@ -1,5 +1,6 @@
 # Core application dependencies.
 # Core application dependencies.
 aksharamukha>=2.2,<3
 aksharamukha>=2.2,<3
+esupar>=1.7.5
 flask>=2.3,<3
 flask>=2.3,<3
 flask-cors>=4.0,<5
 flask-cors>=4.0,<5
 python-dotenv>=1.0,<2
 python-dotenv>=1.0,<2

+ 5 - 3
scriptshifter/__init__.py

@@ -15,7 +15,7 @@ SQLite database path.
 This DB stores all the runtime transliteration data.
 This DB stores all the runtime transliteration data.
 """
 """
 DB_PATH = environ.get(
 DB_PATH = environ.get(
-        "DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))
+        "TXL_DB_PATH", path.join(APP_ROOT, "data", "scriptshifter.db"))
 
 
 """
 """
 SMTP server for sending email. For a dummy server that just echoes the
 SMTP server for sending email. For a dummy server that just echoes the
@@ -50,8 +50,10 @@ logger = logging.getLogger(__name__)
 if not env:
 if not env:
     logger.warn("No .env file found. Assuming env was passed externally.")
     logger.warn("No .env file found. Assuming env was passed externally.")
 
 
-EMAIL_FROM = environ["TXL_EMAIL_FROM"]
-EMAIL_TO = environ["TXL_EMAIL_TO"]
+if SMTP_HOST or FEEDBACK_PATH:
+    EMAIL_FROM = environ["TXL_EMAIL_FROM"]
+    EMAIL_TO = environ["TXL_EMAIL_TO"]
+
 try:
 try:
     SMTP_PORT = int(environ.get("TXL_SMTP_PORT", "1025"))
     SMTP_PORT = int(environ.get("TXL_SMTP_PORT", "1025"))
 except ValueError:
 except ValueError:

+ 8 - 0
scriptshifter/hooks/asian_tokenizer/__init__.py

@@ -0,0 +1,8 @@
+from esupar import load
+
+
+def s2r_tokenize(ctx, model):
+    nlp = load(model)
+    token_data = nlp(ctx.src)
+
+    ctx._src = " ".join(token_data.values[1])

+ 17 - 0
scriptshifter/hooks/chinese/__init__.py

@@ -127,3 +127,20 @@ def parse_numerals_pre_assembly(ctx):
     # Skip main transliterate function joining.
     # Skip main transliterate function joining.
 
 
     return normalize_spacing_post_assembly(ctx)
     return normalize_spacing_post_assembly(ctx)
+
+
+def person_name_pre_assembly(ctx):
+    """
+    Parse a personal name from a specific MARC field.
+    """
+    if not ctx.options.get("marc_field") in ("100", "600", "700", "800"):
+        return
+
+    ctx.dest_ls[0] = ctx.dest_ls[0].capitalize().strip() + ", "
+    ctx.dest_ls[1] = ctx.dest_ls[1].capitalize()
+    if len(ctx.dest_ls) > 2:
+        ctx.dest_ls[1] = ctx.dest_ls[1].strip()
+        if ctx.dest_ls[2][0] in "aeiou":
+            ctx.dest_ls[1] += "'"
+        ctx.dest_ls[1] += ctx.dest_ls[2]
+        del(ctx.dest_ls[2])

+ 52 - 52
scriptshifter/tables/__init__.py

@@ -1,5 +1,4 @@
 import logging
 import logging
-import re
 import sqlite3
 import sqlite3
 
 
 from collections import defaultdict
 from collections import defaultdict
@@ -7,6 +6,7 @@ from functools import cache
 from importlib import import_module
 from importlib import import_module
 from json import dumps as jdumps, loads as jloads
 from json import dumps as jdumps, loads as jloads
 from os import R_OK, access, environ, makedirs, path, unlink
 from os import R_OK, access, environ, makedirs, path, unlink
+from re import compile
 from shutil import move
 from shutil import move
 
 
 from yaml import load
 from yaml import load
@@ -28,9 +28,6 @@ runtime.
 """
 """
 
 
 
 
-TMP_DB_PATH = path.join(
-        path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH))
-
 DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
 DEFAULT_TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
 # Can be overridden for tests.
 # Can be overridden for tests.
 TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
 TABLE_DIR = environ.get("TXL_CONFIG_TABLE_DIR", DEFAULT_TABLE_DIR)
@@ -143,7 +140,7 @@ def init_db():
 
 
     This operation removes any preexisting database.
     This operation removes any preexisting database.
 
 
-    All tables in the index file (`./data/index.yml`) will be parsed
+    All tables in the index file (`./index.yml`) will be parsed
     (including inheritance rules) and loaded into the designated DB.
     (including inheritance rules) and loaded into the designated DB.
 
 
     This must be done only once at bootstrap. To update individual tables,
     This must be done only once at bootstrap. To update individual tables,
@@ -151,7 +148,9 @@ def init_db():
     """
     """
     # Create parent diretories if necessary.
     # Create parent diretories if necessary.
     # If the DB already exists, it will be overwritten ONLY on success at
     # If the DB already exists, it will be overwritten ONLY on success at
-    # hhis point.
+    # this point.
+    TMP_DB_PATH = path.join(
+            path.dirname(DB_PATH), "~tmp." + path.basename(DB_PATH))
     if path.isfile(TMP_DB_PATH):
     if path.isfile(TMP_DB_PATH):
         # Remove previous temp file (possibly from failed attempt)
         # Remove previous temp file (possibly from failed attempt)
         unlink(TMP_DB_PATH)
         unlink(TMP_DB_PATH)
@@ -166,25 +165,17 @@ def init_db():
             conn.executescript(fh.read())
             conn.executescript(fh.read())
 
 
     # Populate tables.
     # Populate tables.
-    with open(path.join(TABLE_DIR, "index.yml")) as fh:
+    with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh:
         tlist = load(fh, Loader=Loader)
         tlist = load(fh, Loader=Loader)
     try:
     try:
         with conn:
         with conn:
             for tname, tdata in tlist.items():
             for tname, tdata in tlist.items():
-                res = conn.execute(
-                    """INSERT INTO tbl_language (
-                        name, label, marc_code, description
-                    ) VALUES (?, ?, ?, ?)""",
-                    (
-                        tname, tdata.get("name"), tdata.get("marc_code"),
-                        tdata.get("description"),
-                    )
-                )
-                populate_table(conn, res.lastrowid, tname)
+                populate_table(conn, tname, tdata)
 
 
         # If the DB already exists, it will be overwritten ONLY on success at
         # If the DB already exists, it will be overwritten ONLY on success at
         # thhis point.
         # thhis point.
         move(TMP_DB_PATH, DB_PATH)
         move(TMP_DB_PATH, DB_PATH)
+        logger.info(f"Database initialized at {DB_PATH}.")
     finally:
     finally:
         conn.close()
         conn.close()
         if path.isfile(TMP_DB_PATH):
         if path.isfile(TMP_DB_PATH):
@@ -201,7 +192,27 @@ def get_connection():
     return sqlite3.connect(DB_PATH)
     return sqlite3.connect(DB_PATH)
 
 
 
 
-def populate_table(conn, tid, tname):
+def populate_table(conn, tname, tdata):
+    """
+    Populate an individual table with data from a configuration.
+
+    @param conn: SQLite connection.
+
+    @param tname(str): Table name.
+
+    @param tdata(dict): Table data.
+    """
+    res = conn.execute(
+        """INSERT INTO tbl_language (
+            name, label, marc_code, description
+        ) VALUES (?, ?, ?, ?)""",
+        (
+            tname, tdata.get("name"), tdata.get("marc_code"),
+            tdata.get("description"),
+        )
+    )
+    tid = res.lastrowid
+
     data = load_table(tname)
     data = load_table(tname)
     flags = 0
     flags = 0
     if "script_to_roman" in data:
     if "script_to_roman" in data:
@@ -247,20 +258,19 @@ def populate_table(conn, tid, tname):
                             hook_data[1].__name__, jdumps(hook_data[2])))
                             hook_data[1].__name__, jdumps(hook_data[2])))
 
 
         # Ignore rules (R2S only).
         # Ignore rules (R2S only).
-        for row in sec.get("ignore", []):
-            if isinstance(row, dict):
-                if "re" in row:
-                    flags = FEAT_RE
-                    rule = row["re"]
-            else:
-                flags = 0
-                rule = row
+        for rule in sec.get("ignore", []):
+            conn.execute(
+                    """INSERT INTO tbl_ignore (
+                        lang_id, rule, features
+                    ) VALUES (?, ?, ?)""",
+                    (tid, rule, 0))
 
 
+        for rule in sec.get("ignore_ptn", []):
             conn.execute(
             conn.execute(
                     """INSERT INTO tbl_ignore (
                     """INSERT INTO tbl_ignore (
                         lang_id, rule, features
                         lang_id, rule, features
                     ) VALUES (?, ?, ?)""",
                     ) VALUES (?, ?, ?)""",
-                    (tid, rule, flags))
+                    (tid, rule, FEAT_RE))
 
 
         # Double caps (S2R only).
         # Double caps (S2R only).
         for rule in sec.get("double_cap", []):
         for rule in sec.get("double_cap", []):
@@ -417,33 +427,22 @@ def load_table(tname):
 
 
         # Ignore regular expression patterns.
         # Ignore regular expression patterns.
         # Patterns are evaluated in the order they are listed in the config.
         # Patterns are evaluated in the order they are listed in the config.
-        ignore_ptn = [
-                re.compile(ptn)
-                for ptn in tdata["roman_to_script"].get("ignore_ptn", [])]
+        ignore_ptn = tdata["roman_to_script"].get("ignore_ptn", [])
         for parent in parents:
         for parent in parents:
             parent_tdata = load_table(parent)
             parent_tdata = load_table(parent)
             # NOTE: duplicates are not removed.
             # NOTE: duplicates are not removed.
-            ignore_ptn = [
-                re.compile(ptn)
-                for ptn in parent_tdata.get(
-                        "roman_to_script", {}).get("ignore_ptn", [])
-            ] + ignore_ptn
+            ignore_ptn = parent_tdata.get(
+                    "roman_to_script", {}).get("ignore_ptn", []) + ignore_ptn
         tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn
         tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn
 
 
         # Ignore plain strings.
         # Ignore plain strings.
-        ignore = {
-            Token(t)
-            for t in tdata["roman_to_script"].get("ignore", [])
-        }
+        ignore = set(tdata["roman_to_script"].get("ignore", []))
         for parent in parents:
         for parent in parents:
             parent_tdata = load_table(parent)
             parent_tdata = load_table(parent)
             # No overriding occurs with the ignore list, only de-duplication.
             # No overriding occurs with the ignore list, only de-duplication.
-            ignore |= {
-                Token(t) for t in parent_tdata.get(
-                        "roman_to_script", {}).get("ignore", [])
-            }
-        tdata["roman_to_script"]["ignore"] = [
-                t.content for t in sorted(ignore)]
+            ignore |= set(parent_tdata.get(
+                        "roman_to_script", {}).get("ignore", []))
+        tdata["roman_to_script"]["ignore"] = sorted(ignore)
 
 
         # Hooks.
         # Hooks.
         if "hooks" in tdata["roman_to_script"]:
         if "hooks" in tdata["roman_to_script"]:
@@ -521,6 +520,10 @@ def get_language(lang):
             if len(s2r_hooks):
             if len(s2r_hooks):
                 data["script_to_roman"]["hooks"] = s2r_hooks
                 data["script_to_roman"]["hooks"] = s2r_hooks
 
 
+            double_cap = get_lang_dcap(conn, lang_id)
+            if len(double_cap):
+                data["script_to_roman"]["double_cap"] = double_cap
+
         # Roman to script map, ignore list, and hooks.
         # Roman to script map, ignore list, and hooks.
 
 
         if data["has_r2s"]:
         if data["has_r2s"]:
@@ -542,10 +545,6 @@ def get_language(lang):
         if len(opt_data):
         if len(opt_data):
             data["options"] = opt_data
             data["options"] = opt_data
 
 
-        double_cap = get_lang_dcap(conn, lang_id)
-        if len(double_cap):
-            data["double_cap"] = double_cap
-
     conn.close()
     conn.close()
 
 
     return data
     return data
@@ -591,8 +590,9 @@ def get_lang_ignore(conn, lang_id):
             """SELECT rule, features FROM tbl_ignore
             """SELECT rule, features FROM tbl_ignore
             WHERE lang_id = ?""",
             WHERE lang_id = ?""",
             (lang_id,))
             (lang_id,))
-    # Features (regular expressions) not implemented yet.
-    return tuple(row[0] for row in qry)
+    return tuple(
+            compile(row[0]) if row[1] & FEAT_RE else row[0]
+            for row in qry)
 
 
 
 
 @cache
 @cache
@@ -652,7 +652,7 @@ def get_lang_hooks(conn, lang_id, t_dir):
             }
             }
         )
         )
 
 
-    return hooks
+    return dict(hooks)
 
 
 
 
 def get_lang_dcap(conn, lang_id):
 def get_lang_dcap(conn, lang_id):

+ 13 - 98
scriptshifter/tables/data/_ignore_base.yml

@@ -16,106 +16,21 @@ roman_to_script:
     # dedicated U+2160÷U+216F (uppercase Roman
     # dedicated U+2160÷U+216F (uppercase Roman
     # numerals) and/or U+2170÷U+217F (lower case Roman
     # numerals) and/or U+2170÷U+217F (lower case Roman
     # numerals) ranges to avoid this ambiguity.
     # numerals) ranges to avoid this ambiguity.
-    # TODO implement regular expressions for ignore patterns.
-    #- re: "I{2,3}"
-    #- re: "I(V|X)"
-    #- re: "LI{,3}"
-    #- re: "LI?(V|X)"
-    #- re: "L(V|X{1,3})I{,3}"
-    #- re: "LX{1,3}I?V"
-    #- re: "LX{1,3}VI{,3}"
-    #- re: "(V|X{1,3})I{,3}"
-    #- re: "X{1,3}I{,3}"
-    #- re: "X{1,3}I(V|X)"
-    #- re: "X{1,3}VI{,3}"
-    - "II"
-    - "III"
-    - "IV"
-    - "IX"
-    - "LI"
-    - "LII"
-    - "LIII"
-    - "LIV"
-    - "LIX"
-    - "LV"
-    - "LVI"
-    - "LVII"
-    - "LVIII"
-    - "LX"
-    - "LXI"
-    - "LXII"
-    - "LXIII"
-    - "LXIV"
-    - "LXIX"
-    - "LXV"
-    - "LXVI"
-    - "LXVII"
-    - "LXVIII"
-    - "LXX"
-    - "LXXI"
-    - "LXXII"
-    - "LXXIII"
-    - "LXXIV"
-    - "LXXIX"
-    - "LXXV"
-    - "LXXVI"
-    - "LXXVII"
-    - "LXXVIII"
-    - "LXXX"
-    - "LXXXI"
-    - "LXXXII"
-    - "LXXXIII"
-    - "LXXXIV"
-    - "LXXXIX"
-    - "LXXXV"
-    - "LXXXVI"
-    - "LXXXVII"
-    - "LXXXVIII"
-    - "VI"
-    - "VII"
-    - "VIII"
-    - "XI"
-    - "XII"
-    - "XIII"
-    - "XIV"
-    - "XIX"
-    - "XL"
-    - "XLI"
-    - "XLII"
-    - "XLIII"
-    - "XLIV"
-    - "XLIX"
-    - "XLV"
-    - "XLVI"
-    - "XLVII"
-    - "XLVIII"
-    - "XV"
-    - "XVI"
-    - "XVII"
-    - "XVIII"
-    - "XX"
-    - "XXI"
-    - "XXII"
-    - "XXIII"
-    - "XXIV"
-    - "XXIX"
-    - "XXV"
-    - "XXVI"
-    - "XXVII"
-    - "XXVIII"
-    - "XXX"
-    - "XXXI"
-    - "XXXII"
-    - "XXXIII"
-    - "XXXIV"
-    - "XXXIX"
-    - "XXXV"
-    - "XXXVI"
-    - "XXXVII"
-    - "XXXVIII"
     - "and one other"
     - "and one other"
-    #- re: "and ([a-z0-9]+ )?others"
     - "et al."
     - "et al."
+  ignore_ptn:
+    - "and ([a-z0-9]+ )?others"
+    - "I{2,3}"
+    - "I(V|X)"
+    - "LI{,3}"
+    - "LI?(V|X)"
+    - "L(V|X{1,3})I{,3}"
+    - "LX{1,3}I?V"
+    - "LX{1,3}VI{,3}"
+    - "(V|X{1,3})I{,3}"
+    - "X{1,3}I{,3}"
+    - "X{1,3}I(V|X)"
+    - "X{1,3}VI{,3}"
 
 
 script_to_roman:
 script_to_roman:
   ignore:
   ignore:

+ 2 - 0
scriptshifter/tables/data/chinese.yml

@@ -31,6 +31,8 @@ script_to_roman:
     pre_assembly:
     pre_assembly:
       -
       -
         - chinese.parse_numerals_pre_assembly
         - chinese.parse_numerals_pre_assembly
+      -
+        - chinese.person_name_pre_assembly
 
 
   map:
   map:
     "〇": "ling#0 "
     "〇": "ling#0 "

+ 3 - 0
scriptshifter/tables/data/thai.yml

@@ -33,6 +33,9 @@ options:
 script_to_roman:
 script_to_roman:
   hooks:
   hooks:
     post_config:
     post_config:
+      -
+        - asian_tokenizer.s2r_tokenize
+        - model: "KoichiYasuoka/roberta-base-thai-spm-upos"
       -
       -
         - aksharamukha.romanizer.s2r_post_config
         - aksharamukha.romanizer.s2r_post_config
         - src_script: "Thai"
         - src_script: "Thai"

+ 5 - 0
scriptshifter/tables/data/thai_alt.yml

@@ -4,6 +4,11 @@ general:
   case_sensitive: false
   case_sensitive: false
 
 
 script_to_roman:
 script_to_roman:
+  hooks:
+    post_normalize:
+      -
+        - asian_tokenizer.s2r_tokenize
+        - model: "th"
   map:
   map:
     # COMMON SPECIAL CHARACTERS
     # COMMON SPECIAL CHARACTERS
 
 

+ 5 - 3
scriptshifter/tables/data/uighur_arabic.yml

@@ -1,5 +1,7 @@
+---
 general:
 general:
   name: Uighur (Arabic)
   name: Uighur (Arabic)
+  case_sensitive: false
 
 
 roman_to_script:
 roman_to_script:
   map:
   map:
@@ -118,7 +120,7 @@ roman_to_script:
     "%zh": "\uFB8A"
     "%zh": "\uFB8A"
     "zh": "\uFB8B"
     "zh": "\uFB8B"
     "%zh%": "\uFB8A"
     "%zh%": "\uFB8A"
-    
+
 script_to_roman:
 script_to_roman:
   map:
   map:
     "\u0626\u0627": "a"
     "\u0626\u0627": "a"
@@ -157,9 +159,9 @@ script_to_roman:
     "\uFEEB": "h"
     "\uFEEB": "h"
     "\uFEEC": "h"
     "\uFEEC": "h"
     "\u0640\u0629": "h"
     "\u0640\u0629": "h"
-    "\uFEEA": "h"
+    "%\uFEEA": "h"
     "\u0629": "h"
     "\u0629": "h"
-    "\u0647": "h"
+    "%\u0647%": "h"
     "\uFE8C": "i"
     "\uFE8C": "i"
     "\uFBE8": "i"
     "\uFBE8": "i"
     "\uFBE9": "i"
     "\uFBE9": "i"

+ 0 - 0
scriptshifter/tables/data/index.yml → scriptshifter/tables/index.yml


+ 34 - 7
scriptshifter/trans.py

@@ -1,7 +1,7 @@
 import logging
 import logging
 
 
 from importlib import import_module
 from importlib import import_module
-from re import compile
+from re import Pattern, compile
 
 
 from scriptshifter.exceptions import BREAK, CONT
 from scriptshifter.exceptions import BREAK, CONT
 from scriptshifter.tables import (
 from scriptshifter.tables import (
@@ -120,11 +120,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
         if _run_hook("post_config", ctx) == BREAK:
         if _run_hook("post_config", ctx) == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
             return getattr(ctx, "dest", ""), ctx.warnings
 
 
-        _normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id))
-
-        if _run_hook("post_normalize", ctx) == BREAK:
+        # _normalize_src returns the results of the post_normalize hook.
+        if _normalize_src(
+                ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
             return getattr(ctx, "dest", ""), ctx.warnings
 
 
+        logger.debug(f"Normalized source: {ctx.src}")
         lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))
         lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))
 
 
         # Loop through source characters. The increment of each loop depends on
         # Loop through source characters. The increment of each loop depends on
@@ -169,8 +170,21 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
                     if hret == CONT:
                     if hret == CONT:
                         continue
                         continue
 
 
-                    step = len(ctx.tk)
-                    if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
+                    _matching = False
+                    if type(ctx.tk) is Pattern:
+                        # Seach RE pattern beginning at cursor.
+                        if _ptn_match := ctx.tk.match(ctx.src[ctx.cur:]):
+                            ctx.tk = _ptn_match[0]
+                            logger.debug(f"Matched regex: {ctx.tk}")
+                            step = len(ctx.tk)
+                            _matching = True
+                    else:
+                        # Search exact match.
+                        step = len(ctx.tk)
+                        if ctx.tk == ctx.src[ctx.cur:ctx.cur + step]:
+                            _matching = True
+
+                    if _matching:
                         # The position matches an ignore token.
                         # The position matches an ignore token.
                         hret = _run_hook("on_ignore_match", ctx)
                         hret = _run_hook("on_ignore_match", ctx)
                         if hret == BREAK:
                         if hret == BREAK:
@@ -181,6 +195,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
                         logger.info(f"Ignored token: {ctx.tk}")
                         logger.info(f"Ignored token: {ctx.tk}")
                         ctx.dest_ls.append(ctx.tk)
                         ctx.dest_ls.append(ctx.tk)
                         ctx.cur += step
                         ctx.cur += step
+                        if ctx.cur >= len(ctx.src):
+                            # reached end of string. Stop ignoring.
+                            # The outer loop will exit imediately after.
+                            ctx.ignoring = False
+                            break
+
                         cur_char = ctx.src[ctx.cur]
                         cur_char = ctx.src[ctx.cur]
                         ctx.ignoring = True
                         ctx.ignoring = True
                         break
                         break
@@ -193,6 +213,9 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
             delattr(ctx, "tk")
             delattr(ctx, "tk")
             delattr(ctx, "ignoring")
             delattr(ctx, "ignoring")
 
 
+            if ctx.cur >= len(ctx.src):
+                break
+
             # Begin transliteration token lookup.
             # Begin transliteration token lookup.
             ctx.match = False
             ctx.match = False
 
 
@@ -315,10 +338,14 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 def _normalize_src(ctx, norm_rules):
 def _normalize_src(ctx, norm_rules):
     """
     """
     Normalize source text according to rules.
     Normalize source text according to rules.
+
+    NOTE: this manipluates the protected source attribute so it may not
+    correspond to the originally provided source.
     """
     """
     for nk, nv in norm_rules.items():
     for nk, nv in norm_rules.items():
         ctx._src = ctx.src.replace(nk, nv)
         ctx._src = ctx.src.replace(nk, nv)
-    logger.debug(f"Normalized source: {ctx.src}")
+
+    return _run_hook("post_normalize", ctx)
 
 
 
 
 def _is_bow(cur, ctx, word_boundary):
 def _is_bow(cur, ctx, word_boundary):

+ 2 - 0
scriptshifter_base.Dockerfile

@@ -10,6 +10,8 @@ RUN addgroup --system www
 RUN adduser --system www
 RUN adduser --system www
 RUN gpasswd -a www www
 RUN gpasswd -a www www
 
 
+ENV HF_DATASETS_CACHE /data/hf/datasets
+
 # Copy external dependencies.
 # Copy external dependencies.
 WORKDIR ${WORKROOT}
 WORKDIR ${WORKROOT}
 COPY ext ./ext/
 COPY ext ./ext/

+ 1 - 1
sscli

@@ -10,7 +10,7 @@ from os import path
 
 
 from scriptshifter import DB_PATH
 from scriptshifter import DB_PATH
 from scriptshifter.tables import init_db as _init_db
 from scriptshifter.tables import init_db as _init_db
-from tests import test_sample
+from tests.integration_tests import test_sample
 
 
 
 
 @click.group()
 @click.group()

+ 10 - 64
tests/__init__.py

@@ -1,71 +1,17 @@
-from csv import reader
-from difflib import ndiff
 from importlib import reload
 from importlib import reload
-from json import loads as jloads
-from logging import getLogger
-from os import path
+from os import path, environ
+from tempfile import gettempdir
 
 
-import scriptshifter.tables
-
-from scriptshifter.trans import transliterate
+import scriptshifter
+from scriptshifter import tables
 
 
 
 
 TEST_DIR = path.dirname(path.realpath(__file__))
 TEST_DIR = path.dirname(path.realpath(__file__))
 TEST_DATA_DIR = path.join(TEST_DIR, "data")
 TEST_DATA_DIR = path.join(TEST_DIR, "data")
+TEST_CONFIG_DIR = path.join(TEST_DIR, "tables", "data")
 
 
-logger = getLogger(__name__)
-
-
-def reload_tables():
-    reload(scriptshifter.tables)  # Reload new config dir.
-    from scriptshifter import tables
-    tables.list_tables.cache_clear()
-    tables.get_language.cache_clear()
-    tables.get_lang_map.cache_clear()
-
-    return tables
-
-
-def test_sample(dset):
-    """
-    Test an individual sample set and produce a human-readable report.
-
-    Used outside of automated tests.
-
-    @param dset (str): sample set name (without the .csv extension) found in
-    the `data/script_samples` directory.
-    """
-    deltas = []
-    dset_fpath = path.join(TEST_DATA_DIR, "script_samples", dset + ".csv")
-    log_fpath = path.join(TEST_DATA_DIR, f"test_{dset}.log")
-
-    with open(dset_fpath, newline="") as fh:
-        csv = reader(fh)
-        for row in csv:
-            lang, script, rom = row[:3]
-            if not lang:
-                continue
-            opts = jloads(row[3]) if len(row) > 3 and row[3] else {}
-            trans, warnings = transliterate(
-                    script, lang, t_dir="s2r",
-                    capitalize=opts.get("capitalize"), options=opts)
-            if (trans == rom):
-                print(".", end="")
-            else:
-                print("F", end="")
-                deltas.append((lang, script, ndiff([trans], [rom])))
-
-    with open(log_fpath, "w") as fh:
-        # If no deltas, just truncate the file.
-        for lang, script, delta in deltas:
-            fh.write(f"Language: {lang}\n")
-            fh.write(f"Original: {script}\nDiff (result vs. expected):\n")
-            for dline in delta:
-                fh.write(dline.strip() + "\n")
-            fh.write("\n\n")
-
-    ct = len(deltas)
-    if ct > 0:
-        print(f"{ct} failed tests. See report at {log_fpath}")
-    else:
-        print("All tests passed.")
+# Reload main SS modules after changing environment variables.
+environ["TXL_DB_PATH"] = path.join(gettempdir(), "scriptshifter_unittest.db")
+reload(scriptshifter)
+environ["TXL_CONFIG_TABLE_DIR"] = TEST_CONFIG_DIR
+reload(tables)

+ 0 - 6
tests/data/index.yml

@@ -1,6 +0,0 @@
-inherited:
-  name: Test inherited table
-ordering:
-  name: Test ordering
-rot3:
-  name: Test ROT3 hooks

+ 8 - 0
tests/data/script_samples/unittest.csv

@@ -0,0 +1,8 @@
+inherited,abcd,9078,,
+inherited,TUVX,tuvx,"{""t_dir"": ""r2s""}",
+rot3,defg,abcd,,
+rot3,HIJK,KLMN,"{""t_dir"": ""r2s""}",
+rot3,st uv,Vw xy,"{""t_dir"": ""r2s"", ""capitalize"": ""first""}",
+rot3,st uv,Vw Xy,"{""t_dir"": ""r2s"", ""capitalize"": ""all""}",
+regex,Hello abc,Hello 678,"{""t_dir"": ""r2s""}",
+regex,Hullo abc,5u22o 678,"{""t_dir"": ""r2s""}",

+ 58 - 0
tests/integration_tests.py

@@ -0,0 +1,58 @@
+from csv import reader
+from difflib import ndiff
+from json import loads as jloads
+from logging import getLogger
+from os import path
+
+from scriptshifter.trans import transliterate
+from tests import TEST_DATA_DIR
+
+logger = getLogger(__name__)
+
+
+def test_sample(dset):
+    """
+    Test an individual sample set and produce a human-readable report.
+
+    Used outside of automated tests.
+
+    @param dset (str): sample set name (without the .csv extension) found in
+    the `data/script_samples` directory.
+    """
+    deltas = []
+    dset_fpath = path.join(TEST_DATA_DIR, "script_samples", dset + ".csv")
+    log_fpath = path.join(TEST_DATA_DIR, f"test_{dset}.log")
+
+    with open(dset_fpath, newline="") as fh:
+        csv = reader(fh)
+        i = 1
+        for row in csv:
+            logger.info(f"CSV row #{i}")
+            i += 1
+            lang, script, rom = row[:3]
+            if not lang:
+                continue
+            opts = jloads(row[3]) if len(row) > 3 and row[3] else {}
+            trans, warnings = transliterate(
+                    script, lang, t_dir="s2r",
+                    capitalize=opts.get("capitalize"), options=opts)
+            if (trans == rom):
+                print(".", end="")
+            else:
+                print("F", end="")
+                deltas.append((lang, script, ndiff([trans], [rom])))
+
+    with open(log_fpath, "w") as fh:
+        # If no deltas, just truncate the file.
+        for lang, script, delta in deltas:
+            fh.write(f"Language: {lang}\n")
+            fh.write(f"Original: {script}\nDiff (result vs. expected):\n")
+            for dline in delta:
+                fh.write(dline.strip() + "\n")
+            fh.write("\n\n")
+
+    ct = len(deltas)
+    if ct > 0:
+        print(f"{ct} failed tests. See report at {log_fpath}")
+    else:
+        print("All tests passed.")

+ 0 - 0
tests/data/_base1.yml → tests/tables/data/_base1.yml


+ 0 - 0
tests/data/_base2.yml → tests/tables/data/_base2.yml


+ 0 - 0
tests/data/_base3.yml → tests/tables/data/_base3.yml


+ 0 - 0
tests/data/cap_base1.yml → tests/tables/data/cap_base1.yml


+ 0 - 0
tests/data/cap_base2.yml → tests/tables/data/cap_base2.yml


+ 0 - 0
tests/data/cap_inherited.yml → tests/tables/data/cap_inherited.yml


+ 0 - 0
tests/data/inherited.yml → tests/tables/data/inherited.yml


+ 0 - 0
tests/data/ordering.yml → tests/tables/data/ordering.yml


+ 19 - 0
tests/tables/data/regex.yml

@@ -0,0 +1,19 @@
+---
+# Test file for regex ignoring.
+
+general:
+  name: Test regex ignoring.
+  parents:
+    - inherited
+
+roman_to_script:
+  ignore_ptn:
+    - "[hH][ae]llo"
+
+  map:
+    "h": "1"
+    "H": "5"
+    "l": "2"
+    "a": "6"
+    "b": "7"
+    "c": "8"

+ 0 - 0
tests/data/rot3.yml → tests/tables/data/rot3.yml


+ 17 - 0
tests/tables/index.yml

@@ -0,0 +1,17 @@
+---
+inherited:
+  name: Test inheritance leaf file
+  marc_code: inh
+  description: Test description.
+cap_base1:
+  name: Test capitalization base 1
+cap_base2:
+  name: Test capitalization base 2
+cap_inherited:
+  name: Test capitalization
+ordering:
+  name: Test ordering
+regex:
+  name: inherited config + regex ignore.
+rot3:
+  name: Test ROT3 hooks

+ 29 - 33
tests/test01_cfg.py

@@ -1,20 +1,21 @@
+from os import environ, unlink
 from unittest import TestCase
 from unittest import TestCase
 
 
-from os import environ
+from scriptshifter.tables import get_language, init_db
 
 
-import scriptshifter
 
 
-from tests import TEST_DATA_DIR, reload_tables
+def setUpModule():
+    init_db()
+
+
+def tearDownModule():
+    unlink(environ["TXL_DB_PATH"])
 
 
 
 
 class TestConfig(TestCase):
 class TestConfig(TestCase):
     """ Test configuration parsing. """
     """ Test configuration parsing. """
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        self.tables = reload_tables()
-
     def test_ordering(self):
     def test_ordering(self):
-        tbl = self.tables.load_table("ordering")
+        tbl = get_language("ordering")
         exp_order = ["ABCD", "AB", "A", "BCDE", "BCD", "BEFGH", "B"]
         exp_order = ["ABCD", "AB", "A", "BCDE", "BCD", "BEFGH", "B"]
 
 
         self.assertEqual(
         self.assertEqual(
@@ -23,19 +24,17 @@ class TestConfig(TestCase):
 
 
 class TestOverride(TestCase):
 class TestOverride(TestCase):
     """ Test configuration overrides. """
     """ Test configuration overrides. """
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        self.tables = reload_tables()
-
     def test_override_map(self):
     def test_override_map(self):
-        tbl = self.tables.load_table("inherited")
+        tbl = get_language("inherited")
 
 
-        self.assertEqual(tbl["general"]["name"], "Test inheritance leaf file")
+        self.assertEqual(tbl["label"], "Test inheritance leaf file")
+        self.assertEqual(tbl["marc_code"], "inh")
+        self.assertEqual(tbl["description"], "Test description.")
 
 
         # Entries are additive.
         # Entries are additive.
         self.assertEqual(
         self.assertEqual(
                 tbl["roman_to_script"]["ignore"],
                 tbl["roman_to_script"]["ignore"],
-                ["Fritter my wig", "Hi", "Ho", "Thing-um-a-jig"])
+                ("Fritter my wig", "Hi", "Ho", "Thing-um-a-jig"))
         self.assertEqual(
         self.assertEqual(
                 tbl["roman_to_script"]["map"],
                 tbl["roman_to_script"]["map"],
                 (
                 (
@@ -102,34 +101,31 @@ class TestOverride(TestCase):
 
 
 class TestHooks(TestCase):
 class TestHooks(TestCase):
     """ Test parsing of hook functions. """
     """ Test parsing of hook functions. """
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        self.tables = reload_tables()
-
     def test_rot3(self):
     def test_rot3(self):
-        tbl = self.tables.load_table("rot3")
+        tbl = get_language("rot3")
 
 
         self.assertEqual(
         self.assertEqual(
-                tbl["script_to_roman"]["hooks"],
-                {
-                    "begin_input_token": [
-                        ("test", scriptshifter.hooks.test.rotate, {"n": -3})
-                    ]
-                })
+            tbl["script_to_roman"]["hooks"],
+            {
+                "begin_input_token": [
+                    {
+                        "module_name": "test",
+                        "fn_name": "rotate",
+                        "kwargs": {"n": -3},
+                    }
+                ]
+            }
+        )
 
 
 
 
 class TestDoubleCaps(TestCase):
 class TestDoubleCaps(TestCase):
     """ Test double capitalization configuration. """
     """ Test double capitalization configuration. """
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        self.tables = reload_tables()
-
     def test_dcaps_base1(self):
     def test_dcaps_base1(self):
-        cap_base1 = self.tables.load_table("cap_base1")
+        cap_base1 = get_language("cap_base1")
         assert "z︠h︡" in cap_base1["script_to_roman"]["double_cap"]
         assert "z︠h︡" in cap_base1["script_to_roman"]["double_cap"]
 
 
     def test_dcaps_base2(self):
     def test_dcaps_base2(self):
-        cap_base2 = self.tables.load_table("cap_base2")
+        cap_base2 = get_language("cap_base2")
         dcap = cap_base2["script_to_roman"]["double_cap"]
         dcap = cap_base2["script_to_roman"]["double_cap"]
 
 
         assert len(dcap) == 2
         assert len(dcap) == 2
@@ -137,7 +133,7 @@ class TestDoubleCaps(TestCase):
         assert "i︠o︡" in dcap
         assert "i︠o︡" in dcap
 
 
     def test_dcaps_inherited(self):
     def test_dcaps_inherited(self):
-        cap_inherited = self.tables.load_table("cap_inherited")
+        cap_inherited = get_language("cap_inherited")
         dcap = cap_inherited["script_to_roman"]["double_cap"]
         dcap = cap_inherited["script_to_roman"]["double_cap"]
 
 
         assert len(dcap) == 1
         assert len(dcap) == 1

+ 33 - 47
tests/test02_transliteration.py

@@ -2,18 +2,25 @@ import logging
 
 
 from unittest import TestCase, TestSuite, TextTestRunner
 from unittest import TestCase, TestSuite, TextTestRunner
 from csv import reader
 from csv import reader
-from glob import glob
 from json import loads as jloads
 from json import loads as jloads
-from os import environ, path
+from os import environ, path, unlink
 
 
-from tests import TEST_DATA_DIR, reload_tables
 from scriptshifter.trans import transliterate
 from scriptshifter.trans import transliterate
-import scriptshifter.tables
+from scriptshifter.tables import get_language, init_db
+from tests import TEST_DATA_DIR
 
 
 
 
 logger = logging.getLogger(__name__)
 logger = logging.getLogger(__name__)
 
 
 
 
+def setUpModule():
+    init_db()
+
+
+def tearDownModule():
+    unlink(environ["TXL_DB_PATH"])
+
+
 class TestTrans(TestCase):
 class TestTrans(TestCase):
     """
     """
     Test S2R transliteration.
     Test S2R transliteration.
@@ -23,20 +30,21 @@ class TestTrans(TestCase):
     TODO use a comprehensive sample table and report errors for unsupported
     TODO use a comprehensive sample table and report errors for unsupported
     languages.
     languages.
     """
     """
-
-    maxDiff = None
-
-    def sample_s2r(self):
+    def sample(self):
         """
         """
-        Test S2R transliteration for one CSV sample.
+        Test transliteration for one CSV row.
 
 
         This function name won't start with `test_` otherwise will be
         This function name won't start with `test_` otherwise will be
         automatically run without parameters.
         automatically run without parameters.
         """
         """
-        config = scriptshifter.tables.load_table(self.tbl)
-        if "script_to_roman" in config:
+        config = get_language(self.tbl)
+        t_dir = self.options.get("t_dir", "s2r")
+        if (
+                t_dir == "s2r" and config["has_s2r"]
+                or t_dir == "r2s" and config["has_r2s"]):
             txl = transliterate(
             txl = transliterate(
                     self.script, self.tbl,
                     self.script, self.tbl,
+                    t_dir=t_dir,
                     capitalize=self.options.get("capitalize", False),
                     capitalize=self.options.get("capitalize", False),
                     options=self.options)[0]
                     options=self.options)[0]
             self.assertEqual(
             self.assertEqual(
@@ -44,49 +52,27 @@ class TestTrans(TestCase):
                     f"S2R transliteration error for {self.tbl}!\n"
                     f"S2R transliteration error for {self.tbl}!\n"
                     f"Original: {self.script}")
                     f"Original: {self.script}")
 
 
-    def sample_r2s(self):
-        """
-        Test R2S transliteration for one CSV sample.
-
-        This function name won't start with `test_` otherwise will be
-        automatically run without parameters.
-        """
-        config = scriptshifter.tables.load_table(self.tbl)
-        if "roman_to_script" in config:
-            txl = transliterate(
-                    self.roman, self.tbl,
-                    t_dir="r2s",
-                    capitalize=self.options.get("capitalize", False),
-                    options=self.options)[0]
-            self.assertEqual(
-                    txl, self.script,
-                    f"R2S transliteration error for {self.tbl}!\n"
-                    f"Original: {self.roman}")
-
 
 
 def make_suite():
 def make_suite():
     """
     """
     Build parametrized test cases.
     Build parametrized test cases.
     """
     """
-    if "TXL_CONFIG_TABLE_DIR" in environ:
-        del environ["TXL_CONFIG_TABLE_DIR"]
-    reload_tables()
-
     suite = TestSuite()
     suite = TestSuite()
 
 
-    for fpath in glob(path.join(TEST_DATA_DIR, "script_samples", "*.csv")):
-        with open(fpath, newline="") as fh:
-            csv = reader(fh)
-            for row in csv:
-                if len(row[0]):
-                    # Inject transliteration info in the test case.
-                    for tname in ("sample_s2r", "sample_r2s"):
-                        tcase = TestTrans(tname)
-                        tcase.tbl = row[0]
-                        tcase.script = row[1].strip()
-                        tcase.roman = row[2].strip()
-                        tcase.options = jloads(row[3]) if len(row[3]) else {}
-                        suite.addTest(tcase)
+    with open(path.join(
+        TEST_DATA_DIR, "script_samples", "unittest.csv"
+    ), newline="") as fh:
+        csv = reader(fh)
+        for row in csv:
+            if len(row[0]):
+                # Inject transliteration info in the test case.
+                tcase = TestTrans("sample")
+                tcase.tbl = row[0]
+                tcase.script = row[1].strip()
+                tcase.roman = row[2].strip()
+                tcase.options = jloads(row[3]) if len(row[3]) else {}
+
+                suite.addTest(tcase)
 
 
     return suite
     return suite
 
 

+ 10 - 7
tests/test03_capitalization.py

@@ -1,19 +1,22 @@
-from os import environ
+from os import environ, unlink
 from unittest import TestCase
 from unittest import TestCase
 
 
 from scriptshifter.trans import transliterate
 from scriptshifter.trans import transliterate
-from tests import TEST_DATA_DIR, reload_tables
+from scriptshifter.tables import init_db
+
+
+def setUpModule():
+    init_db()
+
+
+def tearDownModule():
+    unlink(environ["TXL_DB_PATH"])
 
 
 
 
 class TestCapitalization(TestCase):
 class TestCapitalization(TestCase):
     """
     """
     Test capitalization.
     Test capitalization.
     """
     """
-
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        self.tables = reload_tables()
-
     def test_cap(self):
     def test_cap(self):
         tbl = "cap_inherited"
         tbl = "cap_inherited"
         in_str = "зг іо"
         in_str = "зг іо"

+ 21 - 15
tests/test04_rest_api.py

@@ -1,25 +1,28 @@
 import json
 import json
 
 
-from os import environ
+from os import environ, unlink
 from unittest import TestCase
 from unittest import TestCase
 
 
 from scriptshifter.rest_api import app
 from scriptshifter.rest_api import app
-from tests import TEST_DATA_DIR, reload_tables
+from scriptshifter.tables import init_db
 
 
 
 
 EP = "http://localhost:8000"
 EP = "http://localhost:8000"
 
 
 
 
+def setUpModule():
+    init_db()
+
+
+def tearDownModule():
+    unlink(environ["TXL_DB_PATH"])
+
+
 class TestRestAPI(TestCase):
 class TestRestAPI(TestCase):
     """ Test REST API interaction. """
     """ Test REST API interaction. """
-    def setUp(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
-        # if "TXL_CONFIG_TABLE_DIR" in environ:
-        #     del environ["TXL_CONFIG_TABLE_DIR"]
-        reload_tables()
-
-        # Start webapp.
-        app.testing = True
+    # def setUp(self):
+    #     # Start webapp.
+    #     app.testing = True
 
 
     def test_health(self):
     def test_health(self):
         with app.test_client() as c:
         with app.test_client() as c:
@@ -35,7 +38,7 @@ class TestRestAPI(TestCase):
 
 
         data = json.loads(rsp.get_data(as_text=True))
         data = json.loads(rsp.get_data(as_text=True))
         self.assertIn("inherited", data)
         self.assertIn("inherited", data)
-        self.assertIn("name", data["inherited"])
+        self.assertIn("label", data["inherited"])
         self.assertNotIn("_base1", data)
         self.assertNotIn("_base1", data)
         self.assertNotIn("_base2", data)
         self.assertNotIn("_base2", data)
         self.assertNotIn("_base3", data)
         self.assertNotIn("_base3", data)
@@ -47,14 +50,17 @@ class TestRestAPI(TestCase):
         self.assertEqual(rsp.status_code, 200)
         self.assertEqual(rsp.status_code, 200)
         data = json.loads(rsp.get_data(as_text=True))
         data = json.loads(rsp.get_data(as_text=True))
 
 
-        self.assertIn("general", data)
+        self.assertIn("case_sensitive", data)
+        self.assertIn("description", data)
         self.assertIn("roman_to_script", data)
         self.assertIn("roman_to_script", data)
         self.assertIn("map", data["roman_to_script"])
         self.assertIn("map", data["roman_to_script"])
+        self.assertEqual(data["has_r2s"], True)
+        self.assertEqual(data["has_s2r"], False)
         self.assertEqual(data["roman_to_script"]["map"][0], ["ABCD", ""])
         self.assertEqual(data["roman_to_script"]["map"][0], ["ABCD", ""])
 
 
     def test_trans_api_s2r(self):
     def test_trans_api_s2r(self):
         with app.test_client() as c:
         with app.test_client() as c:
-            rsp = c.post("/trans", data={"lang": "rot3", "text": "defg"})
+            rsp = c.post("/trans", json={"lang": "rot3", "text": "defg"})
 
 
         self.assertEqual(rsp.status_code, 200)
         self.assertEqual(rsp.status_code, 200)
         data = json.loads(rsp.get_data(as_text=True))
         data = json.loads(rsp.get_data(as_text=True))
@@ -64,7 +70,7 @@ class TestRestAPI(TestCase):
     def test_trans_api_r2s(self):
     def test_trans_api_r2s(self):
         with app.test_client() as c:
         with app.test_client() as c:
             rsp = c.post(
             rsp = c.post(
-                "/trans", data={
+                "/trans", json={
                     "lang": "rot3",
                     "lang": "rot3",
                     "text": "abcd",
                     "text": "abcd",
                     "t_dir": "r2s"
                     "t_dir": "r2s"
@@ -80,7 +86,7 @@ class TestRestAPI(TestCase):
         with app.test_client() as c:
         with app.test_client() as c:
             rsp = c.post(
             rsp = c.post(
                 "/trans",
                 "/trans",
-                data={
+                json={
                     "lang": "rot3",
                     "lang": "rot3",
                     "capitalize": "first",
                     "capitalize": "first",
                     "text": "bcde",
                     "text": "bcde",