ソースを参照

WIP regexp and testing framework.

scossu 8 ヶ月 前
コミット
6c5cab4743

+ 2 - 0
example.env

@@ -2,4 +2,6 @@ FLASK_DEBUG=true
 TXL_DICTA_EP="changeme"
 TXL_DICTA_EP="changeme"
 TXL_FLASK_SECRET="changeme"
 TXL_FLASK_SECRET="changeme"
 TXL_LOGLEVEL="INFO"
 TXL_LOGLEVEL="INFO"
+TXL_EMAIL_FROM="me@loc.gov"
+TXL_EMAIL_TO="me@loc.gov"
 LOSHN_KOYDESH_O2P_SRC="${PWD}/scriptshifter/hooks/yiddish_/loshn_koydesh_o2p_override.tsv"
 LOSHN_KOYDESH_O2P_SRC="${PWD}/scriptshifter/hooks/yiddish_/loshn_koydesh_o2p_override.tsv"

+ 144 - 0
legacy/processNumbers.ts

@@ -0,0 +1,144 @@
+private processNumbers(pinyinString: string, tag: string, code: string): string {
+    let outputString = "";
+    let useNumVersion = false;
+    //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
+    if ((tag == "245" || tag == "830") && code == "n") {
+       useNumVersion = true;
+    }
+
+    /*
+     * The input string is split, with any space or punctuation character (except for #) as the delimiter.
+     * The delimiters will be captured and included in the string of tokens.  Only the even-numbered
+     * array elements are the true 'tokens', so the code for processing tokens is run only for even
+     * values of j.
+     */
+    let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
+    let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
+    let numToken_re = new RegExp(numTokenPattern);
+    let n = tokens.length
+    //this.alert.info(tokens.join("|"),{autoClose: false})
+    for (let i = 0; i < n; i++) {
+        let toki = tokens[i];
+        if (toki.match(numToken_re)) {
+            /*
+             * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
+             * found after it.  Two versions of the string are maintained.  The textVersion is the original pinyin (minus the
+             * # suffixes).  In the numVersion, characters representing numbers are converted to Arabic numerals.  When a
+             * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
+             * which version should be used in the output string.  The outer loop then continues where the inner loop left off.
+             */
+            let textVersion = "";
+            let numVersion = "";
+            for (let j = i; j < n; j++) {
+                let tokj = tokens[j];
+                /* a token without # (or the end of string) is reached */
+                if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
+                    //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
+                    let m = tokj.match(numToken_re);
+                    if (m) {
+                        textVersion += m[1]
+                        if (m[2] == "") {
+                            numVersion += m[1];
+                        } else {
+                            numVersion += m[2];
+                        }
+                    } else if (j == n - 1) {
+                    //if last token is non-numerical, just tack it on.
+                        textVersion += tokj;
+                        numVersion += tokj;
+                    } else if (textVersion.length > 0 && numVersion.length > 0) {
+                    //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
+                    //(outer loop will pick up at this point)
+                        textVersion = textVersion.substring(0, textVersion.length - 1);
+                        numVersion = numVersion.substring(0, numVersion.length - 1);
+                    }
+                    //evaluate numerical string that has been constructed so far
+                    //use num version for ordinals and date strings
+                    if (numVersion.match(/^di [0-9]/i) ||
+                        numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
+                        numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
+                        numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
+                        useNumVersion
+                       ) {
+                        useNumVersion = true;
+                        /*
+                         * At this point, string may contain literal translations of Chinese numerals
+                         * Convert these to Arabic numerals (for example "2 10 7" = "27").
+                         */
+
+                        while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
+                            m = numVersion.match(/([0-9]+) ([1-9]0+)/);
+                            if (m) {
+                                let sum = Number(m[1]) * Number(m[2]);
+                                numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
+                            } else {
+                                let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
+                                if (mb)
+                                {
+                                    let sumb = Number(mb[1]) + Number(mb[2]);
+                                    numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
+                                }
+                                else
+                                {
+                                    break;
+                                }
+                            }
+                        }
+
+                        //A few other tweaks
+                        numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
+                        if ((tag == "245" || tag == "830") && code == "n") {
+                            while (numVersion.match(/[0-9] [0-9]/)) {
+                                numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
+                            }
+                        }
+                    }
+                    if (useNumVersion)
+                    {
+                        outputString += numVersion;
+                    }
+                    else
+                    {
+                        outputString += textVersion;
+                    }
+                    //if the end of the string is not reached, backtrack to the delimiter after the last numerical token
+                    //(i.e. two tokens ago)
+                    if (j < n - 1)
+                    {
+                        i = j - 2;
+                    }
+                    else //we are at the end of the string, so we are done!
+                    {
+                        i = j;
+                    }
+                    break;
+                }
+                //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
+                //This is identical to the code that is run above when the last token is numeric.
+                if (j % 2 == 0)
+                {
+                    let m = tokj.match(numToken_re);
+                    textVersion += m[1];
+                    if (m[2]== "")
+                    {
+                        numVersion += m[1];
+                    }
+                    else
+                    {
+                        numVersion += m[2];
+                    }
+                }
+                else //a delimiter, just tack it on.
+                {
+                    textVersion += tokj;
+                    numVersion += tokj;
+                }
+            }
+        }
+        else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
+        {
+            outputString += toki;
+        }
+    }
+    return outputString;
+ }

+ 25 - 15
scriptshifter/tables/__init__.py

@@ -143,7 +143,7 @@ def init_db():
 
 
     This operation removes any preexisting database.
     This operation removes any preexisting database.
 
 
-    All tables in the index file (`./data/index.yml`) will be parsed
+    All tables in the index file (`./index.yml`) will be parsed
     (including inheritance rules) and loaded into the designated DB.
     (including inheritance rules) and loaded into the designated DB.
 
 
     This must be done only once at bootstrap. To update individual tables,
     This must be done only once at bootstrap. To update individual tables,
@@ -151,7 +151,7 @@ def init_db():
     """
     """
     # Create parent diretories if necessary.
     # Create parent diretories if necessary.
     # If the DB already exists, it will be overwritten ONLY on success at
     # If the DB already exists, it will be overwritten ONLY on success at
-    # hhis point.
+    # this point.
     if path.isfile(TMP_DB_PATH):
     if path.isfile(TMP_DB_PATH):
         # Remove previous temp file (possibly from failed attempt)
         # Remove previous temp file (possibly from failed attempt)
         unlink(TMP_DB_PATH)
         unlink(TMP_DB_PATH)
@@ -166,21 +166,12 @@ def init_db():
             conn.executescript(fh.read())
             conn.executescript(fh.read())
 
 
     # Populate tables.
     # Populate tables.
-    with open(path.join(TABLE_DIR, "index.yml")) as fh:
+    with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh:
         tlist = load(fh, Loader=Loader)
         tlist = load(fh, Loader=Loader)
     try:
     try:
         with conn:
         with conn:
             for tname, tdata in tlist.items():
             for tname, tdata in tlist.items():
-                res = conn.execute(
-                    """INSERT INTO tbl_language (
-                        name, label, marc_code, description
-                    ) VALUES (?, ?, ?, ?)""",
-                    (
-                        tname, tdata.get("name"), tdata.get("marc_code"),
-                        tdata.get("description"),
-                    )
-                )
-                populate_table(conn, res.lastrowid, tname)
+                populate_table(conn, tname, tdata)
 
 
         # If the DB already exists, it will be overwritten ONLY on success at
         # If the DB already exists, it will be overwritten ONLY on success at
         # thhis point.
         # thhis point.
@@ -201,7 +192,27 @@ def get_connection():
     return sqlite3.connect(DB_PATH)
     return sqlite3.connect(DB_PATH)
 
 
 
 
-def populate_table(conn, tid, tname):
+def populate_table(conn, tname, tdata):
+    """
+    Populate an individual table with data from a configuration.
+
+    @param conn: SQLite connection.
+
+    @param tname(str): Table name.
+
+    @param tdata(dict): Table data.
+    """
+    res = conn.execute(
+        """INSERT INTO tbl_language (
+            name, label, marc_code, description
+        ) VALUES (?, ?, ?, ?)""",
+        (
+            tname, tdata.get("name"), tdata.get("marc_code"),
+            tdata.get("description"),
+        )
+    )
+    tid = res.lastrowid
+
     data = load_table(tname)
     data = load_table(tname)
     flags = 0
     flags = 0
     if "script_to_roman" in data:
     if "script_to_roman" in data:
@@ -579,7 +590,6 @@ def get_lang_ignore(conn, lang_id):
             """SELECT rule, features FROM tbl_ignore
             """SELECT rule, features FROM tbl_ignore
             WHERE lang_id = ?""",
             WHERE lang_id = ?""",
             (lang_id,))
             (lang_id,))
-    # Features (regular expressions) not implemented yet.
     return tuple(
     return tuple(
             compile(row[0]) if row[1] & FEAT_RE else row[0]
             compile(row[0]) if row[1] & FEAT_RE else row[0]
             for row in qry)
             for row in qry)

+ 0 - 0
scriptshifter/tables/data/index.yml → scriptshifter/tables/index.yml


+ 18 - 4
tests/__init__.py

@@ -1,11 +1,10 @@
 from csv import reader
 from csv import reader
 from difflib import ndiff
 from difflib import ndiff
+from glob import glob
 from importlib import reload
 from importlib import reload
 from json import loads as jloads
 from json import loads as jloads
 from logging import getLogger
 from logging import getLogger
-from os import path
-
-import scriptshifter.tables
+from os import environ, path
 
 
 from scriptshifter.trans import transliterate
 from scriptshifter.trans import transliterate
 
 
@@ -17,8 +16,20 @@ logger = getLogger(__name__)
 
 
 
 
 def reload_tables():
 def reload_tables():
-    reload(scriptshifter.tables)  # Reload new config dir.
+    if "TXL_CONFIG_TABLE_DIR" in environ:
+        del environ["TXL_CONFIG_TABLE_DIR"]
+
+    # import here to set modified test config dir.
     from scriptshifter import tables
     from scriptshifter import tables
+
+    tables.init_db()
+
+    for fname in glob(path.join(TEST_DATA_DIR, "config", ".yml")):
+        tname = path.splitext(path.basename(filename))[1]
+        with tables.get_connection() as conn:
+            tables.populate_table(conn, tname, {"name": fname})
+
+
     tables.list_tables.cache_clear()
     tables.list_tables.cache_clear()
     tables.get_language.cache_clear()
     tables.get_language.cache_clear()
     tables.get_lang_map.cache_clear()
     tables.get_lang_map.cache_clear()
@@ -41,7 +52,10 @@ def test_sample(dset):
 
 
     with open(dset_fpath, newline="") as fh:
     with open(dset_fpath, newline="") as fh:
         csv = reader(fh)
         csv = reader(fh)
+        i = 1
         for row in csv:
         for row in csv:
+            logger.info(f"CSV row #{i}")
+            i += 1
             lang, script, rom = row[:3]
             lang, script, rom = row[:3]
             if not lang:
             if not lang:
                 continue
                 continue

+ 0 - 0
tests/data/_base1.yml → tests/data/config/_base1.yml


+ 0 - 0
tests/data/_base2.yml → tests/data/config/_base2.yml


+ 0 - 0
tests/data/_base3.yml → tests/data/config/_base3.yml


+ 0 - 0
tests/data/cap_base1.yml → tests/data/config/cap_base1.yml


+ 0 - 0
tests/data/cap_base2.yml → tests/data/config/cap_base2.yml


+ 0 - 0
tests/data/cap_inherited.yml → tests/data/config/cap_inherited.yml


+ 0 - 0
tests/data/index.yml → tests/data/config/index.yml


+ 0 - 0
tests/data/inherited.yml → tests/data/config/inherited.yml


+ 0 - 0
tests/data/ordering.yml → tests/data/config/ordering.yml


+ 0 - 0
tests/data/rot3.yml → tests/data/config/rot3.yml


+ 9 - 0
tests/data/script_samples/unittest.csv

@@ -0,0 +1,9 @@
+chinese,從易經解維摩詰經,臺北市大塊文化出版股份有限公司。,"cong yi jing jie wei mo jie jing, Taibei Shi da kuai wen hua chu ban gu fen you xian gong si.",,
+chinese,廖忠俊. 著名狀元榜眼探花傳略,liao zhong jun. zhu ming zhuang yuan bang yan tan hua zhuan lüe,,
+chinese,文學革命論 / 陳獨秀 -- 人的文學 / 周作人 -- 新文學運動的意義 / 張我軍.,wen xue ge ming lun / chen du xiu -- ren de wen xue / zhou zuo ren -- xin wen xue yun dong de yi yi / zhang wo jun.,,
+belarusian,Пётр Клімук : жыццё і подзвіг касманаўта,Pi︠o︡tr Klimuk : z︠h︡ytstsi︠o︡ i podzvih kasmanaŭta,,
+greek_classical,Ἡσιόδου τοῦ Ἀσκραίου Ἔργα καὶ ἡμέραι,Hēsiodou tou Askraiou Erga kai hēmerai,,
+korean_names,간규찬,Kan Kyu-ch'an,,Hangul; from Y. Lee,
+korean_names,강감찬,Kang Kam-ch'an,,Hangul; from Y. Lee,
+korean_nonames,내 나름 대로 의 사랑,Nae narŭm taero ŭi sarang,"{""capitalize"": ""first""}",From K-Romanizer,
+korean_nonames,내 마음 속 의 한국 문학,Nae maŭm sok ŭi Han'guk munhak,"{""capitalize"": ""first""}",From K-Romanizer,

+ 19 - 20
tests/test02_transliteration.py

@@ -8,7 +8,7 @@ from os import environ, path
 
 
 from tests import TEST_DATA_DIR, reload_tables
 from tests import TEST_DATA_DIR, reload_tables
 from scriptshifter.trans import transliterate
 from scriptshifter.trans import transliterate
-import scriptshifter.tables
+from scriptshifter.tables import get_language
 
 
 
 
 logger = logging.getLogger(__name__)
 logger = logging.getLogger(__name__)
@@ -33,8 +33,8 @@ class TestTrans(TestCase):
         This function name won't start with `test_` otherwise will be
         This function name won't start with `test_` otherwise will be
         automatically run without parameters.
         automatically run without parameters.
         """
         """
-        config = scriptshifter.tables.load_table(self.tbl)
-        if "script_to_roman" in config:
+        config = get_language(self.tbl)
+        if config["has_s2r"]:
             txl = transliterate(
             txl = transliterate(
                     self.script, self.tbl,
                     self.script, self.tbl,
                     capitalize=self.options.get("capitalize", False),
                     capitalize=self.options.get("capitalize", False),
@@ -51,8 +51,8 @@ class TestTrans(TestCase):
         This function name won't start with `test_` otherwise will be
         This function name won't start with `test_` otherwise will be
         automatically run without parameters.
         automatically run without parameters.
         """
         """
-        config = scriptshifter.tables.load_table(self.tbl)
-        if "roman_to_script" in config:
+        config = get_language(self.tbl)
+        if config["has_r2s"]:
             txl = transliterate(
             txl = transliterate(
                     self.roman, self.tbl,
                     self.roman, self.tbl,
                     t_dir="r2s",
                     t_dir="r2s",
@@ -68,25 +68,24 @@ def make_suite():
     """
     """
     Build parametrized test cases.
     Build parametrized test cases.
     """
     """
-    if "TXL_CONFIG_TABLE_DIR" in environ:
-        del environ["TXL_CONFIG_TABLE_DIR"]
     reload_tables()
     reload_tables()
 
 
     suite = TestSuite()
     suite = TestSuite()
 
 
-    for fpath in glob(path.join(TEST_DATA_DIR, "script_samples", "*.csv")):
-        with open(fpath, newline="") as fh:
-            csv = reader(fh)
-            for row in csv:
-                if len(row[0]):
-                    # Inject transliteration info in the test case.
-                    for tname in ("sample_s2r", "sample_r2s"):
-                        tcase = TestTrans(tname)
-                        tcase.tbl = row[0]
-                        tcase.script = row[1].strip()
-                        tcase.roman = row[2].strip()
-                        tcase.options = jloads(row[3]) if len(row[3]) else {}
-                        suite.addTest(tcase)
+    with open(path.join(
+        TEST_DATA_DIR, "script_samples", "unittest.csv"
+    ), newline="") as fh:
+        csv = reader(fh)
+        for row in csv:
+            if len(row[0]):
+                # Inject transliteration info in the test case.
+                for tname in ("sample_s2r", "sample_r2s"):
+                    tcase = TestTrans(tname)
+                    tcase.tbl = row[0]
+                    tcase.script = row[1].strip()
+                    tcase.roman = row[2].strip()
+                    tcase.options = jloads(row[3]) if len(row[3]) else {}
+                    suite.addTest(tcase)
 
 
     return suite
     return suite