6 mēneši atpakaļ · 6c5cab4743
--- a/example.env
+++ b/example.env
@@ -2,4 +2,6 @@ FLASK_DEBUG=true
 
				 TXL_DICTA_EP="changeme"
			
 
				 TXL_FLASK_SECRET="changeme"
			
 
				 TXL_LOGLEVEL="INFO"
			
 
				+TXL_EMAIL_FROM="me@loc.gov"
			
 
				+TXL_EMAIL_TO="me@loc.gov"
			
 
				 LOSHN_KOYDESH_O2P_SRC="${PWD}/scriptshifter/hooks/yiddish_/loshn_koydesh_o2p_override.tsv"
			
--- a/legacy/processNumbers.ts
+++ b/legacy/processNumbers.ts
@@ -0,0 +1,144 @@
 
				+private processNumbers(pinyinString: string, tag: string, code: string): string {
			
 
				+    let outputString = "";
			
 
				+    let useNumVersion = false;
			
 
				+    //useNumVersion is set in specific subfields where we definitely want to treat numbers as numbers
			
 
				+    if ((tag == "245" || tag == "830") && code == "n") {
			
 
				+       useNumVersion = true;
			
 
				+    }
			
 
				+
			
 
				+    /*
			
 
				+     * The input string is split, with any space or punctuation character (except for #) as the delimiter.
			
 
				+     * The delimiters will be captured and included in the string of tokens.  Only the even-numbered
			
 
				+     * array elements are the true 'tokens', so the code for processing tokens is run only for even
			
 
				+     * values of j.
			
 
				+     */
			
 
				+    let tokens: string[] = pinyinString.split(new RegExp("([^\\P{P}#]|\\s)","u"));
			
 
				+    let numTokenPattern = "^([A-Za-z]+)#([0-9]*)$";
			
 
				+    let numToken_re = new RegExp(numTokenPattern);
			
 
				+    let n = tokens.length
			
 
				+    //this.alert.info(tokens.join("|"),{autoClose: false})
			
 
				+    for (let i = 0; i < n; i++) {
			
 
				+        let toki = tokens[i];
			
 
				+        if (toki.match(numToken_re)) {
			
 
				+            /*
			
 
				+             * When a numerical token (containing #) is reached, the inner loop consumes it and all consecutive numerical tokens
			
 
				+             * found after it.  Two versions of the string are maintained.  The textVersion is the original pinyin (minus the
			
 
				+             * # suffixes).  In the numVersion, characters representing numbers are converted to Arabic numerals.  When a
			
 
				+             * non-numerical token (or end of string) is encountered, the string of numerical tokens is evaluated to determine
			
 
				+             * which version should be used in the output string.  The outer loop then continues where the inner loop left off.
			
 
				+             */
			
 
				+            let textVersion = "";
			
 
				+            let numVersion = "";
			
 
				+            for (let j = i; j < n; j++) {
			
 
				+                let tokj = tokens[j];
			
 
				+                /* a token without # (or the end of string) is reached */
			
 
				+                if ((j % 2 == 0 && !tokj.match(numToken_re)) || j == n - 1) {
			
 
				+                    //If this runs, then we are on the last token and it is numeric. Add text after # (if present) to numerical version
			
 
				+                    let m = tokj.match(numToken_re);
			
 
				+                    if (m) {
			
 
				+                        textVersion += m[1]
			
 
				+                        if (m[2] == "") {
			
 
				+                            numVersion += m[1];
			
 
				+                        } else {
			
 
				+                            numVersion += m[2];
			
 
				+                        }
			
 
				+                    } else if (j == n - 1) {
			
 
				+                    //if last token is non-numerical, just tack it on.
			
 
				+                        textVersion += tokj;
			
 
				+                        numVersion += tokj;
			
 
				+                    } else if (textVersion.length > 0 && numVersion.length > 0) {
			
 
				+                    //if not at end of string yet and token is non-numerical, remove the last delimiter that was appended
			
 
				+                    //(outer loop will pick up at this point)
			
 
				+                        textVersion = textVersion.substring(0, textVersion.length - 1);
			
 
				+                        numVersion = numVersion.substring(0, numVersion.length - 1);
			
 
				+                    }
			
 
				+                    //evaluate numerical string that has been constructed so far
			
 
				+                    //use num version for ordinals and date strings
			
 
				+                    if (numVersion.match(/^di [0-9]/i) ||
			
 
				+                        numVersion.match(/[0-9] [0-9] [0-9] [0-9]/) ||
			
 
				+                        numVersion.match(/[0-9]+ nian [0-9]+ yue/i) ||
			
 
				+                        numVersion.match(/"[0-9]+ yue [0-9]+ ri/i) ||
			
 
				+                        useNumVersion
			
 
				+                       ) {
			
 
				+                        useNumVersion = true;
			
 
				+                        /*
			
 
				+                         * At this point, string may contain literal translations of Chinese numerals
			
 
				+                         * Convert these to Arabic numerals (for example "2 10 7" = "27").
			
 
				+                         */
			
 
				+
			
 
				+                        while (numVersion.match(/[0-9] 10+/) || numVersion.match(/[1-9]0+ [1-9]/)) {
			
 
				+                            m = numVersion.match(/([0-9]+) ([1-9]0+)/);
			
 
				+                            if (m) {
			
 
				+                                let sum = Number(m[1]) * Number(m[2]);
			
 
				+                                numVersion = numVersion.replace(/[0-9]+ [1-9]0+/, String(sum));
			
 
				+                            } else {
			
 
				+                                let mb = numVersion.match(/([1-9]0+) ([0-9]+)/);
			
 
				+                                if (mb)
			
 
				+                                {
			
 
				+                                    let sumb = Number(mb[1]) + Number(mb[2]);
			
 
				+                                    numVersion = numVersion.replace(/[1-9]0+ [0-9]+/, String(sumb));
			
 
				+                                }
			
 
				+                                else
			
 
				+                                {
			
 
				+                                    break;
			
 
				+                                }
			
 
				+                            }
			
 
				+                        }
			
 
				+
			
 
				+                        //A few other tweaks
			
 
				+                        numVersion = numVersion.replace(/([0-9]) ([0-9]) ([0-9]) ([0-9])/g, "$1$2$3$4");
			
 
				+                        if ((tag == "245" || tag == "830") && code == "n") {
			
 
				+                            while (numVersion.match(/[0-9] [0-9]/)) {
			
 
				+                                numVersion = numVersion.replace(/([0-9]) ([0-9])/, "$1$2");
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+                    if (useNumVersion)
			
 
				+                    {
			
 
				+                        outputString += numVersion;
			
 
				+                    }
			
 
				+                    else
			
 
				+                    {
			
 
				+                        outputString += textVersion;
			
 
				+                    }
			
 
				+                    //if the end of the string is not reached, backtrack to the delimiter after the last numerical token
			
 
				+                    //(i.e. two tokens ago)
			
 
				+                    if (j < n - 1)
			
 
				+                    {
			
 
				+                        i = j - 2;
			
 
				+                    }
			
 
				+                    else //we are at the end of the string, so we are done!
			
 
				+                    {
			
 
				+                        i = j;
			
 
				+                    }
			
 
				+                    break;
			
 
				+                }
			
 
				+                //this is run when we are not yet at the end of the string and have not yet reached a non-numerical token
			
 
				+                //This is identical to the code that is run above when the last token is numeric.
			
 
				+                if (j % 2 == 0)
			
 
				+                {
			
 
				+                    let m = tokj.match(numToken_re);
			
 
				+                    textVersion += m[1];
			
 
				+                    if (m[2]== "")
			
 
				+                    {
			
 
				+                        numVersion += m[1];
			
 
				+                    }
			
 
				+                    else
			
 
				+                    {
			
 
				+                        numVersion += m[2];
			
 
				+                    }
			
 
				+                }
			
 
				+                else //a delimiter, just tack it on.
			
 
				+                {
			
 
				+                    textVersion += tokj;
			
 
				+                    numVersion += tokj;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        else // the outer loop has encountered a non-numeric token or delimiter, just tack it on.
			
 
				+        {
			
 
				+            outputString += toki;
			
 
				+        }
			
 
				+    }
			
 
				+    return outputString;
			
 
				+ }
			
--- a/scriptshifter/tables/__init__.py
+++ b/scriptshifter/tables/__init__.py
@@ -143,7 +143,7 @@ def init_db():
 
				 
			
 
				     This operation removes any preexisting database.
			
 
				 
			
 
				-    All tables in the index file (`./data/index.yml`) will be parsed
			
 
				+    All tables in the index file (`./index.yml`) will be parsed
			
 
				     (including inheritance rules) and loaded into the designated DB.
			
 
				 
			
 
				     This must be done only once at bootstrap. To update individual tables,
			
@@ -151,7 +151,7 @@ def init_db():
 
				     """
			
 
				     # Create parent diretories if necessary.
			
 
				     # If the DB already exists, it will be overwritten ONLY on success at
			
 
				-    # hhis point.
			
 
				+    # this point.
			
 
				     if path.isfile(TMP_DB_PATH):
			
 
				         # Remove previous temp file (possibly from failed attempt)
			
 
				         unlink(TMP_DB_PATH)
			
@@ -166,21 +166,12 @@ def init_db():
 
				             conn.executescript(fh.read())
			
 
				 
			
 
				     # Populate tables.
			
 
				-    with open(path.join(TABLE_DIR, "index.yml")) as fh:
			
 
				+    with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh:
			
 
				         tlist = load(fh, Loader=Loader)
			
 
				     try:
			
 
				         with conn:
			
 
				             for tname, tdata in tlist.items():
			
 
				-                res = conn.execute(
			
 
				-                    """INSERT INTO tbl_language (
			
 
				-                        name, label, marc_code, description
			
 
				-                    ) VALUES (?, ?, ?, ?)""",
			
 
				-                    (
			
 
				-                        tname, tdata.get("name"), tdata.get("marc_code"),
			
 
				-                        tdata.get("description"),
			
 
				-                    )
			
 
				-                )
			
 
				-                populate_table(conn, res.lastrowid, tname)
			
 
				+                populate_table(conn, tname, tdata)
			
 
				 
			
 
				         # If the DB already exists, it will be overwritten ONLY on success at
			
 
				         # thhis point.
			
@@ -201,7 +192,27 @@ def get_connection():
 
				     return sqlite3.connect(DB_PATH)
			
 
				 
			
 
				 
			
 
				-def populate_table(conn, tid, tname):
			
 
				+def populate_table(conn, tname, tdata):
			
 
				+    """
			
 
				+    Populate an individual table with data from a configuration.
			
 
				+
			
 
				+    @param conn: SQLite connection.
			
 
				+
			
 
				+    @param tname(str): Table name.
			
 
				+
			
 
				+    @param tdata(dict): Table data.
			
 
				+    """
			
 
				+    res = conn.execute(
			
 
				+        """INSERT INTO tbl_language (
			
 
				+            name, label, marc_code, description
			
 
				+        ) VALUES (?, ?, ?, ?)""",
			
 
				+        (
			
 
				+            tname, tdata.get("name"), tdata.get("marc_code"),
			
 
				+            tdata.get("description"),
			
 
				+        )
			
 
				+    )
			
 
				+    tid = res.lastrowid
			
 
				+
			
 
				     data = load_table(tname)
			
 
				     flags = 0
			
 
				     if "script_to_roman" in data:
			
@@ -579,7 +590,6 @@ def get_lang_ignore(conn, lang_id):
 
				             """SELECT rule, features FROM tbl_ignore
			
 
				             WHERE lang_id = ?""",
			
 
				             (lang_id,))
			
 
				-    # Features (regular expressions) not implemented yet.
			
 
				     return tuple(
			
 
				             compile(row[0]) if row[1] & FEAT_RE else row[0]
			
 
				             for row in qry)
			
--- a/scriptshifter/tables/data/index.yml
+++ b/scriptshifter/tables/data/index.yml
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,11 +1,10 @@
 
				 from csv import reader
			
 
				 from difflib import ndiff
			
 
				+from glob import glob
			
 
				 from importlib import reload
			
 
				 from json import loads as jloads
			
 
				 from logging import getLogger
			
 
				-from os import path
			
 
				-
			
 
				-import scriptshifter.tables
			
 
				+from os import environ, path
			
 
				 
			
 
				 from scriptshifter.trans import transliterate
			
 
				 
			
@@ -17,8 +16,20 @@ logger = getLogger(__name__)
 
				 
			
 
				 
			
 
				 def reload_tables():
			
 
				-    reload(scriptshifter.tables)  # Reload new config dir.
			
 
				+    if "TXL_CONFIG_TABLE_DIR" in environ:
			
 
				+        del environ["TXL_CONFIG_TABLE_DIR"]
			
 
				+
			
 
				+    # import here to set modified test config dir.
			
 
				     from scriptshifter import tables
			
 
				+
			
 
				+    tables.init_db()
			
 
				+
			
 
				+    for fname in glob(path.join(TEST_DATA_DIR, "config", ".yml")):
			
 
				+        tname = path.splitext(path.basename(filename))[1]
			
 
				+        with tables.get_connection() as conn:
			
 
				+            tables.populate_table(conn, tname, {"name": fname})
			
 
				+
			
 
				+
			
 
				     tables.list_tables.cache_clear()
			
 
				     tables.get_language.cache_clear()
			
 
				     tables.get_lang_map.cache_clear()
			
@@ -41,7 +52,10 @@ def test_sample(dset):
 
				 
			
 
				     with open(dset_fpath, newline="") as fh:
			
 
				         csv = reader(fh)
			
 
				+        i = 1
			
 
				         for row in csv:
			
 
				+            logger.info(f"CSV row #{i}")
			
 
				+            i += 1
			
 
				             lang, script, rom = row[:3]
			
 
				             if not lang:
			
 
				                 continue
			
--- a/tests/data/config/_base1.yml
+++ b/tests/data/config/_base1.yml
--- a/tests/data/config/_base2.yml
+++ b/tests/data/config/_base2.yml
--- a/tests/data/config/_base3.yml
+++ b/tests/data/config/_base3.yml
--- a/tests/data/config/cap_base1.yml
+++ b/tests/data/config/cap_base1.yml
--- a/tests/data/config/cap_base2.yml
+++ b/tests/data/config/cap_base2.yml
--- a/tests/data/config/cap_inherited.yml
+++ b/tests/data/config/cap_inherited.yml
--- a/tests/data/config/index.yml
+++ b/tests/data/config/index.yml
--- a/tests/data/config/inherited.yml
+++ b/tests/data/config/inherited.yml
--- a/tests/data/config/ordering.yml
+++ b/tests/data/config/ordering.yml
--- a/tests/data/config/rot3.yml
+++ b/tests/data/config/rot3.yml
--- a/tests/data/script_samples/unittest.csv
+++ b/tests/data/script_samples/unittest.csv
@@ -0,0 +1,9 @@
 
				+chinese,從易經解維摩詰經，臺北市大塊文化出版股份有限公司。,"cong yi jing jie wei mo jie jing, Taibei Shi da kuai wen hua chu ban gu fen you xian gong si.",,
			
 
				+chinese,廖忠俊. 著名狀元榜眼探花傳略,liao zhong jun. zhu ming zhuang yuan bang yan tan hua zhuan lüe,,
			
 
				+chinese,文學革命論 / 陳獨秀 -- 人的文學 / 周作人 -- 新文學運動的意義 / 張我軍.,wen xue ge ming lun / chen du xiu -- ren de wen xue / zhou zuo ren -- xin wen xue yun dong de yi yi / zhang wo jun.,,
			
 
				+belarusian,Пётр Клімук : жыццё і подзвіг касманаўта,Pi︠o︡tr Klimuk : z︠h︡ytstsi︠o︡ i podzvih kasmanaŭta,,
			
 
				+greek_classical,Ἡσιόδου τοῦ Ἀσκραίου Ἔργα καὶ ἡμέραι,Hēsiodou tou Askraiou Erga kai hēmerai,,
			
 
				+korean_names,간규찬,Kan Kyu-ch'an,,Hangul; from Y. Lee,
			
 
				+korean_names,강감찬,Kang Kam-ch'an,,Hangul; from Y. Lee,
			
 
				+korean_nonames,내 나름 대로 의 사랑,Nae narŭm taero ŭi sarang,"{""capitalize"": ""first""}",From K-Romanizer,
			
 
				+korean_nonames,내 마음 속 의 한국 문학,Nae maŭm sok ŭi Han'guk munhak,"{""capitalize"": ""first""}",From K-Romanizer,
			
--- a/tests/test02_transliteration.py
+++ b/tests/test02_transliteration.py
@@ -8,7 +8,7 @@ from os import environ, path
 
				 
			
 
				 from tests import TEST_DATA_DIR, reload_tables
			
 
				 from scriptshifter.trans import transliterate
			
 
				-import scriptshifter.tables
			
 
				+from scriptshifter.tables import get_language
			
 
				 
			
 
				 
			
 
				 logger = logging.getLogger(__name__)
			
@@ -33,8 +33,8 @@ class TestTrans(TestCase):
 
				         This function name won't start with `test_` otherwise will be
			
 
				         automatically run without parameters.
			
 
				         """
			
 
				-        config = scriptshifter.tables.load_table(self.tbl)
			
 
				-        if "script_to_roman" in config:
			
 
				+        config = get_language(self.tbl)
			
 
				+        if config["has_s2r"]:
			
 
				             txl = transliterate(
			
 
				                     self.script, self.tbl,
			
 
				                     capitalize=self.options.get("capitalize", False),
			
@@ -51,8 +51,8 @@ class TestTrans(TestCase):
 
				         This function name won't start with `test_` otherwise will be
			
 
				         automatically run without parameters.
			
 
				         """
			
 
				-        config = scriptshifter.tables.load_table(self.tbl)
			
 
				-        if "roman_to_script" in config:
			
 
				+        config = get_language(self.tbl)
			
 
				+        if config["has_r2s"]:
			
 
				             txl = transliterate(
			
 
				                     self.roman, self.tbl,
			
 
				                     t_dir="r2s",
			
@@ -68,25 +68,24 @@ def make_suite():
 
				     """
			
 
				     Build parametrized test cases.
			
 
				     """
			
 
				-    if "TXL_CONFIG_TABLE_DIR" in environ:
			
 
				-        del environ["TXL_CONFIG_TABLE_DIR"]
			
 
				     reload_tables()
			
 
				 
			
 
				     suite = TestSuite()
			
 
				 
			
 
				-    for fpath in glob(path.join(TEST_DATA_DIR, "script_samples", "*.csv")):
			
 
				-        with open(fpath, newline="") as fh:
			
 
				-            csv = reader(fh)
			
 
				-            for row in csv:
			
 
				-                if len(row[0]):
			
 
				-                    # Inject transliteration info in the test case.
			
 
				-                    for tname in ("sample_s2r", "sample_r2s"):
			
 
				-                        tcase = TestTrans(tname)
			
 
				-                        tcase.tbl = row[0]
			
 
				-                        tcase.script = row[1].strip()
			
 
				-                        tcase.roman = row[2].strip()
			
 
				-                        tcase.options = jloads(row[3]) if len(row[3]) else {}
			
 
				-                        suite.addTest(tcase)
			
 
				+    with open(path.join(
			
 
				+        TEST_DATA_DIR, "script_samples", "unittest.csv"
			
 
				+    ), newline="") as fh:
			
 
				+        csv = reader(fh)
			
 
				+        for row in csv:
			
 
				+            if len(row[0]):
			
 
				+                # Inject transliteration info in the test case.
			
 
				+                for tname in ("sample_s2r", "sample_r2s"):
			
 
				+                    tcase = TestTrans(tname)
			
 
				+                    tcase.tbl = row[0]
			
 
				+                    tcase.script = row[1].strip()
			
 
				+                    tcase.roman = row[2].strip()
			
 
				+                    tcase.options = jloads(row[3]) if len(row[3]) else {}
			
 
				+                    suite.addTest(tcase)
			
 
				 
			
 
				     return suite