浏览代码

Initial test of sample CSV (S2R only).

Stefano Cossu 2 年之前
父节点
当前提交
180541aa67

+ 1 - 1
tests/__init__.py

@@ -1,4 +1,4 @@
 from os import path
 
 TEST_DIR = path.dirname(path.realpath(__file__))
-TEST_CONFIG_DIR = path.join(TEST_DIR, "data")
+TEST_DATA_DIR = path.join(TEST_DIR, "data")

+ 0 - 0
tests/data/transliterator_sample_strings.csv → tests/data/sample_strings.csv


+ 2 - 2
tests/test01_cfg.py

@@ -3,7 +3,7 @@ from unittest import TestCase
 from importlib import reload
 from os import environ
 
-from tests import TEST_CONFIG_DIR
+from tests import TEST_DATA_DIR
 import transliterator.tables
 
 
@@ -11,7 +11,7 @@ class TestConfig(TestCase):
     """ Test configuration parsing. """
 
     def test_ordering(self):
-        environ["TXL_CONFIG_TABLE_DIR"] = TEST_CONFIG_DIR
+        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
         reload(transliterator.tables)  # Reload new config dir.
         from transliterator import tables
         tables.list_tables.cache_clear()

+ 39 - 2
tests/test02_transliteration.py

@@ -1,12 +1,19 @@
+import logging
+
 from unittest import TestCase
+from csv import reader
 
 from importlib import reload
-from os import environ
+from os import environ, path
 
+from tests import TEST_DATA_DIR
 from transliterator.trans import transliterate
 import transliterator.tables
 
 
+logger = logging.getLogger(__name__)
+
+
 class TestScriptToRoman(TestCase):
     """
     Test S2R transliteration.
@@ -21,7 +28,6 @@ class TestScriptToRoman(TestCase):
             # import transliterator.tables
 
     def test_basic_chinese(self):
-        breakpoint()
         src = "撞倒須彌 : 漢傳佛教青年學者論壇論文集"
         dest = (
                 "Zhuang dao Xumi : han zhuan Fo jiao qing nian xue zhe lun "
@@ -29,3 +35,34 @@ class TestScriptToRoman(TestCase):
 
         trans = transliterate(src, "chinese")
         assert trans == dest
+
+    def test_available_samples(self):
+        """
+        Test all available samples for the implemented tables.
+        """
+        for k, script, roman in _test_cases():
+            txl = transliterate(script, k)
+            if txl != roman:
+                warn_str = f"Mismatching transliteration in {k}!"
+                logger.warning("*" * len(warn_str))
+                logger.warning(warn_str)
+                logger.warning("*" * len(warn_str))
+                logger.info(f"Transliterated string: {txl}")
+                logger.info(f"        Target string: {roman}")
+
+            # assert txl == roman
+
+
+def _test_cases():
+    test_cases = []
+    with open(
+            path.join(TEST_DATA_DIR, "sample_strings.csv"),
+            newline="") as fh:
+        csv = reader(fh)
+        csv.__next__()  # Discard header row.
+        for row in csv:
+            if len(row[2]):
+                # Table key, script, Roman
+                test_cases.append((row[2], row[3], row[4]))
+
+    return test_cases

+ 0 - 1
transliterator/tables/data/uzbek.yml

@@ -2,7 +2,6 @@ general:
   name: uzbek (Cyrillic)
   parents:
     - _cyrillic_base
-    - _ignore_base
 
 roman_to_script:
   map:

+ 1 - 1
transliterator/trans.py

@@ -9,7 +9,7 @@ from transliterator.tables import load_table
 MULTI_WS_RE = re.compile(r"\s{2,}")
 # Default characters defining a word boundary. TODO Make this configurable
 # per-table.
-WORD_BOUNDARY = " \n\t:;.,\"'"
+WORD_BOUNDARY = " \n\t:;.,\"'-()[]{}"
 
 # Cursor bitwise flags.
 CUR_BOW = 1