Quellcode durchsuchen

ALMOST translate sample Chinese string.

Stefano Cossu vor 2 Jahren
Ursprung
Commit
87e73f696d

+ 6 - 1
transliterator/tables/data/chinese.yml

@@ -11,7 +11,11 @@ script_to_roman:
     #subfields_always_excluded: uvxy0123456789
     #other_subfields_excluded_by_tag: 100/e 110/e 111/j 246/i 260/c 264/c 650/a 700/e 700/i 710/e 710/i 711/i 711/j 730/i
     #uppercase_first_character_in_subfield: 260/b
-    personal_name_handling: true
+    # personal_name_handling: true  # This is handled by the text scanner.
+    # Capitalize the first letter of the string only;
+    # TODO Implement a list that includes all punctuation marks that want the
+    # following letter capitalized.
+    capitalize: true
   map: # Mapping section.
     # RDA boilerplate phrases not transliterated.
     # All characters not found in the mapping are copid verbatim. No need for
@@ -28,6 +32,7 @@ script_to_roman:
     # logic maybe.
     "\u4E2D\u56FD": "Zhongguo "
     "\u5317\u4EAC": "Beijing "
+    "\u9808\u5F4C": "Xumi "
 
     # Single character transliteration.
     "\u2018": "'"

+ 0 - 0
transliterator/tests/test01_cfg.py


+ 20 - 0
transliterator/tests/test02_transliteration.py

@@ -0,0 +1,20 @@
+import unittest
+
+from transliterator.trans import transliterate
+
+
+class TestScriptToRoman(unittest.TestCase):
+    """
+    Test S2R transliteration.
+
+    TODO use a comprehensive sample table and report errors for unsupported
+    languages.
+    """
+
+    def test_basic_chinese(self):
+        src = "撞倒須彌 : 漢傳佛教青年學者論壇論文集"
+        dest = (
+                "Zhuang dao Xumi : Han chuan Fo jiao qing nian xue zhe lun "
+                "tan lun wen ji ")
+
+        assert transliterate(src) == dest

+ 0 - 0
transliterator/tests/test03_rest_api.py


+ 77 - 0
transliterator/trans.py

@@ -0,0 +1,77 @@
+import logging
+
+from transliterator.tables import load_table
+
+
+logger = logging.getLogger(__name__)
+
+
+def transliterate(src, script, lang, s2r=True):
+    """
+    Transliterate a single string.
+
+    Args:
+        src (str): Source string.
+
+        lang (str): Language name.
+
+        script (str): Name of the script that the language is encoded in.
+
+    Keyword args:
+        s2r (bool): If True (the default), the source is considered to be a
+        non-latin script in the language and script specified, and the output
+        the Romanization thereof; if False, the source is considered to be
+        romanized text to be transliterated into the specified script/language.
+
+    Return:
+        str: The transliterated string.
+    """
+    # TODO script is ignored at the moment.
+    cfg = load_table(lang)
+    # General directives.
+    # general_dir = cfg.get("directives", {})
+
+    # We could be clever here but let's give the users a precise message.
+    if s2r and "script_to_roman" not in cfg:
+        raise NotImplementedError(
+            f"Script-to-Roman transliteration not yet supported for {lang}."
+        )
+    elif not s2r and "roman_to_script" not in cfg:
+        raise NotImplementedError(
+            f"Roman-to-script transliteration not yet supported for {lang}."
+        )
+
+    langsec = cfg["script_to_roman"] if s2r else cfg["roman_to_script"]
+    langsec_dir = langsec.get("directives", {})
+
+    i = 0
+    dest_ls = []
+    # Loop through source characters. The increment of each loop depends on the
+    # length of the token that eventually matches.
+    while i < len(src):
+        match = False
+        for src_tk, dest_tk in langsec["map"]:
+            # Longer tokens should be guaranteed to be scanned before their
+            # substrings at this point.
+            step = len(src_tk)
+            if src_tk == src[i:i + step]:
+                # A match is found. Stop scanning tokens, append result, and
+                # proceed scanning the source.
+                dest_ls.append(dest_tk)
+                match = True
+                i += step
+                break
+        if not match:
+            # Copy non-mapped character (one at a time).
+            logger.info(f"Token {src[i]} at position {i} is not mapped.")
+            dest_ls.append(src[i])
+            i += 1
+
+    breakpoint()
+    if langsec_dir.get("capitalize", False):
+        dest_ls[0] = dest_ls[0].capitalize()
+
+    logger.info(f"Output list: {dest_ls}")
+    dest = "".join(dest_ls)
+
+    return dest