Переглянути джерело

Refine config file parser; add notes to Chinese.

Stefano Cossu 2 роки тому
батько
коміт
02c87aeb23

+ 59 - 10
transliterator/tables/__init__.py

@@ -1,5 +1,7 @@
+import logging
+
 from functools import cache
-from glob import glob
+# from glob import glob
 from os import path, access, R_OK
 
 from yaml import load
@@ -20,6 +22,49 @@ language (or language and script? TBD)
 TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
 
 
+logger = logging.getLogger(__name__)
+
+
+class Token:
+    """
+    Token class: minimal unit of text parsing.
+
+    This class overrides the `<` operator for strings, so that sorting is done
+    in a way that prioritizes a longer string over a shorter one with identical
+    root.
+    """
+    def __init__(self, content):
+        self.content = content
+
+    def __lt__(self, other):
+        """
+        Operator to sort tokens.
+
+        E.g:
+
+        - ABCD
+        - AB
+        - A
+        - BCDE
+        - BCD
+        - BEFGH
+        - B
+        """
+        logger.debug(f"a: {self.content}, b: {other.content}")
+        self_len = len(self.content)
+        other_len = len(other.content)
+        min_len = min(self_len, other_len)
+
+        # If one of the strings is entirely contained in the other string...
+        if self.content[:min_len] == other.content[:min_len]:
+            logger.debug("Roots match.")
+            # ...then the longer one takes precedence (is "less")
+            return self_len > other_len
+
+        # If the root strings are different, perform a normal comparison.
+        return self.content < other.content
+
+
 @cache
 def load_table(tname):
     """
@@ -36,14 +81,18 @@ def load_table(tname):
     with open(fname) as fh:
         tdata = load(fh, Loader=Loader)
 
-    # TODO Rearrange parsing tokens alphabetically, but so that the longest
-    # ones come first. E.g.
-    # - ABCD
-    # - AB
-    # - A
-    # - BCDE
-    # - BCD
-    # - BEFGH
-    # - B
+    if "script_to_roman" in tdata:
+        tokens = {
+                Token(k): v
+                for k, v in tdata["script_to_roman"].get("map", {}).items()}
+        tdata["script_to_roman"]["map"] = tuple(
+                (k.content, tokens[k]) for k in sorted(tokens))
+
+    if "roman_to_script" in tdata:
+        tokens = {
+                Token(k): v
+                for k, v in tdata["roman_to_script"].get("map", {}).items()}
+        tdata["roman_to_script"]["map"] = tuple(
+                (k.content, tokens[k]) for k in sorted(tokens))
 
     return tdata

+ 3 - 1
transliterator/tables/data/chinese.yml

@@ -34,6 +34,8 @@ script_to_roman:
     "\u2019": "'"
     "\u201C": "\""
     "\u201D": "\""
+    # NOTE \u3000 is an ideographic space; shouldn't this translate into a
+    # space?
     "\u3000": ""
     "\u3001": ", " # Underscores are replaced by spaces directly in config.
     "\u3002": ". "
@@ -4131,7 +4133,7 @@ script_to_roman:
     "\u94C9": "Xuan "
     "\u94DB": "dang "
     "\u94DC": "tong "
-    "\u94DD": "l "
+    "\u94DD": "l\u00FC "
     "\u94E8": "quan "
     "\u94ED": "ming "
     "\u94EE": "zheng "