|
@@ -1,5 +1,7 @@
|
|
|
+import logging
|
|
|
+
|
|
|
from functools import cache
|
|
|
-from glob import glob
|
|
|
+# from glob import glob
|
|
|
from os import path, access, R_OK
|
|
|
|
|
|
from yaml import load
|
|
@@ -20,6 +22,49 @@ language (or language and script? TBD)
|
|
|
TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
|
|
|
|
|
|
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+
|
|
|
+class Token:
|
|
|
+ """
|
|
|
+ Token class: minimal unit of text parsing.
|
|
|
+
|
|
|
+ This class overrides the `<` operator for strings, so that sorting is done
|
|
|
+ in a way that prioritizes a longer string over a shorter one with identical
|
|
|
+ root.
|
|
|
+ """
|
|
|
+ def __init__(self, content):
|
|
|
+ self.content = content
|
|
|
+
|
|
|
+ def __lt__(self, other):
|
|
|
+ """
|
|
|
+ Operator to sort tokens.
|
|
|
+
|
|
|
+ E.g:
|
|
|
+
|
|
|
+ - ABCD
|
|
|
+ - AB
|
|
|
+ - A
|
|
|
+ - BCDE
|
|
|
+ - BCD
|
|
|
+ - BEFGH
|
|
|
+ - B
|
|
|
+ """
|
|
|
+ logger.debug(f"a: {self.content}, b: {other.content}")
|
|
|
+ self_len = len(self.content)
|
|
|
+ other_len = len(other.content)
|
|
|
+ min_len = min(self_len, other_len)
|
|
|
+
|
|
|
+ # If one of the strings is entirely contained in the other string...
|
|
|
+ if self.content[:min_len] == other.content[:min_len]:
|
|
|
+ logger.debug("Roots match.")
|
|
|
+ # ...then the longer one takes precedence (is "less")
|
|
|
+ return self_len > other_len
|
|
|
+
|
|
|
+ # If the root strings are different, perform a normal comparison.
|
|
|
+ return self.content < other.content
|
|
|
+
|
|
|
+
|
|
|
@cache
|
|
|
def load_table(tname):
|
|
|
"""
|
|
@@ -36,14 +81,18 @@ def load_table(tname):
|
|
|
with open(fname) as fh:
|
|
|
tdata = load(fh, Loader=Loader)
|
|
|
|
|
|
- # TODO Rearrange parsing tokens alphabetically, but so that the longest
|
|
|
- # ones come first. E.g.
|
|
|
- # - ABCD
|
|
|
- # - AB
|
|
|
- # - A
|
|
|
- # - BCDE
|
|
|
- # - BCD
|
|
|
- # - BEFGH
|
|
|
- # - B
|
|
|
+ if "script_to_roman" in tdata:
|
|
|
+ tokens = {
|
|
|
+ Token(k): v
|
|
|
+ for k, v in tdata["script_to_roman"].get("map", {}).items()}
|
|
|
+ tdata["script_to_roman"]["map"] = tuple(
|
|
|
+ (k.content, tokens[k]) for k in sorted(tokens))
|
|
|
+
|
|
|
+ if "roman_to_script" in tdata:
|
|
|
+ tokens = {
|
|
|
+ Token(k): v
|
|
|
+ for k, v in tdata["roman_to_script"].get("map", {}).items()}
|
|
|
+ tdata["roman_to_script"]["map"] = tuple(
|
|
|
+ (k.content, tokens[k]) for k in sorted(tokens))
|
|
|
|
|
|
return tdata
|