123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- import logging
- from functools import cache
- from os import path, access, R_OK
- from yaml import load
- try:
- from yaml import CLoader as Loader
- except ImportError:
- from yaml import Loader
- __doc__ = """
- Transliteration tables.
- These tables contain all transliteration information, grouped by script and
- language (or language and script? TBD)
- """
- TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
- logger = logging.getLogger(__name__)
- class Token:
- """
- Token class: minimal unit of text parsing.
- This class overrides the `<` operator for strings, so that sorting is done
- in a way that prioritizes a longer string over a shorter one with identical
- root.
- """
- def __init__(self, content):
- self.content = content
- def __lt__(self, other):
- """
- Operator to sort tokens.
- E.g:
- - ABCD
- - AB
- - A
- - BCDE
- - BCD
- - BEFGH
- - B
- """
- logger.debug(f"a: {self.content}, b: {other.content}")
- self_len = len(self.content)
- other_len = len(other.content)
- min_len = min(self_len, other_len)
- # If one of the strings is entirely contained in the other string...
- if self.content[:min_len] == other.content[:min_len]:
- logger.debug("Roots match.")
- # ...then the longer one takes precedence (is "less")
- return self_len > other_len
- # If the root strings are different, perform a normal comparison.
- return self.content < other.content
- @cache
- def list_tables():
- """
- List all the available tables.
- """
- with open(path.join(TABLE_DIR, "index.yml")) as fh:
- tdata = load(fh, Loader=Loader)
- return tdata
- @cache
- def load_table(tname):
- """
- Load one transliteration table and possible parent.
- The table file is parsed into an in-memory configuration that contains
- the language & script metadata and parsing rules.
- """
- fname = path.join(TABLE_DIR, tname + ".yml")
- if not access(fname, R_OK):
- raise ValueError(f"No transliteration table for {tname}!")
- with open(fname) as fh:
- tdata = load(fh, Loader=Loader)
- # NOTE Only one level of inheritance. No need for recursion for now.
- parent = tdata.get("general", {}).get("inherits", None)
- if parent:
- parent_tdata = load_table(parent)
- if "script_to_roman" in tdata:
- tokens = {
- Token(k): v
- for k, v in tdata["script_to_roman"].get("map", {}).items()}
- if parent:
- # Merge (and override) parent values.
- tokens = {
- Token(k): v for k, v in parent_tdata.get(
- "script_to_roman", {}).get("map", {})
- } | tokens
- tdata["script_to_roman"]["map"] = tuple(
- (k.content, tokens[k]) for k in sorted(tokens))
- if "roman_to_script" in tdata:
- tokens = {
- Token(k): v
- for k, v in tdata["roman_to_script"].get("map", {}).items()}
- if parent:
- # Merge (and override) parent values.
- tokens = {
- Token(k): v for k, v in parent_tdata.get(
- "roman_to_script", {}).get("map", {})
- } | tokens
- tdata["roman_to_script"]["map"] = tuple(
- (k.content, tokens[k]) for k in sorted(tokens))
- if parent:
- p_ignore = {
- Token(t) for t in parent_tdata.get(
- "roman_to_script", {}).get("ignore", [])}
- else:
- p_ignore = set()
- ignore = {
- Token(t)
- for t in tdata["roman_to_script"].get("ignore", [])
- } | p_ignore
- tdata["roman_to_script"]["ignore"] = [
- t.content for t in sorted(ignore)]
- return tdata
|