__init__.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. import logging
  2. from functools import cache
  3. from os import path, access, R_OK
  4. from yaml import load
  5. try:
  6. from yaml import CLoader as Loader
  7. except ImportError:
  8. from yaml import Loader
  9. __doc__ = """
  10. Transliteration tables.
  11. These tables contain all transliteration information, grouped by script and
  12. language (or language and script? TBD)
  13. """
  14. TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
  15. logger = logging.getLogger(__name__)
  16. class Token(str):
  17. """
  18. Token class: minimal unit of text parsing.
  19. This class overrides the `<` operator for strings, so that sorting is done
  20. in a way that prioritizes a longer string over a shorter one with identical
  21. root.
  22. """
  23. def __init__(self, content):
  24. self.content = content
  25. def __lt__(self, other):
  26. """
  27. Operator to sort tokens.
  28. E.g:
  29. - ABCD
  30. - AB
  31. - A
  32. - BCDE
  33. - BCD
  34. - BEFGH
  35. - B
  36. """
  37. logger.debug(f"a: {self.content}, b: {other.content}")
  38. self_len = len(self.content)
  39. other_len = len(other.content)
  40. min_len = min(self_len, other_len)
  41. # If one of the strings is entirely contained in the other string...
  42. if self.content[:min_len] == other.content[:min_len]:
  43. logger.debug("Roots match.")
  44. # ...then the longer one takes precedence (is "less")
  45. return self_len > other_len
  46. # If the root strings are different, perform a normal comparison.
  47. return self.content < other.content
  48. def __hash__(self):
  49. return hash(self.content)
  50. @cache
  51. def list_tables():
  52. """
  53. List all the available tables.
  54. """
  55. with open(path.join(TABLE_DIR, "index.yml")) as fh:
  56. tdata = load(fh, Loader=Loader)
  57. return tdata
  58. @cache
  59. def load_table(tname):
  60. """
  61. Load one transliteration table and possible parent.
  62. The table file is parsed into an in-memory configuration that contains
  63. the language & script metadata and parsing rules.
  64. """
  65. fname = path.join(TABLE_DIR, tname + ".yml")
  66. if not access(fname, R_OK):
  67. raise ValueError(f"No transliteration table for {tname}!")
  68. with open(fname) as fh:
  69. tdata = load(fh, Loader=Loader)
  70. # NOTE Only one level of inheritance. No need for recursion for now.
  71. parent = tdata.get("general", {}).get("inherits", None)
  72. if parent:
  73. parent_tdata = load_table(parent)
  74. if "script_to_roman" in tdata:
  75. tokens = {
  76. Token(k): v
  77. for k, v in tdata["script_to_roman"].get("map", {}).items()}
  78. if parent:
  79. # Merge (and override) parent values.
  80. tokens = {
  81. Token(k): v for k, v in parent_tdata.get(
  82. "script_to_roman", {}).get("map", {})
  83. } | tokens
  84. tdata["script_to_roman"]["map"] = tuple(
  85. (k.content, tokens[k]) for k in sorted(tokens))
  86. if "roman_to_script" in tdata:
  87. tokens = {
  88. Token(k): v
  89. for k, v in tdata["roman_to_script"].get("map", {}).items()}
  90. if parent:
  91. # Merge (and override) parent values.
  92. tokens = {
  93. Token(k): v for k, v in parent_tdata.get(
  94. "roman_to_script", {}).get("map", {})
  95. } | tokens
  96. tdata["roman_to_script"]["map"] = tuple(
  97. (k.content, tokens[k]) for k in sorted(tokens))
  98. if parent:
  99. p_ignore = {
  100. Token(t) for t in parent_tdata.get(
  101. "roman_to_script", {}).get("ignore", [])}
  102. else:
  103. p_ignore = set()
  104. ignore = {
  105. Token(t)
  106. for t in tdata["roman_to_script"].get("ignore", [])
  107. } | p_ignore
  108. tdata["roman_to_script"]["ignore"] = [
  109. t.content for t in sorted(ignore)]
  110. return tdata