__init__.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. import logging
  2. from functools import cache
  3. # from glob import glob
  4. from os import path, access, R_OK
  5. from yaml import load
  6. try:
  7. from yaml import CLoader as Loader
  8. except ImportError:
  9. from yaml import Loader
  10. __doc__ = """
  11. Transliteration tables.
  12. These tables contain all transliteration information, grouped by script and
  13. language (or language and script? TBD)
  14. """
  15. TABLE_DIR = path.join(path.dirname(path.realpath(__file__)), "data")
  16. logger = logging.getLogger(__name__)
  17. class Token:
  18. """
  19. Token class: minimal unit of text parsing.
  20. This class overrides the `<` operator for strings, so that sorting is done
  21. in a way that prioritizes a longer string over a shorter one with identical
  22. root.
  23. """
  24. def __init__(self, content):
  25. self.content = content
  26. def __lt__(self, other):
  27. """
  28. Operator to sort tokens.
  29. E.g:
  30. - ABCD
  31. - AB
  32. - A
  33. - BCDE
  34. - BCD
  35. - BEFGH
  36. - B
  37. """
  38. logger.debug(f"a: {self.content}, b: {other.content}")
  39. self_len = len(self.content)
  40. other_len = len(other.content)
  41. min_len = min(self_len, other_len)
  42. # If one of the strings is entirely contained in the other string...
  43. if self.content[:min_len] == other.content[:min_len]:
  44. logger.debug("Roots match.")
  45. # ...then the longer one takes precedence (is "less")
  46. return self_len > other_len
  47. # If the root strings are different, perform a normal comparison.
  48. return self.content < other.content
  49. @cache
  50. def load_table(tname):
  51. """
  52. Load one transliteration table.
  53. The table file is parsed into an in-memory configuration that contains
  54. the language & script metadata and parsing rules.
  55. """
  56. fname = path.join(TABLE_DIR, tname + ".yml")
  57. if not access(fname, R_OK):
  58. raise ValueError(f"No transliteration table for {tname}!")
  59. with open(fname) as fh:
  60. tdata = load(fh, Loader=Loader)
  61. if "script_to_roman" in tdata:
  62. tokens = {
  63. Token(k): v
  64. for k, v in tdata["script_to_roman"].get("map", {}).items()}
  65. tdata["script_to_roman"]["map"] = tuple(
  66. (k.content, tokens[k]) for k in sorted(tokens))
  67. if "roman_to_script" in tdata:
  68. tokens = {
  69. Token(k): v
  70. for k, v in tdata["roman_to_script"].get("map", {}).items()}
  71. tdata["roman_to_script"]["map"] = tuple(
  72. (k.content, tokens[k]) for k in sorted(tokens))
  73. return tdata