trans.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. import logging
  2. import re
  3. from transliterator.tables import load_table
  4. # Match multiple spaces.
  5. MULTI_WS_RE = re.compile(r"\s{2,}")
  6. logger = logging.getLogger(__name__)
  7. def transliterate(src, lang, s2r=True):
  8. """
  9. Transliterate a single string.
  10. Args:
  11. src (str): Source string.
  12. lang (str): Language name.
  13. Keyword args:
  14. s2r (bool): If True (the default), the source is considered to be a
  15. non-latin script in the language and script specified, and the output
  16. the Romanization thereof; if False, the source is considered to be
  17. romanized text to be transliterated into the specified script/language.
  18. Return:
  19. str: The transliterated string.
  20. """
  21. source_str = "Latin" if s2r else lang
  22. target_str = lang if s2r else "Latin"
  23. logger.info(f"Transliteration is from {source_str} to {target_str}.")
  24. cfg = load_table(lang)
  25. logger.info(f"Loaded table for {lang}.")
  26. # General directives.
  27. # general_dir = cfg.get("directives", {})
  28. if s2r and "script_to_roman" not in cfg:
  29. raise NotImplementedError(
  30. f"Script-to-Roman transliteration not yet supported for {lang}."
  31. )
  32. elif not s2r and "roman_to_script" not in cfg:
  33. raise NotImplementedError(
  34. f"Roman-to-script transliteration not yet supported for {lang}."
  35. )
  36. langsec = cfg["script_to_roman"] if s2r else cfg["roman_to_script"]
  37. langsec_dir = langsec.get("directives", {})
  38. i = 0
  39. dest_ls = []
  40. # Loop through source characters. The increment of each loop depends on the
  41. # length of the token that eventually matches.
  42. ignore_list = langsec.get("ignore", []) # Only present in R2S
  43. while i < len(src):
  44. # Check ignore list first. Find as many subsequent ignore tokens
  45. # as possible before moving on to looking for match tokens.
  46. while True:
  47. ignoring = False
  48. for tk in ignore_list:
  49. step = len(tk)
  50. if tk == src[i:i + step]:
  51. logger.info(f"Ignored token: {tk}")
  52. dest_ls.append(tk)
  53. i += step
  54. ignoring = True
  55. break
  56. # We looked through all ignore tokens, not found any. Move on.
  57. if not ignoring:
  58. break
  59. match = False
  60. for src_tk, dest_tk in langsec["map"]:
  61. # Longer tokens should be guaranteed to be scanned before their
  62. # substrings at this point.
  63. step = len(src_tk)
  64. if src_tk == src[i:i + step]:
  65. # A match is found. Stop scanning tokens, append result, and
  66. # proceed scanning the source.
  67. dest_ls.append(dest_tk)
  68. match = True
  69. i += step
  70. break
  71. if not match:
  72. # No match found. Copy non-mapped character (one at a time).
  73. logger.info(f"Token {src[i]} at position {i} is not mapped.")
  74. dest_ls.append(src[i])
  75. i += 1
  76. if langsec_dir.get("capitalize", False):
  77. dest_ls[0] = dest_ls[0].capitalize()
  78. logger.debug(f"Output list: {dest_ls}")
  79. dest = "".join(dest_ls)
  80. dest = re.sub(MULTI_WS_RE, ' ', dest.strip())
  81. return dest