trans.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. import logging
  2. import re
  3. from transliterator.tables import load_table
  4. # Match multiple spaces.
  5. MULTI_WS_RE = re.compile(r"\s{2,}")
  6. logger = logging.getLogger(__name__)
  7. def transliterate(src, script, lang, s2r=True):
  8. """
  9. Transliterate a single string.
  10. Args:
  11. src (str): Source string.
  12. lang (str): Language name.
  13. script (str): Name of the script that the language is encoded in.
  14. Keyword args:
  15. s2r (bool): If True (the default), the source is considered to be a
  16. non-latin script in the language and script specified, and the output
  17. the Romanization thereof; if False, the source is considered to be
  18. romanized text to be transliterated into the specified script/language.
  19. Return:
  20. str: The transliterated string.
  21. """
  22. # TODO script is ignored at the moment.
  23. cfg = load_table(lang)
  24. # General directives.
  25. # general_dir = cfg.get("directives", {})
  26. # We could be clever here but let's give the users a precise message.
  27. if s2r and "script_to_roman" not in cfg:
  28. raise NotImplementedError(
  29. f"Script-to-Roman transliteration not yet supported for {lang}."
  30. )
  31. elif not s2r and "roman_to_script" not in cfg:
  32. raise NotImplementedError(
  33. f"Roman-to-script transliteration not yet supported for {lang}."
  34. )
  35. langsec = cfg["script_to_roman"] if s2r else cfg["roman_to_script"]
  36. langsec_dir = langsec.get("directives", {})
  37. i = 0
  38. dest_ls = []
  39. # Loop through source characters. The increment of each loop depends on the
  40. # length of the token that eventually matches.
  41. while i < len(src):
  42. match = False
  43. for src_tk, dest_tk in langsec["map"]:
  44. # Longer tokens should be guaranteed to be scanned before their
  45. # substrings at this point.
  46. step = len(src_tk)
  47. if src_tk == src[i:i + step]:
  48. # A match is found. Stop scanning tokens, append result, and
  49. # proceed scanning the source.
  50. dest_ls.append(dest_tk)
  51. match = True
  52. i += step
  53. break
  54. if not match:
  55. # Copy non-mapped character (one at a time).
  56. logger.info(f"Token {src[i]} at position {i} is not mapped.")
  57. dest_ls.append(src[i])
  58. i += 1
  59. if langsec_dir.get("capitalize", False):
  60. dest_ls[0] = dest_ls[0].capitalize()
  61. logger.info(f"Output list: {dest_ls}")
  62. dest = "".join(dest_ls)
  63. dest = re.sub(MULTI_WS_RE, ' ', dest)
  64. return dest