trans.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. import logging
  2. from transliterator.tables import load_table
  3. logger = logging.getLogger(__name__)
  4. def transliterate(src, script, lang, s2r=True):
  5. """
  6. Transliterate a single string.
  7. Args:
  8. src (str): Source string.
  9. lang (str): Language name.
  10. script (str): Name of the script that the language is encoded in.
  11. Keyword args:
  12. s2r (bool): If True (the default), the source is considered to be a
  13. non-latin script in the language and script specified, and the output
  14. the Romanization thereof; if False, the source is considered to be
  15. romanized text to be transliterated into the specified script/language.
  16. Return:
  17. str: The transliterated string.
  18. """
  19. # TODO script is ignored at the moment.
  20. cfg = load_table(lang)
  21. # General directives.
  22. # general_dir = cfg.get("directives", {})
  23. # We could be clever here but let's give the users a precise message.
  24. if s2r and "script_to_roman" not in cfg:
  25. raise NotImplementedError(
  26. f"Script-to-Roman transliteration not yet supported for {lang}."
  27. )
  28. elif not s2r and "roman_to_script" not in cfg:
  29. raise NotImplementedError(
  30. f"Roman-to-script transliteration not yet supported for {lang}."
  31. )
  32. langsec = cfg["script_to_roman"] if s2r else cfg["roman_to_script"]
  33. langsec_dir = langsec.get("directives", {})
  34. i = 0
  35. dest_ls = []
  36. # Loop through source characters. The increment of each loop depends on the
  37. # length of the token that eventually matches.
  38. while i < len(src):
  39. match = False
  40. for src_tk, dest_tk in langsec["map"]:
  41. # Longer tokens should be guaranteed to be scanned before their
  42. # substrings at this point.
  43. step = len(src_tk)
  44. if src_tk == src[i:i + step]:
  45. # A match is found. Stop scanning tokens, append result, and
  46. # proceed scanning the source.
  47. dest_ls.append(dest_tk)
  48. match = True
  49. i += step
  50. break
  51. if not match:
  52. # Copy non-mapped character (one at a time).
  53. logger.info(f"Token {src[i]} at position {i} is not mapped.")
  54. dest_ls.append(src[i])
  55. i += 1
  56. breakpoint()
  57. if langsec_dir.get("capitalize", False):
  58. dest_ls[0] = dest_ls[0].capitalize()
  59. logger.info(f"Output list: {dest_ls}")
  60. dest = "".join(dest_ls)
  61. return dest