1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- import logging
- from transliterator.tables import load_table
- logger = logging.getLogger(__name__)
- def transliterate(src, script, lang, s2r=True):
- """
- Transliterate a single string.
- Args:
- src (str): Source string.
- lang (str): Language name.
- script (str): Name of the script that the language is encoded in.
- Keyword args:
- s2r (bool): If True (the default), the source is considered to be a
- non-latin script in the language and script specified, and the output
- the Romanization thereof; if False, the source is considered to be
- romanized text to be transliterated into the specified script/language.
- Return:
- str: The transliterated string.
- """
- # TODO script is ignored at the moment.
- cfg = load_table(lang)
- # General directives.
- # general_dir = cfg.get("directives", {})
- # We could be clever here but let's give the users a precise message.
- if s2r and "script_to_roman" not in cfg:
- raise NotImplementedError(
- f"Script-to-Roman transliteration not yet supported for {lang}."
- )
- elif not s2r and "roman_to_script" not in cfg:
- raise NotImplementedError(
- f"Roman-to-script transliteration not yet supported for {lang}."
- )
- langsec = cfg["script_to_roman"] if s2r else cfg["roman_to_script"]
- langsec_dir = langsec.get("directives", {})
- i = 0
- dest_ls = []
- # Loop through source characters. The increment of each loop depends on the
- # length of the token that eventually matches.
- while i < len(src):
- match = False
- for src_tk, dest_tk in langsec["map"]:
- # Longer tokens should be guaranteed to be scanned before their
- # substrings at this point.
- step = len(src_tk)
- if src_tk == src[i:i + step]:
- # A match is found. Stop scanning tokens, append result, and
- # proceed scanning the source.
- dest_ls.append(dest_tk)
- match = True
- i += step
- break
- if not match:
- # Copy non-mapped character (one at a time).
- logger.info(f"Token {src[i]} at position {i} is not mapped.")
- dest_ls.append(src[i])
- i += 1
- breakpoint()
- if langsec_dir.get("capitalize", False):
- dest_ls[0] = dest_ls[0].capitalize()
- logger.info(f"Output list: {dest_ls}")
- dest = "".join(dest_ls)
- return dest
|