import logging from transliterator.tables import load_table logger = logging.getLogger(__name__) def transliterate(src, script, lang, s2r=True): """ Transliterate a single string. Args: src (str): Source string. lang (str): Language name. script (str): Name of the script that the language is encoded in. Keyword args: s2r (bool): If True (the default), the source is considered to be a non-latin script in the language and script specified, and the output the Romanization thereof; if False, the source is considered to be romanized text to be transliterated into the specified script/language. Return: str: The transliterated string. """ # TODO script is ignored at the moment. cfg = load_table(lang) # General directives. # general_dir = cfg.get("directives", {}) # We could be clever here but let's give the users a precise message. if s2r and "script_to_roman" not in cfg: raise NotImplementedError( f"Script-to-Roman transliteration not yet supported for {lang}." ) elif not s2r and "roman_to_script" not in cfg: raise NotImplementedError( f"Roman-to-script transliteration not yet supported for {lang}." ) langsec = cfg["script_to_roman"] if s2r else cfg["roman_to_script"] langsec_dir = langsec.get("directives", {}) i = 0 dest_ls = [] # Loop through source characters. The increment of each loop depends on the # length of the token that eventually matches. while i < len(src): match = False for src_tk, dest_tk in langsec["map"]: # Longer tokens should be guaranteed to be scanned before their # substrings at this point. step = len(src_tk) if src_tk == src[i:i + step]: # A match is found. Stop scanning tokens, append result, and # proceed scanning the source. dest_ls.append(dest_tk) match = True i += step break if not match: # Copy non-mapped character (one at a time). logger.info(f"Token {src[i]} at position {i} is not mapped.") dest_ls.append(src[i]) i += 1 breakpoint() if langsec_dir.get("capitalize", False): dest_ls[0] = dest_ls[0].capitalize() logger.info(f"Output list: {dest_ls}") dest = "".join(dest_ls) return dest