scossu
/
scriptshifter


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
							import logging
import re

from transliterator.tables import load_table


# Match multiple spaces.
MULTI_WS_RE = re.compile(r"\s{2,}")


logger = logging.getLogger(__name__)


def transliterate(src, lang, s2r=True):
    """
    Transliterate a single string.

    Args:
        src (str): Source string.

        lang (str): Language name.

    Keyword args:
        s2r (bool): If True (the default), the source is considered to be a
        non-latin script in the language and script specified, and the output
        the Romanization thereof; if False, the source is considered to be
        romanized text to be transliterated into the specified script/language.

    Return:
        str: The transliterated string.
    """
    source_str = "Latin" if s2r else lang
    target_str = lang if s2r else "Latin"
    logger.info(f"Transliteration is from {source_str} to {target_str}.")

    cfg = load_table(lang)
    logger.info(f"Loaded table for {lang}.")

    # General directives.
    # general_dir = cfg.get("directives", {})

    if s2r and "script_to_roman" not in cfg:
        raise NotImplementedError(
            f"Script-to-Roman transliteration not yet supported for {lang}."
        )
    elif not s2r and "roman_to_script" not in cfg:
        raise NotImplementedError(
            f"Roman-to-script transliteration not yet supported for {lang}."
        )

    langsec = cfg["script_to_roman"] if s2r else cfg["roman_to_script"]
    langsec_dir = langsec.get("directives", {})

    i = 0
    dest_ls = []
    # Loop through source characters. The increment of each loop depends on the
    # length of the token that eventually matches.
    ignore_list = langsec.get("ignore", [])  # Only present in R2S
    while i < len(src):
        # Check ignore list first. Find as many subsequent ignore tokens
        # as possible before moving on to looking for match tokens.
        while True:
            ignoring = False
            for tk in ignore_list:
                step = len(tk)
                if tk == src[i:i + step]:
                    logger.info(f"Ignored token: {tk}")
                    dest_ls.append(tk)
                    i += step
                    ignoring = True
                    break
            # We looked through all ignore tokens, not found any. Move on.
            if not ignoring:
                break

        match = False
        for src_tk, dest_tk in langsec["map"]:
            # Longer tokens should be guaranteed to be scanned before their
            # substrings at this point.
            step = len(src_tk)
            if src_tk == src[i:i + step]:
                # A match is found. Stop scanning tokens, append result, and
                # proceed scanning the source.
                dest_ls.append(dest_tk)
                match = True
                i += step
                break

        if not match:
            # No match found. Copy non-mapped character (one at a time).
            logger.info(f"Token {src[i]} at position {i} is not mapped.")
            dest_ls.append(src[i])
            i += 1

    if langsec_dir.get("capitalize", False):
        dest_ls[0] = dest_ls[0].capitalize()

    logger.debug(f"Output list: {dest_ls}")
    dest = "".join(dest_ls)

    dest = re.sub(MULTI_WS_RE, ' ', dest.strip())

    return dest