scossu
/
scriptshifter


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
							import logging
import re

from transliterator.exceptions import BREAK, CONT
from transliterator.tables import load_table


# Match multiple spaces.
MULTI_WS_RE = re.compile(r"\s{2,}")
# Default characters defining a word boundary. TODO Make this configurable
# per-table.
WORD_BOUNDARY = " \n\t:;.,\"'"

# Cursor flags.
CUR_BOW = 1
CUR_EOW = 2

logger = logging.getLogger(__name__)


class Context:
    """
    Context used within the transliteration and passed to hook functions.
    """
    def __init__(self, src, general, langsec):
        """
        Initialize a context.

        Args:
            src (str): The original text. This is meant to never change.
            general (dict): general section of the current config.
            langsec (dict): Language configuration section being used.
        """
        self.src = src
        self.general = general
        self.langsec = langsec
        self.dest_ls = []


def transliterate(src, lang, r2s=False):
    """
    Transliterate a single string.

    Args:
        src (str): Source string.

        lang (str): Language name.

    Keyword args:
        r2s (bool): If False (the default), the source is considered to be a
        non-latin script in the language and script specified, and the output
        the Romanization thereof; if True, the source is considered to be
        romanized text to be transliterated into the specified script/language.

    Return:
        str: The transliterated string.
    """
    source_str = "Latin" if r2s else lang
    target_str = lang if r2s else "Latin"
    logger.info(f"Transliteration is from {source_str} to {target_str}.")

    cfg = load_table(lang)
    logger.info(f"Loaded table for {lang}.")

    # General directives.
    general = cfg.get("general", {})

    if not r2s and "script_to_roman" not in cfg:
        raise NotImplementedError(
            f"Script-to-Roman transliteration not yet supported for {lang}."
        )
    elif r2s and "roman_to_script" not in cfg:
        raise NotImplementedError(
            f"Roman-to-script transliteration not yet supported for {lang}."
        )

    langsec = cfg["script_to_roman"] if not r2s else cfg["roman_to_script"]
    langsec_dir = langsec.get("directives", {})
    langsec_hooks = langsec.get("hooks", {})

    ctx = Context(src, general, langsec)

    # This hook may take over the whole transliteration process or delegate it
    # to some external process, and return the output string directly.
    if _run_hook("post_config", ctx, langsec_hooks) == BREAK:
        return getattr(ctx, "dest", "")

    # Loop through source characters. The increment of each loop depends on
    # the length of the token that eventually matches.
    ignore_list = langsec.get("ignore", [])  # Only present in R2S
    ignore_ptn_list = langsec.get("ignore_ptn", [])  # Only present in R2S
    ctx.cur = 0
    while ctx.cur < len(src):
        # Reset cursor position flags.
        ctx.cur_flags = 0

        # Look for a word boundary and flag word beginning/end it if found.
        if ctx.cur == 0 or src[ctx.cur - 1] in WORD_BOUNDARY:
            # Beginning of word.
            logger.debug(f"Beginning of word at position {ctx.cur}.")
            ctx.cur_flags |= CUR_BOW
        if ctx.cur == len(src) - 1 or src[ctx.cur + 1] in WORD_BOUNDARY:
            # End of word.
            logger.debug(f"End of word at position {ctx.cur}.")
            ctx.cur_flags |= CUR_EOW

        # This hook may skip the parsing of the current
        # token or exit the scanning loop altogether.
        hret = _run_hook("begin_input_token", ctx, langsec_hooks)
        if hret == BREAK:
            logger.debug("Breaking text scanning from hook signal.")
            break
        if hret == CONT:
            logger.debug("Skipping scanning iteration from hook signal.")
            continue

        # Check ignore lists. Find as many subsequent ignore tokens
        # as possible before moving on to looking for match tokens.
        ctx.tk = None
        while True:
            ctx.ignoring = False
            # Ignore patterns.
            for ctx.tk in ignore_ptn_list:
                hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
                if hret == BREAK:
                    break
                if hret == CONT:
                    continue

                step = len(ctx.tk)
                # FIXME This is an issue if we want to specify
                # beginning-of-word matches, as we aren't reading the
                # previous token and we only know that from the CUR_BOW.
                # Which means we would have to analyze the regexp to find if
                # it's looking for BOW. Messy.
                match = re.match(src[ctx.cur:])
                if match:
                    # The position matches an ignore token.
                    hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
                    if hret == BREAK:
                        break
                    if hret == CONT:
                        continue

                    logger.info(f"Ignored token: {ctx.tk}")
                    ctx.dest_ls.append(ctx.tk)
                    ctx.cur += step
                    ctx.ignoring = True
                    break

            # Ignore plain strings.
            for ctx.tk in ignore_list:
                hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
                if hret == BREAK:
                    break
                if hret == CONT:
                    continue

                step = len(ctx.tk)
                if ctx.tk == src[ctx.cur:ctx.cur + step]:
                    # The position matches an ignore token.
                    hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
                    if hret == BREAK:
                        break
                    if hret == CONT:
                        continue

                    logger.info(f"Ignored token: {ctx.tk}")
                    ctx.dest_ls.append(ctx.tk)
                    ctx.cur += step
                    ctx.ignoring = True
                    break
            # We looked through all ignore tokens, not found any. Move on.
            if not ctx.ignoring:
                break
            # Otherwise, if we found a match, check if the next position may be
            # ignored as well.

        delattr(ctx, "tk")
        delattr(ctx, "ignoring")

        # Begin transliteration token lookup.
        ctx.match = False
        for ctx.src_tk, ctx.dest_tk in langsec["map"]:
            hret = _run_hook("pre_tx_token", ctx, langsec_hooks)
            if hret == BREAK:
                break
            if hret == CONT:
                continue

            # Longer tokens should be guaranteed to be scanned before their
            # substrings at this point.
            step = len(ctx.src_tk)
            if ctx.src_tk == src[ctx.cur:ctx.cur + step]:
                ctx.match = True
                # This hook may skip this token or break out of the token
                # lookup for the current position.
                hret = _run_hook("on_tx_token_match", ctx, langsec_hooks)
                if hret == BREAK:
                    break
                if hret == CONT:
                    continue

                # A match is found. Stop scanning tokens, append result, and
                # proceed scanning the source.
                ctx.dest_ls.append(ctx.dest_tk)
                ctx.cur += step
                break

        if ctx.match is False:
            delattr(ctx, "match")
            hret = _run_hook("on_no_tx_token_match", ctx, langsec_hooks)
            if hret == BREAK:
                break
            if hret == CONT:
                continue

            # No match found. Copy non-mapped character (one at a time).
            logger.info(
                    f"Token {src[ctx.cur]} (\\u{hex(ord(src[ctx.cur]))[2:]})"
                    f"at position {ctx.cur} is not mapped.")
            ctx.dest_ls.append(src[ctx.cur])
            ctx.cur += 1
        else:
            delattr(ctx, "match")
        delattr(ctx, "cur_flags")

    delattr(ctx, "cur")

    # This hook may take care of the assembly and cause the function to return
    # its own return value.
    hret = _run_hook("pre_assembly", ctx, langsec_hooks)
    if hret is not None:
        return hret

    if langsec_dir.get("capitalize", False):
        ctx.dest_ls[0] = ctx.dest_ls[0].capitalize()

    logger.debug(f"Output list: {ctx.dest_ls}")
    ctx.dest = "".join(ctx.dest_ls)

    # This hook may reassign the output string and/or cause the function to
    # return it immediately.
    hret = _run_hook("post_assembly", ctx, langsec_hooks)
    if hret == "ret":
        return ctx.dest

    # Strip multiple spaces and leading/trailing whitespace.
    ctx.dest = re.sub(MULTI_WS_RE, ' ', ctx.dest.strip())

    return ctx.dest


def _run_hook(hname, ctx, hooks):
    ret = None
    for hook_def in hooks.get(hname, []):
        kwargs = hook_def[1] if len(hook_def) > 1 else {}
        ret = hook_def[0](ctx, **kwargs)
        if ret in (BREAK, CONT):
            # This will stop parsing hooks functions and tell the caller to
            # break out of the outer loop or skip iteration.
            return ret

    return ret