123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- import logging
- import re
- from transliterator.tables import load_table
- # Match multiple spaces.
- MULTI_WS_RE = re.compile(r"\s{2,}")
- logger = logging.getLogger(__name__)
- def transliterate(src, lang, s2r=True):
- """
- Transliterate a single string.
- Args:
- src (str): Source string.
- lang (str): Language name.
- Keyword args:
- s2r (bool): If True (the default), the source is considered to be a
- non-latin script in the language and script specified, and the output
- the Romanization thereof; if False, the source is considered to be
- romanized text to be transliterated into the specified script/language.
- Return:
- str: The transliterated string.
- """
- source_str = "Latin" if s2r else lang
- target_str = lang if s2r else "Latin"
- logger.info(f"Transliteration is from {source_str} to {target_str}.")
- cfg = load_table(lang)
- logger.info(f"Loaded table for {lang}.")
- # General directives.
- # general_dir = cfg.get("directives", {})
- if s2r and "script_to_roman" not in cfg:
- raise NotImplementedError(
- f"Script-to-Roman transliteration not yet supported for {lang}."
- )
- elif not s2r and "roman_to_script" not in cfg:
- raise NotImplementedError(
- f"Roman-to-script transliteration not yet supported for {lang}."
- )
- langsec = cfg["script_to_roman"] if s2r else cfg["roman_to_script"]
- langsec_dir = langsec.get("directives", {})
- i = 0
- dest_ls = []
- # Loop through source characters. The increment of each loop depends on the
- # length of the token that eventually matches.
- ignore_list = langsec.get("ignore", []) # Only present in R2S
- while i < len(src):
- # Check ignore list first. Find as many subsequent ignore tokens
- # as possible before moving on to looking for match tokens.
- while True:
- ignoring = False
- for tk in ignore_list:
- step = len(tk)
- if tk == src[i:i + step]:
- logger.info(f"Ignored token: {tk}")
- dest_ls.append(tk)
- i += step
- ignoring = True
- break
- # We looked through all ignore tokens, not found any. Move on.
- if not ignoring:
- break
- match = False
- for src_tk, dest_tk in langsec["map"]:
- # Longer tokens should be guaranteed to be scanned before their
- # substrings at this point.
- step = len(src_tk)
- if src_tk == src[i:i + step]:
- # A match is found. Stop scanning tokens, append result, and
- # proceed scanning the source.
- dest_ls.append(dest_tk)
- match = True
- i += step
- break
- if not match:
- # No match found. Copy non-mapped character (one at a time).
- logger.info(f"Token {src[i]} at position {i} is not mapped.")
- dest_ls.append(src[i])
- i += 1
- if langsec_dir.get("capitalize", False):
- dest_ls[0] = dest_ls[0].capitalize()
- logger.debug(f"Output list: {dest_ls}")
- dest = "".join(dest_ls)
- dest = re.sub(MULTI_WS_RE, ' ', dest.strip())
- return dest
|