|
@@ -0,0 +1,77 @@
|
|
|
+import logging
|
|
|
+
|
|
|
+from transliterator.tables import load_table
|
|
|
+
|
|
|
+
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+
|
|
|
+def transliterate(src, script, lang, s2r=True):
|
|
|
+ """
|
|
|
+ Transliterate a single string.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ src (str): Source string.
|
|
|
+
|
|
|
+ lang (str): Language name.
|
|
|
+
|
|
|
+ script (str): Name of the script that the language is encoded in.
|
|
|
+
|
|
|
+ Keyword args:
|
|
|
+ s2r (bool): If True (the default), the source is considered to be a
|
|
|
+ non-latin script in the language and script specified, and the output
|
|
|
+ the Romanization thereof; if False, the source is considered to be
|
|
|
+ romanized text to be transliterated into the specified script/language.
|
|
|
+
|
|
|
+ Return:
|
|
|
+ str: The transliterated string.
|
|
|
+ """
|
|
|
+ # TODO script is ignored at the moment.
|
|
|
+ cfg = load_table(lang)
|
|
|
+ # General directives.
|
|
|
+ # general_dir = cfg.get("directives", {})
|
|
|
+
|
|
|
+ # We could be clever here but let's give the users a precise message.
|
|
|
+ if s2r and "script_to_roman" not in cfg:
|
|
|
+ raise NotImplementedError(
|
|
|
+ f"Script-to-Roman transliteration not yet supported for {lang}."
|
|
|
+ )
|
|
|
+ elif not s2r and "roman_to_script" not in cfg:
|
|
|
+ raise NotImplementedError(
|
|
|
+ f"Roman-to-script transliteration not yet supported for {lang}."
|
|
|
+ )
|
|
|
+
|
|
|
+ langsec = cfg["script_to_roman"] if s2r else cfg["roman_to_script"]
|
|
|
+ langsec_dir = langsec.get("directives", {})
|
|
|
+
|
|
|
+ i = 0
|
|
|
+ dest_ls = []
|
|
|
+ # Loop through source characters. The increment of each loop depends on the
|
|
|
+ # length of the token that eventually matches.
|
|
|
+ while i < len(src):
|
|
|
+ match = False
|
|
|
+ for src_tk, dest_tk in langsec["map"]:
|
|
|
+ # Longer tokens should be guaranteed to be scanned before their
|
|
|
+ # substrings at this point.
|
|
|
+ step = len(src_tk)
|
|
|
+ if src_tk == src[i:i + step]:
|
|
|
+ # A match is found. Stop scanning tokens, append result, and
|
|
|
+ # proceed scanning the source.
|
|
|
+ dest_ls.append(dest_tk)
|
|
|
+ match = True
|
|
|
+ i += step
|
|
|
+ break
|
|
|
+ if not match:
|
|
|
+ # Copy non-mapped character (one at a time).
|
|
|
+ logger.info(f"Token {src[i]} at position {i} is not mapped.")
|
|
|
+ dest_ls.append(src[i])
|
|
|
+ i += 1
|
|
|
+
|
|
|
+ breakpoint()
|
|
|
+ if langsec_dir.get("capitalize", False):
|
|
|
+ dest_ls[0] = dest_ls[0].capitalize()
|
|
|
+
|
|
|
+ logger.info(f"Output list: {dest_ls}")
|
|
|
+ dest = "".join(dest_ls)
|
|
|
+
|
|
|
+ return dest
|