Преглед на файлове

Merge pull request #141 from lcnetdev/thai_word_splitting

Thai word splitting
Stefano Cossu преди 7 месеца
родител
ревизия
ac291353f0
променени са 5 файла, в които са добавени 27 реда и са изтрити 5 реда
  1. 1 0
      requirements.txt
  2. 8 0
      scriptshifter/hooks/asian_tokenizer/__init__.py
  3. 3 0
      scriptshifter/tables/data/thai.yml
  4. 5 0
      scriptshifter/tables/data/thai_alt.yml
  5. 10 5
      scriptshifter/trans.py

+ 1 - 0
requirements.txt

@@ -1,5 +1,6 @@
 # Core application dependencies.
 aksharamukha>=2.2,<3
+esupar>=1.7.5
 flask>=2.3,<3
 flask-cors>=4.0,<5
 python-dotenv>=1.0,<2

+ 8 - 0
scriptshifter/hooks/asian_tokenizer/__init__.py

@@ -0,0 +1,8 @@
+from esupar import load
+
+
+def s2r_tokenize(ctx, model):
+    nlp = load(model)
+    token_data = nlp(ctx.src)
+
+    ctx._src = " ".join(token_data.values[1])

+ 3 - 0
scriptshifter/tables/data/thai.yml

@@ -33,6 +33,9 @@ options:
 script_to_roman:
   hooks:
     post_config:
+      -
+        - asian_tokenizer.s2r_tokenize
+        - model: "KoichiYasuoka/roberta-base-thai-spm-upos"
       -
         - aksharamukha.romanizer.s2r_post_config
         - src_script: "Thai"

+ 5 - 0
scriptshifter/tables/data/thai_alt.yml

@@ -4,6 +4,11 @@ general:
   case_sensitive: false
 
 script_to_roman:
+  hooks:
+    post_normalize:
+      -
+        - asian_tokenizer.s2r_tokenize
+        - model: "th"
   map:
     # COMMON SPECIAL CHARACTERS
 

+ 10 - 5
scriptshifter/trans.py

@@ -120,11 +120,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
         if _run_hook("post_config", ctx) == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
 
-        _normalize_src(ctx, get_lang_normalize(ctx.conn, ctx.lang_id))
-
-        if _run_hook("post_normalize", ctx) == BREAK:
+        # _normalize_src returns the results of the post_normalize hook.
+        if _normalize_src(
+                ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
             return getattr(ctx, "dest", ""), ctx.warnings
 
+        logger.debug(f"Normalized source: {ctx.src}")
         lang_map = list(get_lang_map(ctx.conn, ctx.lang_id, ctx.t_dir))
 
         # Loop through source characters. The increment of each loop depends on
@@ -151,7 +152,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
             # token or exit the scanning loop altogether.
             hret = _run_hook("begin_input_token", ctx)
             if hret == BREAK:
-                logger.debug("Breaking text scanning from hook signal.")
+                Logger.debug("Breaking text scanning from hook signal.")
                 break
             if hret == CONT:
                 logger.debug("Skipping scanning iteration from hook signal.")
@@ -315,10 +316,14 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 def _normalize_src(ctx, norm_rules):
     """
     Normalize source text according to rules.
+
+    NOTE: this manipluates the protected source attribute so it may not
+    correspond to the originally provided source.
     """
     for nk, nv in norm_rules.items():
         ctx._src = ctx.src.replace(nk, nv)
-    logger.debug(f"Normalized source: {ctx.src}")
+
+    return _run_hook("post_normalize", ctx)
 
 
 def _is_bow(cur, ctx, word_boundary):