Browse Source

Use Camel ALA Romanizer with mini model.

scossu 1 year ago
parent
commit
e93e081d91
1 changed files with 21 additions and 19 deletions
  1. 21 19
      scriptshifter/hooks/arabic/camel_tools.py

+ 21 - 19
scriptshifter/hooks/arabic/camel_tools.py

@@ -1,28 +1,30 @@
-from camel_tools.utils.charmap import CharMapper
-from camel_tools.utils.dediac import dediac_ar
-from camel_tools.utils.normalize import (
-        normalize_unicode,
-        normalize_alef_maksura_ar,
-        normalize_alef_ar,
-        normalize_teh_marbuta_ar)
+from os import path
+from sys import path as syspath
 
+from scriptshifter import APP_ROOT
 from scriptshifter.exceptions import BREAK
 
 
-def s2r_post_config(ctx):
-    # Unicode normalization
-    src = normalize_unicode(ctx.src)
+CAMEL_DIR = path.join(path.dirname(APP_ROOT), "ext", "arabic_rom")
+MODULE_DIR = path.join(CAMEL_DIR, "src")
+MODEL_DIR = path.join(CAMEL_DIR, "models", "mle")
+MODEL_PATH = path.join(MODEL_DIR, "size1.0.tsv")
+
+syspath.append(MODULE_DIR)
 
-    # Orthographic normalization
-    src = normalize_alef_maksura_ar(src)
-    src = normalize_alef_ar(src)
-    src = normalize_teh_marbuta_ar(src)
 
-    # Dediacritization.
-    src = dediac_ar(src)
+def s2r_post_config(ctx):
+    from predict import mle_predict as mle
+    from predict import translit_rules as tr
+
+    loc_exceptional = tr.load_exceptional_spellings()
+    loc_mappings = tr.load_loc_mappings()
 
-    # Conversion proper.
-    ar2bw = CharMapper.builtin_mapper("ar2bw")
-    ctx.dest = ar2bw(src)
+    mle_model = mle.load_mle_model(mle_model_tsv=MODEL_PATH)
+    ctx.dest = mle.apply_mle_translit_simple_backoff(
+            ctx.src,
+            mle_model,
+            loc_mappings,
+            loc_exceptional)
 
     return BREAK