Explorar el Código

WIP Camel Tools setup.

scossu hace 1 año
padre
commit
9ee61432af

+ 28 - 0
scriptshifter/hooks/arabic/camel_tools.py

@@ -0,0 +1,28 @@
+from camel_tools.utils.charmap import CharMapper
+from camel_tools.utils.dediac import dediac_ar
+from camel_tools.utils.normalize import (
+        normalize_unicode,
+        normalize_alef_maksura_ar,
+        normalize_alef_ar,
+        normalize_teh_marbuta_ar)
+
+from scriptshifter.exceptions import BREAK
+
+
+def s2r_post_config(ctx):
+    # Unicode normalization
+    src = normalize_unicode(ctx.src)
+
+    # Orthographic normalization
+    src = normalize_alef_maksura_ar(src)
+    src = normalize_alef_ar(src)
+    src = normalize_teh_marbuta_ar(src)
+
+    # Dediacritization.
+    src = dediac_ar(src)
+
+    # Conversion proper.
+    ar2bw = CharMapper.builtin_mapper("ar2bw")
+    ctx.dest = ar2bw(src)
+
+    return BREAK

+ 0 - 22
scriptshifter/hooks/arabic_ext.py

@@ -1,22 +0,0 @@
-import logging
-
-# This requires ArabicTransliterator to be installed as a package.
-from arabic.ArabicTransliterator import ALA_LC_Transliterator as Trans
-from mishkal.tashkeel.tashkeel import TashkeelClass
-
-from scriptshifter.exceptions import BREAK
-
-
-__doc__ = """ Integrate external ArabicTransliterator library. """
-
-
-logger = logging.getLogger(__name__)
-
-
-def s2r_post_config(ctx):
-    trans = Trans()
-    vocalizer = TashkeelClass()
-    voc = vocalizer.tashkeel(ctx.src)
-    ctx.dest = trans.do(voc.strip())
-
-    return BREAK

+ 2 - 2
scriptshifter/tables/data/arabic_ext.yml → scriptshifter/tables/data/arabic.yml

@@ -2,11 +2,11 @@
 # https://github.com/MTG/ArabicTransliterator
 
 general:
-  name: Arabic (ArabicTransliterator)
+  name: Arabic
   description: Arabic S2R using a 3rd party library.
 
 script_to_roman:
   hooks:
     post_config:
       -
-        - arabic_ext.s2r_post_config
+        - arabic.camel_tools.s2r_post_config

+ 1 - 1
scriptshifter/tables/data/index.yml

@@ -11,7 +11,7 @@ abkhaz_cyrillic:
   name: Abkhaz (Cyrillic)
 altai_cyrillic:
   name: Altai (Cyrillic)
-arabic_ext:
+arabic:
   name: Arabic (S2R)
   description: Arabic-to-Roman transliterator using the ArabicTransliterator external library.
 armenian: