|
@@ -14,6 +14,35 @@ MODEL_PATH = path.join(MODEL_DIR, "size1.0.tsv")
|
|
|
syspath.append(MODULE_DIR)
|
|
|
|
|
|
|
|
|
+# From https://en.wikipedia.org/wiki/Arabic_script_in_Unicode:
|
|
|
+#
|
|
|
+# As of Unicode 16.0, the Arabic script is contained in the following blocks:
|
|
|
+#
|
|
|
+# - Arabic (0600–06FF, 256 characters)
|
|
|
+# - Arabic Supplement (0750–077F, 48 characters)
|
|
|
+# - Arabic Extended-B (0870–089F, 42 characters)
|
|
|
+# - Arabic Extended-A (08A0–08FF, 96 characters)
|
|
|
+# - Arabic Presentation Forms-A (FB50–FDFF, 631 characters)
|
|
|
+# - Arabic Presentation Forms-B (FE70–FEFF, 141 characters)
|
|
|
+# - Rumi Numeral Symbols (10E60–10E7F, 31 characters)
|
|
|
+# - Arabic Extended-C (10EC0-10EFF, 7 characters)
|
|
|
+# - Indic Siyaq Numbers (1EC70–1ECBF, 68 characters)
|
|
|
+# - Ottoman Siyaq Numbers (1ED00–1ED4F, 61 characters)
|
|
|
+# - Arabic Mathematical Alphabetic Symbols (1EE00–1EEFF, 143 characters)
|
|
|
+ARABIC_UNICODE_BLOCKS = (
|
|
|
+ ("\u0600", "\u06FF"),
|
|
|
+ ("\u0750", "\u077F"),
|
|
|
+ ("\u08A0", "\u08FF"),
|
|
|
+ ("\uFB50", "\uFDFF"),
|
|
|
+ ("\uFE70", "\uFEFF"),
|
|
|
+ ("\u10E60", "\u10E7F"),
|
|
|
+ ("\u10EC0", "\u10EFF"),
|
|
|
+ ("\u1EC70", "\u1ECBF"),
|
|
|
+ ("\u1ED00", "\u1ED4F"),
|
|
|
+ ("\u1EE00", "\u1EEFF"),
|
|
|
+)
|
|
|
+
|
|
|
+
|
|
|
def s2r_post_config(ctx):
|
|
|
from predict import mle_predict as mle
|
|
|
from predict import translit_rules as tr
|