Pārlūkot izejas kodu

WIP Add Mishkal vocalization for Persian.

scossu 3 mēneši atpakaļ
vecāks
revīzija
3fe068df03

+ 41 - 0
scriptshifter/hooks/arabic/normalize.py

@@ -0,0 +1,41 @@
+from camel_tools.utils.normalize import (
+    normalize_alef_ar,
+    normalize_alef_maksura_ar,
+    normalize_teh_marbuta_ar,
+    normalize_unicode,
+)
+from mishkal.tashkeel import TashkeelClass
+
+from scriptshifter.exceptions import BREAK
+
+
+vocalizer = TashkeelClass()
+
+
+def camel_normalize(ctx):
+    """
+    Normalize complex Arabic characters.
+
+    Note: this doesn't normalize Persian (and probably other non-Arabic)
+    letters, e.g. ﻯ (U+FEEF) → ی (U+06CC)
+    """
+    norm = normalize_unicode(ctx.src)
+    norm = normalize_alef_ar(norm)
+    norm = normalize_alef_maksura_ar(norm)
+    norm = normalize_teh_marbuta_ar(norm)
+
+    ctx._src = norm
+
+
+def tashkeel_vocalize(ctx):
+    """
+    Vocalize Arabic text by adding implicit vowel marks.
+
+    Note that this changes the source text.
+    """
+    try:
+        ctx._src = vocalizer.tashkeel(ctx.src)
+    except Exception as e:
+        ctx.warnings.append(f"Error vocalizing text: {e}")
+
+        return BREAK

+ 13 - 4
scriptshifter/tables/data/persian.yml

@@ -311,9 +311,15 @@ roman_to_script:
     "%\u02BC": "\u0621"
 
 script_to_roman:
+  hooks:
+    post_normalize:
+      -
+        - arabic.normalize.camel_normalize
+      -
+        - arabic.normalize.tashkeel_vocalize
+
   map:
-    # Copy & paste from
-    # https://www.loc.gov/catdir/cpso/romanization/persian.pdf
+    # From https://www.loc.gov/catdir/cpso/romanization/persian.pdf
     "\u0627": ""  # ا
     "\u0628": "b"  # ب
     "\u067E": "p"  # پ
@@ -331,7 +337,7 @@ script_to_roman:
     "\u0633": "s"  # س
     "\u0634": "sh"  # ش
     "\u0635": "\u1E63"  # ص
-    "\u0636": "z̤"  # ض
+    "\u0636": "z\u0324"  # ض
     "\u0637": "\u1E6D"  # ط
     "\u0638": "\u1E93"  # ظ
     "\u0639": "\u2018"  # ع
@@ -346,7 +352,7 @@ script_to_roman:
     "\u0648": "v"  # و
     "\u0647": "h"  # ه
     "\u0629": "h"  # ة
-    "\u064A": "y"  # ي
+    "\u064A": "y"  # ي  # This looks wrong, this is an Arabic Yeh.
 
     # Vowels and diphthongs
     "\u064E": "a"               # ◌َ
@@ -358,3 +364,6 @@ script_to_roman:
     "\u064E\u0652\u0648": "aw"  # ◌َوْ
     "\u064F\u0648": "\u016B"    # ◌ُو
     "\u064E\u0649\u0648": "ay"  # ◌َىْ
+
+    # Not in ALA-LC spec sheet
+    "\u06CC": "y"  # ی  Farsi Yeh