Browse Source

WIP Persian.

scossu 4 months ago
parent
commit
bb957fbd9c

+ 29 - 0
scriptshifter/hooks/arabic/arabic_romanizer.py

@@ -14,6 +14,35 @@ MODEL_PATH = path.join(MODEL_DIR, "size1.0.tsv")
 syspath.append(MODULE_DIR)
 
 
+# From https://en.wikipedia.org/wiki/Arabic_script_in_Unicode:
+#
+# As of Unicode 16.0, the Arabic script is contained in the following blocks:
+#
+# - Arabic (0600–06FF, 256 characters)
+# - Arabic Supplement (0750–077F, 48 characters)
+# - Arabic Extended-B (0870–089F, 42 characters)
+# - Arabic Extended-A (08A0–08FF, 96 characters)
+# - Arabic Presentation Forms-A (FB50–FDFF, 631 characters)
+# - Arabic Presentation Forms-B (FE70–FEFF, 141 characters)
+# - Rumi Numeral Symbols (10E60–10E7F, 31 characters)
+# - Arabic Extended-C (10EC0-10EFF, 7 characters)
+# - Indic Siyaq Numbers (1EC70–1ECBF, 68 characters)
+# - Ottoman Siyaq Numbers (1ED00–1ED4F, 61 characters)
+# - Arabic Mathematical Alphabetic Symbols (1EE00–1EEFF, 143 characters)
+ARABIC_UNICODE_BLOCKS = (
+    ("\u0600", "\u06FF"),
+    ("\u0750", "\u077F"),
+    ("\u08A0", "\u08FF"),
+    ("\uFB50", "\uFDFF"),
+    ("\uFE70", "\uFEFF"),
+    ("\u10E60", "\u10E7F"),
+    ("\u10EC0", "\u10EFF"),
+    ("\u1EC70", "\u1ECBF"),
+    ("\u1ED00", "\u1ED4F"),
+    ("\u1EE00", "\u1EEFF"),
+)
+
+
 def s2r_post_config(ctx):
     from predict import mle_predict as mle
     from predict import translit_rules as tr

+ 15 - 0
scriptshifter/hooks/persian/__init__.py

@@ -0,0 +1,15 @@
+# @package ext
+
+__doc__ = """
+Persian transliteration hooks.
+
+conforms to https://www.loc.gov/catdir/cpso/romanization/persian.pdf
+"""
+
+def s2r_on_tx_token_match(ctx):
+    """
+    Apply special substitution rules according to ALA-LC spec.
+    """
+    if ctx.token == "ا":
+        # Note 1
+        

+ 49 - 0
scriptshifter/tables/data/persian.yml

@@ -309,3 +309,52 @@ roman_to_script:
     # hamza (alone in final position)
     "%\u02BE": "\u0621"
     "%\u02BC": "\u0621"
+
+script_to_roman:
+  map:
+    # Copy & paste from
+    # https://www.loc.gov/catdir/cpso/romanization/persian.pdf
+    "\u0627": ""  # ا
+    "\u0628": "b"  # ب
+    "\u067E": "p"  # پ
+    "\u062A": "t"  # ت
+    "\u062B": "s"  # ث
+    "\u062C": "j"  # ج
+    "\u0686": "ch"  # چ
+    "\u062D": "\u1E25"  # ح
+    "\u062E": "kh"  # خ
+    "\u062F": "d"  # د
+    "\u0630": "z"  # ذ
+    "\u0631": "r"  # ر
+    "\u0632": "z"  # ز
+    "\u0698": "zh"  # ژ
+    "\u0633": "s"  # س
+    "\u0634": "sh"  # ش
+    "\u0635": "\u1E63"  # ص
+    "\u0636": "z̤"  # ض
+    "\u0637": "\u1E6D"  # ط
+    "\u0638": "\u1E93"  # ظ
+    "\u0639": "\u2018"  # ع
+    "\u063A": "gh"  # غ
+    "\u0641": "f"  # ف
+    "\u0642": "q"  # ق
+    "\u0643": "k"  # ك
+    "\u06AF": "g"  # گ
+    "\u0644": "l"  # ل
+    "\u0645": "m"  # م
+    "\u0646": "n"  # ن
+    "\u0648": "v"  # و
+    "\u0647": "h"  # ه
+    "\u0629": "h"  # ة
+    "\u064A": "y"  # ي
+
+    # Vowels and diphthongs
+    "\u064E": "a"               # ◌َ
+    "\u0622": "\u0101"          # آ
+    "\u064E\u0627": "\u0101"    # ◌َا
+    "\u0650": "i"               # ◌ِ
+    "\u0650\u0649": "\u012B"    # ◌ِى
+    "\u064F": "u"               # ◌ُ
+    "\u064E\u0652\u0648": "aw"  # ◌َوْ
+    "\u064F\u0648": "\u016B"    # ◌ُو
+    "\u064E\u0649\u0648": "ay"  # ◌َىْ