فهرست منبع

Merge pull request #155 from lcnetdev/arabic_r2s

added arabic r2s tables from transliterator
Matt Miller 5 ماه پیش
والد
کامیت
91a76dfc95
2فایلهای تغییر یافته به همراه377 افزوده شده و 2 حذف شده
  1. 376 1
      scriptshifter/tables/data/arabic.yml
  2. 1 1
      scriptshifter/tables/index.yml

+ 376 - 1
scriptshifter/tables/data/arabic.yml

@@ -4,9 +4,384 @@
 ---
 general:
   name: Arabic
-  description: Arabic S2R using a 3rd party library.
+  description: Arabic R2S using a conversion table and S2R using a 3rd party library.
   case_sensitive: false
 
+  parents:
+    - _ignore_base
+
+
+roman_to_script:
+  map:
+
+    # Original table by David Bucknum
+    # Last updated 25 January 2019
+    # Modified by WK with testing by Arabic Cat Staff LOC-CAIRO 
+    # Additional info from R. Vassie, [n.d.] "Marrying the Arabic and Latin Scripts Conceptually" 
+
+
+    # Punctuation marks:    
+    "*": "\u066D"
+    ",": "\u060C"
+    ";": "\u061B"
+    "?": "\u061F"
+
+    # Exceptions for specific words 
+    # Allah
+    "Alla\u0304h": "\u0627\u0644\u0644\u0647"
+
+    # Qur'an
+    "Qur\u02BCa\u0304n": "\u0642\u0631\u0622\u0646"
+
+    # lillah
+    "lilla\u0304h": "\u0644\u0644\u0647"
+
+    # billah
+    "billa\u0304h": "\u0628\u0644\u0644\u0647"
+
+    # Rahman
+    "Rah\u0323ma\u0304n": "\u0631\u062D\u0645\u0646"
+
+    # Ruwat
+    "Ruwa\u0304t": "\u0631\u0648\u0627\u0629"
+    "ruwa\u0304t": "\u0631\u0648\u0627\u0629"
+
+    # Hadha
+    "Ha\u0304dha\u0304": "\u0647\u0630\u0627"
+    "ha\u0304dha\u0304": "\u0647\u0630\u0627"
+
+    # Hadhihi
+    "Ha\u0304dhi\u0304hi": "\u0647\u0630\u0647"
+    "ha\u0304dhi\u0304hi": "\u0647\u0630\u0647"
+
+    # dhalika
+    "dha\u0304lika": "\u0630\u0644\u0643"
+
+    # Ibn when it appears in the middle of a name sequence
+    "ibn": "\u0628\u0646"
+
+    # H[dot below]aya[macron]t
+    "h\u0323aya\u0304t": "\u062D\u064A\u0627\u0629"
+    "H\u0323aya\u0304t": "\u062D\u064A\u0627\u0629"
+
+    # "sh[dot below] as in "Ishaq"
+
+    "%sh\u0323%": "\u0633\u062D"
+
+    # "s[prime]h" combos
+
+    "%s\u02B9h%": "\u0633\u0647"
+
+    # "th[dot below]"
+
+    "%th\u0323%": "\u062A\u062D"
+
+    # dh[dot under] 
+
+    "%dh\u0323%": "\u062F\u062D"
+
+    # La-hu
+
+    "la-hu": "\u0644\u0647"
+
+    # Mi'ah
+    "Mi\u02BEah": "\u0645\u0627\u0626\u0629"
+    "Mi\u02BCah": "\u0645\u0627\u0626\u0629"
+    "mi\u02BEah": "\u0645\u0627\u0626\u0629"
+    "mi\u02BCah": "\u0645\u0627\u0626\u0629"
+
+    # Mi'at
+    "Mi\u02BEat": "\u0645\u0627\u0626\u0629"
+    "Mi\u02BCat": "\u0645\u0627\u0626\u0629"
+    "mi\u02BEat": "\u0645\u0627\u0626\u0629"
+    "mi\u02BCat": "\u0645\u0627\u0626\u0629"
+
+    # Numbers (I have set these to Hindi numbers. Note that Persian and Urdu will technically use \u06F0-06F9. This needs further discussion with PSD as RLIN21 used Hindi numbers, Connexion and Voyager does not.)
+
+    # Edition statements with Latin number
+    "al-T\u0323ab\u02BBah 1": "\u0627\u0644\u0637\u0628\u0639\u0629 1"
+    "al-T\u0323ab\u02BBah 2": "\u0627\u0644\u0637\u0628\u0639\u0629 2"
+    "al-T\u0323ab\u02BBah 3": "\u0627\u0644\u0637\u0628\u0639\u0629 3"
+    "al-T\u0323ab\u02BBah 4": "\u0627\u0644\u0637\u0628\u0639\u0629 4"
+    "al-T\u0323ab\u02BBah 5": "\u0627\u0644\u0637\u0628\u0639\u0629 5"
+    "al-T\u0323ab\u02BBah 6": "\u0627\u0644\u0637\u0628\u0639\u0629 6"
+    "al-T\u0323ab\u02BBah 7": "\u0627\u0644\u0637\u0628\u0639\u0629 7"
+    "al-T\u0323ab\u02BBah 8": "\u0627\u0644\u0637\u0628\u0639\u0629 8"
+    "al-T\u0323ab\u02BBah 9": "\u0627\u0644\u0637\u0628\u0639\u0629 9"
+
+    # Use Basic Arabic-Indic \u0660-0669
+    "0": "\u0660"
+    "1": "\u0661"
+    "2": "\u0662"
+    "3": "\u0663"
+    "4": "\u0664"
+    "5": "\u0665"
+    "6": "\u0666"
+    "7": "\u0667"
+    "8": "\u0668"
+    "9": "\u0669"
+
+    # Hyphenated prefixes:
+    "wa-": "\u0648"
+    "bi-": "\u0628"
+    "al-": "\u0627\u0644"
+    "lil-": "\u0644\u0644"
+    "li-": "\u0644"
+    "la\u0304-": "\u0644"
+    "fi\u0304-": "\u0641\u064A"
+    "ka-": "\u0643"
+
+    # Vowels and vowel/consonant combinations
+    "%ah": "\u0629"
+    "%at": "\u0629"
+
+    #tanwin
+    "%an": "\u0627"
+
+    # ayn-alif combo
+    "%\u02BBa\u0304\u02BE": "\u0639\u0627\u0621"
+    "%\u02BBa\u0304\u02BC": "\u0639\u0627\u0621"
+
+    "\u02BBA\u0304": "\u0639\u0627"
+    "\u02BBa\u0304": "\u0639\u0627"
+
+    "\u02BBI\u0304": "\u0639\u064A"
+    "\u02BBi\u0304": "\u0639\u064A"
+
+    "\u02BBU\u0304": "\u0639\u0648"
+    "\u02BBu\u0304": "\u0639\u0648"
+    "\u02BBU": "\u0639"
+    "\u02BBu": "\u0639"
+
+    "\u02BBA%": "\u0639"
+    #"\u02BBa%": "\u0639"
+
+    # alif and hamzas for all occasions
+
+    # truncation necessary? It seems to work fine with. 
+
+    "%i\u0304\u02BEah": "\u064A\u0626\u0629"
+    "%i\u0304\u02BCah": "\u064A\u0626\u0629"
+
+    "%i\u0304\u02BEat": "\u064A\u0626\u0629"
+    "%i\u0304\u02BCat": "\u064A\u0626\u0629"
+
+    "%i\u02BEa\u0304": "\u0626\u0627"
+    "%i\u02BCa\u0304": "\u0626\u0627"
+
+    "%i\u02BE": "\u0626"
+    "%i\u02BC": "\u0626"
+    "a\u0304\u02BEa\u0304": "\u0627\u0621\u0627"
+    "a\u0304\u02BCa\u0304": "\u0627\u0621\u0627"
+     
+    "a\u02BE": "\u0623"
+    "a\u02BC": "\u0623"
+    "\u02BEi": "\u0626"
+    "\u02BCi": "\u0626"
+    "\u02BEa\u0304": "\u0622"
+    "\u02BCa\u0304": "\u0622"
+    "\u02BEa": "\u0623"
+    "\u02BCa": "\u0623"
+
+    "y\u02BCah": "\u064A\u0626\u0629"
+    "y\u02BEah": "\u064A\u0626\u0629"
+
+    "y\u02BCat": "\u064A\u0626\u0629"
+    "y\u02BEat": "\u064A\u0626\u0629"
+
+    # A
+
+    "a\u0304\u02BCi\u0304": "\u0627\u0626\u064A"
+    "a\u0304\u02BEi\u0304": "\u0627\u0626\u064A"
+
+    "a\u0304\u02BCi": "\u0627\u0626"
+    "a\u0304\u02BEi": "\u0627\u0626"
+    "a\u0304\u02BC": "\u0627\u0621"
+    "a\u0304\u02BE": "\u0627\u0621"
+    "A\u0304%": "\u0622"
+    "a\u0304%": "\u0622"
+    "A\u0304": "\u0627"
+    "a\u0304": "\u0627"
+
+    # These next two lines were intended to convert to alif-ayn when it is at the beginning of a word, definite or indefinine (i.e. al-a[ayn]ma[macron]l or [space]a[ayn]ma[macron]l" 
+    "A\u02BB%": "\u0623\u0639"
+    "a\u02BB%": "\u0623\u0639"
+    "a\u02BB": "\u0639"
+    "A\u0301": "\u0649"
+    "a\u0301": "\u0649"
+
+    "ayy": "\u064A"
+    "A%": "\u0623"
+    "a%": "\u0627"
+    "A": "\u0623"
+    "a": ""
+
+    # I - Capital I at beginning of word is usually alif hamzah-below.
+
+    "%i\u0304": "\u064A"
+    "i\u0304y": "\u064A"
+    "iy": "\u064A"
+    "I\u0304%": "\u0625\u064A"
+    "i\u0304": "\u064A"
+    "\u02BBI%": "\u0639"
+
+    #"i\u02BB": "\u0625\u0639"
+
+    "I\u02BE": "\u0627\u0626"
+    "I\u02BC": "\u0627\u0626"
+    "i\u02BE": "\u0626"
+    "i\u02BC": "\u0627\u0626"
+
+    "I%": "\u0625"
+    "i%": "\u0625"
+    "I": "\u0625"
+    "i": ""
+
+    # U 
+
+    "u\u0304\u02BE": "\u0624"
+    "u\u0304\u02BC": "\u0624"
+    "U\u0304w%": "\u0623\u0648"
+    "u\u0304w%": "\u0623\u0648"
+    "U\u0304%": "\u0623\u0648"
+    "u\u0304%": "\u0623\u0648"
+    "u\u0304w": "\u0648"
+    "u\u0304": "\u0648"
+    "u\u02BE": "\u0624"
+    "u\u02BC": "\u0624"
+
+    "U%": "\u0623"
+    "u%": "\u0623"
+    "U": "\u0623"
+    "u": ""
+
+    # Consonants, with tashdid added 
+
+    "B": "\u0628"
+    "bb": "\u0628"
+    "b": "\u0628"
+    "Th": "\u062B"
+    "thth": "\u062B"
+    "th": "\u062B"
+    "T\u0323": "\u0637"
+    "t\u0323t\u0323": "\u0637"
+    "t\u0323": "\u0637"
+    "T": "\u062A"
+    "tt": "\u062A"
+    "t": "\u062A"
+    "J": "\u062C"
+    "jj": "\u062C"
+    "j": "\u062C"
+    "H\u0323": "\u062D"
+    "h\u0323h\u0323": "\u062D"
+    "h\u0323": "\u062D"
+    "H": "\u0647"
+    "hh": "\u0647"
+    "h": "\u0647"
+    "Kh": "\u062E"
+    "khkh": "\u062E"
+    "kh": "\u062E"
+    "K": "\u0643"
+    "kk": "\u0643"
+    "k": "\u0643"
+    "Dh": "\u0630"
+    "dhdh": "\u0630"
+    "dh": "\u0630"
+    "D\u0323": "\u0636"
+    "d\u0323d\u0323": "\u0636"
+    "d\u0323": "\u0636"
+    "D": "\u062F"
+    "dd": "\u062F"
+    "d": "\u062F"
+    "R": "\u0631"
+    "rr": "\u0631"
+    "r": "\u0631"
+    "Z\u0323": "\u0638"
+    "z\u0323z\u0323": "\u0638"
+    "z\u0323": "\u0638"
+    "Z": "\u0632"
+    "zz": "\u0632"
+    "z": "\u0632"
+    "Sh": "\u0634"
+    "shsh": "\u0634"
+    "sh": "\u0634"
+    "S\u0323": "\u0635"
+    "s\u0323s\u0323": "\u0635"
+    "s\u0323": "\u0635"
+    "S": "\u0633"
+    "ss": "\u0633"
+    "s": "\u0633"
+    "Gh": "\u063A"
+    "ghgh": "\u063A"
+    "gh": "\u063A"
+    "F": "\u0641"
+    "ff": "\u0641"
+    "f": "\u0641"
+    "Q": "\u0642"
+    "qq": "\u0642"
+    "q": "\u0642"
+    "L": "\u0644"
+    "ll": "\u0644"
+    "l": "\u0644"
+    "M": "\u0645"
+    "mm": "\u0645"
+    "m": "\u0645"
+    "N": "\u0646"
+    "nn": "\u0646"
+    "n": "\u0646"
+    "W": "\u0648"
+    "ww": "\u0648"
+    "w": "\u0648"
+    "Y": "\u064A"
+    "yy": "\u064A"
+    "y": "\u064A"
+
+    # non-Arabic consonants:
+    "P": "\u067E"
+    "p": "\u067E"
+    "Ch": "\u0686"
+    "ch": "\u0686"
+    "V": "\u06A4"
+    "v": "\u06A4"
+    "G": "\u06AF"
+    "g": "\u06AF"
+
+    # Diacritic characters:
+    # ain (\u0639) - not transliterated alone:
+    "\u02BB": "\u0639"
+    # hamza - not romanized
+    # "\u0621"
+    # hamza (alone in final position)
+    "%\u02BE": "\u0621"
+    "%\u02BC": "\u0621"
+
+    # Do not know what, if anything, is needed here:
+    # tatweel:
+    # "\u0640"
+    # fathatan:
+    # "\u064B"
+    # dammatan:
+    # "\u064C"
+    # kasratan:
+    # "\u064D"
+    # fatha:
+    # "\u064E"
+    # damma:
+    # "\u064F"
+    # kasra:
+    # "\u0650"
+    # shadda:
+    # "\u0651"
+    # sukun:
+    # "\u0652"
+    # superscript alef:
+    # "\u0670"
+    # alef wasla
+    # "\u0671"
+
+
+
+
 script_to_roman:
   hooks:
     post_config:

+ 1 - 1
scriptshifter/tables/index.yml

@@ -19,7 +19,7 @@ arabic:
     Arabic-to-Roman transliterator using the ArabicTransliterator external
     library.
   marc_code: ara
-  name: Arabic (S2R)
+  name: Arabic
 armenian:
   marc_code: arm
   name: Armenian