فهرست منبع

Update Mongol bichig - test failing.

scossu 1 ماه پیش
والد
کامیت
d2909c4a14
2فایلهای تغییر یافته به همراه98 افزوده شده و 104 حذف شده
  1. 94 104
      scriptshifter/tables/data/mongolian_mongol_bichig.yml
  2. 4 0
      test/data/script_samples/mongolian.csv

+ 94 - 104
scriptshifter/tables/data/mongolian_mongol_bichig.yml

@@ -8,111 +8,104 @@ general:
 roman_to_script:
 
   map:
+    "\u0020latin": "\u0020\u182F\u1820\u1832\u180B\u1822\u1828"
+    "mate\u0307riyal": "\u182E\u1820\u1832\u180B\u1827\u1837\u1822\u1836\u1820\u182F"
     "\u002Daca": "\u202F\u1820\u1834\u1820"
-    "\u002DA": "\u180E\u1820"
     "\u002Da": "\u180E\u1820"
-    "A": "\u1820"
     "a": "\u1820"
     "\u002Dece": "\u202F\u1821\u1834\u1821"
-    "\u002DE": "\u180E\u1821"
     "\u002De": "\u180E\u1821"
     "\u002D": "\u202F"
-    "E\u0307": "\u1827"
     "e\u0307": "\u1827"
-    "E": "\u1821"
     "e": "\u1821"
-    "\u002DI": "\u180E\u1822"
-    "\u002Di": "\u180E\u1822"
-    "I": "\u1822"
     "i": "\u1822"
-    "O\u0307": "\u1825"
     "o\u0307": "\u1825"
-    "O": "\u1823"
     "o": "\u1823"
-    "U\u0307": "\u1826"
     "u\u0307": "\u1826"
-    "U": "\u1824"
     "u": "\u1824"
-    "NG": "\u1829"
-    "nG": "\u1829"
+    # Feminine g control when followed by l
+    "nggl": "\u1829\u182D\u180D\u182F"
+    "ng\u0307": "\u1828\u182D"
     "ng": "\u1829"
-    "N": "\u1828"
     "n": "\u1828"
-    "B": "\u182A"
     "b": "\u182A"
-    "P": "\u182B"
     "p": "\u182B"
-    "Q": "\u182C"
     "q": "\u182C"
-    "KH": "\u183B"
-    "Kh": "\u183B"
-    "kH": "\u183B"
     "kh": "\u183B"
-    "K\u0307": "\u183A"
     "k\u0307": "\u183A"
-    "K": "\u182C"
     "k": "\u182C"
-    "G\u0307": "\u182D"
     "g\u0307": "\u182D"
-    "G": "\u182D"
     "g": "\u182D"
-    "M": "\u182E"
     "m": "\u182E"
-    "LH": "\u1840"
-    "Lh": "\u1840"
-    "lH": "\u1840"
     "lh": "\u1840"
-    "L": "\u182F"
     "l": "\u182F"
-    "TS\u0307": "\u183C"
-    "Ts\u0307": "\u183C"
-    "tS\u0307": "\u183C"
     "ts\u0307": "\u183C"
-    "S\u0301": "\u1831"
     "s\u0301": "\u1831"
-    "S": "\u1830"
     "s": "\u1830"
-    "T": "\u1832"
+    "t'": "\u1832\u180B"
     "t": "\u1832"
-    "D": "\u1833"
+    "d'": "\u1833\u180B"
     "d": "\u1833"
     "J": "\u1835"
     "j": "\u1835"
-    "Y": "\u1836"
     "y": "\u1836"
-    "V": "\u1838"
     "v": "\u1838"
-    "W": "\u1838"
     "w": "\u1838"
-    "F": "\u1839"
     "f": "\u1839"
-    "ZR": "\u183F"
-    "Zr": "\u183F"
-    "zR": "\u183F"
     "zr": "\u183F"
-    "R": "\u1837"
     "r": "\u1837"
-    "ZH": "\u1841"
-    "Zh": "\u1841"
-    "zH": "\u1841"
     "zh": "\u1841"
-    "Z": "\u183D"
     "z": "\u183D"
-    "CH": "\u1842"
-    "Ch": "\u1842"
-    "cH": "\u1842"
-    "ch": "\u1842"
-    # this is a Buryat letter
-    "C\u0307": "\u1878"
-    "c\u0307": "\u1878"
-    "C": "\u1834"
+    "h\u0307": "\u1842"
     "c": "\u1834"
-    "H": "\u183E"
     "h": "\u183E"
-    "-": "\u180E"
+    # Double hyphen: kept in data
+    "\u002D\u002D": "\u002D\u002D"
+    # Mongolian ellipsis
+    "\u002E\u002E\u002E": "\u1801"
+    # Comma at end of subfield
+    "\u002C\u0020\u2021": "\u002C\u0020\u2021"
+    # Mongolian comma
+    "\u002C": "\u1802"
+    # Mongolian full stop
+    "\u002E\u002E": "\u1803"
+    # Mongolian four dots (chapter end)
+    "\u002B": "\u1805"
+    # Mongolian soft hyphen
+    "\u0020\u002D\u0020": "\u1806"
+    # Mongolian nirugu (letter extender added to initial ending in a full stop)
+    "\u002E\u0020": "\u180A\u0020"
+    # Mongolian Free Variation Separator One (FVS1) apostrophe used after t and d
+    "\u0027": "\u180B"
+    # Mongolian Free Variation Separator Two (FVS2) quotation mark used to force final alternate letter shape
+    "\u0022": "\u180C"
+    # Mongolian Free Variation Separator Three (FVS3) grave used to force intermediate alternate letter shape
+    "\u0060": "\u180D"
+    # Mongolian Vowel Separator (MVS) low line used as an unabiguous final vowel separator
+    "\u005F": "\u180E"
+    # Narrow No-Break Space (NNBSP) hyphen used before Mongolian grammatical endings
+    "\u002D": "\u202F"
+    "\u003C\u003C": "\u300A"
+    "\u003E\u003E": "\u300B"
+    # Middle dot; asterisk used to separate parts of one person's name (clan * forename)
+    "\u002A": "\u00B7"
+    "0": "\u1810"
+    "1": "\u1811"
+    "2": "\u1812"
+    "3": "\u1813"
+    "4": "\u1814"
+    "5": "\u1815"
+    "6": "\u1816"
+    "7": "\u1817"
+    "8": "\u1818"
+    "9": "\u1819"
 
 script_to_roman:
   map:
+    # Middle dot; asterisk used to separate parts of one person's name (clan * forename)
+    "\u00B7": "\u002A"
+    "\u0020\u182F\u1820\u1832\u180B\u1822\u1828": "\u0020latin"
+    "\u182E\u1820\u1832\u180B\u1827\u1837\u1822\u1836\u1820\u182F": "mate\u0307riyal"
     # ga
     "\u182D\u1820": "g\u0307a"
     # go
@@ -137,8 +130,6 @@ script_to_roman:
     "\u182D\u1827": "ge\u0307"
     # eg
     "\u1821\u182D": "eg"
-    # ig
-    "\u1822\u182D": "ig"
     # oeg
     "\u1825\u182D": "o\u0307g"
     # ueg
@@ -146,17 +137,11 @@ script_to_roman:
     # eeg
     "\u1827\u182D": "e\u0307g"
     # qa
-    "\u182C\u1820": "q\u0307a"
+    "\u182C\u1820": "qa"
     # qo
-    "\u182C\u1823": "q\u0307o"
+    "\u182C\u1823": "qo"
     # qu
-    "\u182C\u1824": "q\u0307u"
-    # aq (should not occur)
-    "\u1820\u182C": "aq"
-    # oq (should not occur)
-    "\u1823\u182C": "oq"
-    # uq (should not occur)
-    "\u1824\u182C": "uq"
+    "\u182C\u1824": "qu"
     # ke
     "\u182C\u1821": "ke"
     # ki
@@ -167,37 +152,36 @@ script_to_roman:
     "\u182C\u1826": "ku\u0307"
     # kee
     "\u182C\u1827": "ke\u0307"
-    # ek (should not occur)
-    "\u1821\u182C": "ek"
-    # ik should not occur)
-    "\u1822\u182C": "ik"
-    # oek (should not occur)
-    "\u1825\u182C": "o\u0307k"
-    # uek (should not occur)
-    "\u1826\u182C": "o\u0307k"
-    # eek should not occur)
-    "\u1827\u182C": "e\u0307k"
-    # non-connecting vowel a
-    "\u180E\u1820": "\u002Da"
-    # non-connecting vowel e
-    "\u180E\u1821": "\u002De"
-    # non-connectubg vowel i
-    "\u180E\u1822": "\u002Di"
-    # Other Mongolian vowel separators to hyphen
-    "\u180E": "\u002De"
-    # Narrow no-break space to hyphen
-    "\u202F": "\u002D"
-    # Other Mongolian vowel NOT associated with g or k/q
-    "\u1801": "..."
-    "\u1802": ","
-    "\u1803": "."
-    "\u1804": ":"
-    "\u1805": "*"
-    "\u1806": "-"
+    # Double hyphen: kept in data
+    "\u002D\u002D": "\u002D\u002D"
+    # Mongolian ellipsis
+    "\u1801": "\u002E\u002E\u002E"
+    # Mongolian comma
+    "\u1802": "\u002C"
+    # Mongolian full stop
+    "\u1803": "\u002E\u002E"
+    # Mongolian colon
+    "\u1804": "\u003A"
+    # Mongolian four dots (chapter end)
+    "\u1805": "\u002B"
+    # Mongolian soft hyphen
+    "\u1806": "\u0020\u002D\u0020"
     "\u1807": "\u0020"
-    "\u1808": ","
-    "\u1809": "."
-    "\u180A": "-"
+    "\u1808": "\u002C"
+    "\u1809": "\u002E"
+    # Mongolian nirugu (letter extender to force initial form rather than isolated form in initials)
+    "\u180A\u0020": "\u002E\u0020"
+    # Mongolian Free Variation Separator One (FVS1) apostrophe used after t and d
+    "\u180B": "\u0027"
+    # Mongolian Free Variation Separator Two (FVS2) quotation mark used to force final alternate letter shape
+    "\u180C": "\u0022"
+    # Mongolian Free Variation Separator Three (FVS3) grave used to force intermediate alternate letter shape
+    "\u180D": "\u0060"
+    # Mongolian Vowel Separator (MVS) converts to hyphen as the vowel separator
+    "\u180E": "\u002D"
+    # Mongolian Free Variation Separator Four (FVS4) [not currently used]
+    "\u180F": "\u005B\u003F\u005B"
+    "\u180A": "."
     "\u1810": "0"
     "\u1811": "1"
     "\u1812": "2"
@@ -219,10 +203,12 @@ script_to_roman:
     "\u1827": "e\u0307"
     "\u1828": "n"
     "\u1829": "ng"
+    # Feminine g control when followed by l
+    "\u1829\u182D\u180D\u182F": "ngg`l"
     "\u182A": "b"
     "\u182B": "p"
     "\u182C": "q"
-    "\u182D": "g\u0307"
+    "\u182D": "g"
     "\u182E": "m"
     "\u182F": "l"
     "\u1830": "s"
@@ -239,9 +225,13 @@ script_to_roman:
     "\u183B": "kh"
     "\u183C": "ts\u0307"
     "\u183D": "z"
-    "\u183E": "h\u0307"
+    "\u183E": "h"
     "\u183F": "zr"
     "\u1840": "lh"
     "\u1841": "zh"
-    "\u1842": "ch"
-    "\u1878": "c\u0307"
+    "\u1842": "h\u0307"
+    # Narrow No-Break Space (NNBSP) converts to hyphen before Mongolian grammatical endings
+    "\u202F": "\u002D"
+    # low line to Mongolian vowel separator
+    "\u300A": "\u003C\u003C"
+    "\u300B": "\u003E\u003E"

+ 4 - 0
test/data/script_samples/mongolian.csv

@@ -0,0 +1,4 @@
+mongolian_mongol_bichig,"ᠠ ᠠ ᠎ᠠ ᠎ᠠ ᠎ᠠ ᠎ᠠ ᠡ ᠡ ᠎ᠡ ᠎ᠡ ᠎ᠡ ᠎ᠡ ᠧ ᠧ ᠢ ᠢ ᠣ ᠣ ᠤ ᠤ ᠥ ᠥ ᠦ ᠦ ᠨ ᠨ ᠨᠭ ᠩ ᠨᠭ ᠨᠭ ᠬ ᠬ ᠭ ᠭ ᠭᠠ ᠭᠠ ᠭᠣ ᠭᠣ ᠭᠤ ᠭᠤ ᠭ ᠭ ᠭᠡ ᠭᠡ ᠭᠧ ᠭᠧ ᠭᠢ ᠭᠢ ᠭᠥ ᠭᠥ ᠭᠦ ᠭᠦ ᠪ ᠪ ᠫ ᠫ ᠹ ᠹ ᠰ ᠰ ᠱ ᠱ ᠲ ᠲ ᠼ ᠼ ᠳ ᠳ ᠯ ᠯ ᠮ ᠮ ᠴ ᠴ ᠽ ᠽ ᠵ ᠵ ᠶ ᠶ ᠬ ᠬ ᠬᠡ ᠬᠡ ᠬᠢ ᠬᠢ ᠬᠥ ᠬᠥ ᠬᠦ ᠬᠦ ᠺ ᠺ ᠻ ᠻ ᠷ ᠷ ᠸ ᠸ ᠸ ᠸ ᠾ ᠾ ᡂ ᡂ ᡀ ᡀ ᡁ ᡁ ᠿ ᠿ ᠐ ᠑ ᠒ ᠓ ᠔ ᠕ ᠖ ᠗ ᠘ ᠙","a a -a -a _a _a e e -e -e _e _e ė ė i i o o u u ȯ ȯ u̇ u̇  n n ng ng nġ nġ q q ġ ġ ga ga go go gu gu g g ge ge gė gė gi gi gȯ gȯ gu̇ gu̇ b b p p f f s s ś ś t t tṡ tṡ d d l l m m c c z z j j y y k k ke ke ki ki kȯ kȯ ku̇ ku̇ k̇ k̇ kh kh r r v v w w h h ḣ ḣ lh lh zh zh zr zr 0 1 2 3 4 5 6 7 8 9","s2r",
+mongolian_mongol_bichig,"A a -A -a _A _a E e -E -e _E _e Ė ė I i O o U u Ȯ ȯ U̇ u̇  N n Ng ng Nġ nġ Q q Ġ ġ Ga ga Go go Gu gu G g Ge ge Gė gė Gi gi Gȯ gȯ Gu̇ gu̇ B b P p F f S s Ś ś T t Tṡ tṡ D d L l M m C c Z z J j Y y K k Ke ke Ki ki Kȯ kȯ Ku̇ ku̇ K̇ k̇ Kh kh R r V v W w H h Ḣ ḣ Lh lh Zh zh Zr zr 0 1 2 3 4 5 6 7 8 9","ᠠ ᠠ ᠎ᠠ ᠎ᠠ ᠎ᠠ ᠎ᠠ ᠡ ᠡ ᠎ᠡ ᠎ᠡ ᠎ᠡ ᠎ᠡ ᠧ ᠧ ᠢ ᠢ ᠣ ᠣ ᠤ ᠤ ᠥ ᠥ ᠦ ᠦ ᠨ ᠨ ᠨᠭ ᠩ ᠨᠭ ᠨᠭ ᠬ ᠬ ᠭ ᠭ ᠭᠠ ᠭᠠ ᠭᠣ ᠭᠣ ᠭᠤ ᠭᠤ ᠭ ᠭ ᠭᠡ ᠭᠡ ᠭᠧ ᠭᠧ ᠭᠢ ᠭᠢ ᠭᠥ ᠭᠥ ᠭᠦ ᠭᠦ ᠪ ᠪ ᠫ ᠫ ᠹ ᠹ ᠰ ᠰ ᠱ ᠱ ᠲ ᠲ ᠼ ᠼ ᠳ ᠳ ᠯ ᠯ ᠮ ᠮ ᠴ ᠴ ᠽ ᠽ ᠵ ᠵ ᠶ ᠶ ᠬ ᠬ ᠬᠡ ᠬᠡ ᠬᠢ ᠬᠢ ᠬᠥ ᠬᠥ ᠬᠦ ᠬᠦ ᠺ ᠺ ᠻ ᠻ ᠷ ᠷ ᠸ ᠸ ᠸ ᠸ ᠾ ᠾ ᡂ ᡂ ᡀ ᡀ ᡁ ᡁ ᠿ ᠿ ᠐ ᠑ ᠒ ᠓ ᠔ ᠕ ᠖ ᠗ ᠘ ᠙","r2s",
+mongolian_mongol_bichig,"ᠰᠠᠷ᠎ᠠ ᠶᠢᠨ ᠨᠡᠷ᠎ᠡ ᠶᠢᠨ ᠰᠠᠷ᠎ᠠ ᠨᠡᠷ᠎ᠡ ᠪ᠊ ᠨᠠᠰᠤᠨᠪᠠᠲᠤ -- ᠳ᠋ᠣᠯᠯᠠᠷ ᠮᠠᠲ᠋ᠧᠷᠢᠶᠠᠯ ᠳᠡᠳ᠋ ᠡᠳ᠋ ᠠᠪᠢᠰᠢᠭ᠌ ᠠᠩᠭ᠍ᠯᠢ ᠨᠣᠮ ᠠᠴᠠ ᠭᠡᠷ ᠡᠴᠡ ᠪ᠊ ᠳ᠊ ᠨᠠ᠊ ᠳᠣᠺᠲ᠋ᠤᠷ","sar-a-yin ner-e-yin sar_a ner_e b. nasunbatu -- d'ollar mat'ėriyal ded' ed' abisig" angg`li nom-aca ger-ece b. d. na. dok̇t'ur","s2r",
+mongolian_mongol_bichig,"sar-a-yin ner-e-yin sar_a ner_e B. Nasunbatu -- d'ollar mat'ėriyal ded' ed' abisig""angg`li nom-aca ger-ece B. D. Na. dok̇t'ur","ᠰᠠᠷ᠎ᠠ ᠶᠢᠨ ᠨᠡᠷ᠎ᠡ ᠶᠢᠨ ᠰᠠᠷ᠎ᠠ ᠨᠡᠷ᠎ᠡ ᠪ᠊ ᠨᠠᠰᠤᠨᠪᠠᠲᠤ -- ᠳ᠋ᠣᠯᠯᠠᠷ ᠮᠠᠲ᠋ᠧᠷᠢᠶᠠᠯ ᠳᠡᠳ᠋ ᠡᠳ᠋ ᠠᠪᠢᠰᠢᠭ᠌ ᠠᠩᠭ᠍ᠯᠢ ᠨᠣᠮ ᠠᠴᠠ ᠭᠡᠷ ᠡᠴᠡ ᠪ᠊ ᠳ᠊ ᠨᠠ᠊ ᠳᠣᠺᠲ᠋ᠤᠷ","r2s",