Explorar o código

Update Mongol bichig.

Stefano Cossu hai 1 ano
pai
achega
a06198d072
Modificáronse 1 ficheiros con 197 adicións e 57 borrados
  1. 197 57
      transliterator/tables/data/mongolian_mongol_bichig.yml

+ 197 - 57
transliterator/tables/data/mongolian_mongol_bichig.yml

@@ -2,8 +2,121 @@ general:
   name: Mongolian (Mongol bichig)
 
 roman_to_script:
+  ignore:
+    - "at head of title"
+    - "colophon"
+    - "date of publication not identified"
+    - "place of publication not identified"
+    - "publisher not identified"
+    # NOTE There is ambiguity about ignoring these
+    # words. Note that the single-character Roman
+    # numerals are not included on purpose.
+    # Ideally the source editors should use the
+    # dedicated U+2160÷U+216F (uppercase Roman
+    # numerals) and/or U+2170÷U+217F (lower case Roman
+    # numerals) ranges to avoid this ambiguity.
+    # TODO implement regular expressions for ignore patterns.
+    #- re: "I{2,3}"
+    #- re: "I(V|X)"
+    #- re: "LI{,3}"
+    #- re: "LI?(V|X)"
+    #- re: "L(V|X{1,3})I{,3}"
+    #- re: "LX{1,3}I?V"
+    #- re: "LX{1,3}VI{,3}"
+    #- re: "(V|X{1,3})I{,3}"
+    #- re: "X{1,3}I{,3}"
+    #- re: "X{1,3}I(V|X)"
+    #- re: "X{1,3}VI{,3}"
+    - "II"
+    - "III"
+    - "IV"
+    - "IX"
+    - "LI"
+    - "LII"
+    - "LIII"
+    - "LIV"
+    - "LIX"
+    - "LV"
+    - "LVI"
+    - "LVII"
+    - "LVIII"
+    - "LX"
+    - "LXI"
+    - "LXII"
+    - "LXIII"
+    - "LXIV"
+    - "LXIX"
+    - "LXV"
+    - "LXVI"
+    - "LXVII"
+    - "LXVIII"
+    - "LXX"
+    - "LXXI"
+    - "LXXII"
+    - "LXXIII"
+    - "LXXIV"
+    - "LXXIX"
+    - "LXXV"
+    - "LXXVI"
+    - "LXXVII"
+    - "LXXVIII"
+    - "LXXX"
+    - "LXXXI"
+    - "LXXXII"
+    - "LXXXIII"
+    - "LXXXIV"
+    - "LXXXIX"
+    - "LXXXV"
+    - "LXXXVI"
+    - "LXXXVII"
+    - "LXXXVIII"
+    - "VI"
+    - "VII"
+    - "VIII"
+    - "XI"
+    - "XII"
+    - "XIII"
+    - "XIV"
+    - "XIX"
+    - "XL"
+    - "XLI"
+    - "XLII"
+    - "XLIII"
+    - "XLIV"
+    - "XLIX"
+    - "XLV"
+    - "XLVI"
+    - "XLVII"
+    - "XLVIII"
+    - "XV"
+    - "XVI"
+    - "XVII"
+    - "XVIII"
+    - "XX"
+    - "XXI"
+    - "XXII"
+    - "XXIII"
+    - "XXIV"
+    - "XXIX"
+    - "XXV"
+    - "XXVI"
+    - "XXVII"
+    - "XXVIII"
+    - "XXX"
+    - "XXXI"
+    - "XXXII"
+    - "XXXIII"
+    - "XXXIV"
+    - "XXXIX"
+    - "XXXV"
+    - "XXXVI"
+    - "XXXVII"
+    - "XXXVIII"
+    - "and one other"
+    - "and ([a-z]+ )?others"
+    - "et al."
+
   map:
-    "A": "\u0531"
     "\u002DA": "\u180E\u1820"
     "\u002Da": "\u180E\u1820"
     "A": "\u1820"
@@ -15,6 +128,8 @@ roman_to_script:
     "e\u0307": "\u1827"
     "E": "\u1821"
     "e": "\u1821"
+    "\u002DI": "\u180E\u1822"
+    "\u002Di": "\u180E\u1822"
     "I": "\u1822"
     "i": "\u1822"
     "O\u0307": "\u1825"
@@ -73,14 +188,10 @@ roman_to_script:
     "t": "\u1832"
     "D": "\u1833"
     "d": "\u1833"
-    "C": "\u1834"
-    "c": "\u1834"
     "J": "\u1835"
     "j": "\u1835"
     "Y": "\u1836"
     "y": "\u1836"
-    "R": "\u1837"
-    "r": "\u1837"
     "V": "\u1838"
     "v": "\u1838"
     "W": "\u1838"
@@ -93,6 +204,8 @@ roman_to_script:
     # this conversion should not be needed, but does no harm
     "zR": "\u183F"
     "zr": "\u183F"
+    "R": "\u1837"
+    "r": "\u1837"
     "ZH": "\u1841"
     "Zh": "\u1841"
     # this conversion should not be needed, but does no harm
@@ -103,110 +216,131 @@ roman_to_script:
     # this conversion should not be needed, but does no harm
     "cH": "\u1842"
     "ch": "\u1842"
+    # this is a Buryat letter
+    "C\u0307": "\u1878"
+    "c\u0307": "\u1878"
+    "C": "\u1834"
+    "c": "\u1834"
     "H": "\u183E"
     "h": "\u183E"
     "-": "\u180E"
 
 script_to_roman:
   map:
-    # g followed by vowel a
+    # ga
     "\u182D\u1820": "g\u0307a"
-    # g followed by vowel o
+    # go
     "\u182D\u1823": "g\u0307o"
-    # g followed by vowel u
+    # gu
     "\u182D\u1824": "g\u0307u"
-    # g preceded by vowel a
+    # ag
     "\u1820\u182D": "ag\u0307"
-    # g preceded by vowel o
+    # og
     "\u1823\u182D": "og\u0307"
-    # g preceded by vowel u
+    # ug
     "\u1824\u182D": "ug\u0307"
-    # g followed by vowel e
+    # ge
     "\u182D\u1821": "ge"
-    # g followed by vowel i
+    # gi
     "\u182D\u1822": "gi"
-    # g followed by vowel oe
+    # goe
     "\u182D\u1825": "go\u0307"
-    # g followed by vowel ue
+    # gue
     "\u182D\u1826": "gu\u0307"
-    # g followed by vowel ee
+    # gee
     "\u182D\u1827": "ge\u0307"
-    # g preceded by vowel e
+    # eg
     "\u1821\u182D": "eg"
-    # g preceded by vowel i
+    # ig
     "\u1822\u182D": "ig"
-    # g preceded by vowel oe
+    # oeg
     "\u1825\u182D": "o\u0307g"
-    # g preceded by vowel ue
+    # ueg
     "\u1826\u182D": "u\u0307g"
-    # g preceded by vowel ee
+    # eeg
     "\u1827\u182D": "e\u0307g"
-    # q followed by vowel a
+    # qa
     "\u182C\u1820": "q\u0307a"
-    # q followed by vowel o
+    # qo
     "\u182C\u1823": "q\u0307o"
-    # q followed by vowel u
+    # qu
     "\u182C\u1824": "q\u0307u"
-    # q preceded by vowel a (should not occur)
-    "\u1820\u182C": "q\u0307a"
-    # q preceded by vowel o (should not occur)
-    "\u1823\u182C": "q\u0307o"
-    # q preceded by vowel u (should not occur)
-    "\u1824\u182C": "q\u0307u"
-    # k followed by vowel e
+    # aq (should not occur)
+    "\u1820\u182C": "aq"
+    # oq (should not occur)
+    "\u1823\u182C": "oq"
+    # uq (should not occur)
+    "\u1824\u182C": "uq"
+    # ke
     "\u182C\u1821": "ke"
-    # k followed by vowel i
+    # ki
     "\u182C\u1822": "ki"
-    # k followed by vowel oe
+    # koe
     "\u182C\u1825": "ko\u0307"
-    # k followed by vowel ue
-    "\u182C\u1826": "ko\u0307"
-    # k followed by vowel ee
+    # kue
+    "\u182C\u1826": "ku\u0307"
+    # kee
     "\u182C\u1827": "ke\u0307"
-    # k preceded by vowel e (should not occur)
-    "\u1821\u182C": "ke"
-    # k preceded by vowel i (should not occur)
-    "\u1822\u182C": "ki"
-    # k preceded by vowel oe (should not occur)
-    "\u1825\u182C": "ko\u0307"
-    # k preceded by vowel ue (should not occur)
-    "\u1826\u182C": "ko\u0307"
-    # k preceded by vowel ee (should not occur)
-    "\u1827\u182C": "ke\u0307"
+    # ek (should not occur)
+    "\u1821\u182C": "ek"
+    # ik should not occur)
+    "\u1822\u182C": "ik"
+    # oek (should not occur)
+    "\u1825\u182C": "o\u0307k"
+    # uek (should not occur)
+    "\u1826\u182C": "o\u0307k"
+    # eek should not occur)
+    "\u1827\u182C": "e\u0307k"
     # non-connecting vowel a
     "\u180E\u1820": "\u002Da"
     # non-connecting vowel e
     "\u180E\u1821": "\u002De"
+    # non-connectubg vowel i
+    "\u180E\u1822": "\u002Di"
     # Other Mongolian vowel separators to hyphen
     "\u180E": "\u002De"
     # Narrow no-break space to hyphen
     "\u202F": "\u002D"
     # Other Mongolian vowel NOT associated with g or k/q
+    "\u1801": "..."
+    "\u1802": ","
+    "\u1803": "."
+    "\u1804": ":"
+    "\u1805": "*"
+    "\u1806": "-"
+    "\u1807": "\u0020"
+    "\u1808": ","
+    "\u1809": "."
+    "\u180A": "-"
+    "u\1810": "0"
+    "u\1811": "1"
+    "u\1812": "2"
+    "u\1813": "3"
+    "u\1814": "4"
+    "u\1815": "5"
+    "u\1816": "6"
+    "u\1817": "7"
+    "u\1818": "8"
+    "u\1819": "9"
+    # Mongolian vowels NOT associated with g/g+dot or k/q
     "\u1820": "a"
-    "\u1823": "o"
-    "\u1826": "u"
     "\u1821": "e"
     "\u1822": "i"
-    "\u1825": "o\u0307"
     "\u1823": "o"
     "\u1824": "u"
+    "\u1825": "o\u0307"
     "\u1826": "u\u0307"
     "\u1827": "e\u0307"
-    "\u1829": "ng"
     "\u1828": "n"
+    "\u1829": "ng"
     "\u182A": "b"
     "\u182B": "p"
     "\u182C": "q"
-    "\u183B": "kh"
-    "\u183A": "k\u0307"
-    "\u182C": "k"
     "\u182D": "g\u0307"
     "\u182E": "m"
-    "\u1840": "lh"
     "\u182F": "l"
-    "\u183C": "ts\u0307"
-    "\u1831": "s\u0301"
     "\u1830": "s"
+    "\u1831": "s\u0301"
     "\u1832": "t"
     "\u1833": "d"
     "\u1834": "c"
@@ -215,7 +349,13 @@ script_to_roman:
     "\u1837": "r"
     "\u1838": "v"
     "\u1839": "f"
+    "\u183A": "k\u0307"
+    "\u183B": "kh"
+    "\u183C": "ts\u0307"
+    "\u183D": "z"
+    "\u183E": "h\u0307"
     "\u183F": "zr"
+    "\u1840": "lh"
     "\u1841": "zh"
     "\u1842": "ch"
-    "\u183E": "h\u0307"
+    "\u1878": "c\u0307"