|
@@ -2,8 +2,121 @@ general:
|
|
name: Mongolian (Mongol bichig)
|
|
name: Mongolian (Mongol bichig)
|
|
|
|
|
|
roman_to_script:
|
|
roman_to_script:
|
|
|
|
+ ignore:
|
|
|
|
+ - "at head of title"
|
|
|
|
+ - "colophon"
|
|
|
|
+ - "date of publication not identified"
|
|
|
|
+ - "place of publication not identified"
|
|
|
|
+ - "publisher not identified"
|
|
|
|
+ # NOTE There is ambiguity about ignoring these
|
|
|
|
+ # words. Note that the single-character Roman
|
|
|
|
+ # numerals are not included on purpose.
|
|
|
|
+ # Ideally the source editors should use the
|
|
|
|
+ # dedicated U+2160÷U+216F (uppercase Roman
|
|
|
|
+ # numerals) and/or U+2170÷U+217F (lower case Roman
|
|
|
|
+ # numerals) ranges to avoid this ambiguity.
|
|
|
|
+ # TODO implement regular expressions for ignore patterns.
|
|
|
|
+ #- re: "I{2,3}"
|
|
|
|
+ #- re: "I(V|X)"
|
|
|
|
+ #- re: "LI{,3}"
|
|
|
|
+ #- re: "LI?(V|X)"
|
|
|
|
+ #- re: "L(V|X{1,3})I{,3}"
|
|
|
|
+ #- re: "LX{1,3}I?V"
|
|
|
|
+ #- re: "LX{1,3}VI{,3}"
|
|
|
|
+ #- re: "(V|X{1,3})I{,3}"
|
|
|
|
+ #- re: "X{1,3}I{,3}"
|
|
|
|
+ #- re: "X{1,3}I(V|X)"
|
|
|
|
+ #- re: "X{1,3}VI{,3}"
|
|
|
|
+ - "II"
|
|
|
|
+ - "III"
|
|
|
|
+ - "IV"
|
|
|
|
+ - "IX"
|
|
|
|
+ - "LI"
|
|
|
|
+ - "LII"
|
|
|
|
+ - "LIII"
|
|
|
|
+ - "LIV"
|
|
|
|
+ - "LIX"
|
|
|
|
+ - "LV"
|
|
|
|
+ - "LVI"
|
|
|
|
+ - "LVII"
|
|
|
|
+ - "LVIII"
|
|
|
|
+ - "LX"
|
|
|
|
+ - "LXI"
|
|
|
|
+ - "LXII"
|
|
|
|
+ - "LXIII"
|
|
|
|
+ - "LXIV"
|
|
|
|
+ - "LXIX"
|
|
|
|
+ - "LXV"
|
|
|
|
+ - "LXVI"
|
|
|
|
+ - "LXVII"
|
|
|
|
+ - "LXVIII"
|
|
|
|
+ - "LXX"
|
|
|
|
+ - "LXXI"
|
|
|
|
+ - "LXXII"
|
|
|
|
+ - "LXXIII"
|
|
|
|
+ - "LXXIV"
|
|
|
|
+ - "LXXIX"
|
|
|
|
+ - "LXXV"
|
|
|
|
+ - "LXXVI"
|
|
|
|
+ - "LXXVII"
|
|
|
|
+ - "LXXVIII"
|
|
|
|
+ - "LXXX"
|
|
|
|
+ - "LXXXI"
|
|
|
|
+ - "LXXXII"
|
|
|
|
+ - "LXXXIII"
|
|
|
|
+ - "LXXXIV"
|
|
|
|
+ - "LXXXIX"
|
|
|
|
+ - "LXXXV"
|
|
|
|
+ - "LXXXVI"
|
|
|
|
+ - "LXXXVII"
|
|
|
|
+ - "LXXXVIII"
|
|
|
|
+ - "VI"
|
|
|
|
+ - "VII"
|
|
|
|
+ - "VIII"
|
|
|
|
+ - "XI"
|
|
|
|
+ - "XII"
|
|
|
|
+ - "XIII"
|
|
|
|
+ - "XIV"
|
|
|
|
+ - "XIX"
|
|
|
|
+ - "XL"
|
|
|
|
+ - "XLI"
|
|
|
|
+ - "XLII"
|
|
|
|
+ - "XLIII"
|
|
|
|
+ - "XLIV"
|
|
|
|
+ - "XLIX"
|
|
|
|
+ - "XLV"
|
|
|
|
+ - "XLVI"
|
|
|
|
+ - "XLVII"
|
|
|
|
+ - "XLVIII"
|
|
|
|
+ - "XV"
|
|
|
|
+ - "XVI"
|
|
|
|
+ - "XVII"
|
|
|
|
+ - "XVIII"
|
|
|
|
+ - "XX"
|
|
|
|
+ - "XXI"
|
|
|
|
+ - "XXII"
|
|
|
|
+ - "XXIII"
|
|
|
|
+ - "XXIV"
|
|
|
|
+ - "XXIX"
|
|
|
|
+ - "XXV"
|
|
|
|
+ - "XXVI"
|
|
|
|
+ - "XXVII"
|
|
|
|
+ - "XXVIII"
|
|
|
|
+ - "XXX"
|
|
|
|
+ - "XXXI"
|
|
|
|
+ - "XXXII"
|
|
|
|
+ - "XXXIII"
|
|
|
|
+ - "XXXIV"
|
|
|
|
+ - "XXXIX"
|
|
|
|
+ - "XXXV"
|
|
|
|
+ - "XXXVI"
|
|
|
|
+ - "XXXVII"
|
|
|
|
+ - "XXXVIII"
|
|
|
|
+ - "and one other"
|
|
|
|
+ - "and ([a-z]+ )?others"
|
|
|
|
+ - "et al."
|
|
|
|
+
|
|
map:
|
|
map:
|
|
- "A": "\u0531"
|
|
|
|
"\u002DA": "\u180E\u1820"
|
|
"\u002DA": "\u180E\u1820"
|
|
"\u002Da": "\u180E\u1820"
|
|
"\u002Da": "\u180E\u1820"
|
|
"A": "\u1820"
|
|
"A": "\u1820"
|
|
@@ -15,6 +128,8 @@ roman_to_script:
|
|
"e\u0307": "\u1827"
|
|
"e\u0307": "\u1827"
|
|
"E": "\u1821"
|
|
"E": "\u1821"
|
|
"e": "\u1821"
|
|
"e": "\u1821"
|
|
|
|
+ "\u002DI": "\u180E\u1822"
|
|
|
|
+ "\u002Di": "\u180E\u1822"
|
|
"I": "\u1822"
|
|
"I": "\u1822"
|
|
"i": "\u1822"
|
|
"i": "\u1822"
|
|
"O\u0307": "\u1825"
|
|
"O\u0307": "\u1825"
|
|
@@ -73,14 +188,10 @@ roman_to_script:
|
|
"t": "\u1832"
|
|
"t": "\u1832"
|
|
"D": "\u1833"
|
|
"D": "\u1833"
|
|
"d": "\u1833"
|
|
"d": "\u1833"
|
|
- "C": "\u1834"
|
|
|
|
- "c": "\u1834"
|
|
|
|
"J": "\u1835"
|
|
"J": "\u1835"
|
|
"j": "\u1835"
|
|
"j": "\u1835"
|
|
"Y": "\u1836"
|
|
"Y": "\u1836"
|
|
"y": "\u1836"
|
|
"y": "\u1836"
|
|
- "R": "\u1837"
|
|
|
|
- "r": "\u1837"
|
|
|
|
"V": "\u1838"
|
|
"V": "\u1838"
|
|
"v": "\u1838"
|
|
"v": "\u1838"
|
|
"W": "\u1838"
|
|
"W": "\u1838"
|
|
@@ -93,6 +204,8 @@ roman_to_script:
|
|
# this conversion should not be needed, but does no harm
|
|
# this conversion should not be needed, but does no harm
|
|
"zR": "\u183F"
|
|
"zR": "\u183F"
|
|
"zr": "\u183F"
|
|
"zr": "\u183F"
|
|
|
|
+ "R": "\u1837"
|
|
|
|
+ "r": "\u1837"
|
|
"ZH": "\u1841"
|
|
"ZH": "\u1841"
|
|
"Zh": "\u1841"
|
|
"Zh": "\u1841"
|
|
# this conversion should not be needed, but does no harm
|
|
# this conversion should not be needed, but does no harm
|
|
@@ -103,110 +216,131 @@ roman_to_script:
|
|
# this conversion should not be needed, but does no harm
|
|
# this conversion should not be needed, but does no harm
|
|
"cH": "\u1842"
|
|
"cH": "\u1842"
|
|
"ch": "\u1842"
|
|
"ch": "\u1842"
|
|
|
|
+ # this is a Buryat letter
|
|
|
|
+ "C\u0307": "\u1878"
|
|
|
|
+ "c\u0307": "\u1878"
|
|
|
|
+ "C": "\u1834"
|
|
|
|
+ "c": "\u1834"
|
|
"H": "\u183E"
|
|
"H": "\u183E"
|
|
"h": "\u183E"
|
|
"h": "\u183E"
|
|
"-": "\u180E"
|
|
"-": "\u180E"
|
|
|
|
|
|
script_to_roman:
|
|
script_to_roman:
|
|
map:
|
|
map:
|
|
- # g followed by vowel a
|
|
|
|
|
|
+ # ga
|
|
"\u182D\u1820": "g\u0307a"
|
|
"\u182D\u1820": "g\u0307a"
|
|
- # g followed by vowel o
|
|
|
|
|
|
+ # go
|
|
"\u182D\u1823": "g\u0307o"
|
|
"\u182D\u1823": "g\u0307o"
|
|
- # g followed by vowel u
|
|
|
|
|
|
+ # gu
|
|
"\u182D\u1824": "g\u0307u"
|
|
"\u182D\u1824": "g\u0307u"
|
|
- # g preceded by vowel a
|
|
|
|
|
|
+ # ag
|
|
"\u1820\u182D": "ag\u0307"
|
|
"\u1820\u182D": "ag\u0307"
|
|
- # g preceded by vowel o
|
|
|
|
|
|
+ # og
|
|
"\u1823\u182D": "og\u0307"
|
|
"\u1823\u182D": "og\u0307"
|
|
- # g preceded by vowel u
|
|
|
|
|
|
+ # ug
|
|
"\u1824\u182D": "ug\u0307"
|
|
"\u1824\u182D": "ug\u0307"
|
|
- # g followed by vowel e
|
|
|
|
|
|
+ # ge
|
|
"\u182D\u1821": "ge"
|
|
"\u182D\u1821": "ge"
|
|
- # g followed by vowel i
|
|
|
|
|
|
+ # gi
|
|
"\u182D\u1822": "gi"
|
|
"\u182D\u1822": "gi"
|
|
- # g followed by vowel oe
|
|
|
|
|
|
+ # goe
|
|
"\u182D\u1825": "go\u0307"
|
|
"\u182D\u1825": "go\u0307"
|
|
- # g followed by vowel ue
|
|
|
|
|
|
+ # gue
|
|
"\u182D\u1826": "gu\u0307"
|
|
"\u182D\u1826": "gu\u0307"
|
|
- # g followed by vowel ee
|
|
|
|
|
|
+ # gee
|
|
"\u182D\u1827": "ge\u0307"
|
|
"\u182D\u1827": "ge\u0307"
|
|
- # g preceded by vowel e
|
|
|
|
|
|
+ # eg
|
|
"\u1821\u182D": "eg"
|
|
"\u1821\u182D": "eg"
|
|
- # g preceded by vowel i
|
|
|
|
|
|
+ # ig
|
|
"\u1822\u182D": "ig"
|
|
"\u1822\u182D": "ig"
|
|
- # g preceded by vowel oe
|
|
|
|
|
|
+ # oeg
|
|
"\u1825\u182D": "o\u0307g"
|
|
"\u1825\u182D": "o\u0307g"
|
|
- # g preceded by vowel ue
|
|
|
|
|
|
+ # ueg
|
|
"\u1826\u182D": "u\u0307g"
|
|
"\u1826\u182D": "u\u0307g"
|
|
- # g preceded by vowel ee
|
|
|
|
|
|
+ # eeg
|
|
"\u1827\u182D": "e\u0307g"
|
|
"\u1827\u182D": "e\u0307g"
|
|
- # q followed by vowel a
|
|
|
|
|
|
+ # qa
|
|
"\u182C\u1820": "q\u0307a"
|
|
"\u182C\u1820": "q\u0307a"
|
|
- # q followed by vowel o
|
|
|
|
|
|
+ # qo
|
|
"\u182C\u1823": "q\u0307o"
|
|
"\u182C\u1823": "q\u0307o"
|
|
- # q followed by vowel u
|
|
|
|
|
|
+ # qu
|
|
"\u182C\u1824": "q\u0307u"
|
|
"\u182C\u1824": "q\u0307u"
|
|
- # q preceded by vowel a (should not occur)
|
|
|
|
- "\u1820\u182C": "q\u0307a"
|
|
|
|
- # q preceded by vowel o (should not occur)
|
|
|
|
- "\u1823\u182C": "q\u0307o"
|
|
|
|
- # q preceded by vowel u (should not occur)
|
|
|
|
- "\u1824\u182C": "q\u0307u"
|
|
|
|
- # k followed by vowel e
|
|
|
|
|
|
+ # aq (should not occur)
|
|
|
|
+ "\u1820\u182C": "aq"
|
|
|
|
+ # oq (should not occur)
|
|
|
|
+ "\u1823\u182C": "oq"
|
|
|
|
+ # uq (should not occur)
|
|
|
|
+ "\u1824\u182C": "uq"
|
|
|
|
+ # ke
|
|
"\u182C\u1821": "ke"
|
|
"\u182C\u1821": "ke"
|
|
- # k followed by vowel i
|
|
|
|
|
|
+ # ki
|
|
"\u182C\u1822": "ki"
|
|
"\u182C\u1822": "ki"
|
|
- # k followed by vowel oe
|
|
|
|
|
|
+ # koe
|
|
"\u182C\u1825": "ko\u0307"
|
|
"\u182C\u1825": "ko\u0307"
|
|
- # k followed by vowel ue
|
|
|
|
- "\u182C\u1826": "ko\u0307"
|
|
|
|
- # k followed by vowel ee
|
|
|
|
|
|
+ # kue
|
|
|
|
+ "\u182C\u1826": "ku\u0307"
|
|
|
|
+ # kee
|
|
"\u182C\u1827": "ke\u0307"
|
|
"\u182C\u1827": "ke\u0307"
|
|
- # k preceded by vowel e (should not occur)
|
|
|
|
- "\u1821\u182C": "ke"
|
|
|
|
- # k preceded by vowel i (should not occur)
|
|
|
|
- "\u1822\u182C": "ki"
|
|
|
|
- # k preceded by vowel oe (should not occur)
|
|
|
|
- "\u1825\u182C": "ko\u0307"
|
|
|
|
- # k preceded by vowel ue (should not occur)
|
|
|
|
- "\u1826\u182C": "ko\u0307"
|
|
|
|
- # k preceded by vowel ee (should not occur)
|
|
|
|
- "\u1827\u182C": "ke\u0307"
|
|
|
|
|
|
+ # ek (should not occur)
|
|
|
|
+ "\u1821\u182C": "ek"
|
|
|
|
+ # ik should not occur)
|
|
|
|
+ "\u1822\u182C": "ik"
|
|
|
|
+ # oek (should not occur)
|
|
|
|
+ "\u1825\u182C": "o\u0307k"
|
|
|
|
+ # uek (should not occur)
|
|
|
|
+ "\u1826\u182C": "o\u0307k"
|
|
|
|
+ # eek should not occur)
|
|
|
|
+ "\u1827\u182C": "e\u0307k"
|
|
# non-connecting vowel a
|
|
# non-connecting vowel a
|
|
"\u180E\u1820": "\u002Da"
|
|
"\u180E\u1820": "\u002Da"
|
|
# non-connecting vowel e
|
|
# non-connecting vowel e
|
|
"\u180E\u1821": "\u002De"
|
|
"\u180E\u1821": "\u002De"
|
|
|
|
+ # non-connectubg vowel i
|
|
|
|
+ "\u180E\u1822": "\u002Di"
|
|
# Other Mongolian vowel separators to hyphen
|
|
# Other Mongolian vowel separators to hyphen
|
|
"\u180E": "\u002De"
|
|
"\u180E": "\u002De"
|
|
# Narrow no-break space to hyphen
|
|
# Narrow no-break space to hyphen
|
|
"\u202F": "\u002D"
|
|
"\u202F": "\u002D"
|
|
# Other Mongolian vowel NOT associated with g or k/q
|
|
# Other Mongolian vowel NOT associated with g or k/q
|
|
|
|
+ "\u1801": "..."
|
|
|
|
+ "\u1802": ","
|
|
|
|
+ "\u1803": "."
|
|
|
|
+ "\u1804": ":"
|
|
|
|
+ "\u1805": "*"
|
|
|
|
+ "\u1806": "-"
|
|
|
|
+ "\u1807": "\u0020"
|
|
|
|
+ "\u1808": ","
|
|
|
|
+ "\u1809": "."
|
|
|
|
+ "\u180A": "-"
|
|
|
|
+ "u\1810": "0"
|
|
|
|
+ "u\1811": "1"
|
|
|
|
+ "u\1812": "2"
|
|
|
|
+ "u\1813": "3"
|
|
|
|
+ "u\1814": "4"
|
|
|
|
+ "u\1815": "5"
|
|
|
|
+ "u\1816": "6"
|
|
|
|
+ "u\1817": "7"
|
|
|
|
+ "u\1818": "8"
|
|
|
|
+ "u\1819": "9"
|
|
|
|
+ # Mongolian vowels NOT associated with g/g+dot or k/q
|
|
"\u1820": "a"
|
|
"\u1820": "a"
|
|
- "\u1823": "o"
|
|
|
|
- "\u1826": "u"
|
|
|
|
"\u1821": "e"
|
|
"\u1821": "e"
|
|
"\u1822": "i"
|
|
"\u1822": "i"
|
|
- "\u1825": "o\u0307"
|
|
|
|
"\u1823": "o"
|
|
"\u1823": "o"
|
|
"\u1824": "u"
|
|
"\u1824": "u"
|
|
|
|
+ "\u1825": "o\u0307"
|
|
"\u1826": "u\u0307"
|
|
"\u1826": "u\u0307"
|
|
"\u1827": "e\u0307"
|
|
"\u1827": "e\u0307"
|
|
- "\u1829": "ng"
|
|
|
|
"\u1828": "n"
|
|
"\u1828": "n"
|
|
|
|
+ "\u1829": "ng"
|
|
"\u182A": "b"
|
|
"\u182A": "b"
|
|
"\u182B": "p"
|
|
"\u182B": "p"
|
|
"\u182C": "q"
|
|
"\u182C": "q"
|
|
- "\u183B": "kh"
|
|
|
|
- "\u183A": "k\u0307"
|
|
|
|
- "\u182C": "k"
|
|
|
|
"\u182D": "g\u0307"
|
|
"\u182D": "g\u0307"
|
|
"\u182E": "m"
|
|
"\u182E": "m"
|
|
- "\u1840": "lh"
|
|
|
|
"\u182F": "l"
|
|
"\u182F": "l"
|
|
- "\u183C": "ts\u0307"
|
|
|
|
- "\u1831": "s\u0301"
|
|
|
|
"\u1830": "s"
|
|
"\u1830": "s"
|
|
|
|
+ "\u1831": "s\u0301"
|
|
"\u1832": "t"
|
|
"\u1832": "t"
|
|
"\u1833": "d"
|
|
"\u1833": "d"
|
|
"\u1834": "c"
|
|
"\u1834": "c"
|
|
@@ -215,7 +349,13 @@ script_to_roman:
|
|
"\u1837": "r"
|
|
"\u1837": "r"
|
|
"\u1838": "v"
|
|
"\u1838": "v"
|
|
"\u1839": "f"
|
|
"\u1839": "f"
|
|
|
|
+ "\u183A": "k\u0307"
|
|
|
|
+ "\u183B": "kh"
|
|
|
|
+ "\u183C": "ts\u0307"
|
|
|
|
+ "\u183D": "z"
|
|
|
|
+ "\u183E": "h\u0307"
|
|
"\u183F": "zr"
|
|
"\u183F": "zr"
|
|
|
|
+ "\u1840": "lh"
|
|
"\u1841": "zh"
|
|
"\u1841": "zh"
|
|
"\u1842": "ch"
|
|
"\u1842": "ch"
|
|
- "\u183E": "h\u0307"
|
|
|
|
|
|
+ "\u1878": "c\u0307"
|