Преглед на файлове

Add all Cyrillic tables.

Stefano Cossu преди 1 година
родител
ревизия
5f1753fb37

+ 452 - 0
transliterator/tables/data/asian_cyrillic.yml

@@ -0,0 +1,452 @@
+general:
+  name: Asian Cyrillic
+  inherits: _cyrillic_base
+
+roman_to_script:
+  map:
+    # COMMON COMBINING CHARACTERS (always follow a base letter): 
+    # combining grave U+0300
+    # combining acute U+0301
+    # combining circumflex U+0302
+    # combining macron U+0304
+    # combining breve U+0306
+    # combining dot above U+0307
+    # combining diaeresis U+0308
+    # combining ring above U+030A
+    # combining double acute U+030B
+    # combining caron (hachek) U+030C
+    # combining candrabindu U+0310
+    # combining dot below U+0323
+    # combining comma below U+0326 (Romanian, Latvian, Livonian)
+    # combining cedilla U+0327 (French, Turkish, Azeri)
+    # combining ogonek (hook) U+0328 (Polish, Lithuanian)
+    # combining left ligature U+FE20 (Cyrillic transliteration)
+    # combining right ligature U+FE21 (Cyrillic transliteration)
+    # soft sign/prime (spacing) U+02B9(Cyrillic transliteration)
+    # hard sign/double prime (spacing) U+02BA (Cyrillic transliteration)
+    # ayn(spacing) U+02BB (Semitic and Caucasian languages)
+    # alif (spacing) U+02BC (Semitic languages)
+    # middle dot (space) U+00B7) (Catalan)
+
+    # REGULAR LATIN ALPHABETIC CHARACTERS TO BE CONVERTED
+
+    # CONVERSION OF "I/i" LIGATED TO "A/a" (all capitalization patterns)
+    "I\uFE20A\uFE21": "\u042F"
+    "I\uFE20a\uFE21": "\u042F"
+    "i\uFE20a\uFE21": "\u044F"
+    "i\uFE20A\uFE21": "\u044F"
+
+    # CONVERSION OF "A/a" WITH BREVE (0306)
+    "A\u0306": "\u04D8"
+    "a\u0306": "\u04D9"
+
+    # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARI LETTER "A/a" WITH BREVE DUE TO CONFLICTING ROMANIZATION
+    #"A\u0306": "\u04D2"
+    # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARC LETTER "A/a" WITH BREVE DUE TO CONFLICTING ROMANIZATION
+    #"a\u0306": "\u04D3"
+
+    # REMAINING LONE "A/a"
+
+
+    "V\u0307": "\u0474"
+    "v\u0307": "\u0475"
+
+    "Gh": "\u0492"
+    "GH": "\u0492"
+    "gH": "U=0493"
+    "gh": "U=0493"
+
+    # DE-ACTIVATED CONVERSION OF YAKUT "A" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
+    #"Gh": "\u0494"
+    # DE-ACTIVATED CONVERSION OF YAKUT "A" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
+    #"GH": "\u0494"
+    # DE-ACTIVATED CONVERSION OF YAKUT "a" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
+    #"gH": "\u0495"
+    # DE-ACTIVATED CONVERSION OF YAKUT "a" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
+    #"gh": "\u0495"
+
+    "G\u0301": "\u0494"
+    "g\u0301": "\u0495"
+    "G\u0307": "\u049C"
+    "g\u0307": "\u049D"
+    "G": "\u0413"
+    "g": "\u0433"
+
+
+    # CONVERION OF "I/i" LIGATED TO "E/e", SOME WITH MACRON (0304) AND OGONEK (0328)
+    "I\uFE20E\uFE21\u0304": "\u0464"
+    "I\uFE20E\u0304\uFE21": "\u0464"
+    "I\uFE20e\uFE21\u0304": "\u0464"
+    "I\uFE20e\u0304\uFE21": "\u0464"
+    "I\uFE20E\uFE21\u0328": "\u0468"
+    "I\uFE20E\u0328\uFE21": "\u0468"
+    "I\uFE20e\uFE21\u0328": "\u0468"
+    "I\uFE20e\u0328\uFE21": "\u0468"
+    "i\uFE20e\uFE21\u0304": "\u0465"
+    "i\uFE20e\u0304\uFE21": "\u0465"
+    "i\uFE20E\uFE21\u0304": "\u0465"
+    "i\uFE20E\u0304\uFE21": "\u0465"
+    "i\uFE20e\uFE21\u0328": "\u0469"
+    "i\uFE20e\u0328\uFE21": "\u0469"
+    "i\uFE20E\uFE21\u0328": "\u0469"
+    "i\uFE20E\u0328\uFE21": "\u0469"
+    "I\uFE20E\uFE21": "\u0462"
+    "I\uFE20e\uFE21": "\u0462"
+    "i\uFE20e\uFE21": "\u0463"
+    "i\uFE20E\uFE21": "\u0463"
+
+    # CONVERSION OF "E/e" WITH MACRON (0304), DOT ABOVE (0307), DIAERESIS (0308), OGONEK (0328), & CARON (030C)
+    "E\u030C": "\u0462"
+    "E\u0304": "\u0404"
+    "E\u0307": "\u042D"
+    "E\u0308": "\u0401"
+    "E\u0328": "\u0466"
+    "e\u030C": "\u0463"
+    "e\u0304": "\u0454"
+    "e\u0307": "\u044D"
+    "e\u0308": "\u0451"
+    "e\u0328": "\u0467"
+
+    # CONVERSION OF REMAINING LONE "E/e"
+
+    "ZH": "\u0416"
+    "Zh": "\u0416"
+    "zH": "\u0436"
+    "zh": "\u0436"
+
+
+    # CONVERSION OF "T/t" LIGATED OR BLENDED WITH "H/h" (all capitalization patterns)
+    "T\uFE20H\uFE21": "\u0498"
+    "T\uFE20h\uFE21": "\u0498"
+    "t\uFE20H\uFE21": "\u0499"
+    "t\uFE20h\uFE21": "\u0499"
+    "Th": "\u04AA"
+    "TH": "\u04AA"
+    "tH": "\u04AB"
+    "th": "\u04AB"
+
+    # CONVERION OF "I/i" LIGATED TO "O/o" WITH MACRON (0304) AND OGONEK (0328)
+    "I\uFE20O\uFE21\u0328": "\u046C"
+    "I\uFE20O\u0328\uFE21": "\u046C"
+    "I\uFE20o\uFE21\u0328": "\u046C"
+    "I\uFE20o\u0328\uFE21": "\u046C"
+    "i\uFE20o\uFE21\u0328": "\u046D"
+    "i\uFE20o\u0328\uFE21": "\u046D"
+    "i\uFE20O\uFE21\u0328": "\u046D"
+    "i\uFE20O\u0328\uFE21": "\u046D"
+
+
+    # CONVERION OF "I/i" LIGATED TO "U/u"
+    "I\uFE20U\uFE21": "\u042E"
+    "I\uFE20u\uFE21": "\u042E"
+    "i\uFE20u\uFE21": "\u044E"
+    "i\uFE20U\uFE21": "\u044E"
+
+
+    # CONVERSION OF "I/i" WITH MACRON (0304), BREVE (0306), AND CANDRABINDU (0310)
+    "I\u0304": "\u0406"
+    "I\u0306": "\u0419"
+    "I\u0310": "\u0408"
+    "i\u0304": "\u0456"
+    "i\u0306": "\u0439"
+    "i\u0310": "\u0458"
+
+    # CONVERSION OF REMAINING LONE "I/i"
+    "I": "\u0418"
+    "i": "\u0438"
+
+    "J": "\u0496"
+    "j": "\u0497"
+
+    # DE-ACTIVATED CONVERSION OF AZERI "J" DUE TO CONFLICTING ROMANIZATION
+    #"J": "\u04B8"
+    # DE-ACTIVATED CONVERSION OF AZERI "j" DUE TO CONFLICTING ROMANIZATION
+    #"J": "\u04B9"
+    # DE-ACTIVATED CONVERSION OF TAJIK "J" DUE TO CONFLICTING ROMANIZATION
+    #"J": "\u04B6"
+    # DE-ACTIVATED CONVERSION OF TAJIK "j" DUE TO CONFLICTING ROMANIZATION
+    #"J": "\u04B7"
+
+
+    "K\uFE20S\uFE21": "\u046E"
+    "K\uFE20s\uFE21": "\u046E"
+    "k\uFE20s\uFE21": "\u046F"
+    "k\uFE20S\uFE21": "\u046F"
+    "Q": "\u04A0"
+    "q": "\u04A1"
+
+    # DE-ACTIVATED CONVERSION OF KHANTY "Q" DUE TO CONFLICTING ROMANIZATION
+    #"Q": "\u04C3"
+    # DE-ACTIVATED CONVERSION OF KHANTY "q" DUE TO CONFLICTING ROMANIZATION
+    #"q": "\u04C4"
+
+
+
+    "N\uFE20G\uFE21": "\u04A2"
+    "N\uFE20g\uFE21": "\u04A2"
+    "n\uFE20G\uFE21": "\u04A3"
+    "n\uFE20g\uFE21": "\u04A3"
+
+    # DE-ACTIVATED CONVERSION OF YAKUT "NG/ng" DUE TO CONFLICTING ROMANIZATION
+    #"N\uFE20G\uFE21": "\u04A4"
+    #"N\uFE20g\uFE21": "\u04A4"
+    #"n\uFE20G\uFE21": "\u04A5"
+    #"n\uFE20g\uFE21": "\u04A5"
+
+    # DE-ACTIVATED CONVERSION OF CHUKCHI AND EVENKI "NG/ng" DUE TO CONFLICTING ROMANIZATION
+    #"N\uFE20G\uFE21": "\u04C7"
+    #"N\uFE20g\uFE21": "\u04C7"
+    #"n\uFE20G\uFE21": "\u04C8"
+    #"n\uFE20g\uFE21": "\u04C8"
+
+
+    # CONVERION OF "O/o" WITH OR WITHOUT MACRON (0304), LIGATED TO "T/t"
+    "O\u0304\uFE20T\uFE21": "\u047E"
+    "O\u0304\uFE20t\uFE21": "\u047E"
+    "O\uFE20\u0304T\uFE21": "\u047E"
+    "O\uFE20\u0304t\uFE21": "\u047E"
+    "O\uFE20T\uFE21": "\u047E"
+    "O\uFE20t\uFE21": "\u047E"
+    "o\u0304\uFE20t\uFE21": "\u047F"
+    "o\u0304\uFE20T\uFE21": "\u047F"
+    "o\uFE20\u0304t\uFE21": "\u047F"
+    "o\uFE20\u0304T\uFE21": "\u047F"
+    "o\uFE20t\uFE21": "\u047F"
+    "o\uFE20T\uFE21": "\u047F"
+
+
+    # CONVERSION OF "O/o" WITH MACRON(0304)
+    "O\u0304": "\u04EA"
+    "o\u0304": "\u04EB"
+    # CONVERSION OF "O/o" WITH DOT ABOVE (0307) USED IN MOST CENTRAL ASIAN LANGUAGES
+    "O\u0307": "\u04E8"
+    "o\u0307": "\u04E9"
+
+    # DE-ACTIVATED CONVERSION OF GAGAUZ, KOMI, AND MARI "O" WITH DOT ABOVE (0307)DUE TO CONFLICTING ROMANIZATION
+    #"O\u0307": "\u04E6"
+    #"o\u0307": "\u04E7"
+
+    # CONVERSION OF REMAINING LONE "O/o"
+
+    "P\uFE20S\uFE21": "\u0470"
+    "P\uFE20s\uFE21": "\u0470"
+    "p\uFE20s\uFE21": "\u0471"
+    "p\uFE20S\uFE21": "\u0471"
+
+
+    "SHCH": "\u0429"
+    "SHCh": "\u0429"
+    "SHch": "\u0429"
+    "Shch": "\u0429"
+    "sHCH": "\u0449"
+    "shCH": "\u0449"
+    "shcH": "\u0449"
+    "shch": "\u0449"
+
+    "sH": "\u0448"
+
+    "T\uFE20S\uFE21\u0307": "\u04B4"
+    "T\uFE20S\u0307\uFE21": "\u04B4"
+    "T\uFE20s\uFE21\u0307": "\u04B4"
+    "T\uFE20s\u0307\uFE21": "\u04B4"
+    "t\uFE20S\uFE21\u0307": "\u04B5"
+    "t\uFE20S\u0307\uFE21": "\u04B5"
+    "t\uFE20s\uFE21\u0307": "\u04B5"
+    "t\uFE20s\u0307\uFE21": "\u04B5"
+
+    "T\uFE20S\uFE21": "\u0426"
+    "T\uFE20s\uFE21": "\u0426"
+    "t\uFE20s\uFE21": "\u0446"
+    "t\uFE20S\uFE21": "\u0446"
+
+    # CONVERSION OF "U/u" WITH MACRON(0304), BREVE (0306), AND DOT ABOVE (0307)
+    "U\u0304": "\u04B0"
+    "u\u0304": "\u04B1"
+
+    # DE-ACTIVATED CONVERSION OF TAJIK LETTER DUE TO CONFLICTING ROMANIZATION
+    #"U\u0304": "\u04EE"
+    # DE-ACTIVATED CONVERSION OF TAJIK LETTER DUE TO CONFLICTING ROMANIZATION
+    #"U\u0304": "\u04EF"
+
+    "U\u0306": "\u040E"
+    "u\u0306": "\u0454"
+    "U\u0307": "\u04AE"
+    "u\u0307": "\u04AF"
+
+    # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARI LETTER "O/o" WITH DOT ABOVE DUE TO CONFLICTING ROMANIZATION
+    #"U\u0307": "\u04E6"
+    #"u\u0307": "\u04E7"
+
+    # CONVERSION OF ESKIMO AND KARAKALPAK "W/w" THAT MAPS TO THE SAME CHARACTERS AS "U/u" WITH BREVE
+    "W": "\u040E"
+    "w": "\u0454"
+
+
+    "F\u0307": "\u0472"
+    "f\u0307": "\u0473"
+
+    "cH": "\u0447"
+
+    # CONVERSION OF CYRILLIC PALOCHKA (ASPIRATION SIGN) USED IN MANY CENTRAL ASIAN LANGUAGES (NOT NORMALLY INITIALLY)
+    "H\u0307": "\u04BA"
+    "h\u0307": "\u04BB"
+
+    # DE-ACTIVATED CONVERSION OF TAJIK AND UZBEK LETTER "H/h" WITH DOT ABOVE (0307) DUE TO CONFLICTING ROMANIZATION
+    #"H\u0307": "\u04B2"
+    #"h\u0307": "\u04B3"
+    # DE-ACTIVATED CONVERSION OF ARCHAIC LETTER "H/h" WITH DOT ABOVE (0307) DUE TO CONFLICTING ROMANIZATION
+    #"H\u0307": "\u04FC"
+    #"h\u0307": "\u04FD"
+
+    "Y\u0307": "\u04F8"
+    "y\u0307": "\u04F9"
+
+    "Y": "\u042B"
+    "y": "\u044B"
+
+    "\u0027": "\u044C"
+    # this conversion is ambiguous - \u044C is also theoretically possible
+    "\u02BA": "\u044A"
+
+script_to_roman:
+  map:
+    "\u044F": "i\uFE20a\uFE21"
+    "\u04D8": "A\u0306"
+    "\u04D9": "a\u0306"
+    # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARI LETTER "A/a" WITH BREVE DUE TO CONFLICTING ROMANIZATION
+    "\u04D2": "A\u0306"
+    # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARC LETTER "A/a" WITH BREVE DUE TO CONFLICTING ROMANIZATION
+    "\u04D3": "a\u0306"
+    "\u0474": "V\u0307"
+    "\u0475": "v\u0307"
+    "\u0492": "Gh"
+    "U": "0493=gh"
+    # DE-ACTIVATED CONVERSION OF YAKUT "A" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
+    "\u0494": "Gh"
+    # DE-ACTIVATED CONVERSION OF YAKUT "a" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
+    "\u0495": "gh"
+    "\u0494": "G\u0301"
+    "\u0495": "g\u0301"
+    "\u049C": "G\u0307"
+    "\u049D": "g\u0307"
+    "\u0413": "G"
+    "\u0433": "g"
+    # CONVERION OF "I/i" LIGATED TO "E/e", SOME WITH MACRON (0304) AND OGONEK (0328)
+    "\u0464": "I\uFE20E\uFE21\u0304"
+    "\u0468": "I\uFE20E\uFE21\u0328"
+    "\u0465": "i\uFE20e\uFE21\u0304"
+    "\u0469": "i\uFE20e\uFE21\u0328"
+    "\u0462": "I\uFE20E\uFE21"
+    "\u0463": "i\uFE20e\uFE21"
+    # CONVERSION OF "E/e" WITH MACRON (0304), DOT ABOVE (0307), DIAERESIS (0308), OGONEK (0328), & CARON (030C)
+    "\u0404": "E\u0304"
+    "\u042D": "E\u0307"
+    "\u0401": "E\u0308"
+    "\u0466": "E\u0328"
+    "\u0454": "e\u0304"
+    "\u044D": "e\u0307"
+    "\u0451": "e\u0308"
+    "\u0467": "e\u0328"
+    "\u0416": "Zh"
+    "\u0436": "zh"
+    # CONVERSION OF "T/t" LIGATED OR BLENDED WITH "H/h" (all capitalization patterns)
+    "\u0498": "T\uFE20H\uFE21"
+    "\u0499": "t\uFE20h\uFE21"
+    "\u04AA": "Th"
+    "\u04AB": "th"
+    # CONVERION OF "I/i" LIGATED TO "O/o" WITH MACRON (0304) AND OGONEK (0328)
+    "\u046C": "I\uFE20O\uFE21\u0328"
+    "\u046D": "i\uFE20o\uFE21\u0328"
+    # CONVERION OF "I/i" LIGATED TO "U/u"
+    "\u044E": "i\uFE20u\uFE21"
+    # CONVERSION OF "I/i" WITH MACRON (0304), BREVE (0306), AND CANDRABINDU (0310)
+    "\u0406": "I\u0304"
+    "\u0408": "I\u0310"
+    "\u0456": "i\u0304"
+    "\u0458": "i\u0310"
+    # CONVERSION OF REMAINING LONE "I/i"
+    "\u0418": "I"
+    "\u0438": "i"
+    "\u0496": "J"
+    "\u0497": "j"
+    # DE-ACTIVATED CONVERSION OF AZERI "J" DUE TO CONFLICTING ROMANIZATION
+    "\u04B8": #"J"
+    # DE-ACTIVATED CONVERSION OF AZERI "j" DUE TO CONFLICTING ROMANIZATION
+    "\u04B9": #"J"
+    # DE-ACTIVATED CONVERSION OF TAJIK "J" DUE TO CONFLICTING ROMANIZATION
+    "\u04B6": #"J"
+    # DE-ACTIVATED CONVERSION OF TAJIK "j" DUE TO CONFLICTING ROMANIZATION
+    "\u04B7": #"J"
+    "\u0445": "kh"
+    "\u046E": "K\uFE20S\uFE21"
+    "\u046F": "k\uFE20s\uFE21"
+    "\u04A0": "Q"
+    "\u04A1": "q"
+    # DE-ACTIVATED CONVERSION OF KHANTY "Q" DUE TO CONFLICTING ROMANIZATION
+    "\u04C3": "Q"
+    # DE-ACTIVATED CONVERSION OF KHANTY "q" DUE TO CONFLICTING ROMANIZATION
+    "\u04C4": "q"
+    "\u04A2": "N\uFE20G\uFE21"
+    "\u04A3": "n\uFE20g\uFE21"
+    # DE-ACTIVATED CONVERSION OF YAKUT "NG/ng" DUE TO CONFLICTING ROMANIZATION
+    "\u04A4": #"N\uFE20G\uFE21"
+    "\u04A5": #"n\uFE20g\uFE21"
+    # DE-ACTIVATED CONVERSION OF CHUKCHI AND EVENKI "NG/ng" DUE TO CONFLICTING ROMANIZATION
+    "\u04C7": #"N\uFE20G\uFE21"
+    "\u04C8": #"n\uFE20g\uFE21"
+    # CONVERION OF "O/o" WITH OR WITHOUT MACRON (0304), LIGATED TO "T/t"
+    "\u047E": "O\u0304\uFE20T\uFE21"
+    "\u047F": "o\u0304\uFE20t\uFE21"
+    # CONVERSION OF "O/o" WITH MACRON(0304)
+    "\u04EA": "O\u0304"
+    "\u04EB": "o\u0304"
+    # CONVERSION OF "O/o" WITH DOT ABOVE (0307) USED IN MOST CENTRAL ASIAN LANGUAGES
+    "\u04E8": "O\u0307"
+    "\u04E9": "o\u0307"
+    # DE-ACTIVATED CONVERSION OF GAGAUZ, KOMI, AND MARI "O" WITH DOT ABOVE (0307)DUE TO CONFLICTING ROMANIZATION
+    "\u04E6": #"O\u0307"
+    "\u04E7": #"o\u0307"
+    # CONVERSION OF REMAINING LONE "O/o"
+    "\u0470": "P\uFE20S\uFE21"
+    "\u0471": "p\uFE20s\uFE21"
+    "\u04B4": "T\uFE20S\uFE21\u0307"
+    "\u04B5": "t\uFE20s\uFE21\u0307"
+    "\u0426": "T\uFE20S\uFE21"
+    "\u0446": "t\uFE20s\uFE21"
+    # CONVERSION OF "U/u" WITH MACRON(0304), BREVE (0306), AND DOT ABOVE (0307)
+    "\u04B0": "U\u0304"
+    "\u04B1": "u\u0304"
+    # DE-ACTIVATED CONVERSION OF TAJIK LETTER DUE TO CONFLICTING ROMANIZATION
+    "\u04EE": #"U\u0304"
+    # DE-ACTIVATED CONVERSION OF TAJIK LETTER DUE TO CONFLICTING ROMANIZATION
+    "\u04EF": #"U\u0304"
+    "\u040E": "U\u0306"
+    "\u0454": "u\u0306"
+    "\u04AE": "U\u0307"
+    "\u04AF": "u\u0307"
+    # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARI LETTER "O/o" WITH DOT ABOVE DUE TO CONFLICTING ROMANIZATION
+    "\u04E6": #"U\u0307"
+    "\u04E7": #"u\u0307"
+    # CONVERSION OF ESKIMO AND KARAKALPAK "W/w" THAT MAPS TO THE SAME CHARACTERS AS "U/u" WITH BREVE
+    "\u040E": "W"
+    "\u0454": "w"
+    "\u0472": "F\u0307"
+    "\u0473": "f\u0307"
+    "\u0444": "f"
+    "\u0427": "Ch"
+    # CONVERSION OF CYRILLIC PALOCHKA (ASPIRATION SIGN) USED IN MANY CENTRAL ASIAN LANGUAGES (NOT NORMALLY INITIALLY)
+    "\u04BA": "H\u0307"
+    "\u04BB": "h\u0307"
+    # DE-ACTIVATED CONVERSION OF TAJIK AND UZBEK LETTER "H/h" WITH DOT ABOVE (0307) DUE TO CONFLICTING ROMANIZATION
+    "\u04B2": "H\u0307"
+    "\u04B3": "h\u0307"
+    # DE-ACTIVATED CONVERSION OF ARCHAIC LETTER "H/h" WITH DOT ABOVE (0307) DUE TO CONFLICTING ROMANIZATION
+    "\u04FC": "H\u0307"
+    "\u04FD": "h\u0307"
+    "\u04F8": "Y\u0307"
+    "\u04F9": "y\u0307"
+    "\u042B": "Y"
+    "\u044B": "y"
+    # this conversion is ambiguous - \u044C is also theoretically possible
+    "\u044A": "\u02BA"
+

+ 55 - 0
transliterator/tables/data/bulgarian.yml

@@ -0,0 +1,55 @@
+general:
+  name: Bulgarian
+  inherits: _cyrillic_base
+
+roman_to_script:
+  map:
+    "G": "\u0413"
+    "g": "\u0433"
+    # this conversion shouldn't be needed, but does no harm
+    "ZH": "\u0416"
+    "Zh": "\u0416"
+    "zh": "\u0436"
+    "I\uFE20E\uFE21": "\u0462"
+    # this conversion shouldn't be needed, but does no harm
+    "I\uFE20e\uFE21": "\u0462"
+    # this conversion shouldn't be needed, but does no harm
+    # this conversion shouldn't be needed, but does no harm
+    "I": "\u0418"
+    "i\uFE20e\uFE21": "\u0463"
+    "i": "\u0438"
+    # this conversion shouldn't be needed, but does no harm
+    "SHT": "\u0429"
+    "Sht": "\u0429"
+    "sht": "\u0449"
+    "T\uFE20S\uFE21": "\u0426"
+    # this conversion shouldn't be needed, but does no harm
+    "T\uFE20s\uFE21": "\u0426"
+    "t\uFE20s\uFE21": "\u0446"
+    "U\u0310": "\u046A"
+    "u\u0306": "\u044A"
+    "u\u0310": "\u046B"
+    # this conversion is ambiguous - \u042A is also theoretically possible
+    "\u02BA": "\u044A"
+
+script_to_roman:
+  map:
+    "\u044C": ""
+    "\u042C": ""
+    "\u044A": ""
+    "\u042A%": "u\u0306"
+    "\u042A": ""
+    "\u0413": "G"
+    "\u0433": "g"
+    "\u0416": "Zh"
+    "\u0436": "zh"
+    "\u0462": "I\uFE20E\uFE21"
+    "\u0418": "I"
+    "\u0463": "i\uFE20e\uFE21"
+    "\u0438": "i"
+    "\u0429": "Sht"
+    "\u0449": "sht"
+    "\u0426": "T\uFE20S\uFE21"
+    "\u0446": "t\uFE20s\uFE21"
+    "\u046A": "U\u0310"
+    "\u046B": "u\u0310"

+ 209 - 0
transliterator/tables/data/church_slavonic.yml

@@ -0,0 +1,209 @@
+general:
+  name: Church Slavonic
+  inherits: _cyrillic_base
+
+roman_to_script:
+  map:
+    # CONVERSION OF "I/i" LIGATED TO "A/a" (all capitalization patterns)
+    "i\uFE20A\uFE21": "\u044F"
+
+    "V\u0307": "\u0474"
+    "v\u0307": "\u0475"
+
+    "G\u0301": "\u0494"
+    "g\u0301": "\u0495"
+    "G": "\u0413"
+    "g": "\u0433"
+
+    # CONVERION OF "I/i" LIGATED TO "E/e", SOME WITH MACRON (0304) AND OGONEK (0328)
+    "I\uFE20E\uFE21\u0304": "\u0464"
+    "I\uFE20E\u0304\uFE21": "\u0464"
+    "I\uFE20e\uFE21\u0304": "\u0464"
+    "I\uFE20e\u0304\uFE21": "\u0464"
+    "I\uFE20E\uFE21\u0328": "\u0468"
+    "I\uFE20E\u0328\uFE21": "\u0468"
+    "I\uFE20e\uFE21\u0328": "\u0468"
+    "I\uFE20e\u0328\uFE21": "\u0468"
+    "i\uFE20e\uFE21\u0304": "\u0465"
+    "i\uFE20e\u0304\uFE21": "\u0465"
+    "i\uFE20E\uFE21\u0304": "\u0465"
+    "i\uFE20E\u0304\uFE21": "\u0465"
+    "i\uFE20e\uFE21\u0328": "\u0469"
+    "i\uFE20e\u0328\uFE21": "\u0469"
+    "i\uFE20E\uFE21\u0328": "\u0469"
+    "i\uFE20E\u0328\uFE21": "\u0469"
+    "I\uFE20E\uFE21": "\u0462"
+    "I\uFE20e\uFE21": "\u0462"
+    "i\uFE20e\uFE21": "\u0463"
+    "i\uFE20E\uFE21": "\u0463"
+
+    # CONVERSION OF "E/e" WITH MACRON (0304), DOT ABOVE (0307), DIAERESIS (0308), OGONEK (0328), & CARON (030C)
+    "E\u030C": "\u0462"
+    "E\u0304": "\u0404"
+    "E\u0307": "\u042D"
+    "E\u0308": "\u0401"
+    "E\u0328": "\u0466"
+    "e\u030C": "\u0463"
+    "e\u0304": "\u0454"
+    "e\u0307": "\u044D"
+    "e\u0308": "\u0451"
+    "e\u0328": "\u0467"
+
+    "ZH": "\u0416"
+    "Zh": "\u0416"
+    "zH": "\u0436"
+    "zh": "\u0436"
+
+    # CONVERION OF "I/i" LIGATED TO "O/o" WITH MACRON (0304) AND OGONEK (0328)
+    "I\uFE20O\uFE21\u0328": "\u046C"
+    "I\uFE20O\u0328\uFE21": "\u046C"
+    "I\uFE20o\uFE21\u0328": "\u046C"
+    "I\uFE20o\u0328\uFE21": "\u046C"
+    "i\uFE20o\uFE21\u0328": "\u046D"
+    "i\uFE20o\u0328\uFE21": "\u046D"
+    "i\uFE20O\uFE21\u0328": "\u046D"
+    "i\uFE20O\u0328\uFE21": "\u046D"
+
+    # CONVERION OF "I/i" LIGATED TO "U/u"
+    "i\uFE20U\uFE21": "\u044E"
+
+    # CONVERSION OF "I/i" WITH MACRON (0304) AND BREVE (0306)
+    "I\u0304": "\u0406"
+    "i\u0304": "\u0456"
+
+    # CONVERSION OF REMAINING LONE "I/i"
+    "I": "\u0418"
+    "i": "\u0438"
+
+    "kH": "\u0445"
+
+    "K\uFE20S\uFE21": "\u046E"
+    "K\uFE20s\uFE21": "\u046E"
+    "k\uFE20s\uFE21": "\u046F"
+    "k\uFE20S\uFE21": "\u046F"
+
+    # CONVERION OF "O/o" WITH OR WITHOUT MACRON (0304), LIGATED TO "T/t"
+    "O\u0304\uFE20T\uFE21": "\u047E"
+    "O\u0304\uFE20t\uFE21": "\u047E"
+    "O\uFE20\u0304T\uFE21": "\u047E"
+    "O\uFE20\u0304t\uFE21": "\u047E"
+    "O\uFE20T\uFE21": "\u047E"
+    "O\uFE20t\uFE21": "\u047E"
+    "o\u0304\uFE20t\uFE21": "\u047F"
+    "o\u0304\uFE20T\uFE21": "\u047F"
+    "o\uFE20\u0304t\uFE21": "\u047F"
+    "o\uFE20\u0304T\uFE21": "\u047F"
+    "o\uFE20t\uFE21": "\u047F"
+    "o\uFE20T\uFE21": "\u047F"
+
+    # CONVERSION OF "O/o" WITH MACRON(0304) AND OGONEK (0328)
+    "O\u0328": "\u046A"
+    "o\u0328": "\u046B"
+    "O\u0304": "\u0460"
+    "o\u0304": "\u0461"
+
+    "P\uFE20S\uFE21": "\u0470"
+    "P\uFE20s\uFE21": "\u0470"
+    "p\uFE20s\uFE21": "\u0471"
+    "p\uFE20S\uFE21": "\u0471"
+
+    "SHT": "\u0429"
+    "SHt": "\u0429"
+    "Sht": "\u0429"
+    "sHT": "\u0449"
+    "shT": "\u0449"
+    "sht": "\u0449"
+
+    "sH": "\u0448"
+
+    "T\uFE20S\uFE21": "\u0426"
+    "T\uFE20s\uFE21": "\u0426"
+    "t\uFE20s\uFE21": "\u0446"
+    "t\uFE20S\uFE21": "\u0446"
+
+    "U\u0304": "\u0478"
+    "u\u0304": "\u0479"
+
+    "F\u0307": "\u0472"
+    "f\u0307": "\u0473"
+
+    "cH": "\u0447"
+
+    "Y\u0307": "\u0476"
+    "y\u0307": "\u0477"
+    "Y": "\u042B"
+    "y": "\u044B"
+
+    # this conversion is ambiguous - \u042C is also theoretically possible
+    "\u0027": "\u044C"
+    # this conversion is ambiguous - \u044C is also theoretically possible
+    "\u02BA": "\u044A"
+
+script_to_roman:
+  map:
+    # CONVERSION TO "I/i" LIGATED TO "A/a"
+    "\u0474": "V\u0307"
+    "\u0475": "v\u0307"
+    "\u0494": "G\u0301"
+    "\u0495": "g\u0301"
+    "\u0413": "G"
+    "\u0433": "g"
+    # CONVERION TO "I/i" LIGATED TO "E/e" WITH DIACRITICS
+    "\u0464": "I\uFE20E\uFE21\u0304"
+    "\u0468": "I\uFE20E\uFE21\u0328"
+    "\u0465": "i\uFE20e\uFE21\u0304"
+    "\u0469": "i\uFE20e\uFE21\u0328"
+    # CONVERSION TO "E/e" WITH MACRON (0304), DOT ABOVE (0307), DIAERESIS (0308), OGONEK (0328), & CARON (030C)
+    "\u0462": "E\u030C"
+    "\u0404": "E\u0304"
+    "\u042D": "E\u0307"
+    "\u0401": "E\u0308"
+    "\u0466": "E\u0328"
+    "\u0463": "e\u030C"
+    "\u0454": "e\u0304"
+    "\u044D": "e\u0307"
+    "\u0451": "e\u0308"
+    "\u0467": "e\u0328"
+    "\u0416": "Zh"
+    "\u0436": "zh"
+    # CONVERION T0 "I/i" LIGATED TO "O/o" WITH MACRON (0304) AND OGONEK (0328)
+    "\u046C": "I\uFE20O\uFE21\u0328"
+    "\u046D": "i\uFE20o\uFE21\u0328"
+    # CONVERION TO "I/i" LIGATED TO "U/u"
+    # CONVERSION TO "I/i" WITH MACRON (0304) AND BREVE (0306)
+    "\u0406": "I\u0304"
+    "\u0456": "i\u0304"
+    # CONVERSION TO LONE "I/i"
+    "\u0418": "I"
+    "\u0438": "i"
+    "\u046E": "K\uFE20S\uFE21"
+    "\u046F": "k\uFE20s\uFE21"
+    # CONVERION TO "O/o" WITH MACRON (0304) LIGATED TO "T/t"
+    "\u047E": "O\uFE20\u0304t\uFE21"
+    "\u047F": "o\uFE20\u0304t\uFE21"
+    # CONVERSION TO "O/o" WITH MACRON(0304) AND OGONEK (0328)
+    "\u046A": "O\u0328"
+    "\u046B": "o\u0328"
+    "\u0460": "O\u0304"
+    "\u0461": "o\u0304"
+    # CONVERSION TO LONE "O/o"
+    "\u0470": "P\uFE20S\uFE21"
+    "\u0471": "p\uFE20s\uFE21"
+    "\u0429": "Sht"
+    "\u0449": "sht"
+    "\u0426": "T\uFE20S\uFE21"
+    "\u0446": "t\uFE20s\uFE21"
+    "\u0478": "U\u0304"
+    "\u0479": "u\u0304"
+    "\u0472": "F\u0307"
+    "\u0473": "f\u0307"
+    "\u0476": "Y\u0307"
+    "\u0477": "y\u0307"
+    # Uppercase hard sign (ambiguously maps to one Latin character)
+    "\u042A": "\u02BA"
+    # Lowercase hard sign (ambiguously maps to one Latin character)
+    "\u044A": "\u02BA"
+    # Uppercase soft sign (ambiguously maps to one Latin character)
+    # Lowercase soft sign (ambiguously maps to one Latin character)
+    "\u042B": "Y"
+    "\u044B": "y"

+ 14 - 1
transliterator/tables/data/index.yml

@@ -1,16 +1,29 @@
 # Map index file.
 #
-# Add all mapping files meant to be used in the app here.
+# Configurations not listed here will not show in the UI drop-down menu or
+# in the `/languages` endpoint, but can still be used in the `trans` endpoint.
 #
 # The entry key is the file name without the `.yml` extension, and the `name`
 # key within the entry is the human-readable label that can be used in a
 # multiple-choice menu.
 
+asian_cyrillic:
+  name: Asian Cyrillic
+  description: >
+    Multi-purpose transliteration for non-Slavic Cyrillic script: Abaza, Abkhaz, Adygei, Aisor, Altai, Avar, Azeri, Balkar, Bashkir, Buryat, Chechen, Chukchi, Chuvash, Dargwa, Dungan, Eskimo, Even, Evenki, Gagauz, Ingush, Inuit, Kabardian, Kalmyk, Karachay, Karachay-Balkar, Karakalpak, Karelian, Kazakh, Khakass, Khanty, Komi, Komi-Permyak, Koryak, Kumyk, Kyrgyz, Lak, Lapp, Lezghian, Lithuanian, Mansi, Mari, Moldovan, Molodstov, Mongolian, Mordvin, Nanai, Nenets, Nivkh, Nogai, Ossetic, Permyak, Romanian, Romany, Selkup, Shor, Tabasaran, Tajik, Tat, Tatar, Turkmen, Tuva, Udekhe, Udmurt, Uzbek, Yakut.
 belarusian:
   name: Belarusian
+bulgarian:
+  name: Bulgarian
 chinese:
   name: Chinese (Hanzi)
+church_slavonic:
+  name: Church Slavonic
+greek:
+  name: Greek (classic)
 russian:
   name: Russian
+serbian_macedonian:
+  name: Serbian and Macedonian
 ukrainian:
   name: Ukrainian

+ 82 - 0
transliterator/tables/data/serbian_macedonian.yml

@@ -0,0 +1,82 @@
+general:
+  name: Serbian and Macedonian
+  inherits: _cyrillic_base
+
+roman_to_script:
+  map:
+    "G\u0301": "\u0403"
+    "G": "\u0413"
+    "g\u0301": "\u0453"
+    "g": "\u0433"
+    "\u0110": "\u0402"
+    # this conversion shouldn't be needed, but does no harm
+    "DZ\u030C": "\u040F"
+    # this conversion shouldn't be needed, but does no harm
+    "DZ": "\u0405"
+    "Dz\u030C": "\u040F"
+    "Dz": "\u0405"
+    "\u0111": "\u0452"
+    "dz\u030C": "\u045F"
+    "dz": "\u0455"
+    "Z\u030C": "\u0416"
+    "z\u030C": "\u0436"
+    "z": "\u0437"
+    "I": "\u0418"
+    "i": "\u0438"
+    "J": "\u0408"
+    "j": "\u0458"
+    "K\u0301": "\u040C"
+    "H": "\u0425"
+    "k\u0301": "\u045C"
+    "h": "\u0445"
+    # this conversion shouldn't be needed, but does no harm
+    "LJ": "\u0409"
+    "Lj": "\u0409"
+    "lj": "\u0459"
+    # this conversion shouldn't be needed, but does no harm
+    "NJ": "\u040A"
+    "Nj": "\u040A"
+    "nj": "\u045A"
+    "S\u030C": "\u0428"
+    "s\u030C": "\u0448"
+    "C\u0301": "\u040B"
+    "C\u030C": "\u0427"
+    "C": "\u0426"
+    "c\u0301": "\u045B"
+    "c\u030C": "\u0447"
+    "c": "\u0446"
+
+script_to_roman:
+  map:
+    "\u0403": "G\u0301"
+    "\u0413": "G"
+    "\u0453": "g\u0301"
+    "\u0433": "g"
+    "\u0402": "\u0110"
+    "\u0452": "\u0111"
+    "\u0416": "Z\u030C"
+    "\u0436": "z\u030C"
+    "\u0405": "Dz"
+    "\u0455": "dz"
+    "\u0418": "I"
+    "\u0438": "i"
+    "\u0408": "J"
+    "\u0458": "j"
+    "\u040C": "K\u0301"
+    "\u0425": "H"
+    "\u045C": "k\u0301"
+    "\u0445": "h"
+    "\u0409": "Lj"
+    "\u0459": "lj"
+    "\u040A": "Nj"
+    "\u045A": "nj"
+    "\u0428": "S\u030C"
+    "\u0448": "s\u030C"
+    "\u040B": "C\u0301"
+    "\u0426": "C"
+    "\u045B": "c\u0301"
+    "\u0446": "c"
+    "\u0427": "C\u030C"
+    "\u0447": "c\u030C"
+    "\u040F": "Dz\u030C"
+    "\u045F": "dz\u030C"

+ 2 - 2
transliterator/trans.py

@@ -163,8 +163,8 @@ def transliterate(src, lang, r2s=False):
 
             # No match found. Copy non-mapped character (one at a time).
             logger.info(
-                f"Token {src[ctx.cur]} at position {ctx.cur} is not mapped."
-            )
+                    f"Token {src[ctx.cur]} (\\u{hex(ord(src[ctx.cur]))[2:]})"
+                    f"at position {ctx.cur} is not mapped.")
             ctx.dest_ls.append(src[ctx.cur])
             ctx.cur += 1
         else: