123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452 |
- general:
- name: Asian Cyrillic
- inherits: _cyrillic_base
- roman_to_script:
- map:
- # COMMON COMBINING CHARACTERS (always follow a base letter):
- # combining grave U+0300
- # combining acute U+0301
- # combining circumflex U+0302
- # combining macron U+0304
- # combining breve U+0306
- # combining dot above U+0307
- # combining diaeresis U+0308
- # combining ring above U+030A
- # combining double acute U+030B
- # combining caron (hachek) U+030C
- # combining candrabindu U+0310
- # combining dot below U+0323
- # combining comma below U+0326 (Romanian, Latvian, Livonian)
- # combining cedilla U+0327 (French, Turkish, Azeri)
- # combining ogonek (hook) U+0328 (Polish, Lithuanian)
- # combining left ligature U+FE20 (Cyrillic transliteration)
- # combining right ligature U+FE21 (Cyrillic transliteration)
- # soft sign/prime (spacing) U+02B9(Cyrillic transliteration)
- # hard sign/double prime (spacing) U+02BA (Cyrillic transliteration)
- # ayn(spacing) U+02BB (Semitic and Caucasian languages)
- # alif (spacing) U+02BC (Semitic languages)
- # middle dot (space) U+00B7) (Catalan)
- # REGULAR LATIN ALPHABETIC CHARACTERS TO BE CONVERTED
- # CONVERSION OF "I/i" LIGATED TO "A/a" (all capitalization patterns)
- "I\uFE20A\uFE21": "\u042F"
- "I\uFE20a\uFE21": "\u042F"
- "i\uFE20a\uFE21": "\u044F"
- "i\uFE20A\uFE21": "\u044F"
- # CONVERSION OF "A/a" WITH BREVE (0306)
- "A\u0306": "\u04D8"
- "a\u0306": "\u04D9"
- # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARI LETTER "A/a" WITH BREVE DUE TO CONFLICTING ROMANIZATION
- #"A\u0306": "\u04D2"
- # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARC LETTER "A/a" WITH BREVE DUE TO CONFLICTING ROMANIZATION
- #"a\u0306": "\u04D3"
- # REMAINING LONE "A/a"
- "V\u0307": "\u0474"
- "v\u0307": "\u0475"
- "Gh": "\u0492"
- "GH": "\u0492"
- "gH": "U=0493"
- "gh": "U=0493"
- # DE-ACTIVATED CONVERSION OF YAKUT "A" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
- #"Gh": "\u0494"
- # DE-ACTIVATED CONVERSION OF YAKUT "A" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
- #"GH": "\u0494"
- # DE-ACTIVATED CONVERSION OF YAKUT "a" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
- #"gH": "\u0495"
- # DE-ACTIVATED CONVERSION OF YAKUT "a" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
- #"gh": "\u0495"
- "G\u0301": "\u0494"
- "g\u0301": "\u0495"
- "G\u0307": "\u049C"
- "g\u0307": "\u049D"
- "G": "\u0413"
- "g": "\u0433"
- # CONVERION OF "I/i" LIGATED TO "E/e", SOME WITH MACRON (0304) AND OGONEK (0328)
- "I\uFE20E\uFE21\u0304": "\u0464"
- "I\uFE20E\u0304\uFE21": "\u0464"
- "I\uFE20e\uFE21\u0304": "\u0464"
- "I\uFE20e\u0304\uFE21": "\u0464"
- "I\uFE20E\uFE21\u0328": "\u0468"
- "I\uFE20E\u0328\uFE21": "\u0468"
- "I\uFE20e\uFE21\u0328": "\u0468"
- "I\uFE20e\u0328\uFE21": "\u0468"
- "i\uFE20e\uFE21\u0304": "\u0465"
- "i\uFE20e\u0304\uFE21": "\u0465"
- "i\uFE20E\uFE21\u0304": "\u0465"
- "i\uFE20E\u0304\uFE21": "\u0465"
- "i\uFE20e\uFE21\u0328": "\u0469"
- "i\uFE20e\u0328\uFE21": "\u0469"
- "i\uFE20E\uFE21\u0328": "\u0469"
- "i\uFE20E\u0328\uFE21": "\u0469"
- "I\uFE20E\uFE21": "\u0462"
- "I\uFE20e\uFE21": "\u0462"
- "i\uFE20e\uFE21": "\u0463"
- "i\uFE20E\uFE21": "\u0463"
- # CONVERSION OF "E/e" WITH MACRON (0304), DOT ABOVE (0307), DIAERESIS (0308), OGONEK (0328), & CARON (030C)
- "E\u030C": "\u0462"
- "E\u0304": "\u0404"
- "E\u0307": "\u042D"
- "E\u0308": "\u0401"
- "E\u0328": "\u0466"
- "e\u030C": "\u0463"
- "e\u0304": "\u0454"
- "e\u0307": "\u044D"
- "e\u0308": "\u0451"
- "e\u0328": "\u0467"
- # CONVERSION OF REMAINING LONE "E/e"
- "ZH": "\u0416"
- "Zh": "\u0416"
- "zH": "\u0436"
- "zh": "\u0436"
- # CONVERSION OF "T/t" LIGATED OR BLENDED WITH "H/h" (all capitalization patterns)
- "T\uFE20H\uFE21": "\u0498"
- "T\uFE20h\uFE21": "\u0498"
- "t\uFE20H\uFE21": "\u0499"
- "t\uFE20h\uFE21": "\u0499"
- "Th": "\u04AA"
- "TH": "\u04AA"
- "tH": "\u04AB"
- "th": "\u04AB"
- # CONVERION OF "I/i" LIGATED TO "O/o" WITH MACRON (0304) AND OGONEK (0328)
- "I\uFE20O\uFE21\u0328": "\u046C"
- "I\uFE20O\u0328\uFE21": "\u046C"
- "I\uFE20o\uFE21\u0328": "\u046C"
- "I\uFE20o\u0328\uFE21": "\u046C"
- "i\uFE20o\uFE21\u0328": "\u046D"
- "i\uFE20o\u0328\uFE21": "\u046D"
- "i\uFE20O\uFE21\u0328": "\u046D"
- "i\uFE20O\u0328\uFE21": "\u046D"
- # CONVERION OF "I/i" LIGATED TO "U/u"
- "I\uFE20U\uFE21": "\u042E"
- "I\uFE20u\uFE21": "\u042E"
- "i\uFE20u\uFE21": "\u044E"
- "i\uFE20U\uFE21": "\u044E"
- # CONVERSION OF "I/i" WITH MACRON (0304), BREVE (0306), AND CANDRABINDU (0310)
- "I\u0304": "\u0406"
- "I\u0306": "\u0419"
- "I\u0310": "\u0408"
- "i\u0304": "\u0456"
- "i\u0306": "\u0439"
- "i\u0310": "\u0458"
- # CONVERSION OF REMAINING LONE "I/i"
- "I": "\u0418"
- "i": "\u0438"
- "J": "\u0496"
- "j": "\u0497"
- # DE-ACTIVATED CONVERSION OF AZERI "J" DUE TO CONFLICTING ROMANIZATION
- #"J": "\u04B8"
- # DE-ACTIVATED CONVERSION OF AZERI "j" DUE TO CONFLICTING ROMANIZATION
- #"J": "\u04B9"
- # DE-ACTIVATED CONVERSION OF TAJIK "J" DUE TO CONFLICTING ROMANIZATION
- #"J": "\u04B6"
- # DE-ACTIVATED CONVERSION OF TAJIK "j" DUE TO CONFLICTING ROMANIZATION
- #"J": "\u04B7"
- "K\uFE20S\uFE21": "\u046E"
- "K\uFE20s\uFE21": "\u046E"
- "k\uFE20s\uFE21": "\u046F"
- "k\uFE20S\uFE21": "\u046F"
- "Q": "\u04A0"
- "q": "\u04A1"
- # DE-ACTIVATED CONVERSION OF KHANTY "Q" DUE TO CONFLICTING ROMANIZATION
- #"Q": "\u04C3"
- # DE-ACTIVATED CONVERSION OF KHANTY "q" DUE TO CONFLICTING ROMANIZATION
- #"q": "\u04C4"
- "N\uFE20G\uFE21": "\u04A2"
- "N\uFE20g\uFE21": "\u04A2"
- "n\uFE20G\uFE21": "\u04A3"
- "n\uFE20g\uFE21": "\u04A3"
- # DE-ACTIVATED CONVERSION OF YAKUT "NG/ng" DUE TO CONFLICTING ROMANIZATION
- #"N\uFE20G\uFE21": "\u04A4"
- #"N\uFE20g\uFE21": "\u04A4"
- #"n\uFE20G\uFE21": "\u04A5"
- #"n\uFE20g\uFE21": "\u04A5"
- # DE-ACTIVATED CONVERSION OF CHUKCHI AND EVENKI "NG/ng" DUE TO CONFLICTING ROMANIZATION
- #"N\uFE20G\uFE21": "\u04C7"
- #"N\uFE20g\uFE21": "\u04C7"
- #"n\uFE20G\uFE21": "\u04C8"
- #"n\uFE20g\uFE21": "\u04C8"
- # CONVERION OF "O/o" WITH OR WITHOUT MACRON (0304), LIGATED TO "T/t"
- "O\u0304\uFE20T\uFE21": "\u047E"
- "O\u0304\uFE20t\uFE21": "\u047E"
- "O\uFE20\u0304T\uFE21": "\u047E"
- "O\uFE20\u0304t\uFE21": "\u047E"
- "O\uFE20T\uFE21": "\u047E"
- "O\uFE20t\uFE21": "\u047E"
- "o\u0304\uFE20t\uFE21": "\u047F"
- "o\u0304\uFE20T\uFE21": "\u047F"
- "o\uFE20\u0304t\uFE21": "\u047F"
- "o\uFE20\u0304T\uFE21": "\u047F"
- "o\uFE20t\uFE21": "\u047F"
- "o\uFE20T\uFE21": "\u047F"
- # CONVERSION OF "O/o" WITH MACRON(0304)
- "O\u0304": "\u04EA"
- "o\u0304": "\u04EB"
- # CONVERSION OF "O/o" WITH DOT ABOVE (0307) USED IN MOST CENTRAL ASIAN LANGUAGES
- "O\u0307": "\u04E8"
- "o\u0307": "\u04E9"
- # DE-ACTIVATED CONVERSION OF GAGAUZ, KOMI, AND MARI "O" WITH DOT ABOVE (0307)DUE TO CONFLICTING ROMANIZATION
- #"O\u0307": "\u04E6"
- #"o\u0307": "\u04E7"
- # CONVERSION OF REMAINING LONE "O/o"
- "P\uFE20S\uFE21": "\u0470"
- "P\uFE20s\uFE21": "\u0470"
- "p\uFE20s\uFE21": "\u0471"
- "p\uFE20S\uFE21": "\u0471"
- "SHCH": "\u0429"
- "SHCh": "\u0429"
- "SHch": "\u0429"
- "Shch": "\u0429"
- "sHCH": "\u0449"
- "shCH": "\u0449"
- "shcH": "\u0449"
- "shch": "\u0449"
- "sH": "\u0448"
- "T\uFE20S\uFE21\u0307": "\u04B4"
- "T\uFE20S\u0307\uFE21": "\u04B4"
- "T\uFE20s\uFE21\u0307": "\u04B4"
- "T\uFE20s\u0307\uFE21": "\u04B4"
- "t\uFE20S\uFE21\u0307": "\u04B5"
- "t\uFE20S\u0307\uFE21": "\u04B5"
- "t\uFE20s\uFE21\u0307": "\u04B5"
- "t\uFE20s\u0307\uFE21": "\u04B5"
- "T\uFE20S\uFE21": "\u0426"
- "T\uFE20s\uFE21": "\u0426"
- "t\uFE20s\uFE21": "\u0446"
- "t\uFE20S\uFE21": "\u0446"
- # CONVERSION OF "U/u" WITH MACRON(0304), BREVE (0306), AND DOT ABOVE (0307)
- "U\u0304": "\u04B0"
- "u\u0304": "\u04B1"
- # DE-ACTIVATED CONVERSION OF TAJIK LETTER DUE TO CONFLICTING ROMANIZATION
- #"U\u0304": "\u04EE"
- # DE-ACTIVATED CONVERSION OF TAJIK LETTER DUE TO CONFLICTING ROMANIZATION
- #"U\u0304": "\u04EF"
- "U\u0306": "\u040E"
- "u\u0306": "\u0454"
- "U\u0307": "\u04AE"
- "u\u0307": "\u04AF"
- # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARI LETTER "O/o" WITH DOT ABOVE DUE TO CONFLICTING ROMANIZATION
- #"U\u0307": "\u04E6"
- #"u\u0307": "\u04E7"
- # CONVERSION OF ESKIMO AND KARAKALPAK "W/w" THAT MAPS TO THE SAME CHARACTERS AS "U/u" WITH BREVE
- "W": "\u040E"
- "w": "\u0454"
- "F\u0307": "\u0472"
- "f\u0307": "\u0473"
- "cH": "\u0447"
- # CONVERSION OF CYRILLIC PALOCHKA (ASPIRATION SIGN) USED IN MANY CENTRAL ASIAN LANGUAGES (NOT NORMALLY INITIALLY)
- "H\u0307": "\u04BA"
- "h\u0307": "\u04BB"
- # DE-ACTIVATED CONVERSION OF TAJIK AND UZBEK LETTER "H/h" WITH DOT ABOVE (0307) DUE TO CONFLICTING ROMANIZATION
- #"H\u0307": "\u04B2"
- #"h\u0307": "\u04B3"
- # DE-ACTIVATED CONVERSION OF ARCHAIC LETTER "H/h" WITH DOT ABOVE (0307) DUE TO CONFLICTING ROMANIZATION
- #"H\u0307": "\u04FC"
- #"h\u0307": "\u04FD"
- "Y\u0307": "\u04F8"
- "y\u0307": "\u04F9"
- "Y": "\u042B"
- "y": "\u044B"
- "\u0027": "\u044C"
- # this conversion is ambiguous - \u044C is also theoretically possible
- "\u02BA": "\u044A"
- script_to_roman:
- map:
- "\u044F": "i\uFE20a\uFE21"
- "\u04D8": "A\u0306"
- "\u04D9": "a\u0306"
- # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARI LETTER "A/a" WITH BREVE DUE TO CONFLICTING ROMANIZATION
- "\u04D2": "A\u0306"
- # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARC LETTER "A/a" WITH BREVE DUE TO CONFLICTING ROMANIZATION
- "\u04D3": "a\u0306"
- "\u0474": "V\u0307"
- "\u0475": "v\u0307"
- "\u0492": "Gh"
- "U": "0493=gh"
- # DE-ACTIVATED CONVERSION OF YAKUT "A" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
- "\u0494": "Gh"
- # DE-ACTIVATED CONVERSION OF YAKUT "a" WITH DIAERSIS DUE TO CONFLICTING ROMANIZATION
- "\u0495": "gh"
- "\u0494": "G\u0301"
- "\u0495": "g\u0301"
- "\u049C": "G\u0307"
- "\u049D": "g\u0307"
- "\u0413": "G"
- "\u0433": "g"
- # CONVERION OF "I/i" LIGATED TO "E/e", SOME WITH MACRON (0304) AND OGONEK (0328)
- "\u0464": "I\uFE20E\uFE21\u0304"
- "\u0468": "I\uFE20E\uFE21\u0328"
- "\u0465": "i\uFE20e\uFE21\u0304"
- "\u0469": "i\uFE20e\uFE21\u0328"
- "\u0462": "I\uFE20E\uFE21"
- "\u0463": "i\uFE20e\uFE21"
- # CONVERSION OF "E/e" WITH MACRON (0304), DOT ABOVE (0307), DIAERESIS (0308), OGONEK (0328), & CARON (030C)
- "\u0404": "E\u0304"
- "\u042D": "E\u0307"
- "\u0401": "E\u0308"
- "\u0466": "E\u0328"
- "\u0454": "e\u0304"
- "\u044D": "e\u0307"
- "\u0451": "e\u0308"
- "\u0467": "e\u0328"
- "\u0416": "Zh"
- "\u0436": "zh"
- # CONVERSION OF "T/t" LIGATED OR BLENDED WITH "H/h" (all capitalization patterns)
- "\u0498": "T\uFE20H\uFE21"
- "\u0499": "t\uFE20h\uFE21"
- "\u04AA": "Th"
- "\u04AB": "th"
- # CONVERION OF "I/i" LIGATED TO "O/o" WITH MACRON (0304) AND OGONEK (0328)
- "\u046C": "I\uFE20O\uFE21\u0328"
- "\u046D": "i\uFE20o\uFE21\u0328"
- # CONVERION OF "I/i" LIGATED TO "U/u"
- "\u044E": "i\uFE20u\uFE21"
- # CONVERSION OF "I/i" WITH MACRON (0304), BREVE (0306), AND CANDRABINDU (0310)
- "\u0406": "I\u0304"
- "\u0408": "I\u0310"
- "\u0456": "i\u0304"
- "\u0458": "i\u0310"
- # CONVERSION OF REMAINING LONE "I/i"
- "\u0418": "I"
- "\u0438": "i"
- "\u0496": "J"
- "\u0497": "j"
- # DE-ACTIVATED CONVERSION OF AZERI "J" DUE TO CONFLICTING ROMANIZATION
- "\u04B8": #"J"
- # DE-ACTIVATED CONVERSION OF AZERI "j" DUE TO CONFLICTING ROMANIZATION
- "\u04B9": #"J"
- # DE-ACTIVATED CONVERSION OF TAJIK "J" DUE TO CONFLICTING ROMANIZATION
- "\u04B6": #"J"
- # DE-ACTIVATED CONVERSION OF TAJIK "j" DUE TO CONFLICTING ROMANIZATION
- "\u04B7": #"J"
- "\u0445": "kh"
- "\u046E": "K\uFE20S\uFE21"
- "\u046F": "k\uFE20s\uFE21"
- "\u04A0": "Q"
- "\u04A1": "q"
- # DE-ACTIVATED CONVERSION OF KHANTY "Q" DUE TO CONFLICTING ROMANIZATION
- "\u04C3": "Q"
- # DE-ACTIVATED CONVERSION OF KHANTY "q" DUE TO CONFLICTING ROMANIZATION
- "\u04C4": "q"
- "\u04A2": "N\uFE20G\uFE21"
- "\u04A3": "n\uFE20g\uFE21"
- # DE-ACTIVATED CONVERSION OF YAKUT "NG/ng" DUE TO CONFLICTING ROMANIZATION
- "\u04A4": #"N\uFE20G\uFE21"
- "\u04A5": #"n\uFE20g\uFE21"
- # DE-ACTIVATED CONVERSION OF CHUKCHI AND EVENKI "NG/ng" DUE TO CONFLICTING ROMANIZATION
- "\u04C7": #"N\uFE20G\uFE21"
- "\u04C8": #"n\uFE20g\uFE21"
- # CONVERION OF "O/o" WITH OR WITHOUT MACRON (0304), LIGATED TO "T/t"
- "\u047E": "O\u0304\uFE20T\uFE21"
- "\u047F": "o\u0304\uFE20t\uFE21"
- # CONVERSION OF "O/o" WITH MACRON(0304)
- "\u04EA": "O\u0304"
- "\u04EB": "o\u0304"
- # CONVERSION OF "O/o" WITH DOT ABOVE (0307) USED IN MOST CENTRAL ASIAN LANGUAGES
- "\u04E8": "O\u0307"
- "\u04E9": "o\u0307"
- # DE-ACTIVATED CONVERSION OF GAGAUZ, KOMI, AND MARI "O" WITH DOT ABOVE (0307)DUE TO CONFLICTING ROMANIZATION
- "\u04E6": #"O\u0307"
- "\u04E7": #"o\u0307"
- # CONVERSION OF REMAINING LONE "O/o"
- "\u0470": "P\uFE20S\uFE21"
- "\u0471": "p\uFE20s\uFE21"
- "\u04B4": "T\uFE20S\uFE21\u0307"
- "\u04B5": "t\uFE20s\uFE21\u0307"
- "\u0426": "T\uFE20S\uFE21"
- "\u0446": "t\uFE20s\uFE21"
- # CONVERSION OF "U/u" WITH MACRON(0304), BREVE (0306), AND DOT ABOVE (0307)
- "\u04B0": "U\u0304"
- "\u04B1": "u\u0304"
- # DE-ACTIVATED CONVERSION OF TAJIK LETTER DUE TO CONFLICTING ROMANIZATION
- "\u04EE": #"U\u0304"
- # DE-ACTIVATED CONVERSION OF TAJIK LETTER DUE TO CONFLICTING ROMANIZATION
- "\u04EF": #"U\u0304"
- "\u040E": "U\u0306"
- "\u0454": "u\u0306"
- "\u04AE": "U\u0307"
- "\u04AF": "u\u0307"
- # DE-ACTIVATED CONVERSION OF GAGAUZ AND MARI LETTER "O/o" WITH DOT ABOVE DUE TO CONFLICTING ROMANIZATION
- "\u04E6": #"U\u0307"
- "\u04E7": #"u\u0307"
- # CONVERSION OF ESKIMO AND KARAKALPAK "W/w" THAT MAPS TO THE SAME CHARACTERS AS "U/u" WITH BREVE
- "\u040E": "W"
- "\u0454": "w"
- "\u0472": "F\u0307"
- "\u0473": "f\u0307"
- "\u0444": "f"
- "\u0427": "Ch"
- # CONVERSION OF CYRILLIC PALOCHKA (ASPIRATION SIGN) USED IN MANY CENTRAL ASIAN LANGUAGES (NOT NORMALLY INITIALLY)
- "\u04BA": "H\u0307"
- "\u04BB": "h\u0307"
- # DE-ACTIVATED CONVERSION OF TAJIK AND UZBEK LETTER "H/h" WITH DOT ABOVE (0307) DUE TO CONFLICTING ROMANIZATION
- "\u04B2": "H\u0307"
- "\u04B3": "h\u0307"
- # DE-ACTIVATED CONVERSION OF ARCHAIC LETTER "H/h" WITH DOT ABOVE (0307) DUE TO CONFLICTING ROMANIZATION
- "\u04FC": "H\u0307"
- "\u04FD": "h\u0307"
- "\u04F8": "Y\u0307"
- "\u04F9": "y\u0307"
- "\u042B": "Y"
- "\u044B": "y"
- # this conversion is ambiguous - \u044C is also theoretically possible
- "\u044A": "\u02BA"
|