_cyrillic_base.yml 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. general:
  2. name: Cyrillic base
  3. notes: >
  4. copied from Russian .cfg file and stripped
  5. off language-specific tokens. Russian ignore list
  6. has been left here on purpose, assuming it's valid
  7. for all child languages.
  8. roman_to_script:
  9. ignore:
  10. - "at head of title"
  11. - "colophon"
  12. - "date of publication not identified"
  13. - "place of publication not identified"
  14. - "publisher not identified"
  15. # NOTE There is ambiguity about ignoring these
  16. # words. Note that the single-character Roman
  17. # numerals are not included on purpose.
  18. # Ideally the source editors should use the
  19. # dedicated U+2160÷U+216F (uppercase Roman
  20. # numerals) and/or U+2170÷U+217F (lower case Roman
  21. # numerals) ranges to avoid this ambiguity.
  22. # TODO implement regular expressions for ignore patterns.
  23. #- re: "I{2,3}"
  24. #- re: "I(V|X)"
  25. #- re: "LI{,3}"
  26. #- re: "LI?(V|X)"
  27. #- re: "L(V|X{1,3})I{,3}"
  28. #- re: "LX{1,3}I?V"
  29. #- re: "LX{1,3}VI{,3}"
  30. #- re: "(V|X{1,3})I{,3}"
  31. #- re: "X{1,3}I{,3}"
  32. #- re: "X{1,3}I(V|X)"
  33. #- re: "X{1,3}VI{,3}"
  34. - "II"
  35. - "III"
  36. - "IV"
  37. - "IX"
  38. - "LI"
  39. - "LII"
  40. - "LIII"
  41. - "LIV"
  42. - "LIX"
  43. - "LV"
  44. - "LVI"
  45. - "LVII"
  46. - "LVIII"
  47. - "LX"
  48. - "LXI"
  49. - "LXII"
  50. - "LXIII"
  51. - "LXIV"
  52. - "LXIX"
  53. - "LXV"
  54. - "LXVI"
  55. - "LXVII"
  56. - "LXVIII"
  57. - "LXX"
  58. - "LXXI"
  59. - "LXXII"
  60. - "LXXIII"
  61. - "LXXIV"
  62. - "LXXIX"
  63. - "LXXV"
  64. - "LXXVI"
  65. - "LXXVII"
  66. - "LXXVIII"
  67. - "LXXX"
  68. - "LXXXI"
  69. - "LXXXII"
  70. - "LXXXIII"
  71. - "LXXXIV"
  72. - "LXXXIX"
  73. - "LXXXV"
  74. - "LXXXVI"
  75. - "LXXXVII"
  76. - "LXXXVIII"
  77. - "VI"
  78. - "VII"
  79. - "VIII"
  80. - "XI"
  81. - "XII"
  82. - "XIII"
  83. - "XIV"
  84. - "XIX"
  85. - "XL"
  86. - "XLI"
  87. - "XLII"
  88. - "XLIII"
  89. - "XLIV"
  90. - "XLIX"
  91. - "XLV"
  92. - "XLVI"
  93. - "XLVII"
  94. - "XLVIII"
  95. - "XV"
  96. - "XVI"
  97. - "XVII"
  98. - "XVIII"
  99. - "XX"
  100. - "XXI"
  101. - "XXII"
  102. - "XXIII"
  103. - "XXIV"
  104. - "XXIX"
  105. - "XXV"
  106. - "XXVI"
  107. - "XXVII"
  108. - "XXVIII"
  109. - "XXX"
  110. - "XXXI"
  111. - "XXXII"
  112. - "XXXIII"
  113. - "XXXIV"
  114. - "XXXIX"
  115. - "XXXV"
  116. - "XXXVI"
  117. - "XXXVII"
  118. - "XXXVIII"
  119. - "and one other"
  120. #- re: "and ([a-z]+ )?others"
  121. - "et al."
  122. map:
  123. "A": "\u0410"
  124. "a": "\u0430"
  125. "B": "\u0411"
  126. "b": "\u0431"
  127. "V": "\u0412"
  128. "v": "\u0432"
  129. "D": "\u0414"
  130. "d": "\u0434"
  131. "E": "\u0415"
  132. "e": "\u0435"
  133. # this conversion shouldn't be needed, but does no harm
  134. "Z": "\u0417"
  135. "z": "\u0437"
  136. "I\u0306": "\u0419"
  137. # this conversion shouldn't be needed, but does no harm
  138. "I\uFE20U\uFE21": "\u042E"
  139. # this conversion shouldn't be needed, but does no harm
  140. "I\uFE20u\uFE21": "\u042E"
  141. "I\uFE20A\uFE21": "\u042F"
  142. # this conversion shouldn't be needed, but does no harm
  143. "I\uFE20a\uFE21": "\u042F"
  144. "i\u0306": "\u0439"
  145. "i\uFE20u\uFE21": "\u044E"
  146. "i\uFE20a\uFE21": "\u044F"
  147. # this conversion shouldn't be needed, but does no harm
  148. "KH": "\u0425"
  149. "Kh": "\u0425"
  150. "K": "\u041A"
  151. "kh": "\u0445"
  152. "k": "\u043A"
  153. "L": "\u041B"
  154. "l": "\u043B"
  155. "M": "\u041C"
  156. "m": "\u043C"
  157. "N": "\u041D"
  158. "n": "\u043D"
  159. "O": "\u041E"
  160. "o": "\u043E"
  161. "P": "\u041F"
  162. "p": "\u043F"
  163. "R": "\u0420"
  164. "r": "\u0440"
  165. # this conversion shouldn't be needed, but does no harm
  166. # this conversion shouldn't be needed, but does no harm
  167. "SH": "\u0428"
  168. "Sh": "\u0428"
  169. "S": "\u0421"
  170. "sh": "\u0448"
  171. "s": "\u0441"
  172. # this conversion shouldn't be needed, but does no harm
  173. "T": "\u0422"
  174. "t": "\u0442"
  175. "U": "\u0423"
  176. "u": "\u0443"
  177. "F": "\u0424"
  178. "f": "\u0444"
  179. # this conversion shouldn't be needed, but does no harm
  180. "CH": "\u0427"
  181. "Ch": "\u0427"
  182. "ch": "\u0447"
  183. # this conversion shouldn't be needed, but does no harm
  184. "\uFE20": ""
  185. # this conversion shouldn't be needed, but does no harm
  186. "\uFE21": ""
  187. # this conversion is ambiguous - \u042C is also theoretically possible
  188. "\u02B9": "\u044C"
  189. script_to_roman:
  190. map:
  191. "\u0404": "I\uFE20E\uFE21"
  192. "\u0407": "I\u0308"
  193. "\u0410": "A"
  194. "\u0411": "B"
  195. "\u0412": "V"
  196. "\u0414": "D"
  197. "\u0415": "E"
  198. "\u0417": "Z"
  199. "\u0419": "I\u0306"
  200. "\u041A": "K"
  201. "\u041B": "L"
  202. "\u041C": "M"
  203. "\u041D": "N"
  204. "\u041E": "O"
  205. "\u041F": "P"
  206. "\u0420": "R"
  207. "\u0421": "S"
  208. "\u0422": "T"
  209. "\u0423": "U"
  210. "\u0424": "F"
  211. "\u0425": "Kh"
  212. "\u0427": "Ch"
  213. "\u0428": "Sh"
  214. "\u0429": "Shch"
  215. "\u042C": "\u02B9"
  216. "\u042E": "I\uFE20U\uFE21"
  217. "\u042F": "I\uFE20A\uFE21"
  218. "\u0430": "a"
  219. "\u0431": "b"
  220. "\u0432": "v"
  221. "\u0434": "d"
  222. "\u0435": "e"
  223. "\u0437": "z"
  224. "\u0439": "i\u0306"
  225. "\u043A": "k"
  226. "\u043B": "l"
  227. "\u043C": "m"
  228. "\u043D": "n"
  229. "\u043E": "o"
  230. "\u043F": "p"
  231. "\u0440": "r"
  232. "\u0441": "s"
  233. "\u0442": "t"
  234. "\u0443": "u"
  235. "\u0444": "f"
  236. "\u0445": "kh"
  237. "\u0447": "ch"
  238. "\u0448": "sh"
  239. "\u0449": "shch"
  240. "\u044C": "\u02B9"
  241. "\u044E": "i\uFE20u\uFE21"
  242. "\u044F": "i\uFE20a\uFE21"