Browse Source

Add Cyrillic table and language overrides.

Stefano Cossu 2 years ago
parent
commit
2c1cf9d1a2

+ 268 - 0
transliterator/tables/data/_cyrillic_base.yml

@@ -0,0 +1,268 @@
+general:
+  name: Cyrillic base
+  notes: copied from Ukrainian .cfg file.
+
+roman_to_script:
+  ignore:
+    - "At head of title"
+    - "Colophon"
+    - "II"
+    - "III"
+    - "IV"
+    - "IX"
+    - "LI"
+    - "LII"
+    - "LIII"
+    - "LIV"
+    - "LIX"
+    - "LV"
+    - "LVI"
+    - "LVII"
+    - "LVIII"
+    - "LX"
+    - "LXI"
+    - "LXII"
+    - "LXIII"
+    - "LXIV"
+    - "LXIX"
+    - "LXV"
+    - "LXVI"
+    - "LXVII"
+    - "LXVIII"
+    - "LXX"
+    - "LXXI"
+    - "LXXII"
+    - "LXXIII"
+    - "LXXIV"
+    - "LXXIX"
+    - "LXXV"
+    - "LXXVI"
+    - "LXXVII"
+    - "LXXVIII"
+    - "LXXX"
+    - "LXXXI"
+    - "LXXXII"
+    - "LXXXIII"
+    - "LXXXIV"
+    - "LXXXIX"
+    - "LXXXV"
+    - "LXXXVI"
+    - "LXXXVII"
+    - "LXXXVIII"
+    - "VI"
+    - "VII"
+    - "VIII"
+    - "XI"
+    - "XII"
+    - "XIII"
+    - "XIV"
+    - "XIX"
+    - "XL"
+    - "XLI"
+    - "XLII"
+    - "XLIII"
+    - "XLIV"
+    - "XLIX"
+    - "XLV"
+    - "XLVI"
+    - "XLVII"
+    - "XLVIII"
+    - "XV"
+    - "XVI"
+    - "XVII"
+    - "XVIII"
+    - "XX"
+    - "XXI"
+    - "XXII"
+    - "XXIII"
+    - "XXIV"
+    - "XXIX"
+    - "XXV"
+    - "XXVI"
+    - "XXVII"
+    - "XXVIII"
+    - "XXX"
+    - "XXXI"
+    - "XXXII"
+    - "XXXIII"
+    - "XXXIV"
+    - "XXXIX"
+    - "XXXV"
+    - "XXXVI"
+    - "XXXVII"
+    - "XXXVIII"
+    - "and one other"
+    - "and two others"
+    - "and three others"
+    - "and four others"
+    - "and five others"
+    - "and six others"
+    - "and seven others"
+    - "and eight others"
+    - "and nine others"
+    - "and ten others"
+    - "and eleven others"
+    - "and twelve others"
+    - "and thirteen others"
+    - "and fourteen others"
+    - "and fifteen others"
+    - "and sixteen others"
+    - "and seventeen others"
+    - "and eighteen others"
+    - "and nineteen others"
+    - "and others"
+    - "et al."
+    - "date of publication not identified"
+    - "Place of publication not identified"
+    - "publisher not identified"
+
+  map:
+    "A": "\u0410"
+    "a": "\u0430"
+    "B": "\u0411"
+    "b": "\u0431"
+    "V": "\u0412"
+    "v": "\u0432"
+    "H": "\u0413"
+    "h": "\u0433"
+    "G": "\u0490"
+    "g": "\u0491"
+    "D": "\u0414"
+    "d": "\u0434"
+    "E": "\u0415"
+    "e": "\u0435"
+    "Z\uFE20H\uFE21": "\u0416"
+    # this conversion shouldn't be needed, but does no harm
+    "Z\uFE20h\uFE21": "\u0416"
+    "Z": "\u0417"
+    "z\uFE20h\uFE21": "\u0436"
+    "z": "\u0437"
+    "Y": "\u0418"
+    "y": "\u0438"
+    "I\u0306": "\u0419"
+    "I\u0308": "\u0407"
+    "I\uFE20E\uFE21": "\u0404"
+    # this conversion shouldn't be needed, but does no harm
+    "I\uFE20e\uFE21": "\u0404"
+    "I\uFE20O\uFE21": "\u0401"
+    # this conversion shouldn't be needed, but does no harm
+    "I\uFE20o\uFE21": "\u0401"
+    "I\uFE20U\uFE21": "\u042E"
+    # this conversion shouldn't be needed, but does no harm
+    "I\uFE20u\uFE21": "\u042E"
+    "I\uFE20A\uFE21": "\u042F"
+    # this conversion shouldn't be needed, but does no harm
+    "I\uFE20a\uFE21": "\u042F"
+    "I": "\u0406"
+    "i\u0306": "\u0439"
+    "i\u0308": "\u0457"
+    "i\uFE20e\uFE21": "\u0454"
+    "i\uFE20o\uFE21": "\u0451"
+    "i\uFE20u\uFE21": "\u044E"
+    "i\uFE20a\uFE21": "\u044F"
+    "i": "\u0456"
+    # this conversion shouldn't be needed, but does no harm
+    "KH": "\u0425"
+    "Kh": "\u0425"
+    "K": "\u041A"
+    "kh": "\u0445"
+    "k": "\u043A"
+    "L": "\u041B"
+    "l": "\u043B"
+    "M": "\u041C"
+    "m": "\u043C"
+    "N": "\u041D"
+    "n": "\u043D"
+    "O": "\u041E"
+    "o": "\u043E"
+    "P": "\u041F"
+    "p": "\u043F"
+    "R": "\u0420"
+    "r": "\u0440"
+    # this conversion shouldn't be needed, but does no harm
+    "SHCH": "\u0429"
+    "Shch": "\u0429"
+    # this conversion shouldn't be needed, but does no harm
+    "SH": "\u0428"
+    "Sh": "\u0428"
+    "S": "\u0421"
+    "shch": "\u0449"
+    "sh": "\u0448"
+    "s": "\u0441"
+    # this conversion shouldn't be needed, but does no harm
+    "T\uFE20S\uFE21": "\u0426"
+    # this conversion shouldn't be needed, but does no harm
+    "T\uFE20s\uFE21": "\u0426"
+    "T": "\u0422"
+    "t\uFE20s\uFE21": "\u0446"
+    "t": "\u0442"
+    "U": "\u0423"
+    "u": "\u0443"
+    "F": "\u0424"
+    "f": "\u0444"
+    # this conversion shouldn't be needed, but does no harm
+    "CH": "\u0427"
+    "Ch": "\u0427"
+    "ch": "\u0447"
+    # this conversion shouldn't be needed, but does no harm
+    "\uFE20": ""
+    # this conversion shouldn't be needed, but does no harm
+    "\uFE21": ""
+    # this conversion is ambiguous - \u042C is also theoretically possible
+    "\u02B9": "\u044C"
+
+script_to_roman:
+  map:
+    "\u0404": "I\uFE20E\uFE21"
+    "\u0407": "I\u0308"
+    "\u0410": "A"
+    "\u0411": "B"
+    "\u0412": "V"
+    "\u0414": "D"
+    "\u0415": "E"
+    "\u0417": "Z"
+    "\u0419": "I\u0306"
+    "\u041A": "K"
+    "\u041B": "L"
+    "\u041C": "M"
+    "\u041D": "N"
+    "\u041E": "O"
+    "\u041F": "P"
+    "\u0420": "R"
+    "\u0421": "S"
+    "\u0422": "T"
+    "\u0423": "U"
+    "\u0424": "F"
+    "\u0425": "Kh"
+    "\u0427": "Ch"
+    "\u0428": "Sh"
+    "\u0429": "Shch"
+    "\u042C": "\u02B9"
+    "\u042E": "I\uFE20U\uFE21"
+    "\u042F": "I\uFE20A\uFE21"
+    "\u0430": "a"
+    "\u0431": "b"
+    "\u0432": "v"
+    "\u0433": "h"
+    "\u0434": "d"
+    "\u0435": "e"
+    "\u0437": "z"
+    "\u0439": "i\u0306"
+    "\u043A": "k"
+    "\u043B": "l"
+    "\u043C": "m"
+    "\u043D": "n"
+    "\u043E": "o"
+    "\u043F": "p"
+    "\u0440": "r"
+    "\u0441": "s"
+    "\u0442": "t"
+    "\u0443": "u"
+    "\u0444": "f"
+    "\u0445": "kh"
+    "\u0447": "ch"
+    "\u0448": "sh"
+    "\u0449": "shch"
+    "\u044C": "\u02B9"
+    "\u044E": "i\uFE20u\uFE21"
+    "\u044F": "i\uFE20a\uFE21"

+ 24 - 0
transliterator/tables/data/belorusian.yml

@@ -0,0 +1,24 @@
+general:
+  name: Belorusian
+  inherits: _cyrillic_base
+
+script_to_roman:
+  map:
+    "\u0401": "I\uFE20O\uFE21"
+    "\u0406": "I"
+    "\u040E": "U\u0306"
+    "\u0413": "H"
+    "\u0416": "Z\uFE20H\uFE21"
+    "\u0426": "TS"
+    "\u042B": "Y"
+    "\u042D": "E\u0307"
+    "\u0433": "h"
+    "\u0436": "z\uFE20h\uFE21"
+    "\u0446": "ts"
+    "\u044B": "y"
+    "\u044D": "e\u0307"
+    "\u0451": "i\uFE20o\uFE21"
+    "\u0456": "i"
+    "\u045E": "u\u0306"
+    "\u0490": "G"
+    "\u0491": "g"

+ 38 - 0
transliterator/tables/data/russian.yml

@@ -0,0 +1,38 @@
+general:
+  name: Russian
+  inherits: _cyrillic_base
+
+script_to_roman:
+  map:
+    "\u0401": "E\u0308"
+    "\u0406": "I\u0304"
+    "\u0413": "G"
+    "\u0416": "Zh"
+    "\u0418": "I"
+    "\u0426": "T\uFE20S\uFE21"
+    "\u042A": "\u02BA"
+    "\u042B": "Y"
+    "\u042D": "E\u0307"
+    "\u0433": "g"
+    "\u0436": "zh"
+    "\u0438": "i"
+    "\u0446": "t\uFE20s\uFE21"
+    "\u044A": "\u02BA"
+    "\u044B": "y"
+    "\u044D": "e\u0307"
+    "\u0451": "e\u0308"
+    "\u0456": "i\u0304"
+    "\u0462": "I\uFE20E\uFE21"
+    "\u0463": "i\uFE20e\uFE21"
+    "\u0466": "E\u0328"
+    "\u0467": "e\u0328"
+    "\u0472": "F\u0307"
+    "\u0473": "f\u0307"
+    "\u0474": "Y\u0307"
+    "\u0475": "y\u0307"
+    "\u04AE": "U\u0307"
+    "\u04AF": "u\u0307"
+    "\u04BA": "H\u0307"
+    "\u04BB": "h\u0307"
+    "\u04E8": "O\u0307"
+    "\u04E9": "o\u0307"

+ 24 - 0
transliterator/tables/data/ukrainian.yml

@@ -0,0 +1,24 @@
+general:
+  name: Ukrainian
+  inherits: _cyrillic_base
+
+script_to_roman:
+  map:
+    "\u0401": "I\uFE20O\uFE21"
+    "\u0404": "I\uFE20E\uFE21"
+    "\u0406": "I"
+    "\u0407": "I\u0308"
+    "\u0413": "H"
+    "\u0416": "Z\uFE20H\uFE21"
+    "\u0418": "Y"
+    "\u0426": "T\uFE20S\uFE21"
+    "\u0433": "h"
+    "\u0436": "z\uFE20h\uFE21"
+    "\u0438": "y"
+    "\u0446": "t\uFE20s\uFE21"
+    "\u0451": "i\uFE20o\uFE21"
+    "\u0454": "i\uFE20e\uFE21"
+    "\u0456": "i"
+    "\u0457": "i\u0308"
+    "\u0490": "G"
+    "\u0491": "g"

+ 1 - 1
transliterator/trans.py

@@ -78,6 +78,6 @@ def transliterate(src, script, lang, s2r=True):
     logger.info(f"Output list: {dest_ls}")
     dest = "".join(dest_ls)
 
-    dest = re.sub(MULTI_WS_RE, ' ', dest)
+    dest = re.sub(MULTI_WS_RE, ' ', dest.strip())
 
     return dest