3 years ago · 28f0f7c29f
--- a/transliterator/tables/data/_cyrillic_base.yml
+++ b/transliterator/tables/data/_cyrillic_base.yml
@@ -1,5 +1,7 @@
 
				 general:
			
 
				   name: Cyrillic base
			
 
				+  parents:
			
 
				+    - _ignore_base
			
 
				   notes: >
			
 
				     copied from Russian .cfg file and stripped
			
 
				     off language-specific tokens. Russian ignore list
			
@@ -7,120 +9,6 @@ general:
 
				     for all child languages.
			
 
				 
			
 
				 roman_to_script:
			
 
				-  ignore:
			
 
				-    - "at head of title"
			
 
				-    - "colophon"
			
 
				-    - "date of publication not identified"
			
 
				-    - "place of publication not identified"
			
 
				-    - "publisher not identified"
			
 
				-    # NOTE There is ambiguity about ignoring these
			
 
				-    # words. Note that the single-character Roman
			
 
				-    # numerals are not included on purpose.
			
 
				-    # Ideally the source editors should use the
			
 
				-    # dedicated U+2160÷U+216F (uppercase Roman
			
 
				-    # numerals) and/or U+2170÷U+217F (lower case Roman
			
 
				-    # numerals) ranges to avoid this ambiguity.
			
 
				-    # TODO implement regular expressions for ignore patterns.
			
 
				-    #- re: "I{2,3}"
			
 
				-    #- re: "I(V|X)"
			
 
				-    #- re: "LI{,3}"
			
 
				-    #- re: "LI?(V|X)"
			
 
				-    #- re: "L(V|X{1,3})I{,3}"
			
 
				-    #- re: "LX{1,3}I?V"
			
 
				-    #- re: "LX{1,3}VI{,3}"
			
 
				-    #- re: "(V|X{1,3})I{,3}"
			
 
				-    #- re: "X{1,3}I{,3}"
			
 
				-    #- re: "X{1,3}I(V|X)"
			
 
				-    #- re: "X{1,3}VI{,3}"
			
 
				-    - "II"
			
 
				-    - "III"
			
 
				-    - "IV"
			
 
				-    - "IX"
			
 
				-    - "LI"
			
 
				-    - "LII"
			
 
				-    - "LIII"
			
 
				-    - "LIV"
			
 
				-    - "LIX"
			
 
				-    - "LV"
			
 
				-    - "LVI"
			
 
				-    - "LVII"
			
 
				-    - "LVIII"
			
 
				-    - "LX"
			
 
				-    - "LXI"
			
 
				-    - "LXII"
			
 
				-    - "LXIII"
			
 
				-    - "LXIV"
			
 
				-    - "LXIX"
			
 
				-    - "LXV"
			
 
				-    - "LXVI"
			
 
				-    - "LXVII"
			
 
				-    - "LXVIII"
			
 
				-    - "LXX"
			
 
				-    - "LXXI"
			
 
				-    - "LXXII"
			
 
				-    - "LXXIII"
			
 
				-    - "LXXIV"
			
 
				-    - "LXXIX"
			
 
				-    - "LXXV"
			
 
				-    - "LXXVI"
			
 
				-    - "LXXVII"
			
 
				-    - "LXXVIII"
			
 
				-    - "LXXX"
			
 
				-    - "LXXXI"
			
 
				-    - "LXXXII"
			
 
				-    - "LXXXIII"
			
 
				-    - "LXXXIV"
			
 
				-    - "LXXXIX"
			
 
				-    - "LXXXV"
			
 
				-    - "LXXXVI"
			
 
				-    - "LXXXVII"
			
 
				-    - "LXXXVIII"
			
 
				-    - "VI"
			
 
				-    - "VII"
			
 
				-    - "VIII"
			
 
				-    - "XI"
			
 
				-    - "XII"
			
 
				-    - "XIII"
			
 
				-    - "XIV"
			
 
				-    - "XIX"
			
 
				-    - "XL"
			
 
				-    - "XLI"
			
 
				-    - "XLII"
			
 
				-    - "XLIII"
			
 
				-    - "XLIV"
			
 
				-    - "XLIX"
			
 
				-    - "XLV"
			
 
				-    - "XLVI"
			
 
				-    - "XLVII"
			
 
				-    - "XLVIII"
			
 
				-    - "XV"
			
 
				-    - "XVI"
			
 
				-    - "XVII"
			
 
				-    - "XVIII"
			
 
				-    - "XX"
			
 
				-    - "XXI"
			
 
				-    - "XXII"
			
 
				-    - "XXIII"
			
 
				-    - "XXIV"
			
 
				-    - "XXIX"
			
 
				-    - "XXV"
			
 
				-    - "XXVI"
			
 
				-    - "XXVII"
			
 
				-    - "XXVIII"
			
 
				-    - "XXX"
			
 
				-    - "XXXI"
			
 
				-    - "XXXII"
			
 
				-    - "XXXIII"
			
 
				-    - "XXXIV"
			
 
				-    - "XXXIX"
			
 
				-    - "XXXV"
			
 
				-    - "XXXVI"
			
 
				-    - "XXXVII"
			
 
				-    - "XXXVIII"
			
 
				-    - "and one other"
			
 
				-    #- re: "and ([a-z]+ )?others"
			
 
				-    - "et al."
			
 
				-
			
 
				   map:
			
 
				     "A": "\u0410"
			
 
				     "a": "\u0430"
			
--- a/transliterator/tables/data/_ignore_base.yml
+++ b/transliterator/tables/data/_ignore_base.yml
@@ -0,0 +1,119 @@
 
				+general:
			
 
				+  name: Common ignore list.
			
 
				+
			
 
				+roman_to_script:
			
 
				+  ignore:
			
 
				+    - "at head of title"
			
 
				+    - "colophon"
			
 
				+    - "date of publication not identified"
			
 
				+    - "place of publication not identified"
			
 
				+    - "publisher not identified"
			
 
				+    # NOTE There is ambiguity about ignoring these
			
 
				+    # words. Note that the single-character Roman
			
 
				+    # numerals are not included on purpose.
			
 
				+    # Ideally the source editors should use the
			
 
				+    # dedicated U+2160÷U+216F (uppercase Roman
			
 
				+    # numerals) and/or U+2170÷U+217F (lower case Roman
			
 
				+    # numerals) ranges to avoid this ambiguity.
			
 
				+    # TODO implement regular expressions for ignore patterns.
			
 
				+    #- re: "I{2,3}"
			
 
				+    #- re: "I(V|X)"
			
 
				+    #- re: "LI{,3}"
			
 
				+    #- re: "LI?(V|X)"
			
 
				+    #- re: "L(V|X{1,3})I{,3}"
			
 
				+    #- re: "LX{1,3}I?V"
			
 
				+    #- re: "LX{1,3}VI{,3}"
			
 
				+    #- re: "(V|X{1,3})I{,3}"
			
 
				+    #- re: "X{1,3}I{,3}"
			
 
				+    #- re: "X{1,3}I(V|X)"
			
 
				+    #- re: "X{1,3}VI{,3}"
			
 
				+    - "II"
			
 
				+    - "III"
			
 
				+    - "IV"
			
 
				+    - "IX"
			
 
				+    - "LI"
			
 
				+    - "LII"
			
 
				+    - "LIII"
			
 
				+    - "LIV"
			
 
				+    - "LIX"
			
 
				+    - "LV"
			
 
				+    - "LVI"
			
 
				+    - "LVII"
			
 
				+    - "LVIII"
			
 
				+    - "LX"
			
 
				+    - "LXI"
			
 
				+    - "LXII"
			
 
				+    - "LXIII"
			
 
				+    - "LXIV"
			
 
				+    - "LXIX"
			
 
				+    - "LXV"
			
 
				+    - "LXVI"
			
 
				+    - "LXVII"
			
 
				+    - "LXVIII"
			
 
				+    - "LXX"
			
 
				+    - "LXXI"
			
 
				+    - "LXXII"
			
 
				+    - "LXXIII"
			
 
				+    - "LXXIV"
			
 
				+    - "LXXIX"
			
 
				+    - "LXXV"
			
 
				+    - "LXXVI"
			
 
				+    - "LXXVII"
			
 
				+    - "LXXVIII"
			
 
				+    - "LXXX"
			
 
				+    - "LXXXI"
			
 
				+    - "LXXXII"
			
 
				+    - "LXXXIII"
			
 
				+    - "LXXXIV"
			
 
				+    - "LXXXIX"
			
 
				+    - "LXXXV"
			
 
				+    - "LXXXVI"
			
 
				+    - "LXXXVII"
			
 
				+    - "LXXXVIII"
			
 
				+    - "VI"
			
 
				+    - "VII"
			
 
				+    - "VIII"
			
 
				+    - "XI"
			
 
				+    - "XII"
			
 
				+    - "XIII"
			
 
				+    - "XIV"
			
 
				+    - "XIX"
			
 
				+    - "XL"
			
 
				+    - "XLI"
			
 
				+    - "XLII"
			
 
				+    - "XLIII"
			
 
				+    - "XLIV"
			
 
				+    - "XLIX"
			
 
				+    - "XLV"
			
 
				+    - "XLVI"
			
 
				+    - "XLVII"
			
 
				+    - "XLVIII"
			
 
				+    - "XV"
			
 
				+    - "XVI"
			
 
				+    - "XVII"
			
 
				+    - "XVIII"
			
 
				+    - "XX"
			
 
				+    - "XXI"
			
 
				+    - "XXII"
			
 
				+    - "XXIII"
			
 
				+    - "XXIV"
			
 
				+    - "XXIX"
			
 
				+    - "XXV"
			
 
				+    - "XXVI"
			
 
				+    - "XXVII"
			
 
				+    - "XXVIII"
			
 
				+    - "XXX"
			
 
				+    - "XXXI"
			
 
				+    - "XXXII"
			
 
				+    - "XXXIII"
			
 
				+    - "XXXIV"
			
 
				+    - "XXXIX"
			
 
				+    - "XXXV"
			
 
				+    - "XXXVI"
			
 
				+    - "XXXVII"
			
 
				+    - "XXXVIII"
			
 
				+    - "and one other"
			
 
				+    #- re: "and ([a-z0-9]+ )?others"
			
 
				+    - "et al."
			
 
				+
			
 
				+
			
--- a/transliterator/tables/data/armenian.yml
+++ b/transliterator/tables/data/armenian.yml
@@ -1,121 +1,9 @@
 
				 general:

			
 
				   name: Armenian

			
 
				+  parents:

			
 
				+    - _ignore_base

			
 
				 

			
 
				 roman_to_script:

			
 
				-  ignore:

			
 
				-    - "at head of title"

			
 
				-    - "colophon"

			
 
				-    - "date of publication not identified"

			
 
				-    - "place of publication not identified"

			
 
				-    - "publisher not identified"

			
 
				-    # NOTE There is ambiguity about ignoring these

			
 
				-    # words. Note that the single-character Roman

			
 
				-    # numerals are not included on purpose.

			
 
				-    # Ideally the source editors should use the

			
 
				-    # dedicated U+2160÷U+216F (uppercase Roman

			
 
				-    # numerals) and/or U+2170÷U+217F (lower case Roman

			
 
				-    # numerals) ranges to avoid this ambiguity.

			
 
				-    # TODO implement regular expressions for ignore patterns.

			
 
				-    #- re: "I{2,3}"

			
 
				-    #- re: "I(V|X)"

			
 
				-    #- re: "LI{,3}"

			
 
				-    #- re: "LI?(V|X)"

			
 
				-    #- re: "L(V|X{1,3})I{,3}"

			
 
				-    #- re: "LX{1,3}I?V"

			
 
				-    #- re: "LX{1,3}VI{,3}"

			
 
				-    #- re: "(V|X{1,3})I{,3}"

			
 
				-    #- re: "X{1,3}I{,3}"

			
 
				-    #- re: "X{1,3}I(V|X)"

			
 
				-    #- re: "X{1,3}VI{,3}"

			
 
				-    - "II"

			
 
				-    - "III"

			
 
				-    - "IV"

			
 
				-    - "IX"

			
 
				-    - "LI"

			
 
				-    - "LII"

			
 
				-    - "LIII"

			
 
				-    - "LIV"

			
 
				-    - "LIX"

			
 
				-    - "LV"

			
 
				-    - "LVI"

			
 
				-    - "LVII"

			
 
				-    - "LVIII"

			
 
				-    - "LX"

			
 
				-    - "LXI"

			
 
				-    - "LXII"

			
 
				-    - "LXIII"

			
 
				-    - "LXIV"

			
 
				-    - "LXIX"

			
 
				-    - "LXV"

			
 
				-    - "LXVI"

			
 
				-    - "LXVII"

			
 
				-    - "LXVIII"

			
 
				-    - "LXX"

			
 
				-    - "LXXI"

			
 
				-    - "LXXII"

			
 
				-    - "LXXIII"

			
 
				-    - "LXXIV"

			
 
				-    - "LXXIX"

			
 
				-    - "LXXV"

			
 
				-    - "LXXVI"

			
 
				-    - "LXXVII"

			
 
				-    - "LXXVIII"

			
 
				-    - "LXXX"

			
 
				-    - "LXXXI"

			
 
				-    - "LXXXII"

			
 
				-    - "LXXXIII"

			
 
				-    - "LXXXIV"

			
 
				-    - "LXXXIX"

			
 
				-    - "LXXXV"

			
 
				-    - "LXXXVI"

			
 
				-    - "LXXXVII"

			
 
				-    - "LXXXVIII"

			
 
				-    - "VI"

			
 
				-    - "VII"

			
 
				-    - "VIII"

			
 
				-    - "XI"

			
 
				-    - "XII"

			
 
				-    - "XIII"

			
 
				-    - "XIV"

			
 
				-    - "XIX"

			
 
				-    - "XL"

			
 
				-    - "XLI"

			
 
				-    - "XLII"

			
 
				-    - "XLIII"

			
 
				-    - "XLIV"

			
 
				-    - "XLIX"

			
 
				-    - "XLV"

			
 
				-    - "XLVI"

			
 
				-    - "XLVII"

			
 
				-    - "XLVIII"

			
 
				-    - "XV"

			
 
				-    - "XVI"

			
 
				-    - "XVII"

			
 
				-    - "XVIII"

			
 
				-    - "XX"

			
 
				-    - "XXI"

			
 
				-    - "XXII"

			
 
				-    - "XXIII"

			
 
				-    - "XXIV"

			
 
				-    - "XXIX"

			
 
				-    - "XXV"

			
 
				-    - "XXVI"

			
 
				-    - "XXVII"

			
 
				-    - "XXVIII"

			
 
				-    - "XXX"

			
 
				-    - "XXXI"

			
 
				-    - "XXXII"

			
 
				-    - "XXXIII"

			
 
				-    - "XXXIV"

			
 
				-    - "XXXIX"

			
 
				-    - "XXXV"

			
 
				-    - "XXXVI"

			
 
				-    - "XXXVII"

			
 
				-    - "XXXVIII"

			
 
				-    - "and one other"

			
 
				-    #- re: "and ([a-z0-9]+ )?others"

			
 
				-    - "et al."

			
 
				-

			
 
				   map:

			
 
				     "A": "\u0531"

			
 
				     "a": "\u0561"

			
@@ -363,6 +251,10 @@ script_to_roman:
 
				     "\u0577": "sh"

			
 
				     "\u0540": "H"

			
 
				     "\u0570": "h"

			
 
				+    # U uppercase

			
 
				+    "\u0548\u0582": "U"

			
 
				+    # u lowercase

			
 
				+    "\u0578\u0582": "u"

			
 
				     # O uppercase with combining macron

			
 
				     "\u0555": "O\u0304"

			
 
				     # o lowercase with combining macron

			
--- a/transliterator/tables/data/chinese.yml
+++ b/transliterator/tables/data/chinese.yml
@@ -1,35 +1,16 @@
 
				 general: # Section names and other keywords are all snake_cased.

			
 
				   name: Chinese

			
 
				-  #allow_define_button: true # Probably not a relevant section.

			
 
				-

			
 
				-# roman_to_script: Empty section removed.

			
 
				+  parents:

			
 
				+    - _ignore_base

			
 
				 

			
 
				 script_to_roman:

			
 
				   directives: # Directives section.

			
 
				-    # Irrelevant settings removed.

			
 
				-    #fields_included: 100 110 111 130 240 245 246 250 260 264 440 490 505 600 610 611 630 651 700 710 711 730 740 800 830

			
 
				-    #subfields_always_excluded: uvxy0123456789

			
 
				-    #other_subfields_excluded_by_tag: 100/e 110/e 111/j 246/i 260/c 264/c 650/a 700/e 700/i 710/e 710/i 711/i 711/j 730/i

			
 
				-    #uppercase_first_character_in_subfield: 260/b

			
 
				-    # personal_name_handling: true  # This is handled by the text scanner.

			
 
				-    # Capitalize the first letter of the string only;

			
 
				-    # TODO Implement a list that includes all punctuation marks that want the

			
 
				-    # following letter capitalized.

			
 
				+    # Capitalize the first letter of the string only; TODO

			
 
				+    # Implement a list that includes all punctuation marks that

			
 
				+    # want the following letter capitalized.

			
 
				     capitalize: true

			
 
				   map: # Mapping section.

			
 
				-    # RDA boilerplate phrases not transliterated.

			
 
				-    # All characters not found in the mapping are copid verbatim. No need for

			
 
				-    # a "copy verbatim" section.

			
 
				-    #Place of publication not identified=Place of publication not identified_

			
 
				-    #publisher not identified=publisher not identified_

			
 
				-

			
 
				-    # Multi-character transliteration.

			
 
				-    #

			
 
				-    # These should be kept at the beginning of the mapping for readability,

			
 
				-    # however the software will rearrange them internally after parsing the

			
 
				-    # config file, so the longest strings are searched for first.

			
 
				-    # Q: Separate whole names from the rest? We'll see when we implement the

			
 
				-    # logic maybe.

			
 
				+    # Whole words.

			
 
				     "\u4E2D\u56FD": "Zhongguo "

			
 
				     "\u5317\u4EAC": "Beijing "

			
 
				     "\u9808\u5F4C": "Xumi "

			
--- a/transliterator/tables/data/ethiopic.yml
+++ b/transliterator/tables/data/ethiopic.yml
@@ -1,132 +1,9 @@
 
				 general:

			
 
				   name: Ethiopic

			
 
				+  parents:

			
 
				+    - _ignore_base

			
 
				 

			
 
				 roman_to_script:

			
 
				-  ignore:

			
 
				-    - "at head of title"

			
 
				-    - "colophon"

			
 
				-    - "date of publication not identified"

			
 
				-    - "place of publication not identified"

			
 
				-    - "publisher not identified"

			
 
				-    # NOTE There is ambiguity about ignoring these

			
 
				-    # words. Note that the single-character Roman

			
 
				-    # numerals are not included on purpose.

			
 
				-    # Ideally the source editors should use the

			
 
				-    # dedicated \u2160-\u216F (uppercase Roman

			
 
				-    # numerals) and/or \u2170-\u217F (lower case Roman

			
 
				-    # numerals) ranges to avoid this ambiguity.

			
 
				-    # TODO implement regular expressions for ignore patterns.

			
 
				-    #- re: "I{2,3}"

			
 
				-    #- re: "I(V|X)"

			
 
				-    #- re: "LI{,3}"

			
 
				-    #- re: "LI?(V|X)"

			
 
				-    #- re: "L(V|X{1,3})I{,3}"

			
 
				-    #- re: "LX{1,3}I?V"

			
 
				-    #- re: "LX{1,3}VI{,3}"

			
 
				-    #- re: "(V|X{1,3})I{,3}"

			
 
				-    #- re: "X{1,3}I{,3}"

			
 
				-    #- re: "X{1,3}I(V|X)"

			
 
				-    #- re: "X{1,3}VI{,3}"

			
 
				-    - "II"

			
 
				-    - "III"

			
 
				-    - "IV"

			
 
				-    - "IX"

			
 
				-    - "LI"

			
 
				-    - "LII"

			
 
				-    - "LIII"

			
 
				-    - "LIV"

			
 
				-    - "LIX"

			
 
				-    - "LV"

			
 
				-    - "LVI"

			
 
				-    - "LVII"

			
 
				-    - "LVIII"

			
 
				-    - "LX"

			
 
				-    - "LXI"

			
 
				-    - "LXII"

			
 
				-    - "LXIII"

			
 
				-    - "LXIV"

			
 
				-    - "LXIX"

			
 
				-    - "LXV"

			
 
				-    - "LXVI"

			
 
				-    - "LXVII"

			
 
				-    - "LXVIII"

			
 
				-    - "LXX"

			
 
				-    - "LXXI"

			
 
				-    - "LXXII"

			
 
				-    - "LXXIII"

			
 
				-    - "LXXIV"

			
 
				-    - "LXXIX"

			
 
				-    - "LXXV"

			
 
				-    - "LXXVI"

			
 
				-    - "LXXVII"

			
 
				-    - "LXXVIII"

			
 
				-    - "LXXX"

			
 
				-    - "LXXXI"

			
 
				-    - "LXXXII"

			
 
				-    - "LXXXIII"

			
 
				-    - "LXXXIV"

			
 
				-    - "LXXXIX"

			
 
				-    - "LXXXV"

			
 
				-    - "LXXXVI"

			
 
				-    - "LXXXVII"

			
 
				-    - "LXXXVIII"

			
 
				-    - "VI"

			
 
				-    - "VII"

			
 
				-    - "VIII"

			
 
				-    - "XI"

			
 
				-    - "XII"

			
 
				-    - "XIII"

			
 
				-    - "XIV"

			
 
				-    - "XIX"

			
 
				-    - "XL"

			
 
				-    - "XLI"

			
 
				-    - "XLII"

			
 
				-    - "XLIII"

			
 
				-    - "XLIV"

			
 
				-    - "XLIX"

			
 
				-    - "XLV"

			
 
				-    - "XLVI"

			
 
				-    - "XLVII"

			
 
				-    - "XLVIII"

			
 
				-    - "XV"

			
 
				-    - "XVI"

			
 
				-    - "XVII"

			
 
				-    - "XVIII"

			
 
				-    - "XX"

			
 
				-    - "XXI"

			
 
				-    - "XXII"

			
 
				-    - "XXIII"

			
 
				-    - "XXIV"

			
 
				-    - "XXIX"

			
 
				-    - "XXV"

			
 
				-    - "XXVI"

			
 
				-    - "XXVII"

			
 
				-    - "XXVIII"

			
 
				-    - "XXX"

			
 
				-    - "XXXI"

			
 
				-    - "XXXII"

			
 
				-    - "XXXIII"

			
 
				-    - "XXXIV"

			
 
				-    - "XXXIX"

			
 
				-    - "XXXV"

			
 
				-    - "XXXVI"

			
 
				-    - "XXXVII"

			
 
				-    - "XXXVIII"

			
 
				-    - "and one other"

			
 
				-    - "and ([a-z]+ )?others"

			
 
				-    - "et al."

			
 
				-

			
 
				-    # combining acute \u0301

			
 
				-    # combining tilde \u0303

			
 
				-    # combining macron \u0304

			
 
				-    # combining breve \u0306

			
 
				-    # combining dot above \u0307

			
 
				-    # combining caron (hachek) \u030C

			
 
				-    # combining dot below \u0323

			
 
				-    # combining low line (underscore) \u0332

			
 
				-    # ayn(spacing) \u02BB

			
 
				-    # alif (spacing) \u02BC

			
 
				-

			
 
				   map:

			
 
				     "Rya": "\u1358"

			
 
				     "rya": "\u1358"