浏览代码

Use common ignore list.

Stefano Cossu 2 年之前
父节点
当前提交
28f0f7c29f

+ 2 - 114
transliterator/tables/data/_cyrillic_base.yml

@@ -1,5 +1,7 @@
 general:
   name: Cyrillic base
+  parents:
+    - _ignore_base
   notes: >
     copied from Russian .cfg file and stripped
     off language-specific tokens. Russian ignore list
@@ -7,120 +9,6 @@ general:
     for all child languages.
 
 roman_to_script:
-  ignore:
-    - "at head of title"
-    - "colophon"
-    - "date of publication not identified"
-    - "place of publication not identified"
-    - "publisher not identified"
-    # NOTE There is ambiguity about ignoring these
-    # words. Note that the single-character Roman
-    # numerals are not included on purpose.
-    # Ideally the source editors should use the
-    # dedicated U+2160÷U+216F (uppercase Roman
-    # numerals) and/or U+2170÷U+217F (lower case Roman
-    # numerals) ranges to avoid this ambiguity.
-    # TODO implement regular expressions for ignore patterns.
-    #- re: "I{2,3}"
-    #- re: "I(V|X)"
-    #- re: "LI{,3}"
-    #- re: "LI?(V|X)"
-    #- re: "L(V|X{1,3})I{,3}"
-    #- re: "LX{1,3}I?V"
-    #- re: "LX{1,3}VI{,3}"
-    #- re: "(V|X{1,3})I{,3}"
-    #- re: "X{1,3}I{,3}"
-    #- re: "X{1,3}I(V|X)"
-    #- re: "X{1,3}VI{,3}"
-    - "II"
-    - "III"
-    - "IV"
-    - "IX"
-    - "LI"
-    - "LII"
-    - "LIII"
-    - "LIV"
-    - "LIX"
-    - "LV"
-    - "LVI"
-    - "LVII"
-    - "LVIII"
-    - "LX"
-    - "LXI"
-    - "LXII"
-    - "LXIII"
-    - "LXIV"
-    - "LXIX"
-    - "LXV"
-    - "LXVI"
-    - "LXVII"
-    - "LXVIII"
-    - "LXX"
-    - "LXXI"
-    - "LXXII"
-    - "LXXIII"
-    - "LXXIV"
-    - "LXXIX"
-    - "LXXV"
-    - "LXXVI"
-    - "LXXVII"
-    - "LXXVIII"
-    - "LXXX"
-    - "LXXXI"
-    - "LXXXII"
-    - "LXXXIII"
-    - "LXXXIV"
-    - "LXXXIX"
-    - "LXXXV"
-    - "LXXXVI"
-    - "LXXXVII"
-    - "LXXXVIII"
-    - "VI"
-    - "VII"
-    - "VIII"
-    - "XI"
-    - "XII"
-    - "XIII"
-    - "XIV"
-    - "XIX"
-    - "XL"
-    - "XLI"
-    - "XLII"
-    - "XLIII"
-    - "XLIV"
-    - "XLIX"
-    - "XLV"
-    - "XLVI"
-    - "XLVII"
-    - "XLVIII"
-    - "XV"
-    - "XVI"
-    - "XVII"
-    - "XVIII"
-    - "XX"
-    - "XXI"
-    - "XXII"
-    - "XXIII"
-    - "XXIV"
-    - "XXIX"
-    - "XXV"
-    - "XXVI"
-    - "XXVII"
-    - "XXVIII"
-    - "XXX"
-    - "XXXI"
-    - "XXXII"
-    - "XXXIII"
-    - "XXXIV"
-    - "XXXIX"
-    - "XXXV"
-    - "XXXVI"
-    - "XXXVII"
-    - "XXXVIII"
-    - "and one other"
-    #- re: "and ([a-z]+ )?others"
-    - "et al."
-
   map:
     "A": "\u0410"
     "a": "\u0430"

+ 119 - 0
transliterator/tables/data/_ignore_base.yml

@@ -0,0 +1,119 @@
+general:
+  name: Common ignore list.
+
+roman_to_script:
+  ignore:
+    - "at head of title"
+    - "colophon"
+    - "date of publication not identified"
+    - "place of publication not identified"
+    - "publisher not identified"
+    # NOTE There is ambiguity about ignoring these
+    # words. Note that the single-character Roman
+    # numerals are not included on purpose.
+    # Ideally the source editors should use the
+    # dedicated U+2160÷U+216F (uppercase Roman
+    # numerals) and/or U+2170÷U+217F (lower case Roman
+    # numerals) ranges to avoid this ambiguity.
+    # TODO implement regular expressions for ignore patterns.
+    #- re: "I{2,3}"
+    #- re: "I(V|X)"
+    #- re: "LI{,3}"
+    #- re: "LI?(V|X)"
+    #- re: "L(V|X{1,3})I{,3}"
+    #- re: "LX{1,3}I?V"
+    #- re: "LX{1,3}VI{,3}"
+    #- re: "(V|X{1,3})I{,3}"
+    #- re: "X{1,3}I{,3}"
+    #- re: "X{1,3}I(V|X)"
+    #- re: "X{1,3}VI{,3}"
+    - "II"
+    - "III"
+    - "IV"
+    - "IX"
+    - "LI"
+    - "LII"
+    - "LIII"
+    - "LIV"
+    - "LIX"
+    - "LV"
+    - "LVI"
+    - "LVII"
+    - "LVIII"
+    - "LX"
+    - "LXI"
+    - "LXII"
+    - "LXIII"
+    - "LXIV"
+    - "LXIX"
+    - "LXV"
+    - "LXVI"
+    - "LXVII"
+    - "LXVIII"
+    - "LXX"
+    - "LXXI"
+    - "LXXII"
+    - "LXXIII"
+    - "LXXIV"
+    - "LXXIX"
+    - "LXXV"
+    - "LXXVI"
+    - "LXXVII"
+    - "LXXVIII"
+    - "LXXX"
+    - "LXXXI"
+    - "LXXXII"
+    - "LXXXIII"
+    - "LXXXIV"
+    - "LXXXIX"
+    - "LXXXV"
+    - "LXXXVI"
+    - "LXXXVII"
+    - "LXXXVIII"
+    - "VI"
+    - "VII"
+    - "VIII"
+    - "XI"
+    - "XII"
+    - "XIII"
+    - "XIV"
+    - "XIX"
+    - "XL"
+    - "XLI"
+    - "XLII"
+    - "XLIII"
+    - "XLIV"
+    - "XLIX"
+    - "XLV"
+    - "XLVI"
+    - "XLVII"
+    - "XLVIII"
+    - "XV"
+    - "XVI"
+    - "XVII"
+    - "XVIII"
+    - "XX"
+    - "XXI"
+    - "XXII"
+    - "XXIII"
+    - "XXIV"
+    - "XXIX"
+    - "XXV"
+    - "XXVI"
+    - "XXVII"
+    - "XXVIII"
+    - "XXX"
+    - "XXXI"
+    - "XXXII"
+    - "XXXIII"
+    - "XXXIV"
+    - "XXXIX"
+    - "XXXV"
+    - "XXXVI"
+    - "XXXVII"
+    - "XXXVIII"
+    - "and one other"
+    #- re: "and ([a-z0-9]+ )?others"
+    - "et al."
+
+

+ 6 - 114
transliterator/tables/data/armenian.yml

@@ -1,121 +1,9 @@
 general:
   name: Armenian
+  parents:
+    - _ignore_base
 
 roman_to_script:
-  ignore:
-    - "at head of title"
-    - "colophon"
-    - "date of publication not identified"
-    - "place of publication not identified"
-    - "publisher not identified"
-    # NOTE There is ambiguity about ignoring these
-    # words. Note that the single-character Roman
-    # numerals are not included on purpose.
-    # Ideally the source editors should use the
-    # dedicated U+2160÷U+216F (uppercase Roman
-    # numerals) and/or U+2170÷U+217F (lower case Roman
-    # numerals) ranges to avoid this ambiguity.
-    # TODO implement regular expressions for ignore patterns.
-    #- re: "I{2,3}"
-    #- re: "I(V|X)"
-    #- re: "LI{,3}"
-    #- re: "LI?(V|X)"
-    #- re: "L(V|X{1,3})I{,3}"
-    #- re: "LX{1,3}I?V"
-    #- re: "LX{1,3}VI{,3}"
-    #- re: "(V|X{1,3})I{,3}"
-    #- re: "X{1,3}I{,3}"
-    #- re: "X{1,3}I(V|X)"
-    #- re: "X{1,3}VI{,3}"
-    - "II"
-    - "III"
-    - "IV"
-    - "IX"
-    - "LI"
-    - "LII"
-    - "LIII"
-    - "LIV"
-    - "LIX"
-    - "LV"
-    - "LVI"
-    - "LVII"
-    - "LVIII"
-    - "LX"
-    - "LXI"
-    - "LXII"
-    - "LXIII"
-    - "LXIV"
-    - "LXIX"
-    - "LXV"
-    - "LXVI"
-    - "LXVII"
-    - "LXVIII"
-    - "LXX"
-    - "LXXI"
-    - "LXXII"
-    - "LXXIII"
-    - "LXXIV"
-    - "LXXIX"
-    - "LXXV"
-    - "LXXVI"
-    - "LXXVII"
-    - "LXXVIII"
-    - "LXXX"
-    - "LXXXI"
-    - "LXXXII"
-    - "LXXXIII"
-    - "LXXXIV"
-    - "LXXXIX"
-    - "LXXXV"
-    - "LXXXVI"
-    - "LXXXVII"
-    - "LXXXVIII"
-    - "VI"
-    - "VII"
-    - "VIII"
-    - "XI"
-    - "XII"
-    - "XIII"
-    - "XIV"
-    - "XIX"
-    - "XL"
-    - "XLI"
-    - "XLII"
-    - "XLIII"
-    - "XLIV"
-    - "XLIX"
-    - "XLV"
-    - "XLVI"
-    - "XLVII"
-    - "XLVIII"
-    - "XV"
-    - "XVI"
-    - "XVII"
-    - "XVIII"
-    - "XX"
-    - "XXI"
-    - "XXII"
-    - "XXIII"
-    - "XXIV"
-    - "XXIX"
-    - "XXV"
-    - "XXVI"
-    - "XXVII"
-    - "XXVIII"
-    - "XXX"
-    - "XXXI"
-    - "XXXII"
-    - "XXXIII"
-    - "XXXIV"
-    - "XXXIX"
-    - "XXXV"
-    - "XXXVI"
-    - "XXXVII"
-    - "XXXVIII"
-    - "and one other"
-    #- re: "and ([a-z0-9]+ )?others"
-    - "et al."
-
   map:
     "A": "\u0531"
     "a": "\u0561"
@@ -363,6 +251,10 @@ script_to_roman:
     "\u0577": "sh"
     "\u0540": "H"
     "\u0570": "h"
+    # U uppercase
+    "\u0548\u0582": "U"
+    # u lowercase
+    "\u0578\u0582": "u"
     # O uppercase with combining macron
     "\u0555": "O\u0304"
     # o lowercase with combining macron

+ 6 - 25
transliterator/tables/data/chinese.yml

@@ -1,35 +1,16 @@
 general: # Section names and other keywords are all snake_cased.
   name: Chinese
-  #allow_define_button: true # Probably not a relevant section.
-
-# roman_to_script: Empty section removed.
+  parents:
+    - _ignore_base
 
 script_to_roman:
   directives: # Directives section.
-    # Irrelevant settings removed.
-    #fields_included: 100 110 111 130 240 245 246 250 260 264 440 490 505 600 610 611 630 651 700 710 711 730 740 800 830
-    #subfields_always_excluded: uvxy0123456789
-    #other_subfields_excluded_by_tag: 100/e 110/e 111/j 246/i 260/c 264/c 650/a 700/e 700/i 710/e 710/i 711/i 711/j 730/i
-    #uppercase_first_character_in_subfield: 260/b
-    # personal_name_handling: true  # This is handled by the text scanner.
-    # Capitalize the first letter of the string only;
-    # TODO Implement a list that includes all punctuation marks that want the
-    # following letter capitalized.
+    # Capitalize the first letter of the string only; TODO
+    # Implement a list that includes all punctuation marks that
+    # want the following letter capitalized.
     capitalize: true
   map: # Mapping section.
-    # RDA boilerplate phrases not transliterated.
-    # All characters not found in the mapping are copid verbatim. No need for
-    # a "copy verbatim" section.
-    #Place of publication not identified=Place of publication not identified_
-    #publisher not identified=publisher not identified_
-
-    # Multi-character transliteration.
-    #
-    # These should be kept at the beginning of the mapping for readability,
-    # however the software will rearrange them internally after parsing the
-    # config file, so the longest strings are searched for first.
-    # Q: Separate whole names from the rest? We'll see when we implement the
-    # logic maybe.
+    # Whole words.
     "\u4E2D\u56FD": "Zhongguo "
     "\u5317\u4EAC": "Beijing "
     "\u9808\u5F4C": "Xumi "

+ 2 - 125
transliterator/tables/data/ethiopic.yml

@@ -1,132 +1,9 @@
 general:
   name: Ethiopic
+  parents:
+    - _ignore_base
 
 roman_to_script:
-  ignore:
-    - "at head of title"
-    - "colophon"
-    - "date of publication not identified"
-    - "place of publication not identified"
-    - "publisher not identified"
-    # NOTE There is ambiguity about ignoring these
-    # words. Note that the single-character Roman
-    # numerals are not included on purpose.
-    # Ideally the source editors should use the
-    # dedicated \u2160-\u216F (uppercase Roman
-    # numerals) and/or \u2170-\u217F (lower case Roman
-    # numerals) ranges to avoid this ambiguity.
-    # TODO implement regular expressions for ignore patterns.
-    #- re: "I{2,3}"
-    #- re: "I(V|X)"
-    #- re: "LI{,3}"
-    #- re: "LI?(V|X)"
-    #- re: "L(V|X{1,3})I{,3}"
-    #- re: "LX{1,3}I?V"
-    #- re: "LX{1,3}VI{,3}"
-    #- re: "(V|X{1,3})I{,3}"
-    #- re: "X{1,3}I{,3}"
-    #- re: "X{1,3}I(V|X)"
-    #- re: "X{1,3}VI{,3}"
-    - "II"
-    - "III"
-    - "IV"
-    - "IX"
-    - "LI"
-    - "LII"
-    - "LIII"
-    - "LIV"
-    - "LIX"
-    - "LV"
-    - "LVI"
-    - "LVII"
-    - "LVIII"
-    - "LX"
-    - "LXI"
-    - "LXII"
-    - "LXIII"
-    - "LXIV"
-    - "LXIX"
-    - "LXV"
-    - "LXVI"
-    - "LXVII"
-    - "LXVIII"
-    - "LXX"
-    - "LXXI"
-    - "LXXII"
-    - "LXXIII"
-    - "LXXIV"
-    - "LXXIX"
-    - "LXXV"
-    - "LXXVI"
-    - "LXXVII"
-    - "LXXVIII"
-    - "LXXX"
-    - "LXXXI"
-    - "LXXXII"
-    - "LXXXIII"
-    - "LXXXIV"
-    - "LXXXIX"
-    - "LXXXV"
-    - "LXXXVI"
-    - "LXXXVII"
-    - "LXXXVIII"
-    - "VI"
-    - "VII"
-    - "VIII"
-    - "XI"
-    - "XII"
-    - "XIII"
-    - "XIV"
-    - "XIX"
-    - "XL"
-    - "XLI"
-    - "XLII"
-    - "XLIII"
-    - "XLIV"
-    - "XLIX"
-    - "XLV"
-    - "XLVI"
-    - "XLVII"
-    - "XLVIII"
-    - "XV"
-    - "XVI"
-    - "XVII"
-    - "XVIII"
-    - "XX"
-    - "XXI"
-    - "XXII"
-    - "XXIII"
-    - "XXIV"
-    - "XXIX"
-    - "XXV"
-    - "XXVI"
-    - "XXVII"
-    - "XXVIII"
-    - "XXX"
-    - "XXXI"
-    - "XXXII"
-    - "XXXIII"
-    - "XXXIV"
-    - "XXXIX"
-    - "XXXV"
-    - "XXXVI"
-    - "XXXVII"
-    - "XXXVIII"
-    - "and one other"
-    - "and ([a-z]+ )?others"
-    - "et al."
-
-    # combining acute \u0301
-    # combining tilde \u0303
-    # combining macron \u0304
-    # combining breve \u0306
-    # combining dot above \u0307
-    # combining caron (hachek) \u030C
-    # combining dot below \u0323
-    # combining low line (underscore) \u0332
-    # ayn(spacing) \u02BB
-    # alif (spacing) \u02BC
-
   map:
     "Rya": "\u1358"
     "rya": "\u1358"