Ver Fonte

Merge pull request #245 from lcnetdev/dec_config

Decouple configs
Stefano Cossu há 1 mês atrás
pai
commit
90127b747f

+ 44 - 7
doc/config.md

@@ -13,9 +13,7 @@ Configuration files, also called transliteration tables, are contained in the
 The configuration file names are key to most operations in the software. They
 are all-lowercase and use underscores to separate words, e.g.
 `church_slavonic`. They have the `.yml` extension and are written in the
-[YAML](https://yaml.org/) configuration language. Hence, a transliteration
-request to the `/trans` REST API endpoint providing `church_slavonic` as the
-transliteration language, uses the `church_slavonic.yml` configuration file.
+[YAML](https://yaml.org/) configuration language.
 
 Other files are present in the `data` directory that are not exposed to the end
 user via Web UI or REST API. These files may be incomplete transliteration
@@ -35,10 +33,49 @@ transliteration table key names as described previously, and the values are
 key-value pairs which can have arbitrary contents. These contents are displayed
 to the user in the `/languages` API endpoint.
 
-The only mandatory key for each key-value pair is `name`, which is the
-human-readable label that is displayed in the Web UI. Other keys, such as
-`description`, may be used to inform the user about the scope of a particular
-table.
+Each entry of the index file are the following:
+
+### `<entry_name>`
+
+The key for the language/script. This is referred in multiple places across
+the application, e.g. the `/trans/mongolian_cyrillic` API method transliterates
+a sentence using the `mongolian_cyrillic` index entry. By default, the
+`mongolian_cyrillic` entry in the index file (see below) uses the
+`mongolian_cyrillic.yml` configuration file in the `data/` folder. This can be
+overridden by the `conf` key (see below).
+
+By convention, an entry name uses the name of the language, followed by the
+name of the script, separated by an underscore, *only if that language is known
+to exist in multiple scripts*. For example, `mongolian_cyrillic` is used for
+Mongolian written in Cyrillic, and `mongolian_mongol_bichig` for the native
+Mongol Bichig script; while `persian` is only found in Arabic script, so
+`arabic` is not added.
+
+### `<entry_name>.name`
+
+Human-readable label that is displayed in the Web UI. Mandatory.
+
+### `<entry_name>.conf`
+
+Override the default configuration file lookup. By default,
+the configuration file is inferred from the key name, e.g. `chinese` looks up
+`data/chinese.yml`. However, some entries in the index may not have a distinct
+configuration and reuse an existing configuration that works for that language.
+Several languages in the Cyrillic script use this method.
+
+The value is the full file name relative to the `data/` directory, e.g.
+`cyrillic_generic.yml`.
+
+### `<entry_name>.marc_code`
+
+MARC code from the [MARC Standards Office registry
+](https://www.loc.gov/marc/languages/language_name.html). This may be used by
+external applications to more easily look up entries. Optional.
+
+### `<entry_name>.description`
+
+Additional description that  may be used to inform the user about the scope of
+a particular table. Optional.
 
 ## Inheritance
 

+ 13 - 5
scriptshifter/tables/__init__.py

@@ -66,6 +66,8 @@ FEAT_RE = 1 << 3        # Regular expression.
 
 logger = logging.getLogger(__name__)
 
+tbl_index = None  # Module-level index of all scripts.
+
 
 class Token(str):
     """
@@ -165,11 +167,12 @@ def init_db():
             conn.executescript(fh.read())
 
     # Populate tables.
+    global tbl_index
     with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh:
-        tlist = load(fh, Loader=Loader)
+        tbl_index = load(fh, Loader=Loader)
     try:
         with conn:
-            for tname, tdata in tlist.items():
+            for tname, tdata in tbl_index.items():
                 populate_table(conn, tname, tdata)
 
         # If the DB already exists, it will be overwritten ONLY on success at
@@ -340,9 +343,14 @@ def load_table(tname):
     the language & script metadata and parsing rules.
     """
 
-    fname = path.join(TABLE_DIR, tname + ".yml")
+    try:
+        fname = path.join(TABLE_DIR, tbl_index[tname]["conf"])
+    except KeyError:
+        # If no `conf` key is provided, use the conventional table name + .yml.
+        fname = path.join(TABLE_DIR, tname + ".yml")
     if not access(fname, R_OK):
-        raise ValueError(f"No transliteration table for {tname}!")
+        raise ValueError(
+                f"No transliteration table `{fname}` found for {tname}!")
 
     with open(fname) as fh:
         tdata = load(fh, Loader=Loader)
@@ -400,7 +408,7 @@ def load_table(tname):
 
         # Inherit normalization rules.
         for parent in parents:
-            parent_langsec = load_table(parent)["script_to_roman"]
+            parent_langsec = load_table(parent).get("script_to_roman", {})
             normalize |= parent_langsec.get("normalize", {})
 
         for k, v in tdata["script_to_roman"].get("normalize", {}).items():

+ 12 - 17
scriptshifter/tables/data/_ignore_base.yml

@@ -22,22 +22,17 @@ roman_to_script:
     # dedicated U+2160÷U+216F (uppercase Roman
     # numerals) and/or U+2170÷U+217F (lower case Roman
     # numerals) ranges to avoid this ambiguity.
-    - "\\bI{2,3}\\b"
-    - "\\bI(V|X)\\b"
-    - "\\bLI{,3}\\b"
-    - "\\bLI?(V|X)\\b"
-    - "\\bL(V|X{1,3})I{,3}\\b"
-    - "\\bLX{1,3}I?V\\b"
-    - "\\bLX{1,3}VI{,3}\\b"
-    - "\\b(V|X{1,3})I{,3}\\b"
-    - "\\bX{1,3}I{,3}\\b"
-    - "\\bX{1,3}I(V|X)\\b"
-    - "\\bX{1,3}VI{,3}\\b"
+    - "I{2,3}\\b"
+    - "I(V|X)\\b"
+    - "LI{,3}\\b"
+    - "LI?(V|X)\\b"
+    - "L(V|X{1,3})I{,3}\\b"
+    - "LX{1,3}I?V\\b"
+    - "LX{1,3}VI{,3}\\b"
+    - "(V|X{1,3})I{,3}\\b"
+    - "X{1,3}I{,3}\\b"
+    - "X{1,3}I(V|X)\\b"
+    - "X{1,3}VI{,3}\\b"
 
     # MARC sub-field markers.
-    - "\\b[\u2021\u01C2\\$][0-9a-z]\\b"
-
-script_to_roman:
-  ignore_ptn:
-    # MARC sub-field markers.
-    - "\\b[\u2021\u01C2\\$][0-9a-z]\\b"
+    - "[\u2021\u01C2\\$][0-9a-z]\\b"

+ 102 - 38
scriptshifter/tables/index.yml

@@ -22,26 +22,12 @@ arabic:
   name: Arabic
 amharic:
   marc_code: amh
-  name: Amharic  
+  name: Amharic
 armenian:
   marc_code: arm
   name: Armenian
-# asian_cyrillic:
-#   description: >
-#     Multi-purpose transliteration for non-Slavic Cyrillic scripts: Abaza,
-#     Abkhaz, Adygei, Aisor, Altai, Avar, Azeri, Balkar, Bashkir, Buryat,
-#     Chechen, Chukchi, Chuvash, Dargwa, Dungan, Eskimo, Even, Evenki, Gagauz,
-#     Ingush, Inuit, Kabardian, Kalmyk, Karachay, Karachay-Balkar, Karakalpak,
-#     Karelian, Khakass, Khanty, Komi, Komi-Permyak, Koryak, Kumyk, Lak, Lapp,
-#     Lezghian, Lithuanian, Mansi, Mari, Moldovan, Molodstov, Mordvin, Nanai,
-#     Nenets, Nivkh, Nogai, Ossetic, Permyak, Romanian, Romany, Selkup, Shor,
-#     Tabasaran, Tat, Tuva, Udekhe, Udmurt, Yakut.
-#   marc_code: >
-#     abk, ady, alt, ava, bak, che, chv, dar, ale, esk, kbd, xal, krc, kaa,
-#     krl, kom, kum, lez, lit, chm, nog, oss, rum, rom, sel, udm, sah
-#   name: Asian Cyrillic
 assamese:
-  marc_code: asm  
+  marc_code: asm
   name: assamese
 azerbaijani_cyrillic:
   marc_code: aze
@@ -60,7 +46,7 @@ bihari_devanagari:
   name: Bihari (Devanagari)
 braj_devanagari:
   marc_code: bra
-  name: Braj (Devanagari)  
+  name: Braj (Devanagari)
 bulgarian:
   marc_code: bul
   name: Bulgarian
@@ -81,20 +67,10 @@ church_slavonic:
 chuvash_cyrillic:
   marc_code: chv
   name: Chuvash (Cyrillic)
-cyrillic_generic:
-  description: 'Multi-purpose transliteration for most languages that use the Cyrillic script:
-    Abaza, Abkhaz, Adygei, Aisor, Altai, Avar, Azeri, Balkar, Bashkir, Belarusian, Bulgarian,
-    Buryat, Chechen, Chukchi, Chuvash, Dargwa, Dungan, Eskimo, Even, Evenki, Gagauz, Ingush,
-    Inuit, Kabardian, Kalmyk, Karachay, Karachay-Balkar, Karakalpak, Karelian, Khakass, Khanty,
-    Komi, Komi-Permyak, Koryak, Kumyk, Lak, Lapp, Lezghian, Lithuanian, Macedonian, Mansi, Mari,
-    Moldovan, Molodstov, Mordvin, Nanai, Nenets, Nivkh, Nogai, Ossetic, Permyak, Romanian, Romany,
-    Russian, Selkup, Serbian, Shor, Tabasaran, Tat, Tuva, Udekhe, Udmurt, Ukrainian, Yakut.'
-  marc_code: abk, ady, alt, ava, bak, bel, bul, che, chm, chv, dar, ale, esk, kbd, xal, krc, kaa,
-    krl, kom, kum, lez, lit, mac, nog, oss, rum, rom, sah, sel, srp, udm, ukr
-  name: Cyrillic (Generic)
-devanagari:
-  marc_code: hin, san
-  name: Devanagari
+dargwa_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Dargwa (Cyrillic)
+  marc_code: dar
 divehi_thaana:
   marc_code: div
   name: Divehi (Thaana)
@@ -102,8 +78,12 @@ dogri_devanagari:
   marc_code: doi
   name: Dogri (Devanagari)
 dungan_cyrillic:
-  marc_code: sit  
+  marc_code: sit
   name: Dungan (Cyrillic)
+eskimo_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Eskimo (Cyrillic)
+  marc_code: esk
 ethiopic:
   marc_code: amh, eth
   name: Ethiopic (Amharic)
@@ -159,7 +139,7 @@ khakass_cyrillic:
   marc_code: tut
   name: Khakass (Cyrillic)
 khanty_cyrillic:
-  marc_code: fiu  
+  marc_code: fiu
   name: Khanty (Cyrillic)
 khmer:
   marc_code: khm
@@ -186,15 +166,55 @@ kurdish:
 kyrgyz_cyrillic:
   marc_code: kir
   name: Kyrgyz (Cyrillic)
+ingush_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Ingush (Cyrillic)
+  marc_code: inh
+inuit_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Inuit (Cyrillic)
+  marc_code: ipk
+kabardian_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Kabardian (Cyrillic)
+  marc_code: kbd
+karakalpak_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Karakalpak (Cyrillic)
+  marc_code: kaa
+komi-Permyak_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Permyak (Cyrillic)
+  marc_code: kom
+kumyk_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Kumyk (Cyrillic)
+  marc_code: kum
+lak_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Lak (Cyrillic)
+  marc_code: cau
+lapp_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Lapp (Cyrillic)
+  marc_code: smi
+lezghian_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Lezghian (Cyrillic)
+  marc_code: lez
 lithuanian_cyrillic:
   marc_code: lit
   name: Lithuanian (Cyrillic)
+mari_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Mari (Cyrillic)
+  marc_code: chm
 macedonian:
   marc_code: mac
   name: Macedonian
 maithili_devanagari:
   marc_code: mai
-  name: Maithili (Devanagari)  
+  name: Maithili (Devanagari)
 malayalam:
   marc_code: mal
   name: Malayalam
@@ -209,6 +229,9 @@ marathi_devanagari:
 moldovan_cyrillic:
   marc_code: mol
   name: Moldovan (Cyrillic)
+molodstov_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Molodstov (Cyrillic)
 mongolian_cyrillic:
   marc_code: mon
   name: Mongolian (Cyrillic)
@@ -217,6 +240,10 @@ mongolian_mongol_bichig:
   name: Mongolian (Mongol bichig)
 mordvin_cyrillic:
   name: Mordvin (Cyrillic)
+nanai_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Nanai (Cyrillic)
+  marc_code: tut
 nenets_cyrillic:
   name: Nenets (Cyrillic)
 nepali_devanagari:
@@ -225,6 +252,14 @@ nepali_devanagari:
 newari_devanagari:
   marc_code: new
   name: Newari (Devanagari)
+nivkh_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Nivkh (Cyrillic)
+  marc_code: mis
+nogai_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Nogai (Cyrillic)
+  marc_code: ng
 oriya:
   marc_code: ori
   name: Oriya
@@ -233,13 +268,17 @@ ossetic_cyrillic:
   name: Ossetic (Cyrillic)
 pahari_devanagari:
   marc_code: him
-  name: Pahari (Devanagari)  
+  name: Pahari (Devanagari)
 pali:
   marc_code: pli
   name: Pali
 panjabi_gurmukhi:
   marc_code: pan
   name: Panjabi (Gurmukhi)
+permyak_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Permyak (Cyrillic)
+  marc_code: kom
 persian:
   marc_code: per
   name: Persian
@@ -261,6 +300,14 @@ rajasthani_devanagari:
 romani_cyrillic:
   marc_code: rom
   name: Romani (Cyrillic)
+romany_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Romany (Cyrillic)
+  marc_code: rom
+romanian_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Romanian (Cyrillic)
+  marc_code: rum
 russian:
   marc_code: rus
   name: Russian
@@ -270,6 +317,10 @@ sanskrit_devanagari:
 serbian:
   marc_code: srp
   name: Serbian
+selkup_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Selkup (Cyrillic)
+  marc_code: sel
 shor_cyrillic:
   name: Shor (Cyrillic)
 sinhalese:
@@ -278,6 +329,10 @@ sinhalese:
 syriac_cyrillic:
   marc_code: syc
   name: Syriac (Cyrillic)
+tabasaran_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Tabasaran (Cyrillic)
+  marc_code: cau
 tajik_cyrillic:
   marc_code: tgk
   name: Tajik (Cyrillic)
@@ -290,6 +345,10 @@ tamil_brahmi:
 tamil_extended:
   marc_code: tam
   name: Tamil (extended)
+tat_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Tat (Cyrillic)
+  marc_code: ira
 tatar-kryashen_cyrillic:
   name: Tatar-Kryashen (Cyrillic)
 tatar_cyrillic:
@@ -306,16 +365,23 @@ tibetan:
   name: Tibetan
 tigrinya:
   marc_code: tir
-  name: Tigrinya  
+  name: Tigrinya
 tod_mongolian:
   marc_code: xal
   name: Tod Mongolian
 turkmen_cyrillic:
   marc_code: tuk
   name: Turkmen (Cyrillic)
+tuva_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Tuva (Cyrillic)
 tuvinian_cyrillic:
   marc_code: tyv
   name: Tuvinian (Cyrillic)
+udekhe_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Udekhe (Cyrillic)
+  marc_code: tut
 udmurt_cyrillic:
   marc_code: udm
   name: Udmurt (Cyrillic)
@@ -343,5 +409,3 @@ yiddish:
 yuit_cyrillic:
   marc_code: ypk
   name: Yuit (Cyrillic)
-  
-