Browse Source

Merge pull request #245 from lcnetdev/dec_config

Decouple configs
Stefano Cossu 1 month ago
parent
commit
90127b747f

+ 44 - 7
doc/config.md

@@ -13,9 +13,7 @@ Configuration files, also called transliteration tables, are contained in the
 The configuration file names are key to most operations in the software. They
 The configuration file names are key to most operations in the software. They
 are all-lowercase and use underscores to separate words, e.g.
 are all-lowercase and use underscores to separate words, e.g.
 `church_slavonic`. They have the `.yml` extension and are written in the
 `church_slavonic`. They have the `.yml` extension and are written in the
-[YAML](https://yaml.org/) configuration language. Hence, a transliteration
-request to the `/trans` REST API endpoint providing `church_slavonic` as the
-transliteration language, uses the `church_slavonic.yml` configuration file.
+[YAML](https://yaml.org/) configuration language.
 
 
 Other files are present in the `data` directory that are not exposed to the end
 Other files are present in the `data` directory that are not exposed to the end
 user via Web UI or REST API. These files may be incomplete transliteration
 user via Web UI or REST API. These files may be incomplete transliteration
@@ -35,10 +33,49 @@ transliteration table key names as described previously, and the values are
 key-value pairs which can have arbitrary contents. These contents are displayed
 key-value pairs which can have arbitrary contents. These contents are displayed
 to the user in the `/languages` API endpoint.
 to the user in the `/languages` API endpoint.
 
 
-The only mandatory key for each key-value pair is `name`, which is the
-human-readable label that is displayed in the Web UI. Other keys, such as
-`description`, may be used to inform the user about the scope of a particular
-table.
+Each entry of the index file are the following:
+
+### `<entry_name>`
+
+The key for the language/script. This is referred in multiple places across
+the application, e.g. the `/trans/mongolian_cyrillic` API method transliterates
+a sentence using the `mongolian_cyrillic` index entry. By default, the
+`mongolian_cyrillic` entry in the index file (see below) uses the
+`mongolian_cyrillic.yml` configuration file in the `data/` folder. This can be
+overridden by the `conf` key (see below).
+
+By convention, an entry name uses the name of the language, followed by the
+name of the script, separated by an underscore, *only if that language is known
+to exist in multiple scripts*. For example, `mongolian_cyrillic` is used for
+Mongolian written in Cyrillic, and `mongolian_mongol_bichig` for the native
+Mongol Bichig script; while `persian` is only found in Arabic script, so
+`arabic` is not added.
+
+### `<entry_name>.name`
+
+Human-readable label that is displayed in the Web UI. Mandatory.
+
+### `<entry_name>.conf`
+
+Override the default configuration file lookup. By default,
+the configuration file is inferred from the key name, e.g. `chinese` looks up
+`data/chinese.yml`. However, some entries in the index may not have a distinct
+configuration and reuse an existing configuration that works for that language.
+Several languages in the Cyrillic script use this method.
+
+The value is the full file name relative to the `data/` directory, e.g.
+`cyrillic_generic.yml`.
+
+### `<entry_name>.marc_code`
+
+MARC code from the [MARC Standards Office registry
+](https://www.loc.gov/marc/languages/language_name.html). This may be used by
+external applications to more easily look up entries. Optional.
+
+### `<entry_name>.description`
+
+Additional description that  may be used to inform the user about the scope of
+a particular table. Optional.
 
 
 ## Inheritance
 ## Inheritance
 
 

+ 13 - 5
scriptshifter/tables/__init__.py

@@ -66,6 +66,8 @@ FEAT_RE = 1 << 3        # Regular expression.
 
 
 logger = logging.getLogger(__name__)
 logger = logging.getLogger(__name__)
 
 
+tbl_index = None  # Module-level index of all scripts.
+
 
 
 class Token(str):
 class Token(str):
     """
     """
@@ -165,11 +167,12 @@ def init_db():
             conn.executescript(fh.read())
             conn.executescript(fh.read())
 
 
     # Populate tables.
     # Populate tables.
+    global tbl_index
     with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh:
     with open(path.join(path.dirname(TABLE_DIR), "index.yml")) as fh:
-        tlist = load(fh, Loader=Loader)
+        tbl_index = load(fh, Loader=Loader)
     try:
     try:
         with conn:
         with conn:
-            for tname, tdata in tlist.items():
+            for tname, tdata in tbl_index.items():
                 populate_table(conn, tname, tdata)
                 populate_table(conn, tname, tdata)
 
 
         # If the DB already exists, it will be overwritten ONLY on success at
         # If the DB already exists, it will be overwritten ONLY on success at
@@ -340,9 +343,14 @@ def load_table(tname):
     the language & script metadata and parsing rules.
     the language & script metadata and parsing rules.
     """
     """
 
 
-    fname = path.join(TABLE_DIR, tname + ".yml")
+    try:
+        fname = path.join(TABLE_DIR, tbl_index[tname]["conf"])
+    except KeyError:
+        # If no `conf` key is provided, use the conventional table name + .yml.
+        fname = path.join(TABLE_DIR, tname + ".yml")
     if not access(fname, R_OK):
     if not access(fname, R_OK):
-        raise ValueError(f"No transliteration table for {tname}!")
+        raise ValueError(
+                f"No transliteration table `{fname}` found for {tname}!")
 
 
     with open(fname) as fh:
     with open(fname) as fh:
         tdata = load(fh, Loader=Loader)
         tdata = load(fh, Loader=Loader)
@@ -400,7 +408,7 @@ def load_table(tname):
 
 
         # Inherit normalization rules.
         # Inherit normalization rules.
         for parent in parents:
         for parent in parents:
-            parent_langsec = load_table(parent)["script_to_roman"]
+            parent_langsec = load_table(parent).get("script_to_roman", {})
             normalize |= parent_langsec.get("normalize", {})
             normalize |= parent_langsec.get("normalize", {})
 
 
         for k, v in tdata["script_to_roman"].get("normalize", {}).items():
         for k, v in tdata["script_to_roman"].get("normalize", {}).items():

+ 12 - 17
scriptshifter/tables/data/_ignore_base.yml

@@ -22,22 +22,17 @@ roman_to_script:
     # dedicated U+2160÷U+216F (uppercase Roman
     # dedicated U+2160÷U+216F (uppercase Roman
     # numerals) and/or U+2170÷U+217F (lower case Roman
     # numerals) and/or U+2170÷U+217F (lower case Roman
     # numerals) ranges to avoid this ambiguity.
     # numerals) ranges to avoid this ambiguity.
-    - "\\bI{2,3}\\b"
-    - "\\bI(V|X)\\b"
-    - "\\bLI{,3}\\b"
-    - "\\bLI?(V|X)\\b"
-    - "\\bL(V|X{1,3})I{,3}\\b"
-    - "\\bLX{1,3}I?V\\b"
-    - "\\bLX{1,3}VI{,3}\\b"
-    - "\\b(V|X{1,3})I{,3}\\b"
-    - "\\bX{1,3}I{,3}\\b"
-    - "\\bX{1,3}I(V|X)\\b"
-    - "\\bX{1,3}VI{,3}\\b"
+    - "I{2,3}\\b"
+    - "I(V|X)\\b"
+    - "LI{,3}\\b"
+    - "LI?(V|X)\\b"
+    - "L(V|X{1,3})I{,3}\\b"
+    - "LX{1,3}I?V\\b"
+    - "LX{1,3}VI{,3}\\b"
+    - "(V|X{1,3})I{,3}\\b"
+    - "X{1,3}I{,3}\\b"
+    - "X{1,3}I(V|X)\\b"
+    - "X{1,3}VI{,3}\\b"
 
 
     # MARC sub-field markers.
     # MARC sub-field markers.
-    - "\\b[\u2021\u01C2\\$][0-9a-z]\\b"
-
-script_to_roman:
-  ignore_ptn:
-    # MARC sub-field markers.
-    - "\\b[\u2021\u01C2\\$][0-9a-z]\\b"
+    - "[\u2021\u01C2\\$][0-9a-z]\\b"

+ 102 - 38
scriptshifter/tables/index.yml

@@ -22,26 +22,12 @@ arabic:
   name: Arabic
   name: Arabic
 amharic:
 amharic:
   marc_code: amh
   marc_code: amh
-  name: Amharic  
+  name: Amharic
 armenian:
 armenian:
   marc_code: arm
   marc_code: arm
   name: Armenian
   name: Armenian
-# asian_cyrillic:
-#   description: >
-#     Multi-purpose transliteration for non-Slavic Cyrillic scripts: Abaza,
-#     Abkhaz, Adygei, Aisor, Altai, Avar, Azeri, Balkar, Bashkir, Buryat,
-#     Chechen, Chukchi, Chuvash, Dargwa, Dungan, Eskimo, Even, Evenki, Gagauz,
-#     Ingush, Inuit, Kabardian, Kalmyk, Karachay, Karachay-Balkar, Karakalpak,
-#     Karelian, Khakass, Khanty, Komi, Komi-Permyak, Koryak, Kumyk, Lak, Lapp,
-#     Lezghian, Lithuanian, Mansi, Mari, Moldovan, Molodstov, Mordvin, Nanai,
-#     Nenets, Nivkh, Nogai, Ossetic, Permyak, Romanian, Romany, Selkup, Shor,
-#     Tabasaran, Tat, Tuva, Udekhe, Udmurt, Yakut.
-#   marc_code: >
-#     abk, ady, alt, ava, bak, che, chv, dar, ale, esk, kbd, xal, krc, kaa,
-#     krl, kom, kum, lez, lit, chm, nog, oss, rum, rom, sel, udm, sah
-#   name: Asian Cyrillic
 assamese:
 assamese:
-  marc_code: asm  
+  marc_code: asm
   name: assamese
   name: assamese
 azerbaijani_cyrillic:
 azerbaijani_cyrillic:
   marc_code: aze
   marc_code: aze
@@ -60,7 +46,7 @@ bihari_devanagari:
   name: Bihari (Devanagari)
   name: Bihari (Devanagari)
 braj_devanagari:
 braj_devanagari:
   marc_code: bra
   marc_code: bra
-  name: Braj (Devanagari)  
+  name: Braj (Devanagari)
 bulgarian:
 bulgarian:
   marc_code: bul
   marc_code: bul
   name: Bulgarian
   name: Bulgarian
@@ -81,20 +67,10 @@ church_slavonic:
 chuvash_cyrillic:
 chuvash_cyrillic:
   marc_code: chv
   marc_code: chv
   name: Chuvash (Cyrillic)
   name: Chuvash (Cyrillic)
-cyrillic_generic:
-  description: 'Multi-purpose transliteration for most languages that use the Cyrillic script:
-    Abaza, Abkhaz, Adygei, Aisor, Altai, Avar, Azeri, Balkar, Bashkir, Belarusian, Bulgarian,
-    Buryat, Chechen, Chukchi, Chuvash, Dargwa, Dungan, Eskimo, Even, Evenki, Gagauz, Ingush,
-    Inuit, Kabardian, Kalmyk, Karachay, Karachay-Balkar, Karakalpak, Karelian, Khakass, Khanty,
-    Komi, Komi-Permyak, Koryak, Kumyk, Lak, Lapp, Lezghian, Lithuanian, Macedonian, Mansi, Mari,
-    Moldovan, Molodstov, Mordvin, Nanai, Nenets, Nivkh, Nogai, Ossetic, Permyak, Romanian, Romany,
-    Russian, Selkup, Serbian, Shor, Tabasaran, Tat, Tuva, Udekhe, Udmurt, Ukrainian, Yakut.'
-  marc_code: abk, ady, alt, ava, bak, bel, bul, che, chm, chv, dar, ale, esk, kbd, xal, krc, kaa,
-    krl, kom, kum, lez, lit, mac, nog, oss, rum, rom, sah, sel, srp, udm, ukr
-  name: Cyrillic (Generic)
-devanagari:
-  marc_code: hin, san
-  name: Devanagari
+dargwa_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Dargwa (Cyrillic)
+  marc_code: dar
 divehi_thaana:
 divehi_thaana:
   marc_code: div
   marc_code: div
   name: Divehi (Thaana)
   name: Divehi (Thaana)
@@ -102,8 +78,12 @@ dogri_devanagari:
   marc_code: doi
   marc_code: doi
   name: Dogri (Devanagari)
   name: Dogri (Devanagari)
 dungan_cyrillic:
 dungan_cyrillic:
-  marc_code: sit  
+  marc_code: sit
   name: Dungan (Cyrillic)
   name: Dungan (Cyrillic)
+eskimo_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Eskimo (Cyrillic)
+  marc_code: esk
 ethiopic:
 ethiopic:
   marc_code: amh, eth
   marc_code: amh, eth
   name: Ethiopic (Amharic)
   name: Ethiopic (Amharic)
@@ -159,7 +139,7 @@ khakass_cyrillic:
   marc_code: tut
   marc_code: tut
   name: Khakass (Cyrillic)
   name: Khakass (Cyrillic)
 khanty_cyrillic:
 khanty_cyrillic:
-  marc_code: fiu  
+  marc_code: fiu
   name: Khanty (Cyrillic)
   name: Khanty (Cyrillic)
 khmer:
 khmer:
   marc_code: khm
   marc_code: khm
@@ -186,15 +166,55 @@ kurdish:
 kyrgyz_cyrillic:
 kyrgyz_cyrillic:
   marc_code: kir
   marc_code: kir
   name: Kyrgyz (Cyrillic)
   name: Kyrgyz (Cyrillic)
+ingush_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Ingush (Cyrillic)
+  marc_code: inh
+inuit_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Inuit (Cyrillic)
+  marc_code: ipk
+kabardian_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Kabardian (Cyrillic)
+  marc_code: kbd
+karakalpak_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Karakalpak (Cyrillic)
+  marc_code: kaa
+komi-Permyak_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Permyak (Cyrillic)
+  marc_code: kom
+kumyk_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Kumyk (Cyrillic)
+  marc_code: kum
+lak_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Lak (Cyrillic)
+  marc_code: cau
+lapp_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Lapp (Cyrillic)
+  marc_code: smi
+lezghian_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Lezghian (Cyrillic)
+  marc_code: lez
 lithuanian_cyrillic:
 lithuanian_cyrillic:
   marc_code: lit
   marc_code: lit
   name: Lithuanian (Cyrillic)
   name: Lithuanian (Cyrillic)
+mari_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Mari (Cyrillic)
+  marc_code: chm
 macedonian:
 macedonian:
   marc_code: mac
   marc_code: mac
   name: Macedonian
   name: Macedonian
 maithili_devanagari:
 maithili_devanagari:
   marc_code: mai
   marc_code: mai
-  name: Maithili (Devanagari)  
+  name: Maithili (Devanagari)
 malayalam:
 malayalam:
   marc_code: mal
   marc_code: mal
   name: Malayalam
   name: Malayalam
@@ -209,6 +229,9 @@ marathi_devanagari:
 moldovan_cyrillic:
 moldovan_cyrillic:
   marc_code: mol
   marc_code: mol
   name: Moldovan (Cyrillic)
   name: Moldovan (Cyrillic)
+molodstov_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Molodstov (Cyrillic)
 mongolian_cyrillic:
 mongolian_cyrillic:
   marc_code: mon
   marc_code: mon
   name: Mongolian (Cyrillic)
   name: Mongolian (Cyrillic)
@@ -217,6 +240,10 @@ mongolian_mongol_bichig:
   name: Mongolian (Mongol bichig)
   name: Mongolian (Mongol bichig)
 mordvin_cyrillic:
 mordvin_cyrillic:
   name: Mordvin (Cyrillic)
   name: Mordvin (Cyrillic)
+nanai_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Nanai (Cyrillic)
+  marc_code: tut
 nenets_cyrillic:
 nenets_cyrillic:
   name: Nenets (Cyrillic)
   name: Nenets (Cyrillic)
 nepali_devanagari:
 nepali_devanagari:
@@ -225,6 +252,14 @@ nepali_devanagari:
 newari_devanagari:
 newari_devanagari:
   marc_code: new
   marc_code: new
   name: Newari (Devanagari)
   name: Newari (Devanagari)
+nivkh_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Nivkh (Cyrillic)
+  marc_code: mis
+nogai_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Nogai (Cyrillic)
+  marc_code: ng
 oriya:
 oriya:
   marc_code: ori
   marc_code: ori
   name: Oriya
   name: Oriya
@@ -233,13 +268,17 @@ ossetic_cyrillic:
   name: Ossetic (Cyrillic)
   name: Ossetic (Cyrillic)
 pahari_devanagari:
 pahari_devanagari:
   marc_code: him
   marc_code: him
-  name: Pahari (Devanagari)  
+  name: Pahari (Devanagari)
 pali:
 pali:
   marc_code: pli
   marc_code: pli
   name: Pali
   name: Pali
 panjabi_gurmukhi:
 panjabi_gurmukhi:
   marc_code: pan
   marc_code: pan
   name: Panjabi (Gurmukhi)
   name: Panjabi (Gurmukhi)
+permyak_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Permyak (Cyrillic)
+  marc_code: kom
 persian:
 persian:
   marc_code: per
   marc_code: per
   name: Persian
   name: Persian
@@ -261,6 +300,14 @@ rajasthani_devanagari:
 romani_cyrillic:
 romani_cyrillic:
   marc_code: rom
   marc_code: rom
   name: Romani (Cyrillic)
   name: Romani (Cyrillic)
+romany_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Romany (Cyrillic)
+  marc_code: rom
+romanian_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Romanian (Cyrillic)
+  marc_code: rum
 russian:
 russian:
   marc_code: rus
   marc_code: rus
   name: Russian
   name: Russian
@@ -270,6 +317,10 @@ sanskrit_devanagari:
 serbian:
 serbian:
   marc_code: srp
   marc_code: srp
   name: Serbian
   name: Serbian
+selkup_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Selkup (Cyrillic)
+  marc_code: sel
 shor_cyrillic:
 shor_cyrillic:
   name: Shor (Cyrillic)
   name: Shor (Cyrillic)
 sinhalese:
 sinhalese:
@@ -278,6 +329,10 @@ sinhalese:
 syriac_cyrillic:
 syriac_cyrillic:
   marc_code: syc
   marc_code: syc
   name: Syriac (Cyrillic)
   name: Syriac (Cyrillic)
+tabasaran_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Tabasaran (Cyrillic)
+  marc_code: cau
 tajik_cyrillic:
 tajik_cyrillic:
   marc_code: tgk
   marc_code: tgk
   name: Tajik (Cyrillic)
   name: Tajik (Cyrillic)
@@ -290,6 +345,10 @@ tamil_brahmi:
 tamil_extended:
 tamil_extended:
   marc_code: tam
   marc_code: tam
   name: Tamil (extended)
   name: Tamil (extended)
+tat_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Tat (Cyrillic)
+  marc_code: ira
 tatar-kryashen_cyrillic:
 tatar-kryashen_cyrillic:
   name: Tatar-Kryashen (Cyrillic)
   name: Tatar-Kryashen (Cyrillic)
 tatar_cyrillic:
 tatar_cyrillic:
@@ -306,16 +365,23 @@ tibetan:
   name: Tibetan
   name: Tibetan
 tigrinya:
 tigrinya:
   marc_code: tir
   marc_code: tir
-  name: Tigrinya  
+  name: Tigrinya
 tod_mongolian:
 tod_mongolian:
   marc_code: xal
   marc_code: xal
   name: Tod Mongolian
   name: Tod Mongolian
 turkmen_cyrillic:
 turkmen_cyrillic:
   marc_code: tuk
   marc_code: tuk
   name: Turkmen (Cyrillic)
   name: Turkmen (Cyrillic)
+tuva_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Tuva (Cyrillic)
 tuvinian_cyrillic:
 tuvinian_cyrillic:
   marc_code: tyv
   marc_code: tyv
   name: Tuvinian (Cyrillic)
   name: Tuvinian (Cyrillic)
+udekhe_cyrillic:
+  conf: "cyrillic_generic.yml"
+  name: Udekhe (Cyrillic)
+  marc_code: tut
 udmurt_cyrillic:
 udmurt_cyrillic:
   marc_code: udm
   marc_code: udm
   name: Udmurt (Cyrillic)
   name: Udmurt (Cyrillic)
@@ -343,5 +409,3 @@ yiddish:
 yuit_cyrillic:
 yuit_cyrillic:
   marc_code: ypk
   marc_code: ypk
   name: Yuit (Cyrillic)
   name: Yuit (Cyrillic)
-  
-