Browse Source

Force lower case on R2S for case insensitive scripts.

scossu 8 months ago
parent
commit
9f3ba34c49

File diff suppressed because it is too large
+ 1243 - 0
ext/oriya.html


+ 7 - 1
scriptshifter/tables/__init__.py

@@ -16,7 +16,7 @@ except ImportError:
     from yaml import Loader
 
 from scriptshifter import DB_PATH
-from scriptshifter.exceptions import BREAK, ConfigError
+from scriptshifter.exceptions import BREAK, ApiError, ConfigError
 
 
 __doc__ = """
@@ -209,6 +209,9 @@ def populate_table(conn, tid, tname):
     if "roman_to_script" in data:
         flags |= FEAT_R2S
 
+    if not data.get("general", {}).get("case_sensitive", True):
+        flags |= FEAT_CASEI
+
     conn.execute(
             "UPDATE tbl_language SET features = ? WHERE id = ?",
             (flags, tid))
@@ -555,6 +558,9 @@ def get_lang_general(conn, lang):
             FROM tbl_language WHERE name = ?""", (lang,))
     lang_data = lang_q.fetchone()
 
+    if not lang_data:
+        raise ApiError(f"No language data found for {lang}", 404)
+
     return {
         "id": lang_data[0],
         "data": {

+ 2 - 2
scriptshifter/tables/data/_chinese_base.yml

@@ -1,13 +1,13 @@
 # This file is derived and kept in sync with Princeton's OCLC Connexion Pinyin
 # converter (https://github.com/pulibrary/oclcpinyin/).
 
-general: # Section names and other keywords are all snake_cased.
+general:  # Section names and other keywords are all snake_cased.
   name: Chinese base (from Princeton)
   parents:
     - _ignore_base
 
 script_to_roman:
-  map: # Mapping section.
+  map:  # Mapping section.
     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u862D\u5171\u548C\u570B": "Bajisitan Yisilan Gongheguo "
     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u5170\u5171\u548C\u56FD": "Bajisitan Yisilan Gongheguo "
     "\u5DF4\u97F3\u90ED\u695E\u8499\u53E4\u81EA\u6CBB\u5DDE": "Bayinguoleng Menggu Zizhizhou "

+ 2 - 0
scriptshifter/tables/data/arabic.yml

@@ -1,9 +1,11 @@
 # Arabic S2R using the 3rd-party ArabicTransliterator library:
 # https://github.com/MTG/ArabicTransliterator
 
+---
 general:
   name: Arabic
   description: Arabic S2R using a 3rd party library.
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 3 - 0
scriptshifter/tables/data/chinese.yml

@@ -2,10 +2,13 @@
 #
 # All other Chinese mappings are kept in _chinese_base.yml. This mapping only
 # adds an overlay for parsing numerals and Scriptshifter-specific features.
+
+---
 general:
   name: Chinese
   parents:
     - _chinese_base
+  case_sensitive: false
 
 options:
   - id: marc_field

+ 2 - 0
scriptshifter/tables/data/gujarati.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Gujarati
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 1
scriptshifter/tables/data/hebrew.yml

@@ -1,6 +1,8 @@
+---
 general:
   name: Hebrew
   description: Hebrew S2R.
+  case_sensitive: false
 
 options:
   - id: genre
@@ -19,4 +21,3 @@ script_to_roman:
     post_config:
       -
         - hebrew.dicta_api.s2r_post_config
-

+ 2 - 0
scriptshifter/tables/data/kannada.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Kannada
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/malayalam.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Malayalam
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/marathi_devanagari.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Marathi (Devanagari)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/nepali_devanagari.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Nepali (Devanagari)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/oriya.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Oriya
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/pali.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Pali
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/sanskrit_devanagari.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Sanskrit (Devanagari)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/sinhalese.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Sinhalese
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/telugu.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Telugu
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/thai.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Thai
+  case_sensitive: false
 
 options:
   - id: ThaiTranscription

+ 2 - 0
scriptshifter/tables/data/yiddish.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Yiddish
+  case_sensitive: false
 
 options:
   - id: loshn_koydesh

+ 8 - 1
scriptshifter/trans.py

@@ -5,7 +5,7 @@ from re import compile
 
 from scriptshifter.exceptions import BREAK, CONT
 from scriptshifter.tables import (
-        BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
+        BOW, EOW, WORD_BOUNDARY, FEAT_CASEI, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
         get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
         get_lang_ignore, get_lang_map, get_lang_normalize)
 
@@ -111,6 +111,10 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
                 f"Roman-to-script not yet supported for {lang}."
             )
 
+        # Normalize case before post_config and rule-based normalization.
+        if not ctx.general["case_sensitive"]:
+            ctx._src = ctx.src.lower()
+
         # This hook may take over the whole transliteration process or delegate
         # it to some external process, and return the output string directly.
         if _run_hook("post_config", ctx) == BREAK:
@@ -309,6 +313,9 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
 
 def _normalize_src(ctx, norm_rules):
+    """
+    Normalize source text according to rules.
+    """
     for nk, nv in norm_rules.items():
         ctx._src = ctx.src.replace(nk, nv)
     logger.debug(f"Normalized source: {ctx.src}")

+ 1 - 1
tests/data/script_samples/unclassified.csv

@@ -2,7 +2,7 @@ armenian,Մեդիա իրավունք : (ուսումնական ձեռնարկ) ,
 armenian,Ա Բ Գ Դ Ե Զ Է Ը Թ Ժ Ի Լ Խ Ծ Կ Հ Ձ Ղ Ճ Մ Յ Ն Շ Ո Չ Պ Ջ Ռ Ս Վ Տ Ր Ւ Փ Ք Օ Ֆ ՙ ՚ ՛ ՜ ՝ ՞   ՟ ա բ գ դ ե զ է ը թ ժ ի լ խ ծ կ ձ ղ ճ մ յ ն շ ո չ պ ջ ռ ս վ տ ր ց ւ փ ք օ ֆ և ։ ֊ .,A B G D E Y Z Ē Ě Tʻ Zh I L Kh Ts K H Dz Gg Ch M Y N Sh O Chʻ P J Ṛ S V T R Tsʻ W U Pʻ Kʻ Ew Ev Ō Fa b g d e y z ē ě tʻ zh i l kh ts k h dz gh ch m y n sh o chʻ p j ṛ s v t r tsʻ w u pʻ kʻ ew ev ō f,,
 georgian,ადგილობრივი თვითმმართველობის კოდექსი : საქართველოს ორგანული კანონი; 2018 წლის 7 სექტებრის მდგომარეობით.,Adgilobrivi tʻvitʻmmartʻvelobis kodekʻsi : Sakʻartʻvelos organuli kanoni; 2018 clis 7 sekʻtembris mdgomareobitʻ.,,
 hindi,परमहंस की पीड़ा : महान क्रांतिकारी रामप्रसाद बिस्मिल के जीवन पर आधारित उपन्यास,Paramahaṃsa kī pīṛā : mahāna krāntikārī Rāmaprasāda Bismila ke jīvana para ādhārita upanyāsa,,
-mongolian_mongol_bichig,ᠳᠠᠶᠢᠴᠢᠩ ᠭᠦᠷᠦᠨ ᠦ ᠦᠶ ᠡ ᠶᠢᠨ ᠥᠯᠠᠨ ᠺᠡᠯᠡᠨ ᠦ ᠦᠰᠦᠭ ᠬᠠᠪᠰᠸᠷᠸᠭᠰᠠᠨ ᠰᠸᠷᠪᠸᠯᠵᠢ ᠪᠢᠴᠢᠭ ᠦᠨ ᠰᠸᠳᠸᠯᠸᠯ,Dayicing gu̇ru̇n-u̇ u̇y-e-yin olan kelen-u̇ u̇su̇g qabsuruġsan surbulji bicig-u̇n sudulul,,
+mongolian_mongol_bichig,ᠳᠠᠶᠢᠴᠢᠩ ᠭᠦᠷᠦᠨ ᠦ ᠦᠶ᠎ᠡ ᠶᠢᠨ ᠣᠯᠠᠨ ᠬᠡᠯᠡᠨ ᠦ ᠦᠰᠦᠭ ᠬᠠᠪᠰᠤᠷᠤᠭᠰᠠᠨ ᠰᠤᠷᠪᠤᠯᠵᠢ ᠪᠢᠴᠢᠭ ᠦᠨ ᠰᠤᠳᠤᠯᠤᠯ,dayicing gu̇ru̇n-u̇ u̇y-e-yin olan kelen-u̇ u̇su̇g qabsuruġsan surbulji bicig-u̇n sudulul,,
 ,আগবাৰীত  ফুলিলে  সোনে  মোৰ  চম্পা,Āgabārīta phulile soṇe mora campā,,
 ,Milli dövlətçilik hərəkatının yüksəlişi və Xalq Cümhuriyyəti dövründə Azərbaycançılıq ideyası,Milli dövlätçilik häräkatının yüksälişi vä Xalq Cümhuriyyäti dövründä azärbaycançılıq ideyası,,
 ,مجنون مجنون دوشون منى  شعر توپلوسو ,Macnūn macnūn düşün manī : şiʻr toplūsū,,

Some files were not shown because too many files changed in this diff