10 months ago · 9f3ba34c49
--- a/ext/oriya.html
+++ b/ext/oriya.html
--- a/scriptshifter/tables/__init__.py
+++ b/scriptshifter/tables/__init__.py
@@ -16,7 +16,7 @@ except ImportError:
 
				     from yaml import Loader
			
 
				 
			
 
				 from scriptshifter import DB_PATH
			
 
				-from scriptshifter.exceptions import BREAK, ConfigError
			
 
				+from scriptshifter.exceptions import BREAK, ApiError, ConfigError
			
 
				 
			
 
				 
			
 
				 __doc__ = """
			
@@ -209,6 +209,9 @@ def populate_table(conn, tid, tname):
 
				     if "roman_to_script" in data:
			
 
				         flags |= FEAT_R2S
			
 
				 
			
 
				+    if not data.get("general", {}).get("case_sensitive", True):
			
 
				+        flags |= FEAT_CASEI
			
 
				+
			
 
				     conn.execute(
			
 
				             "UPDATE tbl_language SET features = ? WHERE id = ?",
			
 
				             (flags, tid))
			
@@ -555,6 +558,9 @@ def get_lang_general(conn, lang):
 
				             FROM tbl_language WHERE name = ?""", (lang,))
			
 
				     lang_data = lang_q.fetchone()
			
 
				 
			
 
				+    if not lang_data:
			
 
				+        raise ApiError(f"No language data found for {lang}", 404)
			
 
				+
			
 
				     return {
			
 
				         "id": lang_data[0],
			
 
				         "data": {
			
--- a/scriptshifter/tables/data/_chinese_base.yml
+++ b/scriptshifter/tables/data/_chinese_base.yml
@@ -1,13 +1,13 @@
 
				 # This file is derived and kept in sync with Princeton's OCLC Connexion Pinyin
			
 
				 # converter (https://github.com/pulibrary/oclcpinyin/).
			
 
				 
			
 
				-general: # Section names and other keywords are all snake_cased.
			
 
				+general:  # Section names and other keywords are all snake_cased.
			
 
				   name: Chinese base (from Princeton)
			
 
				   parents:
			
 
				     - _ignore_base
			
 
				 
			
 
				 script_to_roman:
			
 
				-  map: # Mapping section.
			
 
				+  map:  # Mapping section.
			
 
				     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u862D\u5171\u548C\u570B": "Bajisitan Yisilan Gongheguo "
			
 
				     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u5170\u5171\u548C\u56FD": "Bajisitan Yisilan Gongheguo "
			
 
				     "\u5DF4\u97F3\u90ED\u695E\u8499\u53E4\u81EA\u6CBB\u5DDE": "Bayinguoleng Menggu Zizhizhou "
			
--- a/scriptshifter/tables/data/arabic.yml
+++ b/scriptshifter/tables/data/arabic.yml
@@ -1,9 +1,11 @@
 
				 # Arabic S2R using the 3rd-party ArabicTransliterator library:
			
 
				 # https://github.com/MTG/ArabicTransliterator
			
 
				 
			
 
				+---
			
 
				 general:
			
 
				   name: Arabic
			
 
				   description: Arabic S2R using a 3rd party library.
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				   hooks:
			
--- a/scriptshifter/tables/data/chinese.yml
+++ b/scriptshifter/tables/data/chinese.yml
@@ -2,10 +2,13 @@
 
				 #
			
 
				 # All other Chinese mappings are kept in _chinese_base.yml. This mapping only
			
 
				 # adds an overlay for parsing numerals and Scriptshifter-specific features.
			
 
				+
			
 
				+---
			
 
				 general:
			
 
				   name: Chinese
			
 
				   parents:
			
 
				     - _chinese_base
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 options:
			
 
				   - id: marc_field
			
--- a/scriptshifter/tables/data/gujarati.yml
+++ b/scriptshifter/tables/data/gujarati.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Gujarati
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				   hooks:
			
--- a/scriptshifter/tables/data/hebrew.yml
+++ b/scriptshifter/tables/data/hebrew.yml
@@ -1,6 +1,8 @@
 
				+---
			
 
				 general:
			
 
				   name: Hebrew
			
 
				   description: Hebrew S2R.
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 options:
			
 
				   - id: genre
			
@@ -19,4 +21,3 @@ script_to_roman:
 
				     post_config:
			
 
				       -
			
 
				         - hebrew.dicta_api.s2r_post_config
			
 
				-
			
--- a/scriptshifter/tables/data/kannada.yml
+++ b/scriptshifter/tables/data/kannada.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Kannada
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				   hooks:
			
--- a/scriptshifter/tables/data/malayalam.yml
+++ b/scriptshifter/tables/data/malayalam.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Malayalam
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				   hooks:
			
--- a/scriptshifter/tables/data/marathi_devanagari.yml
+++ b/scriptshifter/tables/data/marathi_devanagari.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Marathi (Devanagari)
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				   hooks:
			
--- a/scriptshifter/tables/data/nepali_devanagari.yml
+++ b/scriptshifter/tables/data/nepali_devanagari.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Nepali (Devanagari)
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				   hooks:
			
--- a/scriptshifter/tables/data/oriya.yml
+++ b/scriptshifter/tables/data/oriya.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Oriya
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				   hooks:
			
--- a/scriptshifter/tables/data/pali.yml
+++ b/scriptshifter/tables/data/pali.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Pali
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				   hooks:
			
--- a/scriptshifter/tables/data/sanskrit_devanagari.yml
+++ b/scriptshifter/tables/data/sanskrit_devanagari.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Sanskrit (Devanagari)
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				   hooks:
			
--- a/scriptshifter/tables/data/sinhalese.yml
+++ b/scriptshifter/tables/data/sinhalese.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Sinhalese
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				   hooks:
			
--- a/scriptshifter/tables/data/telugu.yml
+++ b/scriptshifter/tables/data/telugu.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Telugu
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 script_to_roman:
			
 
				   hooks:
			
--- a/scriptshifter/tables/data/thai.yml
+++ b/scriptshifter/tables/data/thai.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Thai
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 options:
			
 
				   - id: ThaiTranscription
			
--- a/scriptshifter/tables/data/yiddish.yml
+++ b/scriptshifter/tables/data/yiddish.yml
@@ -1,5 +1,7 @@
 
				+---
			
 
				 general:
			
 
				   name: Yiddish
			
 
				+  case_sensitive: false
			
 
				 
			
 
				 options:
			
 
				   - id: loshn_koydesh
			
--- a/scriptshifter/trans.py
+++ b/scriptshifter/trans.py
@@ -5,7 +5,7 @@ from re import compile
 
				 
			
 
				 from scriptshifter.exceptions import BREAK, CONT
			
 
				 from scriptshifter.tables import (
			
 
				-        BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
			
 
				+        BOW, EOW, WORD_BOUNDARY, FEAT_CASEI, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
			
 
				         get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
			
 
				         get_lang_ignore, get_lang_map, get_lang_normalize)
			
 
				 
			
@@ -111,6 +111,10 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
				                 f"Roman-to-script not yet supported for {lang}."
			
 
				             )
			
 
				 
			
 
				+        # Normalize case before post_config and rule-based normalization.
			
 
				+        if not ctx.general["case_sensitive"]:
			
 
				+            ctx._src = ctx.src.lower()
			
 
				+
			
 
				         # This hook may take over the whole transliteration process or delegate
			
 
				         # it to some external process, and return the output string directly.
			
 
				         if _run_hook("post_config", ctx) == BREAK:
			
@@ -309,6 +313,9 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
				 
			
 
				 
			
 
				 def _normalize_src(ctx, norm_rules):
			
 
				+    """
			
 
				+    Normalize source text according to rules.
			
 
				+    """
			
 
				     for nk, nv in norm_rules.items():
			
 
				         ctx._src = ctx.src.replace(nk, nv)
			
 
				     logger.debug(f"Normalized source: {ctx.src}")
			
--- a/tests/data/script_samples/unclassified.csv
+++ b/tests/data/script_samples/unclassified.csv
@@ -2,7 +2,7 @@ armenian,Մեդիա իրավունք : (ուսումնական ձեռնարկ) ,
 
				 armenian,Ա Բ Գ Դ Ե Զ Է Ը Թ Ժ Ի Լ Խ Ծ Կ Հ Ձ Ղ Ճ Մ Յ Ն Շ Ո Չ Պ Ջ Ռ Ս Վ Տ Ր Ւ Փ Ք Օ Ֆ ՙ ՚ ՛ ՜ ՝ ՞   ՟ ա բ գ դ ե զ է ը թ ժ ի լ խ ծ կ ձ ղ ճ մ յ ն շ ո չ պ ջ ռ ս վ տ ր ց ւ փ ք օ ֆ և ։ ֊ .,A B G D E Y Z Ē Ě Tʻ Zh I L Kh Ts K H Dz Gg Ch M Y N Sh O Chʻ P J Ṛ S V T R Tsʻ W U Pʻ Kʻ Ew Ev Ō Fa b g d e y z ē ě tʻ zh i l kh ts k h dz gh ch m y n sh o chʻ p j ṛ s v t r tsʻ w u pʻ kʻ ew ev ō f,,
			
 
				 georgian,ადგილობრივი თვითმმართველობის კოდექსი : საქართველოს ორგანული კანონი; 2018 წლის 7 სექტებრის მდგომარეობით.,Adgilobrivi tʻvitʻmmartʻvelobis kodekʻsi : Sakʻartʻvelos organuli kanoni; 2018 clis 7 sekʻtembris mdgomareobitʻ.,,
			
 
				 hindi,परमहंस की पीड़ा : महान क्रांतिकारी रामप्रसाद बिस्मिल के जीवन पर आधारित उपन्यास,Paramahaṃsa kī pīṛā : mahāna krāntikārī Rāmaprasāda Bismila ke jīvana para ādhārita upanyāsa,,
			
 
				-mongolian_mongol_bichig,ᠳᠠᠶᠢᠴᠢᠩ ᠭᠦᠷᠦᠨ ᠦ ᠦᠶ ᠡ ᠶᠢᠨ ᠥᠯᠠᠨ ᠺᠡᠯᠡᠨ ᠦ ᠦᠰᠦᠭ ᠬᠠᠪᠰᠸᠷᠸᠭᠰᠠᠨ ᠰᠸᠷᠪᠸᠯᠵᠢ ᠪᠢᠴᠢᠭ ᠦᠨ ᠰᠸᠳᠸᠯᠸᠯ,Dayicing gu̇ru̇n-u̇ u̇y-e-yin olan kelen-u̇ u̇su̇g qabsuruġsan surbulji bicig-u̇n sudulul,,
			
 
				+mongolian_mongol_bichig,ᠳᠠᠶᠢᠴᠢᠩ ᠭᠦᠷᠦᠨ ᠦ ᠦᠶ᠎ᠡ ᠶᠢᠨ ᠣᠯᠠᠨ ᠬᠡᠯᠡᠨ ᠦ ᠦᠰᠦᠭ ᠬᠠᠪᠰᠤᠷᠤᠭᠰᠠᠨ ᠰᠤᠷᠪᠤᠯᠵᠢ ᠪᠢᠴᠢᠭ ᠦᠨ ᠰᠤᠳᠤᠯᠤᠯ,dayicing gu̇ru̇n-u̇ u̇y-e-yin olan kelen-u̇ u̇su̇g qabsuruġsan surbulji bicig-u̇n sudulul,,
			
 
				 ,আগবাৰীত  ফুলিলে  সোনে  মোৰ  চম্পা,Āgabārīta phulile soṇe mora campā,,
			
 
				 ,Milli dövlətçilik hərəkatının yüksəlişi və Xalq Cümhuriyyəti dövründə Azərbaycançılıq ideyası,Milli dövlätçilik häräkatının yüksälişi vä Xalq Cümhuriyyäti dövründä azärbaycançılıq ideyası,,
			
 
				 ,مجنون مجنون دوشون منى  شعر توپلوسو ,Macnūn macnūn düşün manī : şiʻr toplūsū,,