Jelajahi Sumber

Merge pull request #134 from lcnetdev/casei

Fix case-insensitive R2S.
Stefano Cossu 8 bulan lalu
induk
melakukan
456aae040a
44 mengubah file dengan 1469 tambahan dan 139 penghapusan
  1. 1243 0
      ext/oriya.html
  2. 0 0
      legacy/korean_old.yml
  3. 7 1
      scriptshifter/tables/__init__.py
  4. 2 2
      scriptshifter/tables/data/_chinese_base.yml
  5. 2 0
      scriptshifter/tables/data/arabic.yml
  6. 2 0
      scriptshifter/tables/data/burmese.yml
  7. 3 0
      scriptshifter/tables/data/chinese.yml
  8. 2 0
      scriptshifter/tables/data/devanagari.yml
  9. 3 1
      scriptshifter/tables/data/divehi_thaana.yml
  10. 2 0
      scriptshifter/tables/data/dogri_devanagari.yml
  11. 2 0
      scriptshifter/tables/data/gujarati.yml
  12. 2 0
      scriptshifter/tables/data/gurmukhi.yml
  13. 2 1
      scriptshifter/tables/data/hebrew.yml
  14. 129 127
      scriptshifter/tables/data/hindi.yml
  15. 2 0
      scriptshifter/tables/data/hiragana.yml
  16. 2 0
      scriptshifter/tables/data/kannada.yml
  17. 2 0
      scriptshifter/tables/data/katakana.yml
  18. 2 0
      scriptshifter/tables/data/khmer.yml
  19. 2 0
      scriptshifter/tables/data/korean_names.yml
  20. 2 2
      scriptshifter/tables/data/korean_nonames.yml
  21. 2 0
      scriptshifter/tables/data/malayalam.yml
  22. 2 0
      scriptshifter/tables/data/marathi_devanagari.yml
  23. 2 0
      scriptshifter/tables/data/mongolian_mongol_bichig.yml
  24. 2 0
      scriptshifter/tables/data/nepali_devanagari.yml
  25. 2 0
      scriptshifter/tables/data/newari_devanagari.yml
  26. 2 0
      scriptshifter/tables/data/oriya.yml
  27. 2 0
      scriptshifter/tables/data/pali.yml
  28. 2 0
      scriptshifter/tables/data/panjabi.yml
  29. 3 1
      scriptshifter/tables/data/persian.yml
  30. 2 0
      scriptshifter/tables/data/prakrit_devanagari.yml
  31. 3 1
      scriptshifter/tables/data/pushto.yml
  32. 2 0
      scriptshifter/tables/data/sanskrit_devanagari.yml
  33. 2 0
      scriptshifter/tables/data/sinhalese.yml
  34. 2 0
      scriptshifter/tables/data/tamil.yml
  35. 2 0
      scriptshifter/tables/data/tamil_brahmi.yml
  36. 2 0
      scriptshifter/tables/data/tamil_extended.yml
  37. 2 0
      scriptshifter/tables/data/telugu.yml
  38. 2 0
      scriptshifter/tables/data/thai.yml
  39. 2 0
      scriptshifter/tables/data/thai_alt.yml
  40. 2 0
      scriptshifter/tables/data/tibetan.yml
  41. 3 1
      scriptshifter/tables/data/urdu.yml
  42. 2 0
      scriptshifter/tables/data/yiddish.yml
  43. 8 1
      scriptshifter/trans.py
  44. 1 1
      tests/data/script_samples/unclassified.csv

File diff ditekan karena terlalu besar
+ 1243 - 0
ext/oriya.html


+ 0 - 0
scriptshifter/tables/data/korean_old.yml → legacy/korean_old.yml


+ 7 - 1
scriptshifter/tables/__init__.py

@@ -16,7 +16,7 @@ except ImportError:
     from yaml import Loader
 
 from scriptshifter import DB_PATH
-from scriptshifter.exceptions import BREAK, ConfigError
+from scriptshifter.exceptions import BREAK, ApiError, ConfigError
 
 
 __doc__ = """
@@ -209,6 +209,9 @@ def populate_table(conn, tid, tname):
     if "roman_to_script" in data:
         flags |= FEAT_R2S
 
+    if not data.get("general", {}).get("case_sensitive", True):
+        flags |= FEAT_CASEI
+
     conn.execute(
             "UPDATE tbl_language SET features = ? WHERE id = ?",
             (flags, tid))
@@ -555,6 +558,9 @@ def get_lang_general(conn, lang):
             FROM tbl_language WHERE name = ?""", (lang,))
     lang_data = lang_q.fetchone()
 
+    if not lang_data:
+        raise ApiError(f"No language data found for {lang}", 404)
+
     return {
         "id": lang_data[0],
         "data": {

+ 2 - 2
scriptshifter/tables/data/_chinese_base.yml

@@ -1,13 +1,13 @@
 # This file is derived and kept in sync with Princeton's OCLC Connexion Pinyin
 # converter (https://github.com/pulibrary/oclcpinyin/).
 
-general: # Section names and other keywords are all snake_cased.
+general:  # Section names and other keywords are all snake_cased.
   name: Chinese base (from Princeton)
   parents:
     - _ignore_base
 
 script_to_roman:
-  map: # Mapping section.
+  map:  # Mapping section.
     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u862D\u5171\u548C\u570B": "Bajisitan Yisilan Gongheguo "
     "\u5DF4\u57FA\u65AF\u5766\u4F0A\u65AF\u5170\u5171\u548C\u56FD": "Bajisitan Yisilan Gongheguo "
     "\u5DF4\u97F3\u90ED\u695E\u8499\u53E4\u81EA\u6CBB\u5DDE": "Bayinguoleng Menggu Zizhizhou "

+ 2 - 0
scriptshifter/tables/data/arabic.yml

@@ -1,9 +1,11 @@
 # Arabic S2R using the 3rd-party ArabicTransliterator library:
 # https://github.com/MTG/ArabicTransliterator
 
+---
 general:
   name: Arabic
   description: Arabic S2R using a 3rd party library.
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/burmese.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Burmese (Myanmar)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 3 - 0
scriptshifter/tables/data/chinese.yml

@@ -2,10 +2,13 @@
 #
 # All other Chinese mappings are kept in _chinese_base.yml. This mapping only
 # adds an overlay for parsing numerals and Scriptshifter-specific features.
+
+---
 general:
   name: Chinese
   parents:
     - _chinese_base
+  case_sensitive: false
 
 options:
   - id: marc_field

+ 2 - 0
scriptshifter/tables/data/devanagari.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Devanagari
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 3 - 1
scriptshifter/tables/data/divehi_thaana.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Divehi (Thaana)
+  case_sensitive: false
 
 roman_to_script:
   map:
@@ -50,7 +52,7 @@ roman_to_script:
     "bb": "\u0787\u07B0\u0784"
     "b": "\u0784"
 
-    # THAANA LETTER "L/l" WITH DOT BELOW (0323) 
+    # THAANA LETTER "L/l" WITH DOT BELOW (0323)
     "L\u0323": "\u0785"
     "l\u0323": "\u0785"
     "K": "\u0786"

+ 2 - 0
scriptshifter/tables/data/dogri_devanagari.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Dogri (Devanagari)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/gujarati.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Gujarati
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/gurmukhi.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Punjabi (Gurmukhi)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 1
scriptshifter/tables/data/hebrew.yml

@@ -1,6 +1,8 @@
+---
 general:
   name: Hebrew
   description: Hebrew S2R.
+  case_sensitive: false
 
 options:
   - id: genre
@@ -19,4 +21,3 @@ script_to_roman:
     post_config:
       -
         - hebrew.dicta_api.s2r_post_config
-

File diff ditekan karena terlalu besar
+ 129 - 127
scriptshifter/tables/data/hindi.yml


+ 2 - 0
scriptshifter/tables/data/hiragana.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Japanese (Hiragana)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/kannada.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Kannada
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/katakana.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Japanese (Katakana)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/khmer.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Khmer
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/korean_names.yml

@@ -1,6 +1,8 @@
+---
 general:
   name: Korean (Names)
   description: Korean names S2R.
+  case_sensitive: false
 
 options:
   - id: marc_field

+ 2 - 2
scriptshifter/tables/data/korean_nonames.yml

@@ -1,11 +1,11 @@
+---
 general:
   name: Korean (Non-names)
   description: Korean S2R.
+  case_sensitive: false
 
 script_to_roman:
   hooks:
     post_config:
       -
         - korean.romanizer.s2r_nonames_post_config
-
-

+ 2 - 0
scriptshifter/tables/data/malayalam.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Malayalam
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/marathi_devanagari.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Marathi (Devanagari)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/mongolian_mongol_bichig.yml

@@ -1,7 +1,9 @@
+---
 general:
   name: Mongolian (Mongol bichig)
   parents:
     - _ignore_base
+  case_sensitive: false
 
 roman_to_script:
 

+ 2 - 0
scriptshifter/tables/data/nepali_devanagari.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Nepali (Devanagari)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/newari_devanagari.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Newari (Devanagari)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/oriya.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Oriya
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/pali.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Pali
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/panjabi.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Panjabi
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 3 - 1
scriptshifter/tables/data/persian.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Persian
+  case_sensitive: false
 
 roman_to_script:
   map:
@@ -10,7 +12,7 @@ roman_to_script:
     ";": "\u061B"
     "?": "\u061F"
 
-    # Exceptions for specific words 
+    # Exceptions for specific words
     # Allah
     "Alla\u0304h": "\u0627\u0644\u0644\u0647"
 

+ 2 - 0
scriptshifter/tables/data/prakrit_devanagari.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Prakrit (Devanagari)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 3 - 1
scriptshifter/tables/data/pushto.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Pushto
+  case_sensitive: false
 
 roman_to_script:
   map:
@@ -10,7 +12,7 @@ roman_to_script:
     ";": "\u061B"
     "?": "\u061F"
 
-    # Exceptions for specific words 
+    # Exceptions for specific words
     # Allah
     "Alla\u0304h": "\u0627\u0644\u0644\u0647"
 

+ 2 - 0
scriptshifter/tables/data/sanskrit_devanagari.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Sanskrit (Devanagari)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/sinhalese.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Sinhalese
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/tamil.yml

@@ -1,7 +1,9 @@
+---
 general:
   name: Tamil
   parents:
     - _ignore_base
+  case_sensitive: false
 
 roman_to_script:
   map:

+ 2 - 0
scriptshifter/tables/data/tamil_brahmi.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Tamil Brahmi
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/tamil_extended.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Tamil (extended)
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/telugu.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Telugu
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 2 - 0
scriptshifter/tables/data/thai.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Thai
+  case_sensitive: false
 
 options:
   - id: ThaiTranscription

+ 2 - 0
scriptshifter/tables/data/thai_alt.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Thai (alternative)
+  case_sensitive: false
 
 script_to_roman:
   map:

+ 2 - 0
scriptshifter/tables/data/tibetan.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Tibetan
+  case_sensitive: false
 
 script_to_roman:
   hooks:

+ 3 - 1
scriptshifter/tables/data/urdu.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Urdi
+  case_sensitive: false
 
 roman_to_script:
   map:
@@ -10,7 +12,7 @@ roman_to_script:
     ";": "\u061B"
     "?": "\u061F"
 
-    # Exceptions for specific words 
+    # Exceptions for specific words
     # Allah
     "Alla\u0304h": "\u0627\u0644\u0644\u0647"
     "alla\u0304h": "\u0627\u0644\u0644\u0647"

+ 2 - 0
scriptshifter/tables/data/yiddish.yml

@@ -1,5 +1,7 @@
+---
 general:
   name: Yiddish
+  case_sensitive: false
 
 options:
   - id: loshn_koydesh

+ 8 - 1
scriptshifter/trans.py

@@ -5,7 +5,7 @@ from re import compile
 
 from scriptshifter.exceptions import BREAK, CONT
 from scriptshifter.tables import (
-        BOW, EOW, WORD_BOUNDARY, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
+        BOW, EOW, WORD_BOUNDARY, FEAT_CASEI, FEAT_R2S, FEAT_S2R, HOOK_PKG_PATH,
         get_connection, get_lang_dcap, get_lang_general, get_lang_hooks,
         get_lang_ignore, get_lang_map, get_lang_normalize)
 
@@ -111,6 +111,10 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
                 f"Roman-to-script not yet supported for {lang}."
             )
 
+        # Normalize case before post_config and rule-based normalization.
+        if not ctx.general["case_sensitive"]:
+            ctx._src = ctx.src.lower()
+
         # This hook may take over the whole transliteration process or delegate
         # it to some external process, and return the output string directly.
         if _run_hook("post_config", ctx) == BREAK:
@@ -309,6 +313,9 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
 
 def _normalize_src(ctx, norm_rules):
+    """
+    Normalize source text according to rules.
+    """
     for nk, nv in norm_rules.items():
         ctx._src = ctx.src.replace(nk, nv)
     logger.debug(f"Normalized source: {ctx.src}")

+ 1 - 1
tests/data/script_samples/unclassified.csv

@@ -2,7 +2,7 @@ armenian,Մեդիա իրավունք : (ուսումնական ձեռնարկ) ,
 armenian,Ա Բ Գ Դ Ե Զ Է Ը Թ Ժ Ի Լ Խ Ծ Կ Հ Ձ Ղ Ճ Մ Յ Ն Շ Ո Չ Պ Ջ Ռ Ս Վ Տ Ր Ւ Փ Ք Օ Ֆ ՙ ՚ ՛ ՜ ՝ ՞   ՟ ա բ գ դ ե զ է ը թ ժ ի լ խ ծ կ ձ ղ ճ մ յ ն շ ո չ պ ջ ռ ս վ տ ր ց ւ փ ք օ ֆ և ։ ֊ .,A B G D E Y Z Ē Ě Tʻ Zh I L Kh Ts K H Dz Gg Ch M Y N Sh O Chʻ P J Ṛ S V T R Tsʻ W U Pʻ Kʻ Ew Ev Ō Fa b g d e y z ē ě tʻ zh i l kh ts k h dz gh ch m y n sh o chʻ p j ṛ s v t r tsʻ w u pʻ kʻ ew ev ō f,,
 georgian,ადგილობრივი თვითმმართველობის კოდექსი : საქართველოს ორგანული კანონი; 2018 წლის 7 სექტებრის მდგომარეობით.,Adgilobrivi tʻvitʻmmartʻvelobis kodekʻsi : Sakʻartʻvelos organuli kanoni; 2018 clis 7 sekʻtembris mdgomareobitʻ.,,
 hindi,परमहंस की पीड़ा : महान क्रांतिकारी रामप्रसाद बिस्मिल के जीवन पर आधारित उपन्यास,Paramahaṃsa kī pīṛā : mahāna krāntikārī Rāmaprasāda Bismila ke jīvana para ādhārita upanyāsa,,
-mongolian_mongol_bichig,ᠳᠠᠶᠢᠴᠢᠩ ᠭᠦᠷᠦᠨ ᠦ ᠦᠶ ᠡ ᠶᠢᠨ ᠥᠯᠠᠨ ᠺᠡᠯᠡᠨ ᠦ ᠦᠰᠦᠭ ᠬᠠᠪᠰᠸᠷᠸᠭᠰᠠᠨ ᠰᠸᠷᠪᠸᠯᠵᠢ ᠪᠢᠴᠢᠭ ᠦᠨ ᠰᠸᠳᠸᠯᠸᠯ,Dayicing gu̇ru̇n-u̇ u̇y-e-yin olan kelen-u̇ u̇su̇g qabsuruġsan surbulji bicig-u̇n sudulul,,
+mongolian_mongol_bichig,ᠳᠠᠶᠢᠴᠢᠩ ᠭᠦᠷᠦᠨ ᠦ ᠦᠶ᠎ᠡ ᠶᠢᠨ ᠣᠯᠠᠨ ᠬᠡᠯᠡᠨ ᠦ ᠦᠰᠦᠭ ᠬᠠᠪᠰᠤᠷᠤᠭᠰᠠᠨ ᠰᠤᠷᠪᠤᠯᠵᠢ ᠪᠢᠴᠢᠭ ᠦᠨ ᠰᠤᠳᠤᠯᠤᠯ,dayicing gu̇ru̇n-u̇ u̇y-e-yin olan kelen-u̇ u̇su̇g qabsuruġsan surbulji bicig-u̇n sudulul,,
 ,আগবাৰীত  ফুলিলে  সোনে  মোৰ  চম্পা,Āgabārīta phulile soṇe mora campā,,
 ,Milli dövlətçilik hərəkatının yüksəlişi və Xalq Cümhuriyyəti dövründə Azərbaycançılıq ideyası,Milli dövlätçilik häräkatının yüksälişi vä Xalq Cümhuriyyäti dövründä azärbaycançılıq ideyası,,
 ,مجنون مجنون دوشون منى  شعر توپلوسو ,Macnūn macnūn düşün manī : şiʻr toplūsū,,

Beberapa file tidak ditampilkan karena terlalu banyak file yang berubah dalam diff ini