Browse Source

Merge branch 'main' into cyrillic

scossu 1 year ago
parent
commit
fa83a24053

+ 2 - 2
doc/hooks.md

@@ -78,7 +78,7 @@ happens:
       current position is added verbatim to the output list, and the cursor
       advances by one position.
 5. When the end of the input text is reached, if the configuration indicates
-   that capitalization is required (this is true by default), te first element
+   that capitalization is required (this is true by default), the first element
    of the output list is capitalized.
 6. The output list is joined into one string.
 7. The string is compacted by removing excessive whitespace: Two or more
@@ -118,7 +118,7 @@ registered as constants under `scriptshifter.exceptions`.
 
 The following members of the context object are available in all the hooks:
 
-- `ctx.src`: Source text. It should not be reassigned.
+- `ctx.src`: Source text. Read only.
 - `ctx.general`: Configuration general options.
 - `ctx.langsec`: language section (S2R or R2S) of configuration.
 - `ctx.options`: language-specific options defined in configuration and set

+ 4 - 3
requirements.txt

@@ -1,3 +1,4 @@
-flask
-pyyaml
-uwsgi
+aksharamukha>=2.1,<2.2
+flask>=2.3,<2.4
+pyyaml>=6.0,<7
+uwsgi>=2.0,<2.1

+ 0 - 0
scriptshifter/hooks/aksharamukha/__init__.py


+ 30 - 0
scriptshifter/hooks/aksharamukha/romanizer.py

@@ -0,0 +1,30 @@
+# @package ext
+
+__doc__ = """
+Transliterate a number of Indian and other Asian scripts using Aksharamukha:
+https://github.com/virtualvinodh/aksharamukha-python """
+
+
+from logging import getLogger
+
+from aksharamukha.transliterate import process
+
+from scriptshifter.exceptions import BREAK
+
+
+logger = getLogger(__name__)
+
+
+def s2r_post_config(ctx, src_script):
+    # options = detect_preoptions(ctx.src, src_script)
+    options = [n for n, v in ctx.options.items() if v and n != "capitalize"]
+    ctx.dest = process(src_script, "IAST", ctx.src, pre_options=options)
+
+    return BREAK
+
+
+def r2s_post_config(ctx, dest_script):
+    options = [n for n, v in ctx.options.items() if v and n != "capitalize"]
+    ctx.dest = process("IAST", dest_script, ctx.src, post_options=options)
+
+    return BREAK

+ 11 - 5
scriptshifter/hooks/korean/romanizer.py

@@ -167,7 +167,9 @@ def _romanize_name(src, options):
 
     # `parsed` can either be a modified Korean string with markers, or in case
     # of a foreign name, the final romanized name.
-    parsed, _warnings = _parse_kor_name(re.sub(r"\s{2,}", " ", src.strip()))
+    parsed, _warnings = _parse_kor_name(
+            re.sub(r"\s{2,}", " ", src.strip()),
+            options)
 
     if len(_warnings):
         warnings += _warnings
@@ -211,7 +213,7 @@ def _romanize_name(src, options):
     return "", warnings
 
 
-def _parse_kor_name(src):
+def _parse_kor_name(src, options):
     parsed = None
     warnings = []
 
@@ -225,14 +227,18 @@ def _parse_kor_name(src):
     src_len = len(src)
 
     # FKR005: Error if more than 7 syllables
-    if src_len > 7 or src_len < 2 or " " in src[3:]:
-        return _kor_corp_name_rom(src), warnings
+    if src_len > 7 or src_len < 2 or src.find(" ") > 2:
+        if options.get("foreign_name"):
+            return _kor_corp_name_rom(src), warnings
+        else:
+            warnings.append("ERROR: not a Korean name.")
+            return None, warnings
 
     ct_spaces = src.count(" ")
     # FKR0006: Error if more than 2 spaces
     if ct_spaces > 2:
         warnings.append("ERROR: not a name (too many spaces)")
-        return parsed, warnings
+        return None, warnings
 
     # FKR007: 2 spaces (two family names)
     if ct_spaces == 2:

+ 16 - 0
scriptshifter/tables/data/bengali.yml

@@ -0,0 +1,16 @@
+general:
+  name: Bengali (Bangla)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Bengali"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Bengali"

+ 16 - 0
scriptshifter/tables/data/burmese.yml

@@ -0,0 +1,16 @@
+general:
+  name: Burmese (Myanmar)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Burmese"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Burmese"

+ 16 - 0
scriptshifter/tables/data/devanagari.yml

@@ -0,0 +1,16 @@
+general:
+  name: Devanagari
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Devanagari"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Devanagari"

+ 16 - 0
scriptshifter/tables/data/gurmukhi.yml

@@ -0,0 +1,16 @@
+general:
+  name: Punjabi (Gurmukhi)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Gurmukhi"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Gurmukhi"

+ 16 - 0
scriptshifter/tables/data/hiragana.yml

@@ -0,0 +1,16 @@
+general:
+  name: Japanese (Hiragana)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Hiragana"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Hiragana"

+ 23 - 0
scriptshifter/tables/data/index.yml

@@ -26,10 +26,14 @@ bashkir_cyrillic:
   name: Bashkir (Cyrillic)
 belarusian:
   name: Belarusian
+bengali:
+  name: Bengali
 bulgarian:
   name: Bulgarian
 buriat:
   name: Buriat (Cyrillic)
+burmese:
+  name: Burmese (Myanmar)
 chinese:
   name: Chinese (Hanzi)
 chukchi_cyrillic:
@@ -38,6 +42,8 @@ church_slavonic:
   name: Church Slavonic
 chuvash_cyrillic:
   name: Chuvash (Cyrillic)
+devanagari:
+  name: Devanagari
 dungan_cyrillic:
   name: Dungan (Cyrillic)
 ethiopic:
@@ -50,6 +56,10 @@ gagauz_cyrillic:
   name: Gagauz (Cyrillic)
 hindi:
   name: Hindi (Devanagari)
+hiragana:
+  name: Japanese (Hiragana)
+katakana:
+  name: Japanese (Katakana)
 kalmyk_cyrillic:
   name: Kalmyk (Cyrillic)
 kara-kalpak_cyrillic:
@@ -96,6 +106,8 @@ ossetic_cyrillic:
   name: Ossetic (Cyrillic)
 pulaar:
   name: Pulaar (Adlam)
+gurmukhi:
+  name: Punjabi (Gurmukhi)
 romani_cyrillic:
   name: Romani (Cyrillic)
 russian:
@@ -107,11 +119,22 @@ shor_cyrillic:
 syriac_cyrillic:
   name: Syriac (Cyrillic)
 tajik_cyrillic:
+tamil:
+  name: Tamil
+tamil_brahmi:
+  name: Tamil Brahmi
+tamil_extended:
+  name: Tamil (extended)
+thai:
+  name: Thai
+tajik:
   name: Tajik (Cyrillic)
 tatar-kryashen_cyrillic:
   name: Tatar-Kryashen (Cyrillic)
 tatar_cyrillic:
   name: Tatar (Cyrillic)
+tibetan:
+  name: Tibetan
 turkmen_cyrillic:
   name: Turkmen (Cyrillic)
 tuvinian_cyrillic:

+ 16 - 0
scriptshifter/tables/data/katakana.yml

@@ -0,0 +1,16 @@
+general:
+  name: Japanese (Katakana)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Katakana"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Katakana"

+ 8 - 0
scriptshifter/tables/data/korean_names.yml

@@ -8,6 +8,14 @@ options:
     description: Romanize according to a specific MARC field format. Leave blank if not applicable.
     type: string
     default:
+  - id: foreign_name
+    label: Foreign name
+    description: >
+      The provided string shall be romanized as a foreign name.
+      If this option is deactivated, names not falling within the Korean name
+      schema will not be transliterated and a warning will be issued.
+    type: boolean
+    default: false
 
 script_to_roman:
   hooks:

+ 16 - 0
scriptshifter/tables/data/tamil.yml

@@ -0,0 +1,16 @@
+general:
+  name: Tamil
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Tamil"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Tamil"

+ 16 - 0
scriptshifter/tables/data/tamil_brahmi.yml

@@ -0,0 +1,16 @@
+general:
+  name: Tamil Brahmi
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "TamilBrahmi"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "TamilBrahmi"

+ 16 - 0
scriptshifter/tables/data/tamil_extended.yml

@@ -0,0 +1,16 @@
+general:
+  name: Tamil (extended)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "TamilExtended"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "TamilExtended"

+ 43 - 0
scriptshifter/tables/data/thai.yml

@@ -0,0 +1,43 @@
+general:
+  name: Thai
+
+options:
+  - id: ThaiTranscription
+    label: Thai Orthography
+    description: พุทฺธ → พุทธะ
+    type: boolean
+    default: false
+  - id: ThaiSajjhayaOrthography
+    label: Sajjhāya orthography
+    description: พุทฺธ → พุท์ธ
+    type: boolean
+    default: false
+  - id: ThaiSajjhayawithA
+    label: Nativized sajjhaya
+    description: พุทฺธํ → พุท์ธัง
+    type: boolean
+    default: false
+  - id: ThaiNativeConsonants
+    label: Thai phonetic
+    description: พุทฺธตฺว → บุดธะต͜วะ
+    type: boolean
+    default: false
+  - id: ThaiVisargaSaraA
+    label: Sara a ะ as Visarga
+    description: พุทฺธ → พุทธะ
+    type: boolean
+    default: false
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Thai"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Thai"

+ 16 - 0
scriptshifter/tables/data/tibetan.yml

@@ -0,0 +1,16 @@
+general:
+  name: Tibetan
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Tibetan"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Tibetan"

+ 61 - 34
scriptshifter/templates/index.html

@@ -13,6 +13,11 @@
             height: 15vh;
             padding: 0.5em;
         }
+
+        fieldset.float-left {
+            margin-right: 2rem;
+        }
+
         #results{
             font-size: 1.25em;
             background-color: whitesmoke;
@@ -49,38 +54,41 @@
                 {% endfor %}
             </select>
         </fieldset>
-        <fieldset>
-            <legend>Direction</legend>
-            <div>
-                <label class="label-inline" for="s2r">Script to Roman</label>
-                <input
-                        type="radio" id="opt_s2r" name="t_dir" value="s2r"
-                        checked>
-            </div>
-            <div>
-                <label class="label-inline" for="r2s">Roman to script</label>
-                <input
-                        type="radio" id="opt_r2s" name="t_dir" value="r2s">
-            </div>
-        </fieldset>
-        <fieldset>
-            <legend>Capitalize</legend>
-            <div>
-                <label class="label-inline" for="no-change">No change</label>
-                <input
-                        type="radio" id="no-change" name="capitalize"
-                                                     value="no_change" checked>
-            </div>
-            <div>
-                <label class="label-inline" for="first">First word</label>
-                <input type="radio" id="first" name="capitalize" value="first">
-            </div>
-            <div>
-                <label class="label-inline" for="all">All words</label>
-                <input type="radio" id="all" name="capitalize" value="all">
-            </div>
-        </fieldset>
-        <div id="options"></div>
+        <div class="clearfix">
+            <h3>General Options</h3>
+            <fieldset class="float-left">
+                <legend>Direction</legend>
+                <div>
+                    <label class="label-inline" for="s2r">Script to Roman</label>
+                    <input
+                            type="radio" id="opt_s2r" name="t_dir" value="s2r"
+                            checked>
+                </div>
+                <div>
+                    <label class="label-inline" for="r2s">Roman to script</label>
+                    <input
+                            type="radio" id="opt_r2s" name="t_dir" value="r2s">
+                </div>
+            </fieldset>
+            <fieldset class="float-left">
+                <legend>Capitalize</legend>
+                <div>
+                    <label class="label-inline" for="no-change">No change</label>
+                    <input
+                            type="radio" id="no-change" name="capitalize"
+                                                         value="no_change" checked>
+                </div>
+                <div>
+                    <label class="label-inline" for="first">First word</label>
+                    <input type="radio" id="first" name="capitalize" value="first">
+                </div>
+                <div>
+                    <label class="label-inline" for="all">All words</label>
+                    <input type="radio" id="all" name="capitalize" value="all">
+                </div>
+            </fieldset>
+        </div>
+        <div id="options" class="clearfix"></div>
         <fieldset>
             <input class="button-primary" type="submit" value="Transliterate!">
         </fieldset>
@@ -101,17 +109,32 @@
               .then(response=>response.json())
                 .then((data) => {
                     document.getElementById("options").replaceChildren();
+                    if (data.length > 0) {
+                        let hdr = document.createElement("h3");
+                        hdr.innerText = "Language options";
+                        document.getElementById("options").append(hdr);
+                    }
                     data.forEach((opt)=>{
                         let fset = document.createElement("fieldset");
+                        fset.setAttribute("class", "float-left");
                         let label = document.createElement("label");
                         label.setAttribute("for", opt.id);
                         label.append(opt.label);
 
                         let input = document.createElement("input");
+                        if (opt.type == "boolean") {
+                            // Use checkbox for boolean type.
+                            input.setAttribute("type", "checkbox");
+                            if (opt.default) {
+                                input.setAttribute("checked", 1);
+                            }
+                        } else {
+                            // Use text for all other types.
+                            input.value = opt.default;
+                        }
                         input.setAttribute("id", opt.id);
                         input.setAttribute("name", opt.id);
                         input.classList.add("option_i");
-                        input.value = opt.default;
 
                         let descr = document.createElement("p");
                         descr.setAttribute("class", "input_descr");
@@ -146,7 +169,11 @@
             let option_inputs = document.getElementsByClassName("option_i");
             for (i = 0; i < option_inputs.length; i++) {
                 let el = option_inputs[i];
-                options[el.getAttribute('id')] = el.value;
+                if (el.type == "checkbox") {
+                    options[el.id] = el.checked;
+                } else {
+                    options[el.id] = el.value;
+                }
             };
             data.append('options', JSON.stringify(options));
 

+ 14 - 2
scriptshifter/trans.py

@@ -19,17 +19,29 @@ class Context:
     """
     Context used within the transliteration and passed to hook functions.
     """
+    @property
+    def src(self):
+        return self._src
+
+    @src.setter
+    def src(self):
+        raise NotImplementedError("Atribute is read-only.")
+
+    @src.deleter
+    def src(self):
+        raise NotImplementedError("Atribute is read-only.")
+
     def __init__(self, src, general, langsec, options={}):
         """
         Initialize a context.
 
         Args:
-            src (str): The original text. This is meant to never change.
+            src (str): The original text. Read-only.
             general (dict): general section of the current config.
             langsec (dict): Language configuration section being used.
             options (dict): extra options as a dict.
         """
-        self.src = src
+        self._src = src
         self.general = general
         self.options = options
         self.langsec = langsec