Browse Source

Add Thai support via Aksharamukha. (#79)

* Fix foreign name check; add foreign name option.

* Add default value and correct type to foreign_name option.

* Add UI checkbox.

* Add Aksharamukha integration and initial Thai support.

* Tidy up form layout.

* Add Aksharamukha option parsing in R2S.

* Add support for more Aksharamukha languages:

* Bengali
* Burmese
* Devanagari
* Gurmukhi
* Japanese (Katakana + Hiragana)
* Tamil (+ Brahmi + extended)
* Tibetan
Stefano Cossu 5 months ago
parent
commit
05979ef754

+ 2 - 2
doc/hooks.md

@@ -78,7 +78,7 @@ happens:
       current position is added verbatim to the output list, and the cursor
       advances by one position.
 5. When the end of the input text is reached, if the configuration indicates
-   that capitalization is required (this is true by default), te first element
+   that capitalization is required (this is true by default), the first element
    of the output list is capitalized.
 6. The output list is joined into one string.
 7. The string is compacted by removing excessive whitespace: Two or more
@@ -118,7 +118,7 @@ registered as constants under `scriptshifter.exceptions`.
 
 The following members of the context object are available in all the hooks:
 
-- `ctx.src`: Source text. It should not be reassigned.
+- `ctx.src`: Source text. Read only.
 - `ctx.general`: Configuration general options.
 - `ctx.langsec`: language section (S2R or R2S) of configuration.
 - `ctx.options`: language-specific options defined in configuration and set

+ 4 - 3
requirements.txt

@@ -1,3 +1,4 @@
-flask
-pyyaml
-uwsgi
+aksharamukha>=2.1,<2.2
+flask>=2.3,<2.4
+pyyaml>=6.0,<7
+uwsgi>=2.0,<2.1

+ 0 - 0
scriptshifter/hooks/aksharamukha/__init__.py


+ 30 - 0
scriptshifter/hooks/aksharamukha/romanizer.py

@@ -0,0 +1,30 @@
+# @package ext
+
+__doc__ = """
+Transliterate a number of Indian and other Asian scripts using Aksharamukha:
+https://github.com/virtualvinodh/aksharamukha-python """
+
+
+from logging import getLogger
+
+from aksharamukha.transliterate import process
+
+from scriptshifter.exceptions import BREAK
+
+
+logger = getLogger(__name__)
+
+
+def s2r_post_config(ctx, src_script):
+    # options = detect_preoptions(ctx.src, src_script)
+    options = [n for n, v in ctx.options.items() if v and n != "capitalize"]
+    ctx.dest = process(src_script, "IAST", ctx.src, pre_options=options)
+
+    return BREAK
+
+
+def r2s_post_config(ctx, dest_script):
+    options = [n for n, v in ctx.options.items() if v and n != "capitalize"]
+    ctx.dest = process("IAST", dest_script, ctx.src, post_options=options)
+
+    return BREAK

+ 16 - 0
scriptshifter/tables/data/bengali.yml

@@ -0,0 +1,16 @@
+general:
+  name: Bengali (Bangla)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Bengali"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Bengali"

+ 16 - 0
scriptshifter/tables/data/burmese.yml

@@ -0,0 +1,16 @@
+general:
+  name: Burmese (Myanmar)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Burmese"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Burmese"

+ 16 - 0
scriptshifter/tables/data/devanagari.yml

@@ -0,0 +1,16 @@
+general:
+  name: Devanagari
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Devanagari"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Devanagari"

+ 16 - 0
scriptshifter/tables/data/gurmukhi.yml

@@ -0,0 +1,16 @@
+general:
+  name: Punjabi (Gurmukhi)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Gurmukhi"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Gurmukhi"

+ 16 - 0
scriptshifter/tables/data/hiragana.yml

@@ -0,0 +1,16 @@
+general:
+  name: Japanese (Hiragana)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Hiragana"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Hiragana"

+ 22 - 0
scriptshifter/tables/data/index.yml

@@ -22,12 +22,18 @@ asian_cyrillic:
     Multi-purpose transliteration for non-Slavic Cyrillic scripts: Abaza, Abkhaz, Adygei, Aisor, Altai, Avar, Azeri, Balkar, Bashkir, Buryat, Chechen, Chukchi, Chuvash, Dargwa, Dungan, Eskimo, Even, Evenki, Gagauz, Ingush, Inuit, Kabardian, Kalmyk, Karachay, Karachay-Balkar, Karakalpak, Karelian, Khakass, Khanty, Komi, Komi-Permyak, Koryak, Kumyk, Lak, Lapp, Lezghian, Lithuanian, Mansi, Mari, Moldovan, Molodstov, Mordvin, Nanai, Nenets, Nivkh, Nogai, Ossetic, Permyak, Romanian, Romany, Selkup, Shor, Tabasaran, Tat, Tuva, Udekhe, Udmurt, Yakut.
 belarusian:
   name: Belarusian
+bengali:
+  name: Bengali
 bulgarian:
   name: Bulgarian
+burmese:
+  name: Burmese (Myanmar)
 chinese:
   name: Chinese (Hanzi)
 church_slavonic:
   name: Church Slavonic
+devanagari:
+  name: Devanagari
 ethiopic:
   name: Ethiopic (Amharic)
 georgian:
@@ -44,6 +50,10 @@ korean_names:
   description: Korean S2R for strings ONLY containing personal names formatted as last + first name. Separate multiple names with a comma or a center-dot (U+00B7).
 kyrgyz:
   name: Kyrgyz (Cyrillic)
+hiragana:
+  name: Japanese (Hiragana)
+katakana:
+  name: Japanese (Katakana)
 mongolian:
   name: Mongolian (Cyrillic)
 mongolian_mongol_bichig:
@@ -56,10 +66,22 @@ serbian:
   name: Serbian
 pulaar:
   name: Pulaar (Adlam)
+gurmukhi:
+  name: Punjabi (Gurmukhi)
+tamil:
+  name: Tamil
+tamil_brahmi:
+  name: Tamil Brahmi
+tamil_extended:
+  name: Tamil (extended)
+thai:
+  name: Thai
 tajik:
   name: Tajik (Cyrillic)
 tatar:
   name: Tatar (Cyrillic)
+tibetan:
+  name: Tibetan
 turkmen:
   name: Turkmen (Cyrillic)
 ukrainian:

+ 16 - 0
scriptshifter/tables/data/katakana.yml

@@ -0,0 +1,16 @@
+general:
+  name: Japanese (Katakana)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Katakana"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Katakana"

+ 16 - 0
scriptshifter/tables/data/tamil.yml

@@ -0,0 +1,16 @@
+general:
+  name: Tamil
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Tamil"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Tamil"

+ 16 - 0
scriptshifter/tables/data/tamil_brahmi.yml

@@ -0,0 +1,16 @@
+general:
+  name: Tamil Brahmi
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "TamilBrahmi"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "TamilBrahmi"

+ 16 - 0
scriptshifter/tables/data/tamil_extended.yml

@@ -0,0 +1,16 @@
+general:
+  name: Tamil (extended)
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "TamilExtended"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "TamilExtended"

+ 43 - 0
scriptshifter/tables/data/thai.yml

@@ -0,0 +1,43 @@
+general:
+  name: Thai
+
+options:
+  - id: ThaiTranscription
+    label: Thai Orthography
+    description: พุทฺธ → พุทธะ
+    type: boolean
+    default: false
+  - id: ThaiSajjhayaOrthography
+    label: Sajjhāya orthography
+    description: พุทฺธ → พุท์ธ
+    type: boolean
+    default: false
+  - id: ThaiSajjhayawithA
+    label: Nativized sajjhaya
+    description: พุทฺธํ → พุท์ธัง
+    type: boolean
+    default: false
+  - id: ThaiNativeConsonants
+    label: Thai phonetic
+    description: พุทฺธตฺว → บุดธะต͜วะ
+    type: boolean
+    default: false
+  - id: ThaiVisargaSaraA
+    label: Sara a ะ as Visarga
+    description: พุทฺธ → พุทธะ
+    type: boolean
+    default: false
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Thai"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Thai"

+ 16 - 0
scriptshifter/tables/data/tibetan.yml

@@ -0,0 +1,16 @@
+general:
+  name: Tibetan
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.s2r_post_config
+        - src_script: "Tibetan"
+
+roman_to_script:
+  hooks:
+    post_config:
+      -
+        - aksharamukha.romanizer.r2s_post_config
+        - dest_script: "Tibetan"

+ 51 - 33
scriptshifter/templates/index.html

@@ -13,6 +13,11 @@
             height: 15vh;
             padding: 0.5em;
         }
+
+        fieldset.float-left {
+            margin-right: 2rem;
+        }
+
         #results{
             font-size: 1.25em;
             background-color: whitesmoke;
@@ -49,38 +54,41 @@
                 {% endfor %}
             </select>
         </fieldset>
-        <fieldset>
-            <legend>Direction</legend>
-            <div>
-                <label class="label-inline" for="s2r">Script to Roman</label>
-                <input
-                        type="radio" id="opt_s2r" name="t_dir" value="s2r"
-                        checked>
-            </div>
-            <div>
-                <label class="label-inline" for="r2s">Roman to script</label>
-                <input
-                        type="radio" id="opt_r2s" name="t_dir" value="r2s">
-            </div>
-        </fieldset>
-        <fieldset>
-            <legend>Capitalize</legend>
-            <div>
-                <label class="label-inline" for="no-change">No change</label>
-                <input
-                        type="radio" id="no-change" name="capitalize"
-                                                     value="no_change" checked>
-            </div>
-            <div>
-                <label class="label-inline" for="first">First word</label>
-                <input type="radio" id="first" name="capitalize" value="first">
-            </div>
-            <div>
-                <label class="label-inline" for="all">All words</label>
-                <input type="radio" id="all" name="capitalize" value="all">
-            </div>
-        </fieldset>
-        <div id="options"></div>
+        <div class="clearfix">
+            <h3>General Options</h3>
+            <fieldset class="float-left">
+                <legend>Direction</legend>
+                <div>
+                    <label class="label-inline" for="s2r">Script to Roman</label>
+                    <input
+                            type="radio" id="opt_s2r" name="t_dir" value="s2r"
+                            checked>
+                </div>
+                <div>
+                    <label class="label-inline" for="r2s">Roman to script</label>
+                    <input
+                            type="radio" id="opt_r2s" name="t_dir" value="r2s">
+                </div>
+            </fieldset>
+            <fieldset class="float-left">
+                <legend>Capitalize</legend>
+                <div>
+                    <label class="label-inline" for="no-change">No change</label>
+                    <input
+                            type="radio" id="no-change" name="capitalize"
+                                                         value="no_change" checked>
+                </div>
+                <div>
+                    <label class="label-inline" for="first">First word</label>
+                    <input type="radio" id="first" name="capitalize" value="first">
+                </div>
+                <div>
+                    <label class="label-inline" for="all">All words</label>
+                    <input type="radio" id="all" name="capitalize" value="all">
+                </div>
+            </fieldset>
+        </div>
+        <div id="options" class="clearfix"></div>
         <fieldset>
             <input class="button-primary" type="submit" value="Transliterate!">
         </fieldset>
@@ -101,8 +109,14 @@
               .then(response=>response.json())
                 .then((data) => {
                     document.getElementById("options").replaceChildren();
+                    if (data.length > 0) {
+                        let hdr = document.createElement("h3");
+                        hdr.innerText = "Language options";
+                        document.getElementById("options").append(hdr);
+                    }
                     data.forEach((opt)=>{
                         let fset = document.createElement("fieldset");
+                        fset.setAttribute("class", "float-left");
                         let label = document.createElement("label");
                         label.setAttribute("for", opt.id);
                         label.append(opt.label);
@@ -155,7 +169,11 @@
             let option_inputs = document.getElementsByClassName("option_i");
             for (i = 0; i < option_inputs.length; i++) {
                 let el = option_inputs[i];
-                options[el.getAttribute('id')] = el.value;
+                if (el.type == "checkbox") {
+                    options[el.id] = el.checked;
+                } else {
+                    options[el.id] = el.value;
+                }
             };
             data.append('options', JSON.stringify(options));
 

+ 14 - 2
scriptshifter/trans.py

@@ -19,17 +19,29 @@ class Context:
     """
     Context used within the transliteration and passed to hook functions.
     """
+    @property
+    def src(self):
+        return self._src
+
+    @src.setter
+    def src(self):
+        raise NotImplementedError("Atribute is read-only.")
+
+    @src.deleter
+    def src(self):
+        raise NotImplementedError("Atribute is read-only.")
+
     def __init__(self, src, general, langsec, options={}):
         """
         Initialize a context.
 
         Args:
-            src (str): The original text. This is meant to never change.
+            src (str): The original text. Read-only.
             general (dict): general section of the current config.
             langsec (dict): Language configuration section being used.
             options (dict): extra options as a dict.
         """
-        self.src = src
+        self._src = src
         self.general = general
         self.options = options
         self.langsec = langsec