Browse Source

Hebrew (#81)

* Add Hebrew transliteration via Dicta; add list options.

* Update README.

* Several Hebrew and general changes:

* Add capitalization options
* Move capitalization function to new "tools" module
* Use env file

* Add requests module.

* Remove camel_tools requirement.

* Make rabbinic the default style; add documentation for option type.

* Add Hebrew test strings.
Stefano Cossu 4 months ago
parent
commit
6413dd586c

+ 40 - 1
README.md

@@ -2,6 +2,45 @@
 
 REST API service to convert non-Latin scripts to Latin, and vice versa.
 
+## Environment variables
+
+The provided `example.env` can be renamed to `.env` in your deployment and/or
+moved to a location that is not under version control, and adjusted to fit the
+environment. The file will be parsed directly by the application if present,
+or it can be pre-loaded in a Docker environment.
+
+Currently, the following environment variables are defined:
+
+- `TXL_LOGLEVEL`: Application log level. Defaults to `WARN`.
+- `TXL_FLASK_SECRET`: Flask secret key.
+- `TXL_DICTA_EP`: Endpoint for the Dicta Hebrew transliteration service. This
+  is mandatory for using the Hebrew module.
+
+## Local development server
+
+For local development, it is easiest to run Flask without the WSGI wrapper,
+possibly in a virtual environment:
+
+``` bash
+# python -m venv /path/to/venv
+# source /path/to/venv/bin/activate
+# pip install -r requirements.txt
+# flask run
+```
+
+It is advised to set `FLASK_DEBUG=true` to reload the web app on code changes
+and print detailed stack traces when exceptions are raised. Note that changes
+to any .yml file do NOT trigger a reload of Flask.
+
+Alternatively, the transliteration interface can be accessed directly from
+Python: 
+
+``` python
+from scriptshifter.trans import transliterate
+
+transliterate("some text", "some language")
+```
+
 ## Run on Docker
 
 Build container in current dir:
@@ -13,7 +52,7 @@ docker build -t scriptshifter:latest .
 Start container:
 
 ```
-docker run -e TXL_FLASK_SECRET=changeme -p 8000:8000 scriptshifter:latest
+docker run --env-file .env -p 8000:8000 scriptshifter:latest
 ```
 
 For running in development mode, add `-e FLASK_ENV=development` to the options.

+ 7 - 1
doc/config.md

@@ -139,7 +139,13 @@ are used in the built-in API:
 - `id`: the option ID used as a HTML tag ID and as a variable name.
 - `label`: human-readable label usable in a UI.
 - `description`: description usable in a UI. Optional.
-- `type`: unused at the moment.
+- `type`: at the moment, if this is set to `list`, a select widget shall be
+  generated in the UI. An additional `options` key (see below) with the
+  individual options will be required. Any other value generates a single-line
+  text field.
+- `options`: list of maps, each representing a selectable option in the drop-
+  down menu when the type is `list`. Each list item has an `id` and a `label`
+  key, used respectively as the input value and human-readable label.
 - `default`: The default value that should be set for the option in a UI. Note
   that this does not set a default value in an API call [TODO].
 

+ 4 - 0
example.env

@@ -0,0 +1,4 @@
+FLASK_DEBUG=true
+TXL_DICTA_EP="changeme"
+TXL_FLASK_SECRET="changeme"
+TXL_LOGLEVEL="INFO"

+ 8 - 1
scriptshifter/__init__.py

@@ -2,10 +2,17 @@ import logging
 
 from os import environ, path
 
+from dotenv import load_dotenv
+
+
+env = load_dotenv()
 
 APP_ROOT = path.dirname(path.realpath(__file__))
 
 logging.basicConfig(
         # filename=environ.get("TXL_LOGFILE", "/dev/stdout"),
-        level=environ.get("TXL_LOGLEVEL", logging.INFO))
+        level=environ.get("TXL_LOGLEVEL", logging.WARN))
 logger = logging.getLogger(__name__)
+
+if not env:
+    logger.warn("No .env file found. Assuming env was passed externally.")

+ 38 - 0
scriptshifter/hooks/hebrew/dicta_api.py

@@ -0,0 +1,38 @@
+from json import dumps
+from os import environ
+
+from requests import post
+
+from scriptshifter.exceptions import BREAK
+from scriptshifter.tools import capitalize
+
+EP = environ.get("TXL_DICTA_EP")
+DEFAULT_GENRE = "rabbinic"
+
+
+def s2r_post_config(ctx):
+    """
+    Romanize Hebrew text using the Dicta API service.
+    """
+    ctx.warnings = []
+    rsp = post(
+            EP,
+            data=dumps({
+                "data": ctx.src,
+                "genre": ctx.options.get("genre", DEFAULT_GENRE)
+            }))
+    rsp.raise_for_status()
+
+    rom = rsp.json().get("transliteration")
+
+    if rom:
+        if ctx.options["capitalize"] == "all":
+            rom = capitalize(rom)
+        elif ctx.options["capitalize"] == "first":
+            rom = rom[0].upper() + rom[1:]
+    else:
+        ctx.warnings.append("Upstream service returned empty result.")
+
+    ctx.dest = rom
+
+    return BREAK

+ 3 - 10
scriptshifter/hooks/korean/romanizer.py

@@ -28,6 +28,7 @@ from csv import reader
 
 from scriptshifter.exceptions import BREAK
 from scriptshifter.hooks.korean import KCONF
+from scriptshifter.tools import capitalize
 
 
 PWD = path.dirname(path.realpath(__file__))
@@ -93,7 +94,7 @@ def _romanize_nonames(src, options):
     logger.debug(f"Before capitalization: {rom}")
     # FKR042: Capitalize all first letters
     if options["capitalize"] == "all":
-        rom = _capitalize(rom)
+        rom = capitalize(rom)
     # FKR043: Capitalize the first letter
     elif options["capitalize"] == "first":
         rom = rom[0].upper() + rom[1:]
@@ -283,7 +284,7 @@ def _kor_corp_name_rom(src):
     rom_tok = []
     for tok in src.split(" "):
         rom_tok.append(_romanize_oclc_auto(tok))
-    rom = _capitalize(" ".join(rom_tok))
+    rom = capitalize(" ".join(rom_tok))
 
     if chu == "L":
         rom = "(Chu) " + rom
@@ -720,14 +721,6 @@ def _kor_lname_rom(lname):
     return rom if lname != rom else False
 
 
-def _capitalize(src):
-    """ Only capitalize first word and words preceded by space."""
-    orig_ls = src.split(" ")
-    cap_ls = [orig[0].upper() + orig[1:] for orig in orig_ls]
-
-    return " ".join(cap_ls)
-
-
 def _fkr_log(fkr_i):
     fkr_k = f"FKR{fkr_i:03}"
     logger.debug(f"Applying {fkr_k}: {FKR_IDX[fkr_k]}")

+ 2 - 2
scriptshifter/tables/__init__.py

@@ -71,14 +71,14 @@ class Token(str):
         - BEFGH
         - B
         """
-        logger.debug(f"a: {self.content}, b: {other.content}")
+        # logger.debug(f"a: {self.content}, b: {other.content}")
         self_len = len(self.content)
         other_len = len(other.content)
         min_len = min(self_len, other_len)
 
         # If one of the strings is entirely contained in the other string...
         if self.content[:min_len] == other.content[:min_len]:
-            logger.debug("Roots match.")
+            # logger.debug("Roots match.")
             # ...then the longer one takes precedence (is "less")
             return self_len > other_len
 

+ 22 - 0
scriptshifter/tables/data/hebrew.yml

@@ -0,0 +1,22 @@
+general:
+  name: Hebrew
+  description: Hebrew S2R.
+
+options:
+  - id: genre
+    label: Genre
+    description: Genre of the script.
+    type: list
+    options:
+      - id: rabbinic
+        label: Rabbinic
+      - id: modern
+        label: Modern
+    default: rabbinic
+
+script_to_roman:
+  hooks:
+    post_config:
+      -
+        - hebrew.dicta_api.s2r_post_config
+

+ 2 - 0
scriptshifter/tables/data/index.yml

@@ -54,6 +54,8 @@ georgian:
   name: Georgian
 gagauz_cyrillic:
   name: Gagauz (Cyrillic)
+hebrew:
+  name: Hebrew
 hindi:
   name: Hindi (Devanagari)
 hiragana:

+ 9 - 0
scriptshifter/tools.py

@@ -0,0 +1,9 @@
+__doc__ = """ Common tools for core and hooks. """
+
+
+def capitalize(src):
+    """ Only capitalize first word and words preceded by space."""
+    orig_ls = src.split(" ")
+    cap_ls = [orig[0].upper() + orig[1:] for orig in orig_ls]
+
+    return " ".join(cap_ls)

+ 26 - 0
tests/data/script_samples/hebrew.csv

@@ -0,0 +1,26 @@
+hebrew,מִקְדָּשׁ דּוּד,Miḳdash Daṿid,,
+hebrew,פֵּרוּשׁ הִקְדִּישׁ,Perush ha-ḳadish,,
+hebrew,מֵעֵינֵי הַיְּשׁוּעָה,Maʻayene ha-yeshuʻah,,
+hebrew,לְאָמְרִי קוֹדֶשׁהוֹדוֹת וְהִלֵּל,le-hodot ṿe-halel,,
+hebrew,אָמְרִי קוֹדֶשׁ,imre ḳodesh,,
+hebrew,קֹבֶץ הִלְכוֹת,ḳovets halakhot,,
+hebrew,שִׁיחַ שָׂרְפִי קוֹדֶשׁ,śiaḥ śarfe ḳodesh,,
+hebrew,מָרְדְּכַי,Mordekhai,,
+hebrew,גְּאֻלַּת צִיּוּן,Geʼulat Tsiyon,,
+hebrew,זַמֶּרֶת הַיִּחוּד,Zimrat ha-yiḥud,,
+hebrew,פְּנֵי שְׁמוּאֵל,Pene Shemu’el,,
+hebrew,תּוֹרַת הַבֵּית,torat ha-bayit,,
+hebrew,טַהֲרַת הַבֵּית,ṭohorat ha-bayit,,
+hebrew,"מַאֲמָרִים בִּשְׂמֹאל-שְׂמֹאל, 2020-2015","Maʼamarim ba-śemol-śemol, 2015-2020",,
+hebrew,לְהַמְרִיא עִם הַחַיִּים : הִנְנִי הֶעָנִי מִמַּעַשׂ,Le-hamri ‘im ha-ḥayim : hineni he-‘ani mi-ma‘aś,,
+hebrew,פֵּרִיפֶרְיוֹת בַּמַּדִּים : מִן הַשּׁוּלַיִם לַצָּבָא וּבַחֲזָרָה,Periferyot be-madim : min ha-shulayim la-tsava uva-ḥazarah,,
+hebrew,"צְלִיל, דִּמּוּי וּמֶרְחָב : לִיטוּרְגִּיָּה וְאַמְּנוּיוֹת כִּמְעַצְּבֵי זִכְרוֹן תִּרְבּוּתִי בִּימֵי הַבֵּינַיִם","Tselil, dimui u-merḥav : liṭurgiyah ṿe-omanuyot ki-me‘atsve zikaron tarbuti bi-Yeme ha-Benayim",,
+hebrew,גֶּבֶר נִכְנָס בְּפַרְדֵּס,Gever nikhnas be/a-fardes,,
+hebrew,גּוֹי קָדוֹשׁ : תָּנָ''ךְ וּלְאֻמִּיּוּת בְּעִדַּן הַמּוֹדֶרְנִי,Goi ḳadosh : Tanakh u-le’umiyut ba-‘idan ha-moderni,,
+hebrew,נְקֻדּוֹת מִפְנֶה בַּסְּפָרוֹת הָעִבְרִית וְזִקָּתָן לְמַגָּעִים עִם סִפְרוּיוֹת אֲחֵרוֹת,Neḳudot mifneh ba-sifrut ha-‘Ivrit ṿe-ziḳatan le-maga‘im ‘im sifruyot aḥerot,,
+hebrew,בֵּאוּר לְתַלְמוּד יְרוּשַׁלְמִי,Be’ur le-Talmud Yerushalmi,,
+hebrew,חֲבוּרוֹת וְחִבּוּרִים : עִיּוּנִים בַּסְּפָרוֹת הַמּוֹסֵר הַקַּבָּלִית שֶׁל חַכְמֵי צְפַת בְּמֵאָה הט''ז,Ḥavurot ṿe-ḥiburim : ʻiyunim be-sifrut ha-musar ha-ḳabalit shel ḥakhme Tsefat ba-meʼah ha-16,,
+hebrew,עוּרִי דְּבָרִי שִׁיר,‘Uri dabri shir,,
+hebrew,הֲוָיַת הַחֲכָמָה וְגִדּוּלָהּ,Haṿayat ha-ḥokhamah ṿe-gidulah,,
+hebrew,שִׁירִים טְחוּבִים לַיְּלָדִים רְטֻבִּים,Shirim ṭeḥuvim li-yeladim reṭuvim,,
+hebrew,הַחַמְדָנִים : קוֹמֶדְיָה מֵחַיֵּי הָאֻמָּנִים,ha-Ḥamdanim : ḳomedyah me-ḥaye ha-omanim,,