1 year ago · 6413dd586c
--- a/README.md
+++ b/README.md
@@ -2,6 +2,45 @@
 
				 
			
 
				 REST API service to convert non-Latin scripts to Latin, and vice versa.
			
 
				 
			
 
				+## Environment variables
			
 
				+
			
 
				+The provided `example.env` can be renamed to `.env` in your deployment and/or
			
 
				+moved to a location that is not under version control, and adjusted to fit the
			
 
				+environment. The file will be parsed directly by the application if present,
			
 
				+or it can be pre-loaded in a Docker environment.
			
 
				+
			
 
				+Currently, the following environment variables are defined:
			
 
				+
			
 
				+- `TXL_LOGLEVEL`: Application log level. Defaults to `WARN`.
			
 
				+- `TXL_FLASK_SECRET`: Flask secret key.
			
 
				+- `TXL_DICTA_EP`: Endpoint for the Dicta Hebrew transliteration service. This
			
 
				+  is mandatory for using the Hebrew module.
			
 
				+
			
 
				+## Local development server
			
 
				+
			
 
				+For local development, it is easiest to run Flask without the WSGI wrapper,
			
 
				+possibly in a virtual environment:
			
 
				+
			
 
				+``` bash
			
 
				+# python -m venv /path/to/venv
			
 
				+# source /path/to/venv/bin/activate
			
 
				+# pip install -r requirements.txt
			
 
				+# flask run
			
 
				+```
			
 
				+
			
 
				+It is advised to set `FLASK_DEBUG=true` to reload the web app on code changes
			
 
				+and print detailed stack traces when exceptions are raised. Note that changes
			
 
				+to any .yml file do NOT trigger a reload of Flask.
			
 
				+
			
 
				+Alternatively, the transliteration interface can be accessed directly from
			
 
				+Python: 
			
 
				+
			
 
				+``` python
			
 
				+from scriptshifter.trans import transliterate
			
 
				+
			
 
				+transliterate("some text", "some language")
			
 
				+```
			
 
				+
			
 
				 ## Run on Docker
			
 
				 
			
 
				 Build container in current dir:
			
@@ -13,7 +52,7 @@ docker build -t scriptshifter:latest .
 
				 Start container:
			
 
				 
			
 
				 ```
			
 
				-docker run -e TXL_FLASK_SECRET=changeme -p 8000:8000 scriptshifter:latest
			
 
				+docker run --env-file .env -p 8000:8000 scriptshifter:latest
			
 
				 ```
			
 
				 
			
 
				 For running in development mode, add `-e FLASK_ENV=development` to the options.
			
--- a/doc/config.md
+++ b/doc/config.md
@@ -139,7 +139,13 @@ are used in the built-in API:
 
				 - `id`: the option ID used as a HTML tag ID and as a variable name.
			
 
				 - `label`: human-readable label usable in a UI.
			
 
				 - `description`: description usable in a UI. Optional.
			
 
				-- `type`: unused at the moment.
			
 
				+- `type`: at the moment, if this is set to `list`, a select widget shall be
			
 
				+  generated in the UI. An additional `options` key (see below) with the
			
 
				+  individual options will be required. Any other value generates a single-line
			
 
				+  text field.
			
 
				+- `options`: list of maps, each representing a selectable option in the drop-
			
 
				+  down menu when the type is `list`. Each list item has an `id` and a `label`
			
 
				+  key, used respectively as the input value and human-readable label.
			
 
				 - `default`: The default value that should be set for the option in a UI. Note
			
 
				   that this does not set a default value in an API call [TODO].
			
 
				 
			
--- a/example.env
+++ b/example.env
@@ -0,0 +1,4 @@
 
				+FLASK_DEBUG=true
			
 
				+TXL_DICTA_EP="changeme"
			
 
				+TXL_FLASK_SECRET="changeme"
			
 
				+TXL_LOGLEVEL="INFO"
			
--- a/scriptshifter/__init__.py
+++ b/scriptshifter/__init__.py
@@ -2,10 +2,17 @@ import logging
 
				 
			
 
				 from os import environ, path
			
 
				 
			
 
				+from dotenv import load_dotenv
			
 
				+
			
 
				+
			
 
				+env = load_dotenv()
			
 
				 
			
 
				 APP_ROOT = path.dirname(path.realpath(__file__))
			
 
				 
			
 
				 logging.basicConfig(
			
 
				         # filename=environ.get("TXL_LOGFILE", "/dev/stdout"),
			
 
				-        level=environ.get("TXL_LOGLEVEL", logging.INFO))
			
 
				+        level=environ.get("TXL_LOGLEVEL", logging.WARN))
			
 
				 logger = logging.getLogger(__name__)
			
 
				+
			
 
				+if not env:
			
 
				+    logger.warn("No .env file found. Assuming env was passed externally.")
			
--- a/scriptshifter/hooks/hebrew/dicta_api.py
+++ b/scriptshifter/hooks/hebrew/dicta_api.py
@@ -0,0 +1,38 @@
 
				+from json import dumps
			
 
				+from os import environ
			
 
				+
			
 
				+from requests import post
			
 
				+
			
 
				+from scriptshifter.exceptions import BREAK
			
 
				+from scriptshifter.tools import capitalize
			
 
				+
			
 
				+EP = environ.get("TXL_DICTA_EP")
			
 
				+DEFAULT_GENRE = "rabbinic"
			
 
				+
			
 
				+
			
 
				+def s2r_post_config(ctx):
			
 
				+    """
			
 
				+    Romanize Hebrew text using the Dicta API service.
			
 
				+    """
			
 
				+    ctx.warnings = []
			
 
				+    rsp = post(
			
 
				+            EP,
			
 
				+            data=dumps({
			
 
				+                "data": ctx.src,
			
 
				+                "genre": ctx.options.get("genre", DEFAULT_GENRE)
			
 
				+            }))
			
 
				+    rsp.raise_for_status()
			
 
				+
			
 
				+    rom = rsp.json().get("transliteration")
			
 
				+
			
 
				+    if rom:
			
 
				+        if ctx.options["capitalize"] == "all":
			
 
				+            rom = capitalize(rom)
			
 
				+        elif ctx.options["capitalize"] == "first":
			
 
				+            rom = rom[0].upper() + rom[1:]
			
 
				+    else:
			
 
				+        ctx.warnings.append("Upstream service returned empty result.")
			
 
				+
			
 
				+    ctx.dest = rom
			
 
				+
			
 
				+    return BREAK
			
--- a/scriptshifter/hooks/korean/romanizer.py
+++ b/scriptshifter/hooks/korean/romanizer.py
@@ -28,6 +28,7 @@ from csv import reader
 
				 
			
 
				 from scriptshifter.exceptions import BREAK
			
 
				 from scriptshifter.hooks.korean import KCONF
			
 
				+from scriptshifter.tools import capitalize
			
 
				 
			
 
				 
			
 
				 PWD = path.dirname(path.realpath(__file__))
			
@@ -93,7 +94,7 @@ def _romanize_nonames(src, options):
 
				     logger.debug(f"Before capitalization: {rom}")
			
 
				     # FKR042: Capitalize all first letters
			
 
				     if options["capitalize"] == "all":
			
 
				-        rom = _capitalize(rom)
			
 
				+        rom = capitalize(rom)
			
 
				     # FKR043: Capitalize the first letter
			
 
				     elif options["capitalize"] == "first":
			
 
				         rom = rom[0].upper() + rom[1:]
			
@@ -283,7 +284,7 @@ def _kor_corp_name_rom(src):
 
				     rom_tok = []
			
 
				     for tok in src.split(" "):
			
 
				         rom_tok.append(_romanize_oclc_auto(tok))
			
 
				-    rom = _capitalize(" ".join(rom_tok))
			
 
				+    rom = capitalize(" ".join(rom_tok))
			
 
				 
			
 
				     if chu == "L":
			
 
				         rom = "(Chu) " + rom
			
@@ -720,14 +721,6 @@ def _kor_lname_rom(lname):
 
				     return rom if lname != rom else False
			
 
				 
			
 
				 
			
 
				-def _capitalize(src):
			
 
				-    """ Only capitalize first word and words preceded by space."""
			
 
				-    orig_ls = src.split(" ")
			
 
				-    cap_ls = [orig[0].upper() + orig[1:] for orig in orig_ls]
			
 
				-
			
 
				-    return " ".join(cap_ls)
			
 
				-
			
 
				-
			
 
				 def _fkr_log(fkr_i):
			
 
				     fkr_k = f"FKR{fkr_i:03}"
			
 
				     logger.debug(f"Applying {fkr_k}: {FKR_IDX[fkr_k]}")
			
--- a/scriptshifter/tables/__init__.py
+++ b/scriptshifter/tables/__init__.py
@@ -71,14 +71,14 @@ class Token(str):
 
				         - BEFGH
			
 
				         - B
			
 
				         """
			
 
				-        logger.debug(f"a: {self.content}, b: {other.content}")
			
 
				+        # logger.debug(f"a: {self.content}, b: {other.content}")
			
 
				         self_len = len(self.content)
			
 
				         other_len = len(other.content)
			
 
				         min_len = min(self_len, other_len)
			
 
				 
			
 
				         # If one of the strings is entirely contained in the other string...
			
 
				         if self.content[:min_len] == other.content[:min_len]:
			
 
				-            logger.debug("Roots match.")
			
 
				+            # logger.debug("Roots match.")
			
 
				             # ...then the longer one takes precedence (is "less")
			
 
				             return self_len > other_len
			
 
				 
			
--- a/scriptshifter/tables/data/hebrew.yml
+++ b/scriptshifter/tables/data/hebrew.yml
@@ -0,0 +1,22 @@
 
				+general:
			
 
				+  name: Hebrew
			
 
				+  description: Hebrew S2R.
			
 
				+
			
 
				+options:
			
 
				+  - id: genre
			
 
				+    label: Genre
			
 
				+    description: Genre of the script.
			
 
				+    type: list
			
 
				+    options:
			
 
				+      - id: rabbinic
			
 
				+        label: Rabbinic
			
 
				+      - id: modern
			
 
				+        label: Modern
			
 
				+    default: rabbinic
			
 
				+
			
 
				+script_to_roman:
			
 
				+  hooks:
			
 
				+    post_config:
			
 
				+      -
			
 
				+        - hebrew.dicta_api.s2r_post_config
			
 
				+
			
--- a/scriptshifter/tables/data/index.yml
+++ b/scriptshifter/tables/data/index.yml
@@ -54,6 +54,8 @@ georgian:
 
				   name: Georgian
			
 
				 gagauz_cyrillic:
			
 
				   name: Gagauz (Cyrillic)
			
 
				+hebrew:
			
 
				+  name: Hebrew
			
 
				 hindi:
			
 
				   name: Hindi (Devanagari)
			
 
				 hiragana:
			
--- a/scriptshifter/tools.py
+++ b/scriptshifter/tools.py
@@ -0,0 +1,9 @@
 
				+__doc__ = """ Common tools for core and hooks. """
			
 
				+
			
 
				+
			
 
				+def capitalize(src):
			
 
				+    """ Only capitalize first word and words preceded by space."""
			
 
				+    orig_ls = src.split(" ")
			
 
				+    cap_ls = [orig[0].upper() + orig[1:] for orig in orig_ls]
			
 
				+
			
 
				+    return " ".join(cap_ls)
			
--- a/tests/data/script_samples/hebrew.csv
+++ b/tests/data/script_samples/hebrew.csv
@@ -0,0 +1,26 @@
 
				+hebrew,מִקְדָּשׁ דּוּד,Miḳdash Daṿid,,
			
 
				+hebrew,פֵּרוּשׁ הִקְדִּישׁ,Perush ha-ḳadish,,
			
 
				+hebrew,מֵעֵינֵי הַיְּשׁוּעָה,Maʻayene ha-yeshuʻah,,
			
 
				+hebrew,לְאָמְרִי קוֹדֶשׁהוֹדוֹת וְהִלֵּל,le-hodot ṿe-halel,,
			
 
				+hebrew,אָמְרִי קוֹדֶשׁ,imre ḳodesh,,
			
 
				+hebrew,קֹבֶץ הִלְכוֹת,ḳovets halakhot,,
			
 
				+hebrew,שִׁיחַ שָׂרְפִי קוֹדֶשׁ,śiaḥ śarfe ḳodesh,,
			
 
				+hebrew,מָרְדְּכַי,Mordekhai,,
			
 
				+hebrew,גְּאֻלַּת צִיּוּן,Geʼulat Tsiyon,,
			
 
				+hebrew,זַמֶּרֶת הַיִּחוּד,Zimrat ha-yiḥud,,
			
 
				+hebrew,פְּנֵי שְׁמוּאֵל,Pene Shemu’el,,
			
 
				+hebrew,תּוֹרַת הַבֵּית,torat ha-bayit,,
			
 
				+hebrew,טַהֲרַת הַבֵּית,ṭohorat ha-bayit,,
			
 
				+hebrew,"מַאֲמָרִים בִּשְׂמֹאל-שְׂמֹאל, 2020-2015","Maʼamarim ba-śemol-śemol, 2015-2020",,
			
 
				+hebrew,לְהַמְרִיא עִם הַחַיִּים : הִנְנִי הֶעָנִי מִמַּעַשׂ,Le-hamri ‘im ha-ḥayim : hineni he-‘ani mi-ma‘aś,,
			
 
				+hebrew,פֵּרִיפֶרְיוֹת בַּמַּדִּים : מִן הַשּׁוּלַיִם לַצָּבָא וּבַחֲזָרָה,Periferyot be-madim : min ha-shulayim la-tsava uva-ḥazarah,,
			
 
				+hebrew,"צְלִיל, דִּמּוּי וּמֶרְחָב : לִיטוּרְגִּיָּה וְאַמְּנוּיוֹת כִּמְעַצְּבֵי זִכְרוֹן תִּרְבּוּתִי בִּימֵי הַבֵּינַיִם","Tselil, dimui u-merḥav : liṭurgiyah ṿe-omanuyot ki-me‘atsve zikaron tarbuti bi-Yeme ha-Benayim",,
			
 
				+hebrew,גֶּבֶר נִכְנָס בְּפַרְדֵּס,Gever nikhnas be/a-fardes,,
			
 
				+hebrew,גּוֹי קָדוֹשׁ : תָּנָ''ךְ וּלְאֻמִּיּוּת בְּעִדַּן הַמּוֹדֶרְנִי,Goi ḳadosh : Tanakh u-le’umiyut ba-‘idan ha-moderni,,
			
 
				+hebrew,נְקֻדּוֹת מִפְנֶה בַּסְּפָרוֹת הָעִבְרִית וְזִקָּתָן לְמַגָּעִים עִם סִפְרוּיוֹת אֲחֵרוֹת,Neḳudot mifneh ba-sifrut ha-‘Ivrit ṿe-ziḳatan le-maga‘im ‘im sifruyot aḥerot,,
			
 
				+hebrew,בֵּאוּר לְתַלְמוּד יְרוּשַׁלְמִי,Be’ur le-Talmud Yerushalmi,,
			
 
				+hebrew,חֲבוּרוֹת וְחִבּוּרִים : עִיּוּנִים בַּסְּפָרוֹת הַמּוֹסֵר הַקַּבָּלִית שֶׁל חַכְמֵי צְפַת בְּמֵאָה הט''ז,Ḥavurot ṿe-ḥiburim : ʻiyunim be-sifrut ha-musar ha-ḳabalit shel ḥakhme Tsefat ba-meʼah ha-16,,
			
 
				+hebrew,עוּרִי דְּבָרִי שִׁיר,‘Uri dabri shir,,
			
 
				+hebrew,הֲוָיַת הַחֲכָמָה וְגִדּוּלָהּ,Haṿayat ha-ḥokhamah ṿe-gidulah,,
			
 
				+hebrew,שִׁירִים טְחוּבִים לַיְּלָדִים רְטֻבִּים,Shirim ṭeḥuvim li-yeladim reṭuvim,,
			
 
				+hebrew,הַחַמְדָנִים : קוֹמֶדְיָה מֵחַיֵּי הָאֻמָּנִים,ha-Ḥamdanim : ḳomedyah me-ḥaye ha-omanim,,