4 Commits 4fca3e8efe ... d57108d30d

Author SHA1 Message Date
  scossu d57108d30d Add documentation for double cap conf; update TODO. 1 year ago
  scossu 76f9727776 Implement double caps; sample Belarusian config. 1 year ago
  scossu 4c1e5c1c3a Fix some cap and test configurations. 1 year ago
  scossu 67e11e8f3d Add configuration and tests for double caps. 1 year ago

+ 9 - 8
TODO.md

@@ -5,10 +5,11 @@ discussion, etc.); *X* = not implementing.
 
 - *D* Basic table loading & parsing
 - *D* Table inheritance
-- *W* Multiple recursive inheritance
+- *D* Multiple recursive inheritance
   - *D* Inherit map
   - *D* Inherit ignore
-  - *B* Inherit hooks
+  - *D* Inherit double cap configuration
+  - *X* Inherit hooks
 - *D* Ignore list (R2S)
 - *D* Basic transliteration in both directions
 - *D* Basic REST API
@@ -18,13 +19,13 @@ discussion, etc.); *X* = not implementing.
 - *D* Word boundaries
   - *D* Define word boundary characters per config
   - *D* Mark end-of-word and beginning-of-word characters
-- *B* Optimize token lookup
+- *D* Optimize token lookup
   - *D* Break loop early based on alphabetical order
-  - *B* Ignore word break characters
-  - *W* Capitalization
-    - *P* Separate capitalization function
-    - *B* Capitalize ligated letters (e.g. Cyrillic T͡͡S)
-    - *D* Option for capitalizing first word, all words, none, unchanged
+  - *X* Ignore word break characters
+- *D* Capitalization
+  - *X* Separate capitalization function
+  - *D* Capitalize ligated letters (e.g. Cyrillic T͡͡S)
+  - *D* Option for capitalizing first word, all words, unchanged
 - *D* API documentation
 - *D* Config file documentation
 - *D* Hooks documentation

+ 22 - 0
doc/config.md

@@ -218,6 +218,28 @@ an error if a S2R transliteration is attempted on this language.
 This section may have the `hooks` and `map` sections, that behave exactly as
 described for `roman_to_script`. The `ignore` section is… ignored.
 
+#### `script_to_roman.double_cap`
+
+Type: list
+
+This is only a valid subsection of S2R. It is inherited from a parent and adds
+items to it.
+
+Each item in the list indicates a group of letters that, when encountered at
+the beginning of a word and slated for capitalization, are capitalized
+together, rather than only the first letter. This is the case in several
+ligated letter groups.
+
+Each rule must indicate the letters together as a group, romanized, and all
+lowercase. E.g. to capitalize "z︠h︡", that string must be entered verbatim. In
+that case, it is capitalized as "Z︠H︡", otherwise as "Z︠h︡".
+
+#### `script_to_roman.no_double_cap`
+
+Type: list
+
+This is only a valid subsection of S2R. It removes double capitalization rules
+from the inherited list.
 
 ##  Index file
 

+ 14 - 0
scriptshifter/tables/__init__.py

@@ -120,6 +120,10 @@ def load_table(tname):
     parents = tdata.get("general", {}).get("parents", [])
 
     if "script_to_roman" in tdata:
+        if "double_cap" in tdata["script_to_roman"]:
+            tdata["script_to_roman"]["double_cap"] = tuple(
+                    tdata["script_to_roman"]["double_cap"])
+
         tokens = {}
         for parent in parents:
             parent_tdata = load_table(parent)
@@ -129,6 +133,16 @@ def load_table(tname):
                 Token(k): v for k, v in parent_tdata.get(
                         "script_to_roman", {}).get("map", {})
             }
+            # Merge and/or remove double cap rules.
+            tdata["script_to_roman"]["double_cap"] = tuple((
+                set(parent_tdata.get(
+                    "script_to_roman", {}
+                ).get("double_cap", set())) |
+                set(tdata["script_to_roman"].get("double_cap", set()))
+            ) - set(tdata["script_to_roman"].get("no_double_cap", set())))
+        if "no_double_cap" in tdata["script_to_roman"]:
+            del tdata["script_to_roman"]["no_double_cap"]
+
         tokens |= {
                 Token(k): v
                 for k, v in tdata["script_to_roman"].get("map", {}).items()}

+ 3 - 0
scriptshifter/tables/data/belarusian.yml

@@ -44,6 +44,9 @@ roman_to_script:
     "\u0027": "\u044C"
 
 script_to_roman:
+  double_cap:
+    - "i\uFE20o\uFE21"
+    - "z\uFE20h\uFE21"
   map:
     "\u0401": "I\uFE20O\uFE21"
     "\u0406": "I"

+ 13 - 5
scriptshifter/trans.py

@@ -9,8 +9,8 @@ from scriptshifter.tables import WORD_BOUNDARY, load_table
 MULTI_WS_RE = re.compile(r"\s{2,}")
 
 # Cursor bitwise flags.
-CUR_BOW = 1
-CUR_EOW = 2
+CUR_BOW = 1 << 0
+CUR_EOW = 1 << 1
 
 logger = logging.getLogger(__name__)
 
@@ -89,6 +89,7 @@ def transliterate(src, lang, r2s=False, capitalize=False):
     word_boundary = langsec.get("word_boundary", WORD_BOUNDARY)
     while ctx.cur < len(src):
         # Reset cursor position flags.
+        # Carry over extended "beginning of word" flag.
         ctx.cur_flags = 0
         cur_char = src[ctx.cur]
 
@@ -187,7 +188,6 @@ def transliterate(src, lang, r2s=False, capitalize=False):
 
                 # A match is found. Stop scanning tokens, append result, and
                 # proceed scanning the source.
-                tk = ctx.dest_tk
                 # Capitalization.
                 if (
                     (capitalize == "first" and ctx.cur == 0)
@@ -195,8 +195,16 @@ def transliterate(src, lang, r2s=False, capitalize=False):
                     (capitalize == "all" and ctx.cur_flags & CUR_BOW)
                 ):
                     logger.info("Capitalizing token.")
-                    tk = tk.capitalize()
-                ctx.dest_ls.append(tk)
+                    double_cap = False
+                    for dcap_rule in ctx.langsec.get("double_cap", []):
+                        if ctx.dest_tk == dcap_rule:
+                            ctx.dest_tk = ctx.dest_tk.upper()
+                            double_cap = True
+                            break
+                    if not double_cap:
+                        ctx.dest_tk = ctx.dest_tk.capitalize()
+
+                ctx.dest_ls.append(ctx.dest_tk)
                 ctx.cur += step
                 break
 

+ 19 - 0
tests/data/cap_base1.yml

@@ -0,0 +1,19 @@
+# Test double capitalization.
+general:
+  name: Test ligature capitalization base 1
+
+script_to_roman:
+  double_cap:
+    # This capitalizes ligated z︠h︡
+    - "z\uFE20h\uFE21"
+  map:
+    # From Belarusian
+    "\u0432": "v"
+    "\u0433": "h"
+    "\u0436": "z\uFE20h\uFE21" # ж → z︠h︡
+    "\u0437": "z"
+    "\u043E": "o"
+    "\u0451": "i\uFE20o\uFE21" # ё → i︠o︡
+    "\u0456": "i"
+    "\u045E": "u\u0306"
+    "\u0491": "g"

+ 11 - 0
tests/data/cap_base2.yml

@@ -0,0 +1,11 @@
+# Test double capitalization.
+general:
+  name: Test ligature capitalization base 2
+  parents:
+    - cap_base1
+
+script_to_roman:
+  double_cap:
+    # This capitalizes ligated i︠o︡
+    - "i\uFE20o\uFE21"
+

+ 11 - 0
tests/data/cap_inherited.yml

@@ -0,0 +1,11 @@
+# Test double capitalization.
+general:
+  name: Test ligature capitalization working file
+  parents:
+    - cap_base2
+
+script_to_roman:
+  no_double_cap:
+    # Remove ligated i︠o︡ capitalization.
+    - "i\uFE20o\uFE21"
+

+ 27 - 0
tests/test01_cfg.py

@@ -116,3 +116,30 @@ class TestHooks(TestCase):
                         (scriptshifter.hooks.test.rotate, {"n": -3})
                     ]
                 })
+
+
+class TestDoubleCaps(TestCase):
+    """ Test double capitalization configuration. """
+    def setUp(self):
+        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
+        self.tables = reload_tables()
+
+    def test_dcaps_base1(self):
+        cap_base1 = self.tables.load_table("cap_base1")
+        assert "z︠h︡" in cap_base1["script_to_roman"]["double_cap"]
+
+    def test_dcaps_base2(self):
+        cap_base2 = self.tables.load_table("cap_base2")
+        dcap = cap_base2["script_to_roman"]["double_cap"]
+
+        assert len(dcap) == 2
+        assert "z︠h︡" in dcap
+        assert "i︠o︡" in dcap
+
+    def test_dcaps_inherited(self):
+        cap_inherited = self.tables.load_table("cap_inherited")
+        dcap = cap_inherited["script_to_roman"]["double_cap"]
+
+        assert len(dcap) == 1
+        assert "z︠h︡" in dcap
+        assert "i︠o︡" not in dcap

+ 37 - 0
tests/test03_capitalization.py

@@ -0,0 +1,37 @@
+from os import environ
+from unittest import TestCase
+
+from scriptshifter.trans import transliterate
+from tests import TEST_DATA_DIR, reload_tables
+
+
+class TestCapitalization(TestCase):
+    """
+    Test capitalization.
+    """
+
+    def setUp(self):
+        environ["TXL_CONFIG_TABLE_DIR"] = TEST_DATA_DIR
+        self.tables = reload_tables()
+
+    def test_cap(self):
+        tbl = "cap_inherited"
+        in_str = "зг іо"
+        tx = transliterate(in_str, tbl)
+        tx_cap = transliterate(in_str, tbl, capitalize="first")
+        tx_all = transliterate(in_str, tbl, capitalize="all")
+
+        self.assertEqual(tx, "zh io")
+        self.assertEqual(tx_cap, "Zh io")
+        self.assertEqual(tx_all, "Zh Io")
+
+    def test_cap_ligatures(self):
+        tbl = "cap_inherited"
+        in_str = "жзг ёіо зг іо"
+        tx = transliterate(in_str, tbl)
+        tx_cap = transliterate(in_str, tbl, capitalize="first")
+        tx_all = transliterate(in_str, tbl, capitalize="all")
+
+        self.assertEqual(tx, "z︠h︡zh i︠o︡io zh io")
+        self.assertEqual(tx_cap, "Z︠H︡zh i︠o︡io zh io")
+        self.assertEqual(tx_all, "Z︠H︡zh I︠o︡io Zh Io")

+ 0 - 0
tests/test03_rest_api.py → tests/test04_rest_api.py