2 months ago · 5a21c05369
--- a/scriptshifter/tables/__init__.py
+++ b/scriptshifter/tables/__init__.py
@@ -84,12 +84,12 @@ class Token(str):
 
															         # Standalone has precedence, then initial, then final, then medial.
														
 
															         # This is somewhat arbitrary and may change if special cases arise.
														
 
															         # WB markers are moved to flags to allow default comparison.
														
 
															-        if self.content.endswith(TOKEN_WB_MARKER):
														
 
															-            self.flags |= BOW
														
 
															-            self.content = self.content.rstrip(TOKEN_WB_MARKER)
														
 
															         if self.content.startswith(TOKEN_WB_MARKER):
														
 
															-            self.flags |= EOW
														
 
															+            self.flags |= BOW
														
 
															             self.content = self.content.lstrip(TOKEN_WB_MARKER)
														
 
															+        if self.content.endswith(TOKEN_WB_MARKER):
														
 
															+            self.flags |= EOW
														
 
															+            self.content = self.content.rstrip(TOKEN_WB_MARKER)
														
 
															     def __lt__(self, other):
														
 
															         """
														
@@ -115,9 +115,9 @@ class Token(str):
 
															         if (
														
 
															                 (self.flags > 0 or other.flags > 0)
														
 
															                 and self.content == other.content):
														
 
															-            logger.debug(f"{self.content} flags: {self.flags}")
														
 
															-            logger.debug(f"{other.content} flags: {other.flags}")
														
 
															-            logger.debug("Performing flags comparison.")
														
 
															+            # logger.debug(f"{self.content} flags: {self.flags}")
														
 
															+            # logger.debug(f"{other.content} flags: {other.flags}")
														
 
															+            # logger.debug("Performing flags comparison.")
														
 
															             return self.flags > other.flags
														
@@ -202,6 +202,8 @@ def populate_table(conn, tname, tdata):
 
															     @param tdata(dict): Table data.
														
 
															     """
														
 
															+    logger.info(f"Populating table: {tname}")
														
 
															+
														
 
															     res = conn.execute(
														
 
															         """INSERT INTO tbl_language (
														
 
															             name, label, marc_code, description
														
--- a/scriptshifter/tables/data/_ignore_base.yml
+++ b/scriptshifter/tables/data/_ignore_base.yml
@@ -1,9 +1,9 @@
 
															+---
														
 
															 general:
														
 
															   name: Common ignore list.
														
 
															 roman_to_script:
														
 
															   ignore:
														
 
															-    - " "
														
 
															     - "at head of title"
														
 
															     - "colophon"
														
 
															     - "date of publication not identified"
														
@@ -38,8 +38,6 @@ roman_to_script:
 
															     - "\\b[\u2021$][0-9a-z]\\b"
														
 
															 script_to_roman:
														
 
															-  ignore:
														
 
															-    - " "
														
 
															   ignore_ptn:
														
 
															     # MARC sub-field markers.
														
 
															     - "\\b[\u2021$][0-9a-z]\\b"
														
--- a/scriptshifter/tables/data/greek_classical.yml
+++ b/scriptshifter/tables/data/greek_classical.yml
@@ -404,7 +404,8 @@ script_to_roman:
 
															     "\u0399": "I"
														
 
															     "\u039A": "K"
														
 
															     "\u039B": "L"
														
 
															-    "\u039C\u03C0%": "B"
														
 
															+    "%\u039C\u03A0": "B"
														
 
															+    "%\u039C\u03C0": "B"
														
 
															     "\u039C": "M"
														
 
															     "\u039D\u03C4%": "\u1E0E"
														
 
															     "\u039D": "N"
														
@@ -461,10 +462,11 @@ script_to_roman:
 
															     "\u03B2": "b"
														
 
															     "\u03B3\u03B3": "ng"
														
 
															     "\u03B3\u03BA": "nk"
														
 
															-    "\u0393\u03BA%": "Gk"
														
 
															-    "\u03B3\u03BA%": "gk"
														
 
															     "%\u0393\u03BA": "Gk"
														
 
															+    "%\u0393\u039A": "GK"
														
 
															+    "\u0393\u039A%": "GK"
														
 
															     "%\u03B3\u03BA": "gk"
														
 
															+    "\u03B3\u03BA%": "gk"
														
 
															     "\u03B3\u03BE": "nx"
														
 
															     "\u03B3\u03C7": "nch"
														
 
															     "\u03B3": "g"
														
@@ -494,9 +496,9 @@ script_to_roman:
 
															     "\u03B9": "i"
														
 
															     "\u03BA": "k"
														
 
															     "\u03BB": "l"
														
 
															-    "\u03BC\u03C0%": "b"
														
 
															+    "%\u03BC\u03C0": "b"
														
 
															     "\u03BC": "m"
														
 
															-    "\u03BD\u03C4%": "\u1E0F"
														
 
															+    "%\u03BD\u03C4": "\u1E0F"
														
 
															     "\u03BD": "n"
														
 
															     "\u03BE": "x"
														
 
															     "\u1F41": "ho"
														
@@ -611,6 +613,8 @@ roman_to_script:
 
															     "Au": "\u0391\u03C5"
														
 
															     "au": "\u03B1\u03C5"
														
 
															     "a\u0301": "\u03AC"
														
 
															+    "%B": "\u039C\u03C0"
														
 
															+    "%b": "\u03BC\u03C0"
														
 
															     "B": "\u0392"
														
 
															     "b": "\u03B2"
														
 
															     "b\u0333": "\u03D0"
														
@@ -699,7 +703,7 @@ roman_to_script:
 
															     "m": "\u03BC"
														
 
															     "nch": "\u03B3\u03C7"
														
 
															     "ng": "\u03B3\u03B3"
														
 
															-    "%nk%": "\u03B3\u03BA"
														
 
															+    "nk": "\u03B3\u03BA"
														
 
															     "nx": "\u03B3\u03BE"
														
 
															     "No\u0332": "\u2116"
														
 
															     "N": "\u039D"
														
@@ -749,7 +753,7 @@ roman_to_script:
 
															     # "S": "\u03F9"  # FIXME ambiguous.
														
 
															     "S": "\u03A3"
														
 
															     # "s": "\u03F2"  # FIXME ambiguous.
														
 
															-    "%s": "\u03C2"
														
 
															+    "s%": "\u03C2"
														
 
															     "s": "\u03C3"
														
 
															     "T\u0333H\u0333": "\u03F4"
														
 
															     "t\u0333h\u0333": "\u03D1"
														
--- a/scriptshifter/trans.py
+++ b/scriptshifter/trans.py
@@ -17,9 +17,9 @@ MULTI_WS_RE = compile(r"(\s){2,}")
 
															 logger = logging.getLogger(__name__)
														
 
															-class Context:
														
 
															+class Transliterator:
														
 
															     """
														
 
															-    Context used within the transliteration and passed to hook functions.
														
 
															+    Context carrying the state of transliteration process.
														
 
															     Use within a `with` block for proper cleanup.
														
 
															     """
														
@@ -35,6 +35,10 @@ class Context:
 
															     def src(self):
														
 
															         raise NotImplementedError("Attribute is read-only.")
														
 
															+    @property
														
 
															+    def cur_char(self):
														
 
															+        return self.src[self.cur]
														
 
															+
														
 
															     def __init__(self, lang, src, t_dir, options={}):
														
 
															         """
														
 
															         Initialize a context.
														
@@ -64,6 +68,72 @@ class Context:
 
															     def __exit__(self, exc_type, exc_value, traceback):
														
 
															         self.conn.close()
														
 
															+    def run_hook(self, hname):
														
 
															+        ret = None
														
 
															+        for hook_def in self.hooks.get(hname, []):
														
 
															+            fn = getattr(
														
 
															+                import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
														
 
															+                hook_def["fn_name"]
														
 
															+            )
														
 
															+            ret = fn(self, **hook_def["kwargs"])
														
 
															+            if ret in (BREAK, CONT):
														
 
															+                # This will stop parsing hooks functions and tell the caller to
														
 
															+                # break out of the outer loop or skip iteration.
														
 
															+                return ret
														
 
															+
														
 
															+        return ret
														
 
															+
														
 
															+    def normalize_src(self):
														
 
															+        """
														
 
															+        Normalize source text according to rules.
														
 
															+
														
 
															+        NOTE: this manipluates the protected source attribute so it may not
														
 
															+        correspond to the originally provided source.
														
 
															+        """
														
 
															+        # Normalize precomposed Unicode characters.
														
 
															+        #
														
 
															+        # In using diacritics, LC standards prefer the decomposed form
														
 
															+        # (combining diacritic + base character) to the pre-composed form
														
 
															+        # (single Unicode symbol for the letter with diacritic).
														
 
															+        #
														
 
															+        # Note: only safe for R2S.
														
 
															+        if self.t_dir == FEAT_R2S:
														
 
															+            logger.debug("Normalizing pre-composed symbols.")
														
 
															+            self._src = precomp_normalize("NFD", self.src)
														
 
															+
														
 
															+        norm_rules = get_lang_normalize(self.conn, self.lang_id)
														
 
															+
														
 
															+        for nk, nv in norm_rules.items():
														
 
															+            self._src = self.src.replace(nk, nv)
														
 
															+
														
 
															+        return self.run_hook("post_normalize")
														
 
															+
														
 
															+    def cur_at_bow(self, cur=None):
														
 
															+        """
														
 
															+        Check if cursor is at the beginning of a word.
														
 
															+
														
 
															+        @param cur(int): Position to check. By default, the current cursor.
														
 
															+        """
														
 
															+        if cur is None:
														
 
															+            cur = self.cur
														
 
															+        return (
														
 
															+            self.cur == 0
														
 
															+            or self.src[cur - 1] in WORD_BOUNDARY
														
 
															+        ) and (self.src[cur] not in WORD_BOUNDARY)
														
 
															+
														
 
															+    def cur_at_eow(self, cur=None):
														
 
															+        """
														
 
															+        Check if cursor is at the end of a word.
														
 
															+
														
 
															+        @param cur(int): Position to check. By default, the current cursor.
														
 
															+        """
														
 
															+        if cur is None:
														
 
															+            cur = self.cur
														
 
															+        return (
														
 
															+            cur == len(self.src) - 1
														
 
															+            or self.src[cur + 1] in WORD_BOUNDARY
														
 
															+        ) and (self.src[cur] not in WORD_BOUNDARY)
														
 
															+
														
 
															 def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
														
 
															     """
														
@@ -101,7 +171,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															     src = src.strip()
														
 
															     options["capitalize"] = capitalize
														
 
															-    with Context(lang, src, t_dir, options) as ctx:
														
 
															+    with Transliterator(lang, src, t_dir, options) as ctx:
														
 
															         if t_dir == FEAT_S2R and not ctx.general["has_s2r"]:
														
 
															             raise NotImplementedError(
														
@@ -118,12 +188,11 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															         # This hook may take over the whole transliteration process or delegate
														
 
															         # it to some external process, and return the output string directly.
														
 
															-        if _run_hook("post_config", ctx) == BREAK:
														
 
															+        if ctx.run_hook("post_config") == BREAK:
														
 
															             return getattr(ctx, "dest", ""), ctx.warnings
														
 
															-        # _normalize_src returns the results of the post_normalize hook.
														
 
															-        if _normalize_src(
														
 
															-                ctx, get_lang_normalize(ctx.conn, ctx.lang_id)) == BREAK:
														
 
															+        # ctx.normalize_src returns the results of the post_normalize hook.
														
 
															+        if ctx.normalize_src() == BREAK:
														
 
															             return getattr(ctx, "dest", ""), ctx.warnings
														
 
															         logger.debug(f"Normalized source: {ctx.src}")
														
@@ -137,21 +206,20 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															             # Reset cursor position flags.
														
 
															             # Carry over extended "beginning of word" flag.
														
 
															             ctx.cur_flags = 0
														
 
															-            cur_char = ctx.src[ctx.cur]
														
 
															             # Look for a word boundary and flag word beginning/end it if found.
														
 
															-            if _is_bow(ctx.cur, ctx, WORD_BOUNDARY):
														
 
															+            if ctx.cur_at_bow():
														
 
															                 # Beginning of word.
														
 
															                 logger.debug(f"Beginning of word at position {ctx.cur}.")
														
 
															                 ctx.cur_flags |= BOW
														
 
															-            if _is_eow(ctx.cur, ctx, WORD_BOUNDARY):
														
 
															+            if ctx.cur_at_eow():
														
 
															                 # End of word.
														
 
															                 logger.debug(f"End of word at position {ctx.cur}.")
														
 
															                 ctx.cur_flags |= EOW
														
 
															             # This hook may skip the parsing of the current
														
 
															             # token or exit the scanning loop altogether.
														
 
															-            hret = _run_hook("begin_input_token", ctx)
														
 
															+            hret = ctx.run_hook("begin_input_token")
														
 
															             if hret == BREAK:
														
 
															                 logger.debug("Breaking text scanning from hook signal.")
														
 
															                 break
														
@@ -165,7 +233,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															             while True:
														
 
															                 ctx.ignoring = False
														
 
															                 for ctx.tk in get_lang_ignore(ctx.conn, ctx.lang_id):
														
 
															-                    hret = _run_hook("pre_ignore_token", ctx)
														
 
															+                    hret = ctx.run_hook("pre_ignore_token")
														
 
															                     if hret == BREAK:
														
 
															                         break
														
 
															                     if hret == CONT:
														
@@ -187,7 +255,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															                     if _matching:
														
 
															                         # The position matches an ignore token.
														
 
															-                        hret = _run_hook("on_ignore_match", ctx)
														
 
															+                        hret = ctx.run_hook("on_ignore_match")
														
 
															                         if hret == BREAK:
														
 
															                             break
														
 
															                         if hret == CONT:
														
@@ -202,7 +270,6 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															                             ctx.ignoring = False
														
 
															                             break
														
 
															-                        cur_char = ctx.src[ctx.cur]
														
 
															                         ctx.ignoring = True
														
 
															                         break
														
 
															                 # We looked through all ignore tokens, not found any. Move on.
														
@@ -221,7 +288,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															             ctx.match = False
														
 
															             for ctx.src_tk, ctx.dest_str in lang_map:
														
 
															-                hret = _run_hook("pre_tx_token", ctx)
														
 
															+                hret = ctx.run_hook("pre_tx_token")
														
 
															                 if hret == BREAK:
														
 
															                     break
														
 
															                 if hret == CONT:
														
@@ -237,7 +304,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															                 # point value) than the current character, then break the loop
														
 
															                 # without a match, because we know there won't be any more
														
 
															                 # match due to the alphabetical ordering.
														
 
															-                if ctx.src_tk.content[0] > cur_char:
														
 
															+                if ctx.src_tk.content[0] > ctx.cur_char:
														
 
															                     logger.debug(
														
 
															                             f"{ctx.src_tk.content} is after "
														
 
															                             f"{ctx.src[ctx.cur:ctx.cur + step]}. "
														
@@ -247,11 +314,12 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															                 # If src_tk has a WB flag but the token is not at WB, skip.
														
 
															                 if (
														
 
															                     (ctx.src_tk.flags & BOW and not ctx.cur_flags & BOW)
														
 
															-                    or
														
 
															-                    # Can't rely on EOW flag, we must check on the last
														
 
															-                    # character of the potential match.
														
 
															-                    (ctx.src_tk.flags & EOW and not _is_eow(
														
 
															-                            ctx.cur + step - 1, ctx, WORD_BOUNDARY))
														
 
															+                    or (
														
 
															+                        # Can't rely on EOW flag, we must check on the last
														
 
															+                        # character of the potential match.
														
 
															+                        ctx.src_tk.flags & EOW
														
 
															+                        and not ctx.cur_at_eow(ctx.cur + step - 1)
														
 
															+                    )
														
 
															                 ):
														
 
															                     continue
														
@@ -262,7 +330,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															                     ctx.match = True
														
 
															                     # This hook may skip this token or break out of the token
														
 
															                     # lookup for the current position.
														
 
															-                    hret = _run_hook("on_tx_token_match", ctx)
														
 
															+                    hret = ctx.run_hook("on_tx_token_match")
														
 
															                     if hret == BREAK:
														
 
															                         break
														
 
															                     if hret == CONT:
														
@@ -300,7 +368,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															             if ctx.match is False:
														
 
															                 delattr(ctx, "match")
														
 
															-                hret = _run_hook("on_no_tx_token_match", ctx)
														
 
															+                hret = ctx.run_hook("on_no_tx_token_match")
														
 
															                 if hret == BREAK:
														
 
															                     break
														
 
															                 if hret == CONT:
														
@@ -308,9 +376,10 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															                 # No match found. Copy non-mapped character (one at a time).
														
 
															                 logger.info(
														
 
															-                        f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
														
 
															+                        f"Token {ctx.cur_char} "
														
 
															+                        f"(\\u{hex(ord(ctx.cur_char))[2:]}) "
														
 
															                         f"at position {ctx.cur} is not mapped.")
														
 
															-                ctx.dest_ls.append(cur_char)
														
 
															+                ctx.dest_ls.append(ctx.cur_char)
														
 
															                 ctx.cur += 1
														
 
															             else:
														
 
															                 delattr(ctx, "match")
														
@@ -320,7 +389,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															         # This hook may take care of the assembly and cause the function to
														
 
															         # return its own return value.
														
 
															-        hret = _run_hook("pre_assembly", ctx)
														
 
															+        hret = ctx.run_hook("pre_assembly")
														
 
															         if hret is not None:
														
 
															             return hret, ctx.warnings
														
@@ -329,7 +398,7 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															         # This hook may reassign the output string and/or cause the function to
														
 
															         # return it immediately.
														
 
															-        hret = _run_hook("post_assembly", ctx)
														
 
															+        hret = ctx.run_hook("post_assembly")
														
 
															         if hret is not None:
														
 
															             return hret, ctx.warnings
														
@@ -337,54 +406,3 @@ def transliterate(src, lang, t_dir="s2r", capitalize=False, options={}):
 
															         ctx.dest = MULTI_WS_RE.sub(r"\1", ctx.dest.strip())
														
 
															         return ctx.dest, ctx.warnings
														
 
															-
														
 
															-
														
 
															-def _normalize_src(ctx, norm_rules):
														
 
															-    """
														
 
															-    Normalize source text according to rules.
														
 
															-
														
 
															-    NOTE: this manipluates the protected source attribute so it may not
														
 
															-    correspond to the originally provided source.
														
 
															-    """
														
 
															-    # Normalize precomposed Unicode characters.
														
 
															-    #
														
 
															-    # In using diacritics, LC standards prefer the decomposed form (combining
														
 
															-    # diacritic + base character) to the pre-composed form (single Unicode
														
 
															-    # symbol for the letter with diacritic).
														
 
															-    #
														
 
															-    # Note: only safe for R2S.
														
 
															-    if ctx.t_dir == FEAT_R2S:
														
 
															-        logger.debug("Normalizing pre-composed symbols.")
														
 
															-        ctx._src = precomp_normalize("NFD", ctx.src)
														
 
															-
														
 
															-    for nk, nv in norm_rules.items():
														
 
															-        ctx._src = ctx.src.replace(nk, nv)
														
 
															-
														
 
															-    return _run_hook("post_normalize", ctx)
														
 
															-
														
 
															-
														
 
															-def _is_bow(cur, ctx, word_boundary):
														
 
															-    return (cur == 0 or ctx.src[cur - 1] in word_boundary) and (
														
 
															-            ctx.src[cur] not in word_boundary)
														
 
															-
														
 
															-
														
 
															-def _is_eow(cur, ctx, word_boundary):
														
 
															-    return (
														
 
															-        cur == len(ctx.src) - 1
														
 
															-        or ctx.src[cur + 1] in word_boundary
														
 
															-    ) and (ctx.src[cur] not in word_boundary)
														
 
															-
														
 
															-
														
 
															-def _run_hook(hname, ctx):
														
 
															-    ret = None
														
 
															-    for hook_def in ctx.hooks.get(hname, []):
														
 
															-        fn = getattr(
														
 
															-                import_module("." + hook_def["module_name"], HOOK_PKG_PATH),
														
 
															-                hook_def["fn_name"])
														
 
															-        ret = fn(ctx, **hook_def["kwargs"])
														
 
															-        if ret in (BREAK, CONT):
														
 
															-            # This will stop parsing hooks functions and tell the caller to
														
 
															-            # break out of the outer loop or skip iteration.
														
 
															-            return ret
														
 
															-
														
 
															-    return ret
														
--- a/test/data/precomp_samples.csv
+++ b/test/data/precomp_samples.csv
@@ -0,0 +1,95 @@
 
															+À,À
														
 
															+Á,Á
														
 
															+Â,Â
														
 
															+Ã,Ã
														
 
															+Ā,Ā
														
 
															+Ă,Ă
														
 
															+Ȧ,Ȧ
														
 
															+Ä,Ä
														
 
															+Å,Å
														
 
															+Æ,Æ
														
 
															+Ç,Ç
														
 
															+È,È
														
 
															+É,É
														
 
															+Ê,Ê
														
 
															+Ē,Ē
														
 
															+Ĕ,Ĕ
														
 
															+Ė,Ė
														
 
															+Ë,Ë
														
 
															+Ì,Ì
														
 
															+Í,Í
														
 
															+Î,Î
														
 
															+Ĩ,Ĩ
														
 
															+Ī,Ī
														
 
															+Ĭ,Ĭ
														
 
															+İ,İ
														
 
															+Ï,Ï
														
 
															+Ð,Ð
														
 
															+Ñ,Ñ
														
 
															+Ò,Ò
														
 
															+Ó,Ó
														
 
															+Ô,Ô
														
 
															+Õ,Õ
														
 
															+Ō,Ō
														
 
															+Ŏ,Ŏ
														
 
															+Ȯ,Ȯ
														
 
															+Ö,Ö
														
 
															+Ø,Ø
														
 
															+Ù,Ù
														
 
															+Ú,Ú
														
 
															+Û,Û
														
 
															+Ũ,Ũ
														
 
															+Ū,Ū
														
 
															+Ŭ,Ŭ
														
 
															+Ü,Ü
														
 
															+Ý,Ý
														
 
															+Ÿ,Ÿ
														
 
															+Þ,Þ
														
 
															+ß,ß
														
 
															+à,à
														
 
															+á,á
														
 
															+â,â
														
 
															+ã,ã
														
 
															+ā,ā
														
 
															+ă,ă
														
 
															+ä,ä
														
 
															+å,å
														
 
															+æ,æ
														
 
															+ç,ç
														
 
															+è,è
														
 
															+é,é
														
 
															+ê,ê
														
 
															+ē,ē
														
 
															+ĕ,ĕ
														
 
															+ė,ė
														
 
															+ë,ë
														
 
															+ì,ì
														
 
															+í,í
														
 
															+î,î
														
 
															+ī,ī
														
 
															+ĭ,ĭ
														
 
															+ï,ï
														
 
															+ð,ð
														
 
															+ñ,ñ
														
 
															+ò,ò
														
 
															+ó,ó
														
 
															+ô,ô
														
 
															+õ,õ
														
 
															+ō,ō
														
 
															+ŏ,ŏ
														
 
															+ȯ,ȯ
														
 
															+ö,ö
														
 
															+ø,ø
														
 
															+ù,ù
														
 
															+ú,ú
														
 
															+û,û
														
 
															+ū,ū
														
 
															+ŭ,ŭ
														
 
															+ü,ü
														
 
															+ý,ý
														
 
															+þ,þ
														
 
															+ÿ,ÿ
														
 
															+Ġ,Ġ
														
 
															+ġ,ġ
														
 
															+Ś,Ś
														
 
															+ś,ś
														
--- a/test/data/script_samples/unittest.csv
+++ b/test/data/script_samples/unittest.csv
@@ -1,4 +1,4 @@
 
															-"inherited","abcd",9078,,
														
 
															+"inherited","abcd",9078,"s2r",
														
 
															 "inherited","TUVX","tuvx","r2s",
														
 
															 "rot3","defg","abcd","s2r",
														
 
															 "rot3","HIJK","KLMN","r2s",
														
@@ -6,3 +6,5 @@
 
															 "rot3","st uv","Vw Xy","r2s","{""capitalize"": ""all""}"
														
 
															 "regex","Hello abc","Hello 678","r2s",
														
 
															 "regex","Hullo abc","5u22o 678","r2s",
														
 
															+"word_boundaries","bab aa b.abc c, dae abada:ddd vb","<212> <11> 020.<123> 030, <41e <12141>:<444> v2>","r2s"
														
 
															+"word_boundaries","43 23432 455 4:3 51, 11","<dc> <bcdcb> <d55 0d0:0c0 5a>, <aa>","s2r"
														
--- a/test/integration.py
+++ b/test/integration.py
@@ -69,12 +69,12 @@ def _trans(script, lang, t_dir, opts, rom, deltas):
 
															             capitalize=opts.get("capitalize"), options=opts)
														
 
															     try:
														
 
															         assert trans == rom
														
 
															-    except AssertionError as e:
														
 
															+    except AssertionError:
														
 
															         if deltas is not None:
														
 
															             print("F", end="")
														
 
															             deltas.append((lang, script, ndiff([trans], [rom])))
														
 
															         else:
														
 
															-            raise e
														
 
															+            raise AssertionError(f"Result: {trans}\nExpected: {rom}")
														
 
															     else:
														
 
															         if deltas:
														
 
															             print(".", end="")
														
--- a/test/unittest/tables/index.yml
+++ b/test/unittest/tables/index.yml
@@ -15,3 +15,5 @@ regex:
 
															   name: inherited config + regex ignore.
														
 
															 rot3:
														
 
															   name: Test ROT3 hooks
														
 
															+word_boundaries:
														
 
															+  name: Word boundaries
														
--- a/test/unittest/test04_normalization.py
+++ b/test/unittest/test04_normalization.py
@@ -0,0 +1,30 @@
 
															+from csv import reader
														
 
															+from os import environ, path, unlink
														
 
															+from unittest import TestCase
														
 
															+
														
 
															+from scriptshifter.trans import Transliterator, FEAT_R2S
														
 
															+from scriptshifter.tables import init_db
														
 
															+
														
 
															+from test import TEST_DATA_DIR
														
 
															+
														
 
															+
														
 
															+def setUpModule():
														
 
															+    init_db()
														
 
															+
														
 
															+
														
 
															+def tearDownModule():
														
 
															+    unlink(environ["TXL_DB_PATH"])
														
 
															+
														
 
															+
														
 
															+class TestNormalization(TestCase):
														
 
															+    """ Source normalization tests. """
														
 
															+
														
 
															+    def test_norm_decompose_r2s(self):
														
 
															+        with open(path.join(
														
 
															+                TEST_DATA_DIR, "precomp_samples.csv"), newline="") as fh:
														
 
															+            data = reader(fh)
														
 
															+
														
 
															+            for precomp, decomp in data:
														
 
															+                with Transliterator("rot3", precomp, FEAT_R2S, {}) as ctx:
														
 
															+                    ctx.normalize_src()
														
 
															+                    self.assertEqual(ctx.src, decomp)
														
--- a/test/unittest/test05_rest_api.py
+++ b/test/unittest/test05_rest_api.py
+															+À,À
+															+Á,Á
+															+Â,Â
+															+Ã,Ã
+															+Ā,Ā
+															+Ă,Ă
+															+Ȧ,Ȧ
+															+Ä,Ä
+															+Å,Å
+															+Æ,Æ
+															+Ç,Ç
+															+È,È
+															+É,É
+															+Ê,Ê
+															+Ē,Ē
+															+Ĕ,Ĕ
+															+Ė,Ė
+															+Ë,Ë
+															+Ì,Ì
+															+Í,Í
+															+Î,Î
+															+Ĩ,Ĩ
+															+Ī,Ī
+															+Ĭ,Ĭ
+															+İ,İ
+															+Ï,Ï
+															+Ð,Ð
+															+Ñ,Ñ
+															+Ò,Ò
+															+Ó,Ó
+															+Ô,Ô
+															+Õ,Õ
+															+Ō,Ō
+															+Ŏ,Ŏ
+															+Ȯ,Ȯ
+															+Ö,Ö
+															+Ø,Ø
+															+Ù,Ù
+															+Ú,Ú
+															+Û,Û
+															+Ũ,Ũ
+															+Ū,Ū
+															+Ŭ,Ŭ
+															+Ü,Ü
+															+Ý,Ý
+															+Ÿ,Ÿ
+															+Þ,Þ
+															+ß,ß
+															+à,à
+															+á,á
+															+â,â
+															+ã,ã
+															+ā,ā
+															+ă,ă
+															+ä,ä
+															+å,å
+															+æ,æ
+															+ç,ç
+															+è,è
+															+é,é
+															+ê,ê
+															+ē,ē
+															+ĕ,ĕ
+															+ė,ė
+															+ë,ë
+															+ì,ì
+															+í,í
+															+î,î
+															+ī,ī
+															+ĭ,ĭ
+															+ï,ï
+															+ð,ð
+															+ñ,ñ
+															+ò,ò
+															+ó,ó
+															+ô,ô
+															+õ,õ
+															+ō,ō
+															+ŏ,ŏ
+															+ȯ,ȯ
+															+ö,ö
+															+ø,ø
+															+ù,ù
+															+ú,ú
+															+û,û
+															+ū,ū
+															+ŭ,ŭ
+															+ü,ü
+															+ý,ý
+															+þ,þ
+															+ÿ,ÿ
+															+Ġ,Ġ
+															+ġ,ġ
+															+Ś,Ś
+															+ś,ś