Browse Source

Do not mark word breaking characters as word boundaries.

Stefano Cossu 1 year ago
parent
commit
c5de4bbd6c
2 changed files with 10 additions and 6 deletions
  1. 1 1
      TODO.md
  2. 9 5
      transliterator/trans.py

+ 1 - 1
TODO.md

@@ -30,7 +30,7 @@ discussion, etc.); *X* = not implementing.
 - *D* Hooks documentation
 - *W* Tests
   - *W* Config parsing
-  - *P* Transliteration
+  - *W* Transliteration
   - *P* REST API
 - *W* Complete conversion of existing tables to YAML
   - *P* Arabic

+ 9 - 5
transliterator/trans.py

@@ -92,13 +92,17 @@ def transliterate(src, lang, r2s=False, capitalize=False):
     while ctx.cur < len(src):
         # Reset cursor position flags.
         ctx.cur_flags = 0
+        cur_char = src[ctx.cur]
 
         # Look for a word boundary and flag word beginning/end it if found.
-        if ctx.cur == 0 or src[ctx.cur - 1] in WORD_BOUNDARY:
+        if (ctx.cur == 0 or src[ctx.cur - 1] in WORD_BOUNDARY) and (
+                cur_char not in WORD_BOUNDARY):
             # Beginning of word.
             logger.debug(f"Beginning of word at position {ctx.cur}.")
             ctx.cur_flags |= CUR_BOW
-        if ctx.cur == len(src) - 1 or src[ctx.cur + 1] in WORD_BOUNDARY:
+        if (ctx.cur == len(src) - 1 or src[ctx.cur + 1] in WORD_BOUNDARY) and (
+                cur_char not in WORD_BOUNDARY):
+            # Beginning of word.
             # End of word.
             logger.debug(f"End of word at position {ctx.cur}.")
             ctx.cur_flags |= CUR_EOW
@@ -163,7 +167,7 @@ def transliterate(src, lang, r2s=False, capitalize=False):
             # point value) than the current character, then break the loop
             # without a match, because we know there won't be any more match
             # due to the alphabetical ordering.
-            if ctx.src_tk[0] > src[ctx.cur]:
+            if ctx.src_tk[0] > cur_char:
                 logger.debug(
                         f"{ctx.src_tk} is after {src[ctx.cur:ctx.cur + step]}."
                         " Breaking loop.")
@@ -205,9 +209,9 @@ def transliterate(src, lang, r2s=False, capitalize=False):
 
             # No match found. Copy non-mapped character (one at a time).
             logger.info(
-                    f"Token {src[ctx.cur]} (\\u{hex(ord(src[ctx.cur]))[2:]}) "
+                    f"Token {cur_char} (\\u{hex(ord(cur_char))[2:]}) "
                     f"at position {ctx.cur} is not mapped.")
-            ctx.dest_ls.append(src[ctx.cur])
+            ctx.dest_ls.append(cur_char)
             ctx.cur += 1
         else:
             delattr(ctx, "match")