|
@@ -88,6 +88,7 @@ def transliterate(src, lang, r2s=False):
|
|
# Loop through source characters. The increment of each loop depends on
|
|
# Loop through source characters. The increment of each loop depends on
|
|
# the length of the token that eventually matches.
|
|
# the length of the token that eventually matches.
|
|
ignore_list = langsec.get("ignore", []) # Only present in R2S
|
|
ignore_list = langsec.get("ignore", []) # Only present in R2S
|
|
|
|
+ ignore_ptn_list = langsec.get("ignore_ptn", []) # Only present in R2S
|
|
ctx.cur = 0
|
|
ctx.cur = 0
|
|
while ctx.cur < len(src):
|
|
while ctx.cur < len(src):
|
|
# Reset cursor position flags.
|
|
# Reset cursor position flags.
|
|
@@ -113,11 +114,41 @@ def transliterate(src, lang, r2s=False):
|
|
logger.debug("Skipping scanning iteration from hook signal.")
|
|
logger.debug("Skipping scanning iteration from hook signal.")
|
|
continue
|
|
continue
|
|
|
|
|
|
- # Check ignore list. Find as many subsequent ignore tokens
|
|
|
|
|
|
+ # Check ignore lists. Find as many subsequent ignore tokens
|
|
# as possible before moving on to looking for match tokens.
|
|
# as possible before moving on to looking for match tokens.
|
|
ctx.tk = None
|
|
ctx.tk = None
|
|
while True:
|
|
while True:
|
|
ctx.ignoring = False
|
|
ctx.ignoring = False
|
|
|
|
+ # Ignore patterns.
|
|
|
|
+ for ctx.tk in ignore_ptn_list:
|
|
|
|
+ hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
|
|
|
|
+ if hret == BREAK:
|
|
|
|
+ break
|
|
|
|
+ if hret == CONT:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ step = len(ctx.tk)
|
|
|
|
+ # FIXME This is an issue if we want to specify
|
|
|
|
+ # beginning-of-word matches, as we aren't reading the
|
|
|
|
+ # previous token and we only know that from the CUR_BOW.
|
|
|
|
+ # Which means we would have to analyze the regexp to find if
|
|
|
|
+ # it's looking for BOW. Messy.
|
|
|
|
+ match = re.match(src[ctx.cur:])
|
|
|
|
+ if match:
|
|
|
|
+ # The position matches an ignore token.
|
|
|
|
+ hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
|
|
|
|
+ if hret == BREAK:
|
|
|
|
+ break
|
|
|
|
+ if hret == CONT:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ logger.info(f"Ignored token: {ctx.tk}")
|
|
|
|
+ ctx.dest_ls.append(ctx.tk)
|
|
|
|
+ ctx.cur += step
|
|
|
|
+ ctx.ignoring = True
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ # Ignore plain strings.
|
|
for ctx.tk in ignore_list:
|
|
for ctx.tk in ignore_list:
|
|
hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
|
|
hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
|
|
if hret == BREAK:
|
|
if hret == BREAK:
|