Selaa lähdekoodia

WIP Initial ignore pattern implementation.

Stefano Cossu 2 vuotta sitten
vanhempi
commit
11cec31d39

+ 5 - 0
transliterator/rest_api.py

@@ -49,9 +49,14 @@ def dump_table(lang):
     tbl = deepcopy(load_table(lang))
     for sec_name in ("roman_to_script", "script_to_roman"):
         if sec_name in tbl:
+            # Serialize hook function pointers.
             for hname, fn_defs in tbl[sec_name].get("hooks", {}).items():
                 tbl[sec_name]["hooks"][hname] = [
                         (fn.__name__, kw) for (fn, kw) in fn_defs]
+            # Serialize regular expression patterns.
+            if "ignore_ptn" in tbl[sec_name]:
+                tbl[sec_name]["ignore_ptn"] = [
+                        ptn.pattern for ptn in tbl[sec_name]["ignore_ptn"]]
 
     return jsonify(tbl)
 

+ 18 - 1
transliterator/tables/__init__.py

@@ -1,4 +1,5 @@
 import logging
+import re
 
 from functools import cache
 from importlib import import_module
@@ -151,6 +152,22 @@ def load_table(tname):
         tdata["roman_to_script"]["map"] = tuple(
                 (k.content, tokens[k]) for k in sorted(tokens))
 
+        # Ignore regular expression patterns.
+        # Patterns are evaluated in the order they are listed in the config.
+        ignore_ptn = [
+                re.compile(ptn)
+                for ptn in tdata["roman_to_script"].get("ignore_ptn", [])]
+        for parent in parents:
+            parent_tdata = load_table(parent)
+            # NOTE: duplicates are not removed.
+            ignore_ptn = [
+                re.compile(ptn)
+                for ptn in parent_tdata.get(
+                        "roman_to_script", {}).get("ignore_ptn", [])
+            ] + ignore_ptn
+        tdata["roman_to_script"]["ignore_ptn"] = ignore_ptn
+
+        # Ignore plain strings.
         ignore = {
             Token(t)
             for t in tdata["roman_to_script"].get("ignore", [])
@@ -162,10 +179,10 @@ def load_table(tname):
                 Token(t) for t in parent_tdata.get(
                         "roman_to_script", {}).get("ignore", [])
             }
-
         tdata["roman_to_script"]["ignore"] = [
                 t.content for t in sorted(ignore)]
 
+        # Hooks.
         if "hooks" in tdata["roman_to_script"]:
             tdata["roman_to_script"]["hooks"] = load_hook_fn(
                     tname, tdata["script_to_roman"])

+ 106 - 104
transliterator/tables/data/_ignore_base.yml

@@ -2,12 +2,9 @@ general:
   name: Common ignore list.
 
 roman_to_script:
-  ignore:
-    - "at head of title"
-    - "colophon"
-    - "date of publication not identified"
-    - "place of publication not identified"
-    - "publisher not identified"
+  # Ignore regular expression patterns.
+  ignore_ptn:
+    # Roman numerals.
     # NOTE There is ambiguity about ignoring these
     # words. Note that the single-character Roman
     # numerals are not included on purpose.
@@ -15,105 +12,110 @@ roman_to_script:
     # dedicated U+2160÷U+216F (uppercase Roman
     # numerals) and/or U+2170÷U+217F (lower case Roman
     # numerals) ranges to avoid this ambiguity.
-    # TODO implement regular expressions for ignore patterns.
-    #- re: "I{2,3}"
-    #- re: "I(V|X)"
-    #- re: "LI{,3}"
-    #- re: "LI?(V|X)"
-    #- re: "L(V|X{1,3})I{,3}"
-    #- re: "LX{1,3}I?V"
-    #- re: "LX{1,3}VI{,3}"
-    #- re: "(V|X{1,3})I{,3}"
-    #- re: "X{1,3}I{,3}"
-    #- re: "X{1,3}I(V|X)"
-    #- re: "X{1,3}VI{,3}"
-    - "II"
-    - "III"
-    - "IV"
-    - "IX"
-    - "LI"
-    - "LII"
-    - "LIII"
-    - "LIV"
-    - "LIX"
-    - "LV"
-    - "LVI"
-    - "LVII"
-    - "LVIII"
-    - "LX"
-    - "LXI"
-    - "LXII"
-    - "LXIII"
-    - "LXIV"
-    - "LXIX"
-    - "LXV"
-    - "LXVI"
-    - "LXVII"
-    - "LXVIII"
-    - "LXX"
-    - "LXXI"
-    - "LXXII"
-    - "LXXIII"
-    - "LXXIV"
-    - "LXXIX"
-    - "LXXV"
-    - "LXXVI"
-    - "LXXVII"
-    - "LXXVIII"
-    - "LXXX"
-    - "LXXXI"
-    - "LXXXII"
-    - "LXXXIII"
-    - "LXXXIV"
-    - "LXXXIX"
-    - "LXXXV"
-    - "LXXXVI"
-    - "LXXXVII"
-    - "LXXXVIII"
-    - "VI"
-    - "VII"
-    - "VIII"
-    - "XI"
-    - "XII"
-    - "XIII"
-    - "XIV"
-    - "XIX"
-    - "XL"
-    - "XLI"
-    - "XLII"
-    - "XLIII"
-    - "XLIV"
-    - "XLIX"
-    - "XLV"
-    - "XLVI"
-    - "XLVII"
-    - "XLVIII"
-    - "XV"
-    - "XVI"
-    - "XVII"
-    - "XVIII"
-    - "XX"
-    - "XXI"
-    - "XXII"
-    - "XXIII"
-    - "XXIV"
-    - "XXIX"
-    - "XXV"
-    - "XXVI"
-    - "XXVII"
-    - "XXVIII"
-    - "XXX"
-    - "XXXI"
-    - "XXXII"
-    - "XXXIII"
-    - "XXXIV"
-    - "XXXIX"
-    - "XXXV"
-    - "XXXVI"
-    - "XXXVII"
-    - "XXXVIII"
+    - "\\<I{2,3}\\>"
+    - "\\<I(V|X)\\>"
+    - "\\<LI{,3}\\>"
+    - "\\<LI?(V|X)\\>"
+    - "\\<L(V|X{1,3})I{,3}\\>"
+    - "\\<LX{1,3}I?V\\>"
+    - "\\<LX{1,3}VI{,3}\\>"
+    - "\\<(V|X{1,3})I{,3}\\>"
+    - "\\<X{1,3}I{,3}\\>"
+    - "\\<X{1,3}I(V|X)\\>"
+    - "\\<X{1,3}VI{,3}\\>"
+    - "\\<and ([a-z]+ )?others\\>"
+  ignore:
+    - "at head of title"
+    - "colophon"
+    - "date of publication not identified"
+    - "place of publication not identified"
+    - "publisher not identified"
+    #- "II"
+    #- "III"
+    #- "IV"
+    #- "IX"
+    #- "LI"
+    #- "LII"
+    #- "LIII"
+    #- "LIV"
+    #- "LIX"
+    #- "LV"
+    #- "LVI"
+    #- "LVII"
+    #- "LVIII"
+    #- "LX"
+    #- "LXI"
+    #- "LXII"
+    #- "LXIII"
+    #- "LXIV"
+    #- "LXIX"
+    #- "LXV"
+    #- "LXVI"
+    #- "LXVII"
+    #- "LXVIII"
+    #- "LXX"
+    #- "LXXI"
+    #- "LXXII"
+    #- "LXXIII"
+    #- "LXXIV"
+    #- "LXXIX"
+    #- "LXXV"
+    #- "LXXVI"
+    #- "LXXVII"
+    #- "LXXVIII"
+    #- "LXXX"
+    #- "LXXXI"
+    #- "LXXXII"
+    #- "LXXXIII"
+    #- "LXXXIV"
+    #- "LXXXIX"
+    #- "LXXXV"
+    #- "LXXXVI"
+    #- "LXXXVII"
+    #- "LXXXVIII"
+    #- "VI"
+    #- "VII"
+    #- "VIII"
+    #- "XI"
+    #- "XII"
+    #- "XIII"
+    #- "XIV"
+    #- "XIX"
+    #- "XL"
+    #- "XLI"
+    #- "XLII"
+    #- "XLIII"
+    #- "XLIV"
+    #- "XLIX"
+    #- "XLV"
+    #- "XLVI"
+    #- "XLVII"
+    #- "XLVIII"
+    #- "XV"
+    #- "XVI"
+    #- "XVII"
+    #- "XVIII"
+    #- "XX"
+    #- "XXI"
+    #- "XXII"
+    #- "XXIII"
+    #- "XXIV"
+    #- "XXIX"
+    #- "XXV"
+    #- "XXVI"
+    #- "XXVII"
+    #- "XXVIII"
+    #- "XXX"
+    #- "XXXI"
+    #- "XXXII"
+    #- "XXXIII"
+    #- "XXXIV"
+    #- "XXXIX"
+    #- "XXXV"
+    #- "XXXVI"
+    #- "XXXVII"
+    #- "XXXVIII"
     - "and one other"
-    #- re: "and ([a-z0-9]+ )?others"
     - "et al."
 
 

+ 32 - 1
transliterator/trans.py

@@ -88,6 +88,7 @@ def transliterate(src, lang, r2s=False):
     # Loop through source characters. The increment of each loop depends on
     # the length of the token that eventually matches.
     ignore_list = langsec.get("ignore", [])  # Only present in R2S
+    ignore_ptn_list = langsec.get("ignore_ptn", [])  # Only present in R2S
     ctx.cur = 0
     while ctx.cur < len(src):
         # Reset cursor position flags.
@@ -113,11 +114,41 @@ def transliterate(src, lang, r2s=False):
             logger.debug("Skipping scanning iteration from hook signal.")
             continue
 
-        # Check ignore list. Find as many subsequent ignore tokens
+        # Check ignore lists. Find as many subsequent ignore tokens
         # as possible before moving on to looking for match tokens.
         ctx.tk = None
         while True:
             ctx.ignoring = False
+            # Ignore patterns.
+            for ctx.tk in ignore_ptn_list:
+                hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
+                if hret == BREAK:
+                    break
+                if hret == CONT:
+                    continue
+
+                step = len(ctx.tk)
+                # FIXME This is an issue if we want to specify
+                # beginning-of-word matches, as we aren't reading the
+                # previous token and we only know that from the CUR_BOW.
+                # Which means we would have to analyze the regexp to find if
+                # it's looking for BOW. Messy.
+                match = re.match(src[ctx.cur:])
+                if match:
+                    # The position matches an ignore token.
+                    hret = _run_hook("on_ignore_match", ctx, langsec_hooks)
+                    if hret == BREAK:
+                        break
+                    if hret == CONT:
+                        continue
+
+                    logger.info(f"Ignored token: {ctx.tk}")
+                    ctx.dest_ls.append(ctx.tk)
+                    ctx.cur += step
+                    ctx.ignoring = True
+                    break
+
+            # Ignore plain strings.
             for ctx.tk in ignore_list:
                 hret = _run_hook("pre_ignore_token", ctx, langsec_hooks)
                 if hret == BREAK: