Selaa lähdekoodia

Fix initial, medial, final handling; add tests.

scossu 2 kuukautta sitten
vanhempi
commit
dc74924a66

+ 9 - 7
scriptshifter/tables/__init__.py

@@ -84,12 +84,12 @@ class Token(str):
         # Standalone has precedence, then initial, then final, then medial.
         # This is somewhat arbitrary and may change if special cases arise.
         # WB markers are moved to flags to allow default comparison.
-        if self.content.endswith(TOKEN_WB_MARKER):
-            self.flags |= BOW
-            self.content = self.content.rstrip(TOKEN_WB_MARKER)
         if self.content.startswith(TOKEN_WB_MARKER):
-            self.flags |= EOW
+            self.flags |= BOW
             self.content = self.content.lstrip(TOKEN_WB_MARKER)
+        if self.content.endswith(TOKEN_WB_MARKER):
+            self.flags |= EOW
+            self.content = self.content.rstrip(TOKEN_WB_MARKER)
 
     def __lt__(self, other):
         """
@@ -115,9 +115,9 @@ class Token(str):
         if (
                 (self.flags > 0 or other.flags > 0)
                 and self.content == other.content):
-            logger.debug(f"{self.content} flags: {self.flags}")
-            logger.debug(f"{other.content} flags: {other.flags}")
-            logger.debug("Performing flags comparison.")
+            # logger.debug(f"{self.content} flags: {self.flags}")
+            # logger.debug(f"{other.content} flags: {other.flags}")
+            # logger.debug("Performing flags comparison.")
 
             return self.flags > other.flags
 
@@ -202,6 +202,8 @@ def populate_table(conn, tname, tdata):
 
     @param tdata(dict): Table data.
     """
+    logger.info(f"Populating table: {tname}")
+
     res = conn.execute(
         """INSERT INTO tbl_language (
             name, label, marc_code, description

+ 1 - 3
scriptshifter/tables/data/_ignore_base.yml

@@ -1,9 +1,9 @@
+---
 general:
   name: Common ignore list.
 
 roman_to_script:
   ignore:
-    - " "
     - "at head of title"
     - "colophon"
     - "date of publication not identified"
@@ -38,8 +38,6 @@ roman_to_script:
     - "\\b[\u2021$][0-9a-z]\\b"
 
 script_to_roman:
-  ignore:
-    - " "
   ignore_ptn:
     # MARC sub-field markers.
     - "\\b[\u2021$][0-9a-z]\\b"

+ 2 - 0
test/data/script_samples/unittest.csv

@@ -6,3 +6,5 @@
 "rot3","st uv","Vw Xy","r2s","{""capitalize"": ""all""}"
 "regex","Hello abc","Hello 678","r2s",
 "regex","Hullo abc","5u22o 678","r2s",
+"word_boundaries","bab aa b.abc c, dae abada:ddd vb","<212> <11> 020.<123> 030, <41e <12141>:<444> v2>","r2s"
+"word_boundaries","43 23432 455 4:3 51, 11","<dc> <bcdcb> <d55 0d0:0c0 5a>, <aa>","s2r"

+ 2 - 0
test/unittest/tables/index.yml

@@ -15,3 +15,5 @@ regex:
   name: inherited config + regex ignore.
 rot3:
   name: Test ROT3 hooks
+word_boundaries:
+  name: Word boundaries