浏览代码

WIP Still attempting to tame punctuation.

scossu 5 月之前
父节点
当前提交
06f9fdc27d
共有 1 个文件被更改,包括 16 次插入14 次删除
  1. 16 14
      scriptshifter/hooks/korean/romanizer.py

+ 16 - 14
scriptshifter/hooks/korean/romanizer.py

@@ -35,8 +35,8 @@ PWD = path.dirname(path.realpath(__file__))
 CP_MIN = 44032
 ALL_PUNCT_STR = (
     r'[\!"#$%&\'\(\)\*\+\,\-./:;<=>?・ǂ「」『』@\[\\\]\^_`{|}~‡‰‘’“”–—˜©·]')
-# Capture adjacent punctuation symbols and remove spacing in between.
-PUNCT_SPACING_RE = re.compile(f"({ALL_PUNCT_STR})\\s+({ALL_PUNCT_STR})")
+LEAD_PUNCT_RE = re.compile(r"([^\w\s])(\w)")
+TRAIL_PUNCT_RE = re.compile(r"(\w)([^\w\s])")
 
 # Buid FKR index for better logging.
 with open(path.join(PWD, "FKR_index.csv"), newline='') as fh:
@@ -319,9 +319,9 @@ def _kor_corp_name_rom(src):
 
 
 def _romanize_oclc_auto(kor):
-    # FKR050: Starts preprocessing symbol
-    _fkr_log(50)
-    # kor = _replace_map(kor, KCONF["fkr050"])
+    # Separate punctuation following words without a space.
+    kor = LEAD_PUNCT_RE.sub("\\1 \\2", kor)
+    kor = TRAIL_PUNCT_RE.sub("\\1 \\2", kor)
 
     # See https://github.com/lcnetdev/scriptshifter/issues/19
     kor = re.sub("제([0-9])", "제 \\1", kor)
@@ -340,7 +340,6 @@ def _romanize_oclc_auto(kor):
     logger.debug(f"Korean before romanization: {kor}")
 
     rom_ls = []
-    breakpoint()
     for word in kor.split(" "):
         rom_ls.append(_kor_rom(word))
     rom = " ".join(rom_ls)
@@ -364,14 +363,9 @@ def _romanize_oclc_auto(kor):
         _fkr_log(i)
         rom = _replace_map(rom, KCONF[f"fkr{i:03}"])
 
-    # FKR066: Starts restore symbols
-    _fkr_log(66)
-    rom = _replace_map(rom, KCONF["fkr066"])
-    # Remove spacing between punctuation symbols.
-    rom = PUNCT_SPACING_RE.sub(r"\1\2", rom.strip())
-    # Remove spaces from before symbols.
-    rom = re.sub(r" (?=[,.;:?!])", "", rom)
-    rom = re.sub(r"\s{2,}", " ", rom)
+    rom = re.sub(r"\s{2,}", " ", rom.strip())
+    rom = re.sub(r" (?=[,.;:?!\]\)\}])", "", rom)
+    rom = re.sub(r"(?<=[\[\(\{]) ", "", rom)
 
     return rom
 
@@ -396,6 +390,9 @@ def _kor_rom(kor):
         if cp < CP_MIN:
             non_kor += 1
             kor = kor[1:]
+        else:
+            # Break as soon as a Korean code point is found.
+            break
 
     rom_ls = []
     if non_kor > 0:
@@ -403,6 +400,10 @@ def _kor_rom(kor):
         cpoints = tuple(ord(c) for c in kor)
     for i in range(len(kor)):
         cp = cpoints[i] - CP_MIN
+        if cp < 0:
+            # This accounts for punctuation attached to the end of the word.
+            rom_ls.append(kor[i])
+            continue
         ini = "i" + str(cp // 588)
         med = "m" + str((cp // 28) % 21)
         fin = "f" + str(cp % 28)
@@ -529,6 +530,7 @@ def _kor_rom(kor):
 
     # @TODO Move this to a generic normalization step (not only for K)
     rom = _replace_map(rom, {"ŏ": "ŏ", "ŭ": "ŭ", "Ŏ": "Ŏ", "Ŭ": "Ŭ"})
+    logger.debug(f"Romanized token: {rom}")
 
     return rom