Browse Source

First round of debugging for korean_nonames.

scossu 1 year ago
parent
commit
6a725be434

+ 27 - 27
scriptshifter/hooks/korean/data.yml

@@ -2513,7 +2513,7 @@ fkr060:
   "-nyŏndae ": "-yŏndae "
   "-nyŏndae ": "-yŏndae "
   "-nyŏnsa ": "-yŏnsa "
-  "-nyŏnsaeng ": "-yŏnsaeng "]]
+  "-nyŏnsaeng ": "-yŏnsaeng "
 
 fkr061:
   " kangnamgu ": " Kangnam-gu "
@@ -4142,28 +4142,29 @@ fkr113-115:
   - "립스"
 
 fkr116:
-  - "로부"
-  - "로서"
-  - "로써"
-  - "로는"
-  - "라는"
-  - "라면"
-  - "라든"
-  - "라고"
-  - "라도"
-  - "라서"
-  - "라야"
-  - "로의"
-
-fkr116a:
-  "ra": "na"
-  "ro": "no"
-  "rw": "nw"
-  "ru": "nu"
-  "ri": "i"
-  "ry": "y"
-  "ni": "i"
-  "ny": "y"
+  particles:
+    - "로부"
+    - "로서"
+    - "로써"
+    - "로는"
+    - "라는"
+    - "라면"
+    - "라든"
+    - "라고"
+    - "라도"
+    - "라서"
+    - "라야"
+    - "로의"
+
+  replace_initials:
+    "ra": "na"
+    "ro": "no"
+    "rw": "nw"
+    "ru": "nu"
+    "ri": "i"
+    "ry": "y"
+    "ni": "i"
+    "ny": "y"
 
 fkr118:
   - "강종"
@@ -5629,7 +5630,7 @@ fkr146:
   "肹": "힐"
   "黠": "힐"
 
-fkr147a:
+fkr147:
   "家": "가"
   "歌": "가"
   "價": "가"
@@ -5880,8 +5881,7 @@ fkr147a:
   "鵙": "격"
   "鴃": "격"
   "見": "견"
-
-fkr147b:
+  # K-Romanizer set is split here
   "堅": "견"
   "犬": "견"
   "遣": "견"
@@ -11713,7 +11713,7 @@ fkr172-179:
   - "慄"  # FKR178
   - "栗"  # FKR179
 
-katakana:
+fkr180:
   " カン": " 간"
   " キャ": "갸"
   " キュ": "규"

+ 56 - 40
scriptshifter/hooks/korean/romanizer.py

@@ -51,41 +51,46 @@ def s2r_names_post_config(ctx):
     return BREAK
 
 
-def _romanize_nonames(src, capitalize=False, hancha=False):
+def _romanize_nonames(src, capitalize=False, hancha=True):
     """ Main Romanization function for non-name strings. """
 
     # FKR038: Convert Chinese characters to Hangul
     if hancha:
-        src = _hancha2hangul(_marc8_hancha(src))
+        kor = _hancha2hangul(_marc8_hancha(src))
+    else:
+        kor = src
 
-    data = f" {src} "
+    # Replace ideographic spaces with ASCII space.
+    kor = re.sub(r"\s+", " ", kor)
+    kor = f" {kor} "
 
     # FKR039: Replace Proper name with spaces in advance
     # FKR040: Replace Proper name with a hyphen in advance
     # FKR041: Romanize names of Hangul consonants
     for fkrcode in ("fkr039", "fkr040", "fkr041"):
         logger.debug(f"Applying {fkrcode}")
-        data = _replace_map(data, KCONF[fkrcode])
+        kor = _replace_map(kor, KCONF[fkrcode])
 
     # NOTE This is slightly different from LL 929-930 in that it doesn't
     # result in double spaces.
-    data = data.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
+    kor = kor.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
     # This is more compact but I'm unsure if the replacement order is kept.
-    # data = data.replace({"\r\n": " ", "\r": " ", "\n": " "})
+    # kor = kor.replace({"\r\n": " ", "\r": " ", "\n": " "})
 
-    rom = _romanize_oclc_auto(data)
+    rom = _romanize_oclc_auto(kor)
 
     # FKR042: Capitalize all first letters
     if capitalize == "all":
-        rom = data.title()
+        rom = rom.title()
     # FKR043: Capitalize the first letter
     elif capitalize == "first":
-        rom = data.capitalize()
+        rom = rom.capitalize()
 
     # FKR044: Ambiguities
     ambi = re.sub("[,.\";: ]+", " ", rom)
 
     # @TODO Move this to a generic normalization step (not only for K)
+    breakpoint()
     rom = _replace_map(rom, {"ŏ": "ŏ", "ŭ": "ŭ", "Ŏ": "Ŏ", "Ŭ": "Ŭ"})
 
     # TODO Decide what to do with these. There is no facility for outputting
@@ -99,7 +104,11 @@ def _romanize_nonames(src, capitalize=False, hancha=False):
 
 
 def _romanize_names(src):
-    """ Main Romanization function for names. """
+    """
+    Main Romanization function for names.
+
+    K-Romanizer: KorNameRom20
+    """
 
     warnings = []
 
@@ -126,7 +135,7 @@ def _romanize_names(src):
         warnings.append(f"{src} may not be a personal name.")
         return None, warnings
 
-    src, _warnings = _parse_kor_name(re.sub("\\W{2,}", " ", src.strip()))
+    src, _warnings = _parse_kor_name(re.sub(r"\s{2,}", " ", src.strip()))
 
     if len(_warnings):
         warnings += _warnings
@@ -209,7 +218,8 @@ def _kor_corp_name_rom(src):
 
 
 def _romanize_oclc_auto(data):
-    # FKR050: Starts preprocessing symbol 
+    breakpoint()
+    # FKR050: Starts preprocessing symbol
     for rname, rule in KCONF["fkr050"].items():
         logger.debug(f"Applying fkr050[{rname}]")
         data = _replace_map(data, rule)
@@ -223,7 +233,7 @@ def _romanize_oclc_auto(data):
         data = _replace_map(data, rule)
 
     # Strip end and multiple whitespace.
-    data = re.sub("\\W{2,}", " ", data.strip())
+    data = re.sub(r"\s{2,}", " ", data.strip())
 
     data = data.replace("^", " GLOTTAL ")
 
@@ -233,64 +243,67 @@ def _romanize_oclc_auto(data):
     data = " ".join(data_ls)
 
     # FKR059: Apply glottalization
-    data = f" {data.lstrip()} ".replace({" GLOTTAL ": "", "*": "", "^": ""})
+    data = _replace_map(
+            f" {data.strip()} ", {" GLOTTAL ": "", "*": "", "^": ""})
 
     # FKR060: Process number + -년/-년도/-년대
     # TODO Add leading whitespace as per L1221? L1202 already added one.
     data = _replace_map(data, KCONF["fkr060"])
 
-    data = re.sub("\\W{2,}", " ", f" {data.strip()} ")
+    data = re.sub(r"\s{2,}", " ", f" {data.strip()} ")
 
     # FKR061: Jurisdiction (시)
     # FKR063: Jurisdiction (국,도,군,구)
     # FKR064: Temple names of Kings, Queens, etc. (except 조/종)
     # FKR065: Frequent historical names
-    logger.debug("Applying FKR062-065")
-    data = _replace_map(
-            data,
-            KCONF["fkr061"] + KCONF["fkr063"] +
-            KCONF["fkr064"] + KCONF["fkr065"])
+    for fkrkey in ("fkr061", "fkr063", "fkr064", "fkr065"):
+        logger.debug(f"Applying {fkrkey.upper()}")
+        data = _replace_map(data, KCONF[fkrkey])
 
     # FKR066: Starts restore symbols
     for rname, rule in KCONF["fkr066"].items():
         logger.debug(f"Applying FKR066[{rname}]")
         data = _replace_map(data, rule)
 
-    data = re.sub("\\W{2,}", " ", data.strip())
+    data = re.sub(r"\s{2,}", " ", data.strip())
 
     return data
 
 
-def _kor_rom(data):
-    data = re.sub("\\W{2,}", " ", data.strip())
+# FKR068: Exceptions, Exceptions to initial sound law, Proper names
+def _kor_rom(kor):
+    kor = re.sub(r"\s{2,}", " ", kor.strip())
+    orig = kor
 
     # FKR069: Irregular sound change list
-    data = _replace_map(data, KCONF["fkr069"])
+    kor = _replace_map(kor, KCONF["fkr069"])
 
     # FKR070: [n] insertion position mark +
-    niun = data.find("+")
-    if niun:
-        data = data.replace("+", "")
-        orig = data
+    niun = kor.find("+")
+    if niun > -1:
+        kor = kor.replace("+", "")
+        orig = kor
 
     non_kor = 0
-    cpoints = tuple(ord(c) for c in data)
+    cpoints = tuple(ord(c) for c in kor)
     for cp in cpoints:
         if cp < CP_MIN:
             non_kor += 1
-            data = data[1:]
+            kor = kor[1:]
 
     rom_ls = []
-    for i in range(len(data)):
+    for i in range(len(kor)):
         cp = cpoints[i] - CP_MIN
         ini = "i" + str(cp // 588)
         med = "m" + str((cp // 28) % 21)
         fin = "f" + str(cp % 28)
         rom_ls.append("#".join((ini, med, fin)))
-    rom = "~".join(rom_ls) + "E"
+    rom = "~".join(rom_ls)
+    if len(rom):
+        rom = rom + "E"
 
     # FKR071: [n] insertion
-    if niun:
+    if niun > -1:
         rom_niun_a, rom_niun_b = rom[:niun - 1].split("~", 1)
         if re.match("ill#m(?:2|6|12|17|20)", rom_niun_b):
             logger.debug("Applying FKR071")
@@ -351,7 +364,7 @@ def _kor_rom(data):
         rom = _replace_map(rom, KCONF[f"fkr{fkr_i:03}"])
 
     # FKR109: Convert everything else
-    for pos, data in KCONF["fkr109"]:
+    for pos, data in KCONF["fkr109"].items():
         logger.debug(f"Applying FKR109[{pos}]")
         rom = _replace_map(rom, data)
 
@@ -359,7 +372,9 @@ def _kor_rom(data):
     rom = _replace_map(rom, {"#": "", "~": ""})
 
     if non_kor > 0:
-        rom = f"{orig[:non_kor]}-{rom}"
+        # Modified from K-Romanizer:1727 in that it does not append a hyphen
+        # if the whole word is non-Korean.
+        rom = f"{orig[:non_kor]}-{rom}" if len(rom) else orig
 
     # FKR111: ㄹ + 모음/ㅎ/ㄹ, ["lr","ll"] must be in the last of the array
     rom = _replace_map(rom, KCONF["fkr111"])
@@ -369,16 +384,17 @@ def _kor_rom(data):
     # FKR113: Check loan words by the first 1 letter
     # FKR114: Check loan words by the first 2 letters
     # FKR115: Check loan words by the first 3 letters
-    if orig.strip().startswith(tuple(KCONF["fkr113-115"])):
+    if orig.startswith(tuple(KCONF["fkr113-115"])):
         is_non_kor = True
 
     # FKR116: Exceptions to initial sound law - particles
     is_particle = False
-    if orig.strip().startswith(tuple(KCONF["fkr116"])):
+    if orig.startswith(tuple(KCONF["fkr116"]["particles"])):
         is_particle = True
 
     if len(orig) > 1 and not is_non_kor and not is_particle:
-        rom = _replace_map(rom, KCONF["fkr116a"])
+        if rom.startswith(tuple(KCONF["fkr116"]["replace_initials"].keys())):
+            rom = _replace_map(rom, KCONF["fkr116"]["replace_initials"])
 
     # FKR117: Proper names _StringPoper Does not work because of breves
     if (
@@ -414,7 +430,7 @@ def _hancha2hangul(data):
     # FKR144: Apply initial sound law (Except: 列, 烈, 裂, 劣)
     # FKR145: Simplified characters, variants
     # FKR146: Some characters from expanded list
-    # FKR147: Chinese characters 1-250 車=차
+    # FKR147: Chinese characters 1-500 車=차
     # FKR148: Chinese characters 501-750 串=관
     # FKR149: Chinese characters 751-1000 金=금, 娘=랑
     # FKR150: Chinese characters 1001-1250
@@ -473,7 +489,7 @@ def _hancha2hangul(data):
     logger.debug("Applying FKR180")
     data = _replace_map(data, KCONF["fkr180"])
 
-    return re.sub("\\W{2,}", " ", data.strip())
+    return re.sub(r"\s{2,}", " ", data.strip())
 
 
 def _replace_map(src, rmap, *args, **kw):

+ 1 - 0
scriptshifter/rest_api.py

@@ -11,6 +11,7 @@ from scriptshifter.trans import transliterate
 
 
 logger = logging.getLogger(__name__)
+logging.basicConfig(level=environ.get("TXL_LOGLEVEL", logging.INFO))
 
 
 def create_app():