Browse Source

WIP complete _kor_rom()

scossu 1 year ago
parent
commit
1a148c9248
2 changed files with 280 additions and 190 deletions
  1. 198 172
      scriptshifter/hooks/korean/data.yml
  2. 82 18
      scriptshifter/hooks/korean/romanizer.py

+ 198 - 172
scriptshifter/hooks/korean/data.yml

@@ -3962,178 +3962,178 @@ fkr111:
   "lw": "rw"
   "lr": "ll"
 
-fkr113:
-  rule1:
-    - "냐"
-    - "뉴"
-    - "니"
-    - "랃"
-    - "랏"
-    - "랙"
-    - "랜"
-    - "랟"
-    - "랠"
-    - "램"
-    - "랩"
-    - "랫"
-    - "랴"
-    - "랸"
-    - "랻"
-    - "랼"
-    - "럄"
-    - "럅"
-    - "럇"
-    - "러"
-    - "럭"
-    - "런"
-    - "럴"
-    - "럼"
-    - "럽"
-    - "럿"
-    - "렁"
-    - "레"
-    - "렉"
-    - "렌"
-    - "렐"
-    - "렘"
-    - "렙"
-    - "렛"
-    - "렝"
-    - "렷"
-    - "롄"
-    - "롤"
-    - "롬"
-    - "롭"
-    - "롯"
-    - "롸"
-    - "뢴"
-    - "룀"
-    - "룩"
-    - "룬"
-    - "룰"
-    - "룸"
-    - "룹"
-    - "룻"
-    - "룽"
-    - "뤄"
-    - "뤼"
-    - "뤽"
-    - "륀"
-    - "르"
-    - "른"
-    - "릭"
-    - "릴"
-    - "릿"
-    - "링"
-
-  loan_w_set:
-    - "녀석"
-    - "라디"
-    - "라마"
-    - "라미"
-    - "라스"
-    - "라오스"
-    - "라운드"
-    - "라이"
-    - "라트비"
-    - "라틴"
-    - "래스"
-    - "레슨"
-    - "레터"
-    - "로고스"
-    - "로그"
-    - "로댕"
-    - "로데오"
-    - "로뎀"
-    - "로드"
-    - "로레"
-    - "로렌"
-    - "로마"
-    - "로만"
-    - "로망"
-    - "로맨"
-    - "로미"
-    - "로미오"
-    - "로버"
-    - "로베"
-    - "로보트"
-    - "로보틱"
-    - "로봇"
-    - "로비스"
-    - "로빈"
-    - "로스"
-    - "로얄"
-    - "로이"
-    - "로이드"
-    - "로자리"
-    - "로잔"
-    - "로제타"
-    - "로즈"
-    - "로지"
-    - "로직"
-    - "로칼"
-    - "로컬"
-    - "로터"
-    - "로테"
-    - "로펌"
-    - "록펠"
-    - "료마"
-    - "루마니"
-    - "루미"
-    - "루벤"
-    - "루스"
-    - "루이"
-    - "루트"
-    - "리눅"
-    - "리니"
-    - "리더"
-    - "리드"
-    - "리듬"
-    - "리디"
-    - "리딩"
-    - "리메"
-    - "리멤"
-    - "리모델"
-    - "리모트"
-    - "리미트"
-    - "리바"
-    - "리버"
-    - "리베"
-    - "리본"
-    - "리뷰"
-    - "리비아"
-    - "리빙"
-    - "리빠"
-    - "리사이"
-    - "리셋"
-    - "리스"
-    - "리싸"
-    - "리액"
-    - "리얼"
-    - "리우"
-    - "리움"
-    - "리조트"
-    - "리즈"
-    - "리차드"
-    - "리처드"
-    - "리커"
-    - "리코"
-    - "리콜"
-    - "리터"
-    - "리턴"
-    - "리토피"
-    - "리투아"
-    - "리트"
-    - "리튼"
-    - "리틀"
-    - "리퍼"
-    - "리포터"
-    - "리포트"
-    - "리플"
-    - "리허"
-    - "리히텐"
-    - "림프"
-    - "립스"
+fkr113-115:
+# rule1
+  - "냐"
+  - "뉴"
+  - "니"
+  - "랃"
+  - "랏"
+  - "랙"
+  - "랜"
+  - "랟"
+  - "랠"
+  - "램"
+  - "랩"
+  - "랫"
+  - "랴"
+  - "랸"
+  - "랻"
+  - "랼"
+  - "럄"
+  - "럅"
+  - "럇"
+  - "러"
+  - "럭"
+  - "런"
+  - "럴"
+  - "럼"
+  - "럽"
+  - "럿"
+  - "렁"
+  - "레"
+  - "렉"
+  - "렌"
+  - "렐"
+  - "렘"
+  - "렙"
+  - "렛"
+  - "렝"
+  - "렷"
+  - "롄"
+  - "롤"
+  - "롬"
+  - "롭"
+  - "롯"
+  - "롸"
+  - "뢴"
+  - "룀"
+  - "룩"
+  - "룬"
+  - "룰"
+  - "룸"
+  - "룹"
+  - "룻"
+  - "룽"
+  - "뤄"
+  - "뤼"
+  - "뤽"
+  - "륀"
+  - "르"
+  - "른"
+  - "릭"
+  - "릴"
+  - "릿"
+  - "링"
+
+# loan_w_set
+  - "녀석"
+  - "라디"
+  - "라마"
+  - "라미"
+  - "라스"
+  - "라오스"
+  - "라운드"
+  - "라이"
+  - "라트비"
+  - "라틴"
+  - "래스"
+  - "레슨"
+  - "레터"
+  - "로고스"
+  - "로그"
+  - "로댕"
+  - "로데오"
+  - "로뎀"
+  - "로드"
+  - "로레"
+  - "로렌"
+  - "로마"
+  - "로만"
+  - "로망"
+  - "로맨"
+  - "로미"
+  - "로미오"
+  - "로버"
+  - "로베"
+  - "로보트"
+  - "로보틱"
+  - "로봇"
+  - "로비스"
+  - "로빈"
+  - "로스"
+  - "로얄"
+  - "로이"
+  - "로이드"
+  - "로자리"
+  - "로잔"
+  - "로제타"
+  - "로즈"
+  - "로지"
+  - "로직"
+  - "로칼"
+  - "로컬"
+  - "로터"
+  - "로테"
+  - "로펌"
+  - "록펠"
+  - "료마"
+  - "루마니"
+  - "루미"
+  - "루벤"
+  - "루스"
+  - "루이"
+  - "루트"
+  - "리눅"
+  - "리니"
+  - "리더"
+  - "리드"
+  - "리듬"
+  - "리디"
+  - "리딩"
+  - "리메"
+  - "리멤"
+  - "리모델"
+  - "리모트"
+  - "리미트"
+  - "리바"
+  - "리버"
+  - "리베"
+  - "리본"
+  - "리뷰"
+  - "리비아"
+  - "리빙"
+  - "리빠"
+  - "리사이"
+  - "리셋"
+  - "리스"
+  - "리싸"
+  - "리액"
+  - "리얼"
+  - "리우"
+  - "리움"
+  - "리조트"
+  - "리즈"
+  - "리차드"
+  - "리처드"
+  - "리커"
+  - "리코"
+  - "리콜"
+  - "리터"
+  - "리턴"
+  - "리토피"
+  - "리투아"
+  - "리트"
+  - "리튼"
+  - "리틀"
+  - "리퍼"
+  - "리포터"
+  - "리포트"
+  - "리플"
+  - "리허"
+  - "리히텐"
+  - "림프"
+  - "립스"
 
 fkr116:
   - "로부"
@@ -4149,6 +4149,16 @@ fkr116:
   - "라야"
   - "로의"
 
+fkr116a:
+  "ra": "na"
+  "ro": "no"
+  "rw": "nw"
+  "ru": "nu"
+  "ri": "i"
+  "ry": "y"
+  "ni": "i"
+  "ny": "y"
+
 fkr118:
   - "강종"
   - "경종"
@@ -4396,6 +4406,22 @@ fkr119:
   - "한국"
   - "호주"
 
+fkr119_suffix:
+  - "말"
+  - "인"
+  - "제"
+  - "어"
+  - "학"
+  - "사"
+  - "식"
+  - "산"
+  - "령"
+  - "행"
+  - "군"
+  - "계"
+  - "화"
+  - "적"
+
 fkr120:
   - "로스앤젤레스"
   - "브로드웨이"

+ 82 - 18
scriptshifter/hooks/korean/romanizer.py

@@ -61,7 +61,7 @@ def _romanize_nonames(src, hancha=False):
     # FKR039, FKR040, FKR041
     for fkrcode in ("fkr039", "fkr040", "fkr041"):
         logger.debug(f"Applying {fkrcode}")
-        data = data.replace(KCONF[fkrcode])
+        data = _replace_map(data, KCONF[fkrcode])
 
     # NOTE This is slightly different from LL 929-930 in that it doesn't
     # result in double spaces.
@@ -85,7 +85,7 @@ def _romanize_nonames(src, hancha=False):
     no_oclc_breve = False
 
     if no_oclc_breve:
-        data = data.replace({"ŏ": "ŏ", "ŭ": "ŭ", "Ŏ": "Ŏ", "Ŭ": "Ŭ"})
+        data = _replace_map(data, {"ŏ": "ŏ", "ŭ": "ŭ", "Ŏ": "Ŏ", "Ŭ": "Ŭ"})
 
     # TODO Decide what to do with these. There is no facility for outputting
     # warnings or notes to the user yet.
@@ -106,7 +106,7 @@ def _romanize_oclc_auto(data):
     # FKR050
     for rname, rule in KCONF["fkr050"].items():
         logger.debug(f"Applying fkr050[{rname}]")
-        data = data.replace(rule)
+        data = _replace_map(data, rule)
 
     # NOTE: Is this memant to replace " 제" followed by a digit with " 제 "?
     # This may not yield the expected results as it could replace all
@@ -119,7 +119,7 @@ def _romanize_oclc_auto(data):
     # FKR052
     for rname, rule in KCONF["fkr052"].items():
         logger.debug(f"Applying fkr052[{rname}]")
-        data = data.replace(rule)
+        data = _replace_map(data, rule)
 
     # Strip end and multiple whitespace.
     data = re.sub("\\W{2,}", " ", data.strip())
@@ -136,19 +136,24 @@ def _romanize_oclc_auto(data):
 
     # FKR060
     # TODO Add leading whitespace as per L1221? L1202 already added one.
-    data = data.replace(KCONF["fkr060"])
+    data = _replace_map(data, KCONF["fkr060"])
 
     data = re.sub("\\W{2,}", " ", f" {data.strip()} ")
 
-    # FKR061 FKR063 FKR064 FKR065
+    # FKR061
+    # FKR063
+    # FKR064
+    # FKR065
     logger.debug("Applying FKR062-065")
-    data = data.replace(KCONF["fkr061"]).replace(KCONF["fkr063"]).replace(
-            KCONF["fkr064"]).replace(KCONF["fkr065"])
+    data = _replace_map(
+            data,
+            KCONF["fkr061"] + KCONF["fkr063"] +
+            KCONF["fkr064"] + KCONF["fkr065"])
 
     # FKR066
     for rname, rule in KCONF["fkr066"].items():
         logger.debug(f"Applying FKR066[{rname}]")
-        data = data.replace(rule)
+        data = _replace_map(data, rule)
 
     data = re.sub("\\W{2,}", " ", data.strip())
 
@@ -156,8 +161,10 @@ def _romanize_oclc_auto(data):
 
 
 def _kor_rom(data):
+    data = re.sub("\\W{2,}", " ", data.strip())
+
     # FKR069
-    data = data.replace(KCONF["fkr069"])
+    data = _replace_map(data, KCONF["fkr069"])
 
     # FKR070
     niun = data.find("+")
@@ -183,7 +190,7 @@ def _kor_rom(data):
 
     # FKR071
     if niun:
-        rom_niun_a, rom_niun_b = rom.split("~", 1)
+        rom_niun_a, rom_niun_b = rom[:niun - 1].split("~", 1)
         if re.match("ill#m(?:2|6|12|17|20)", rom_niun_b):
             logger.debug("Applying FKR071")
             rom_niun_b = rom_niun_b.replace("i11#m", "i2#m", 1)
@@ -200,21 +207,70 @@ def _kor_rom(data):
     for k, cmap in KCONF["fkr073-100"].items():
         if k in rom:
             logger.debug(f"Applying FKR{fkr_i:03}")
-            rom.replace(cmap)
+            _replace_map(rom, cmap)
         fkr_i += 1
 
     # FKR101-108
     for fkr_i in range(101, 109):
         logger.debug(f"Applying FKR{fkr_i:03}")
-        rom = rom.replace(KCONF[f"fkr{fkr_i:03}"])
-
-    return data
+        rom = _replace_map(rom, KCONF[f"fkr{fkr_i:03}"])
+
+    # FKR109
+    for pos, data in KCONF["fkr109"]
+        logger.debug(f"Applying FKR109[{pos}]")
+        rom = _replace_map(rom, data)
+
+    # FKR110
+    rom = _replace_map(rom, {"#": "", "~": ""})
+
+    if non_kor > 0:
+        rom = f"{orig[:non_kor]}-{rom}"
+
+    # FKR111
+    rom = _replace_map(rom, KCONF["fkr111"])
+
+    # FKR112
+    is_non_kor = False
+    # FKR113
+    # FKR114
+    # FKR115
+    if orig.strip().startswith(tuple(KCONF["fkr113-115"])):
+        is_non_kor = True
+
+    # FKR116
+    is_particle = False
+    if orig.strip().startswith(tuple(KCONF["fkr116"])):
+        is_particle = True
+
+    if len(orig) > 1 and not is_non_kor and not is_particle:
+        rom = _replace_map(rom, KCONF["fkr116a"])
+
+    # FKR117
+    if (
+            # FKR118
+            orig in KCONF["fkr118"] or
+            # FKR119
+            orig in KCONF["fkr119"] or
+            orig.endswith(tuple(KCONF["fkr119_suffix"])):
+            # FKR120
+            orig.endswith(tuple(KCONF["fkr120"])):
+        rom = rom.capitalize()
+
+    # FKR121
+    # TODO Check global $ConvertR2L assigned in L17 and tested in L1849.
+    if f" {orig} " in KCONF["fkr121"]:
+        if rom.startswith("r"):
+            rom = "l" + rom[1:]
+        elif rom.startswith("R"):
+            rom = "L" + rom[1:]
+
+    return rom
 
 
 def _marc8_hancha(data):
     # FKR142
     logger.debug("Applying FKR142")
-    return data.replace(KCONF["fkr142"])
+    return _replace_map(data, KCONF["fkr142"])
 
 
 def _hancha2hangul(data):
@@ -223,7 +279,7 @@ def _hancha2hangul(data):
     # FKR143-170
     for i in range(143, 171):
         logger.debug(f"Applying FKR{i}")
-        data = data.replace(KCONF[f"fkr{i}"])
+        data = _replace_map(data, KCONF[f"fkr{i}"])
 
     # FKR171
     # Write down indices of occurrences of "不"
@@ -247,6 +303,14 @@ def _hancha2hangul(data):
 
     # FKR180
     logger.debug("Applying FKR180")
-    data = data.replace(KCONF["fkr180"])
+    data = _replace_map(data, KCONF["fkr180"])
 
     return re.sub("\\W{2,}", " ", data.strip())
+
+
+def _replace_map(src, rmap):
+    """ Replace occurrences in a string according to a map. """
+    for k, v in rmap:
+        src = src.replace(k, v)
+
+    return src