Browse Source

Fix Korean errors (#66)

* Fix Korean error with 2-syllable names.

* Relabel korean (names only).

* Add tests for edge case Korean names.

* Fix some test and coda logic (to be verified).
Stefano Cossu 5 months ago
parent
commit
714e803ea5

+ 7 - 3
scriptshifter/hooks/korean/romanizer.py

@@ -177,7 +177,11 @@ def _romanize_name(src, options):
             lname, fname = parsed.split("~", 1)
             fname_rom = _kor_fname_rom(fname)
 
-            lname_rom_ls = [_kor_lname_rom(n) for n in lname.split("+")]
+            lname_rom_ls = []
+            for n in lname.split("+"):
+                _k = _kor_lname_rom(n)
+                if _k:
+                    lname_rom_ls.append(_k)
 
             if not any(lname_rom_ls):
                 warnings.append(f"{parsed} is not a recognized Korean name.")
@@ -566,9 +570,9 @@ def _hancha2hangul(data):
     for char in KCONF["fkr172-179"]:
         idx = [i for i, item in enumerate(data) if item == char]
         for i in idx:
-            val = ord(data[i + 1])
+            val = ord(data[i - 1])
             coda_value = (val - CP_MIN) % 28
-            if coda_value == 1 or coda_value == 4 or val < 100:  # TODO verify
+            if coda_value == 0 or coda_value == 4 or val < 100:  # TODO verify
                 data = data.replace(char, "열", 1)
             else:
                 data = data.replace(char, "렬", 1)

+ 2 - 2
scriptshifter/tables/data/index.yml

@@ -40,8 +40,8 @@ korean_nonames:
   name: Korean
   description: Korean S2R for strings NOT containing any personal names.
 korean_names:
-  name: Korean (names only)
-  description: Korean S2R for strings ONLY containing personal names. Separate multiple names with a comma or a center-dot (U+00B7).
+  name: Korean (last + first names only)
+  description: Korean S2R for strings ONLY containing personal names formatted as last + first name. Separate multiple names with a comma or a center-dot (U+00B7).
 kyrgyz:
   name: Kyrgyz (Cyrillic)
 mongolian:

+ 13 - 4
tests/__init__.py

@@ -1,6 +1,8 @@
 from csv import reader
 from difflib import ndiff
 from importlib import reload
+from json import loads as jloads
+from logging import getLogger
 from os import path
 
 import scriptshifter.tables
@@ -11,6 +13,8 @@ from scriptshifter.trans import transliterate
 TEST_DIR = path.dirname(path.realpath(__file__))
 TEST_DATA_DIR = path.join(TEST_DIR, "data")
 
+logger = getLogger(__name__)
+
 
 def reload_tables():
     reload(scriptshifter.tables)  # Reload new config dir.
@@ -38,17 +42,22 @@ def test_sample(dset):
         csv = reader(fh)
         for row in csv:
             lang, script, rom = row[:3]
-            opts = row[3] if len(row) > 3 and row[3] else {}
-            trans, warnings = transliterate(script, lang, "s2r", opts)
+            if not lang:
+                continue
+            opts = jloads(row[3]) if len(row) > 3 and row[3] else {}
+            trans, warnings = transliterate(
+                    script, lang, t_dir="s2r",
+                    capitalize=opts.get("capitalize"), options=opts)
             if (trans == rom):
                 print(".", end="")
             else:
                 print("F", end="")
-                deltas.append((script, ndiff([trans], [rom])))
+                deltas.append((lang, script, ndiff([trans], [rom])))
 
     with open(log_fpath, "w") as fh:
         # If no deltas, just truncate the file.
-        for script, delta in deltas:
+        for lang, script, delta in deltas:
+            fh.write(f"Language: {lang}\n")
             fh.write(f"Original: {script}\n")
             for dline in delta:
                 fh.write(dline.strip() + "\n")

+ 52 - 44
tests/data/script_samples/korean.csv

@@ -122,9 +122,9 @@ korean_names,고복장,Ko Pok-chang,,Hangul; from Y. Lee,
 korean_names,고사경,Ko Sa-gyŏng,,Hangul; from Y. Lee,
 korean_names,고사계,Ko Sa-gye,,Hangul; from Y. Lee,
 korean_names,고상안,Ko Sang-an,,Hangul; from Y. Lee,
-korean_names,고상은,Ko Sang-ŭn ,,Hangul; from Y. Lee,
+korean_names,고상은,Ko Sang-ŭn,,Hangul; from Y. Lee,
 korean_names,고상준,Ko Sang-jun,,Hangul; from Y. Lee,
-korean_names,고상현,Ko Sang-hyŏn ,,Hangul; from Y. Lee,
+korean_names,고상현,Ko Sang-hyŏn,,Hangul; from Y. Lee,
 korean_names,고성창,Ko Sŏng-ch'ang,,Hangul; from Y. Lee,
 korean_names,고순흠,Ko Sun-hŭm,,Hangul; from Y. Lee,
 korean_names,고시언,Ko Si-ŏn,,Hangul; from Y. Lee,
@@ -225,7 +225,7 @@ korean_names,채수,Ch'ae Su,,Hangul; from Y. Lee,
 korean_names,채수영,Ch'ae Su-yŏng,,Hangul; from Y. Lee,
 korean_names,채영찬,Ch'ae Yŏng-ch'an,,Hangul; from Y. Lee,
 korean_names,채원,Ch'ae Wŏn,,Hangul; from Y. Lee,
-korean_names,채원병,Ch'ae Wŏn-byŏng ,,Hangul; from Y. Lee,
+korean_names,채원병,Ch'ae Wŏn-byŏng,,Hangul; from Y. Lee,
 korean_names,채유후,Ch'ae Yu-hu,,Hangul; from Y. Lee,
 korean_names,채응언,Ch'ae Ŭng-ŏn,,Hangul; from Y. Lee,
 korean_names,채인규,Ch'ae In-gyu,,Hangul; from Y. Lee,
@@ -304,10 +304,10 @@ korean_names,조지서,Cho Chi-sŏ,,Hangul; from Y. Lee,
 korean_names,조지훈,Cho Chi-hun,,Hangul; from Y. Lee,
 korean_names,조인한,Cho In-hwan,,Hangul; from Y. Lee,
 korean_names,조일신,Cho Il-sin,,Hangul; from Y. Lee,
-korean_names,조일훈,Cho Ir-hun ,,Hangul; from Y. Lee,
+korean_names,조일훈,Cho Ir-hun,,Hangul; from Y. Lee,
 korean_names,조자일,Cho Cha-il,,Hangul; from Y. Lee,
 korean_names,조자지,Cho Cha-ji,,Hangul; from Y. Lee,
-korean_names,조자형,Cho Cha-hyŏng ,,Hangul; from Y. Lee,
+korean_names,조자형,Cho Cha-hyŏng,,Hangul; from Y. Lee,
 korean_names,조재원,Cho Chae-wŏn,,Hangul; from Y. Lee,
 korean_names,조재호,Cho Chae-ho,,Hangul; from Y. Lee,
 korean_names,조정구,Cho Chŏng-gu,,Hangul; from Y. Lee,
@@ -349,7 +349,7 @@ korean_names,조연현,Cho Yŏn-hyŏn,,Hangul; from Y. Lee,
 korean_names,조영국,Cho Yŏng-guk,,Hangul; from Y. Lee,
 korean_names,조영규,Cho Yŏng-gyu,,Hangul; from Y. Lee,
 korean_names,조영무,Cho Yŏng-mu,,Hangul; from Y. Lee,
-korean_names,조영복,Cho Yŏng-bok ,,Hangul; from Y. Lee,
+korean_names,조영복,Cho Yŏng-bok,,Hangul; from Y. Lee,
 korean_names,조영우,Cho Yŏng-sŏk,,Hangul; from Y. Lee,
 korean_names,조영인,Cho Yŏng-in,,Hangul; from Y. Lee,
 korean_names,조영진,Cho Yŏng-jin,,Hangul; from Y. Lee,
@@ -360,11 +360,11 @@ korean_names,조수삼,Cho Su-sam,,Hangul; from Y. Lee,
 korean_names,조수성,Cho Su-sŏng,,Hangul; from Y. Lee,
 korean_names,조수익,Cho Su-ik,,Hangul; from Y. Lee,
 korean_names,조순생,Cho Sun-saeng,,Hangul; from Y. Lee,
-korean_names,조순탁,Cho Sun-t'ak ,,Hangul; from Y. Lee,
+korean_names,조순탁,Cho Sun-t'ak,,Hangul; from Y. Lee,
 korean_names,조승훈,Cho Sŭng-hun,,Hangul; from Y. Lee,
 korean_names,조시원,Cho Si-wŏn,,Hangul; from Y. Lee,
 korean_names,조식,Cho Sik,,Hangul; from Y. Lee,
-korean_names,조서경,Cho Sŏ-gyŏng ,,Hangul; from Y. Lee,
+korean_names,조서경,Cho Sŏ-gyŏng,,Hangul; from Y. Lee,
 korean_names,조석문,Cho Sŏng-mun,,Hangul; from Y. Lee,
 korean_names,조석우,Cho Sŏg-u,,Hangul; from Y. Lee,
 korean_names,조석윤,Cho Sŏg-yun,,Hangul; from Y. Lee,
@@ -656,7 +656,7 @@ korean_names,안태국,An T'ae-guk,,Hangul; from Y. Lee,
 korean_names,안향,An Hyang,,Hangul; from Y. Lee,
 korean_names,안희제,An Hŭi-je,,Hangul; from Y. Lee,
 korean_names,양건록,Yang Kŏl-lok,,Hangul; from Y. Lee,
-korean_names,양규철,Yang Kyu-ch'ŏl ,,Hangul; from Y. Lee,
+korean_names,양규철,Yang Kyu-ch'ŏl,,Hangul; from Y. Lee,
 korean_names,양근환,Yang Kŭn-hwan,,Hangul; from Y. Lee,
 korean_names,양득중,Yang Tŭk-chung,,Hangul; from Y. Lee,
 korean_names,양백연,Yang Paeg-yŏn,,Hangul; from Y. Lee,
@@ -715,7 +715,7 @@ korean_names,朱基徹,Chu Ki-ch'ŏl,,Hancha; From Y. Lee,
 korean_names,朱德海,Chu Tŏk-hae,,Hancha; From Y. Lee,
 korean_names,朱思忠,Chu Sa-ch'ung,,Hancha; From Y. Lee,
 korean_names,朱成七,Chu Sŏng-ch'il,,Hancha; From Y. Lee,
-korean_names,朱斗烈,Chu Tu-yŏl ,,Hancha; From Y. Lee,
+korean_names,朱斗烈,Chu Tu-yŏl,,Hancha; From Y. Lee,
 korean_names,朱熹,Chu Hŭi,,Hancha; From Y. Lee,
 korean_names,徐命九,Sŏ Myŏng-gu,,Hancha; From Y. Lee,
 korean_names,徐命善,Sŏ Myŏng-sŏn,,Hancha; From Y. Lee,
@@ -765,7 +765,7 @@ korean_names,張雄,Chang Ung,,Hancha; From Y. Lee,
 korean_names,張雲翼,Chang Un-ik,,Hancha; From Y. Lee,
 korean_names,張順孫,Chang Sun-son,,Hancha; From Y. Lee,
 korean_names,張順明,Chang Sun-myŏng,,Hancha; From Y. Lee,
-korean_names,成俊慶,Sŏng Chun-gyŏng ,,Hancha; From Y. Lee,
+korean_names,成俊慶,Sŏng Chun-gyŏng,,Hancha; From Y. Lee,
 korean_names,成俔,Sŏng Hyŏn,,Hancha; From Y. Lee,
 korean_names,成允文,Sŏng Yun-mun,,Hancha; From Y. Lee,
 korean_names,成准得,Sŏng Chun-dŭk,,Hancha; From Y. Lee,
@@ -834,7 +834,7 @@ korean_names,許昕,Hŏ Hŭn,,Hancha; From Y. Lee,
 korean_names,許曄,Hŏ Yŏp,,Hancha; From Y. Lee,
 korean_names,許有全,Hŏ Yu-jŏn,,Hancha; From Y. Lee,
 korean_names,許楚姬,Hŏ Ch'o-hŭi,,Hancha; From Y. Lee,
-korean_names,趙一訓,Cho Ir-hun ,,Hancha; From Y. Lee,
+korean_names,趙一訓,Cho Ir-hun,,Hancha; From Y. Lee,
 korean_names,趙之瑞,Cho Chi-sŏ,,Hancha; From Y. Lee,
 korean_names,趙之遴,Cho Chi-rin,,Hancha; From Y. Lee,
 korean_names,趙云仡,Cho Un-hŭl,,Hancha; From Y. Lee,
@@ -941,39 +941,47 @@ korean_names,黃仲寶,Hwang Chung-bo,,Hancha; From Y. Lee,
 korean_names,黃俊良,Hwang Chun-nyang,,Hancha; From Y. Lee,
 korean_names,黃信德,Hwang Sin-dŏk,,Hancha; From Y. Lee,
 korean_names,黃允吉,Hwang Yun-gil,,Hancha; From Y. Lee,
+korean_names,남궁 억,Namgung Ŏk,,From Elaine Kim: 2 character last name + 1 character given name,
+korean_names,독고 영,Tokko Yŏng,,From Elaine Kim: 2 character last name + 1 character given name,
+korean_names,선우 정,Sŏnu Chŏng,,From Elaine Kim: 2 character last name + 1 character given name,
+korean_names,선우 종원,Sŏnu Chong-wŏn,,From Elaine Kim: 2 character last name + 2 character given name,
+korean_names,남궁 지영,Namgung Chi-yŏng,,From Elaine Kim: 2 character last name + 2 character given name,
+korean_names,선우 은숙,Sŏnu Ŭn-suk,,From Elaine Kim: 2 character last name + 2 character given name,
+korean_nonames,성철 선사,Sŏngch'ŏl Sŏnsa,"{""capitalize"": ""all""}",From Elaine Kim: No surname,
+korean_nonames,법현,Pŏphyŏn,"{""capitalize"": ""all""}",From Elaine Kim: No surname,
 korean_nonames,민주화 이후 국정 운영,Minjuhwa ihu kukchŏng unyŏng,"{""capitalize"": ""first""}",,
 korean_nonames,曉城 趙 明基 博士 追慕 佛教 史學 論文集,Hyosŏng Cho Myŏng-gi Paksa ch'umo Pulgyo sahak nonmunjip,"{""capitalize"": ""first""}",Not Chinese,
-korean_nonames,결단력,Kyŏltannyŏk,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,상견례,Sanggyŏnnye,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,신여성,Sinnyŏsŏng,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,의견란,Ŭigyŏnnan,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,만석꾼,Mansŏkkun,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,임진란,Imjinnan,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,임진록,Imjinnok,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,옛이야기,Yenniyagi,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,전달자,Chŏndalcha,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,독해법,Tokhaepŏp,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,방지법,Pangjipŏp,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,추진법,Ch'ujinpŏp,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,여행법,Yŏhaengpŏp,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,사랑법,Sarangpŏp,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,호박꽃,Hobakkot,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,공권력,Kongkwŏnnyŏk,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,생산량,Saengsannyang,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,이원론,Iwŏnnon,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,동원령,Tongwŏnnyŏng,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,한여름,Hannyŏrŭm,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,휘발유,Hwiballyu,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,꽃잎,Kkonnip,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,솔잎,Sollip,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,활동가,Hwaltongga,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,별일,Pyŏllil,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,노근리,Nogŭn-ni,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,창원군,Ch'angwŏn-gun,"{""capitalize"": ""first""}",From Elaine,
-korean_nonames,신여자,Sinnyoja,"{""capitalize"": ""first""}",from Elaine,
-korean_nonames,밀당,miltang,"{""capitalize"": ""first""}",from Elaine,
-korean_nonames,말직,malchik,"{""capitalize"": ""first""}",from Elaine,
-korean_nonames,토벌대,t'obŏltae,"{""capitalize"": ""first""}",from Elaine,
+korean_nonames,결단력,Kyŏltannyŏk,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,상견례,Sanggyŏnnye,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,신여성,Sinnyŏsŏng,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,의견란,Ŭigyŏnnan,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,만석꾼,Mansŏkkun,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,임진란,Imjinnan,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,임진록,Imjinnok,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,옛이야기,Yenniyagi,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,전달자,Chŏndalcha,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,독해법,Tokhaepŏp,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,방지법,Pangjipŏp,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,추진법,Ch'ujinpŏp,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,여행법,Yŏhaengpŏp,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,사랑법,Sarangpŏp,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,호박꽃,Hobakkot,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,공권력,Kongkwŏnnyŏk,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,생산량,Saengsannyang,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,이원론,Iwŏnnon,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,동원령,Tongwŏnnyŏng,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,한여름,Hannyŏrŭm,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,휘발유,Hwiballyu,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,꽃잎,Kkonnip,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,솔잎,Sollip,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,활동가,Hwaltongga,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,별일,Pyŏllil,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,노근리,Nogŭn-ni,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,창원군,Ch'angwŏn-gun,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,신여자,Sinnyoja,"{""capitalize"": ""first""}",From Elaine Kim,
+korean_nonames,밀당,miltang,,From Elaine Kim,
+korean_nonames,말직,malchik,,From Elaine Kim,
+korean_nonames,토벌대,t'obŏltae,,From Elaine Kim,
 korean_nonames,民法 과 法學 의 重要 問題,Minpŏp kwa pŏphak ŭi chungyo munje,"{""capitalize"": ""first""}",Not Chinese,
 korean_nonames,그래도 돈 버는 사람 은 있다,Kŭraedo ton pŏnŭn saram ŭn itta,"{""capitalize"": ""first""}",From K-Romanizer,
 korean_nonames,근대 계몽기 문학 과 독자 의 발견,Kŭndae kyemonggi munhak kwa tokcha ŭi palgyŏn,"{""capitalize"": ""first""}",From K-Romanizer,
@@ -1339,7 +1347,7 @@ korean_nonames,아직 끝나지 않았다,Ajik kkŭnnaji anatta,"{""capitalize""
 korean_nonames,아직 할 말 이 남았습니다,Ajik hal mal i namatsŭmnida,"{""capitalize"": ""first""}",From K-Romanizer,
 korean_nonames,안 중근 연구 의 기초,An Chung-gŭn yŏn'gu ŭi kich'o,"{""capitalize"": ""first""}",From K-Romanizer,
 korean_nonames,안녕 D 기계치 도 사랑 한 디지털 노트,Annyŏng D kigyech'i to sarang han tijit'ŏl not'ŭ,"{""capitalize"": ""first""}",From K-Romanizer,
-korean_nonames,암 예방 : 5가지 색깔 을 먹어라,Am yebang: 5-kaji saekkkal ŭl mŏgŏra,"{""capitalize"": ""first""}",From K-Romanizer,
+korean_nonames,암 예방 : 5가지 색깔 을 먹어라,Am yebang: 5-kaji saekkal ŭl mŏgŏra,"{""capitalize"": ""first""}",From K-Romanizer,
 korean_nonames,암 전이 재발 을 막아 주는 한방 신 치료 전략,Am chŏni chaebal ŭl maga chunŭn hanbang sin ch'iryo chŏllyak,"{""capitalize"": ""first""}",From K-Romanizer,
 korean_nonames,앞 으로 3년 경매 가 답 이다,Ap ŭro 3-yŏn kyŏngmae ka tap ida,"{""capitalize"": ""first""}",From K-Romanizer,
 korean_nonames,애니메이션 기획론 : 2D 에서 3D 까지,Aenimeisyŏn kihoengnon: 2D esŏ 3D kkaji,"{""capitalize"": ""first""}",From K-Romanizer,