romanizer.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733
  1. # @package ext.korean
  2. #
  3. __doc__ = """
  4. Korean transcription functions.
  5. Ported from K-Romanizer: https://library.princeton.edu/eastasian/k-romanizer
  6. Only script-to-Roman is possible for Korean.
  7. Note that Korean Romanization must be done separately for strings containing
  8. only personal names and strings that do not contain personal names, due to
  9. ambiguities in the language. A non-deterministic approach using machine
  10. learning that separates words depending on context is being attempted by other
  11. parties, and it may be possible to eventually integrate such services here in
  12. the future, technology and licensing permitting. At the moment there are no
  13. such plans.
  14. Many thanks to Hyoungbae Lee for kindly providing the original K-Romanizer
  15. program and assistance in porting it to Python.
  16. """
  17. import logging
  18. import re
  19. from os import path
  20. from csv import reader
  21. from scriptshifter.exceptions import BREAK
  22. from scriptshifter.hooks.korean import KCONF
  23. PWD = path.dirname(path.realpath(__file__))
  24. CP_MIN = 44032
  25. # Buid FKR index for better logging.
  26. with open(path.join(PWD, "FKR_index.csv"), newline='') as fh:
  27. csv = reader(fh)
  28. FKR_IDX = {row[0]: row[2] for row in csv}
  29. logger = logging.getLogger(__name__)
  30. def s2r_nonames_post_config(ctx):
  31. """ Romanize a regular string NOT containing personal names. """
  32. ctx.dest, ctx.warnings = _romanize_nonames(
  33. ctx.src, ctx.options)
  34. return BREAK
  35. def s2r_names_post_config(ctx):
  36. """
  37. Romanize a string containing ONLY Korean personal names.
  38. One or more names can be transcribed. A comma or middle dot (U+00B7) is
  39. to be used as separator for multiple names.
  40. """
  41. ctx.dest, ctx.warnings = _romanize_names(ctx.src, ctx.options)
  42. return BREAK
  43. def _romanize_nonames(src, options):
  44. """ Main Romanization function for non-name strings. """
  45. # FKR038: Convert Chinese characters to Hangul
  46. if options.get("hancha", True):
  47. kor = _hancha2hangul(_marc8_hancha(src))
  48. else:
  49. kor = src
  50. # Replace ideographic spaces with ASCII space.
  51. kor = re.sub(r"\s+", " ", kor)
  52. kor = f" {kor} "
  53. # FKR039: Replace Proper name with spaces in advance
  54. # FKR040: Replace Proper name with a hyphen in advance
  55. # FKR041: Romanize names of Hangul consonants
  56. for i in range(39, 42):
  57. _fkr_log(i)
  58. kor = _replace_map(kor, KCONF[f"fkr{i:03}"])
  59. # NOTE This is slightly different from LL 929-930 in that it doesn't
  60. # result in double spaces.
  61. kor = kor.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
  62. # This is more compact but I'm unsure if the replacement order is kept.
  63. # kor = kor.replace({"\r\n": " ", "\r": " ", "\n": " "})
  64. rom = _romanize_oclc_auto(kor)
  65. logger.debug(f"Before capitalization: {rom}")
  66. # FKR042: Capitalize all first letters
  67. if options["capitalize"] == "all":
  68. rom = _capitalize(rom)
  69. # FKR043: Capitalize the first letter
  70. elif options["capitalize"] == "first":
  71. rom = rom[0].upper() + rom[1:]
  72. # FKR044: Ambiguities
  73. ambi = re.sub("[,.\";: ]+", " ", rom)
  74. # TODO Decide what to do with these. There is no facility for outputting
  75. # warnings or notes to the user yet.
  76. warnings = []
  77. _fkr_log(45)
  78. for exp, warn in KCONF["fkr045"].items():
  79. if exp in ambi:
  80. warnings.append(ambi if warn == "" else warn)
  81. if rom:
  82. rom = rom.replace("kkk", "kk")
  83. return rom, warnings
  84. def _romanize_names(src, options):
  85. """
  86. Main Romanization function for names.
  87. Separate and romanize multiple names sepearated by comma or middle dot.
  88. K-Romanizer: KorNameRom20
  89. """
  90. rom_ls = []
  91. warnings = []
  92. if "," in src and "·" in src:
  93. warnings.append(
  94. "both commas and middle dots are being used to separate "
  95. "names. Only one of the two types should be used, or "
  96. "unexpected results may occur.")
  97. kor_ls = src.split(",") if "," in src else src.split("·")
  98. for kor in kor_ls:
  99. rom, _warnings = _romanize_name(kor.strip(), options)
  100. rom_ls.append(rom)
  101. warnings.extend(_warnings)
  102. return ", ".join(rom_ls), warnings
  103. def _romanize_name(src, options):
  104. warnings = []
  105. # FKR001: Conversion, Family names in Chinese (dealing with 金 and 李)
  106. # FKR002: Family names, Initial sound law
  107. replaced = False
  108. for ss, r in KCONF["fkr001-002"]:
  109. if replaced:
  110. break
  111. for s in ss:
  112. if src.startswith(s):
  113. src = r + src[1:]
  114. replaced = True
  115. break
  116. # FKR003: First name, Chinese Character Conversion
  117. src = _hancha2hangul(_marc8_hancha(src))
  118. if re.search("[a-zA-Z0-9]", src):
  119. warnings.append(f"{src} is not a recognized personal name.")
  120. return "", warnings
  121. # `parsed` can either be a modified Korean string with markers, or in case
  122. # of a foreign name, the final romanized name.
  123. parsed, _warnings = _parse_kor_name(
  124. re.sub(r"\s{2,}", " ", src.strip()),
  125. options)
  126. if len(_warnings):
  127. warnings += _warnings
  128. if parsed:
  129. if "~" in parsed:
  130. lname, fname = parsed.split("~", 1)
  131. fname_rom = _kor_fname_rom(fname)
  132. lname_rom_ls = []
  133. for n in lname.split("+"):
  134. _k = _kor_lname_rom(n)
  135. if _k:
  136. lname_rom_ls.append(_k)
  137. if not any(lname_rom_ls):
  138. warnings.append(f"{parsed} is not a recognized Korean name.")
  139. return "", warnings
  140. lname_rom = " ".join(lname_rom_ls)
  141. # Add comma after the last name for certain MARC fields.
  142. marc_field = options.get("marc_field")
  143. if marc_field in ("100", "600", "700", "800"):
  144. rom = f"{lname_rom}, {fname_rom}"
  145. else:
  146. rom = f"{lname_rom} {fname_rom}"
  147. if False:
  148. # TODO add option for authoritative name.
  149. rom_ls = rom.rsplit(" ", 1)
  150. rom = ", ".join(rom_ls)
  151. return rom, warnings
  152. else:
  153. warnings.append("Romanized as a foreign name.")
  154. return parsed, warnings
  155. warnings.append(f"{src} is not a recognized Korean name.")
  156. return "", warnings
  157. def _parse_kor_name(src, options):
  158. parsed = None
  159. warnings = []
  160. # FKR004: Check first two characters. Two-syllable family name or not?
  161. two_syl_fname = False
  162. for ptn in KCONF["fkr004"]:
  163. if src.startswith(ptn):
  164. two_syl_fname = True
  165. break
  166. src_len = len(src)
  167. # FKR005: Error if more than 7 syllables
  168. if src_len > 7 or src_len < 2 or src.find(" ") > 2:
  169. if options.get("foreign_name"):
  170. return _kor_corp_name_rom(src), warnings
  171. else:
  172. warnings.append("ERROR: not a Korean name.")
  173. return None, warnings
  174. ct_spaces = src.count(" ")
  175. # FKR0006: Error if more than 2 spaces
  176. if ct_spaces > 2:
  177. warnings.append("ERROR: not a name (too many spaces)")
  178. return None, warnings
  179. # FKR007: 2 spaces (two family names)
  180. if ct_spaces == 2:
  181. parsed = src.replace(" ", "+", 1).replace(" ", "~", 1)
  182. elif ct_spaces == 1:
  183. # FKR008: 1 space (2nd position)
  184. if src[1] == " ":
  185. parsed = src.replace(" ", "~")
  186. # FKR009: 1 space (3nd position)
  187. if src[2] == " ":
  188. if two_syl_fname:
  189. parsed = "+" + src.replace(" ", "~")
  190. # FKR010: When there is no space
  191. else:
  192. if src_len == 2:
  193. parsed = src[0] + "~" + src[1:]
  194. elif src_len > 2:
  195. if two_syl_fname:
  196. parsed = src[:1] + "~" + src[2:]
  197. else:
  198. parsed = src[0] + "~" + src[1:]
  199. return parsed, warnings
  200. def _kor_corp_name_rom(src):
  201. chu = yu = 0
  202. if src.startswith("(주) "):
  203. src = src[4:]
  204. chu = "L"
  205. if src.endswith(" (주)"):
  206. src = src[:-4]
  207. chu = "R"
  208. if src.startswith("(유) "):
  209. src = src[4:]
  210. yu = "L"
  211. if src.endswith(" (유)"):
  212. src = src[:-4]
  213. yu = "R"
  214. rom_tok = []
  215. for tok in src.split(" "):
  216. rom_tok.append(_romanize_oclc_auto(tok))
  217. rom = _capitalize(" ".join(rom_tok))
  218. if chu == "L":
  219. rom = "(Chu) " + rom
  220. elif chu == "R":
  221. rom = rom + " (Chu)"
  222. if yu == "L":
  223. rom = "(Yu) " + rom
  224. elif yu == "R":
  225. rom = rom + " (Yu)"
  226. # FKR035: Replace established names
  227. rom = _replace_map(rom, KCONF["fkr035"])
  228. return rom
  229. def _romanize_oclc_auto(kor):
  230. # FKR050: Starts preprocessing symbol
  231. _fkr_log(50)
  232. for rname, rule in KCONF["fkr050"].items():
  233. logger.debug(f"Applying fkr050[{rname}]")
  234. kor = _replace_map(kor, rule)
  235. # See https://github.com/lcnetdev/scriptshifter/issues/19
  236. kor = re.sub("제([0-9])", "제 \\1", kor)
  237. # FKR052: Replace Che+number
  238. _fkr_log(52)
  239. for rname, rule in KCONF["fkr052"].items():
  240. logger.debug(f"Applying fkr052[{rname}]")
  241. kor = _replace_map(kor, rule)
  242. # Strip end and multiple whitespace.
  243. kor = re.sub(r"\s{2,}", " ", kor.strip())
  244. kor = kor.replace("^", " GLOTTAL ")
  245. logger.debug(f"Korean before romanization: {kor}")
  246. rom_ls = []
  247. for word in kor.split(" "):
  248. rom_ls.append(_kor_rom(word))
  249. rom = " ".join(rom_ls)
  250. # FKR059: Apply glottalization
  251. rom = _replace_map(
  252. f" {rom.strip()} ", {" GLOTTAL ": "", "*": "", "^": ""})
  253. # FKR060: Process number + -년/-년도/-년대
  254. # TODO Add leading whitespace as per L1221? L1202 already added one.
  255. rom = _replace_map(rom, KCONF["fkr060"])
  256. rom = re.sub(r"\s{2,}", " ", f" {rom.strip()} ")
  257. # FKR061: Jurisdiction (시)
  258. # FKR062: Historical place names
  259. # FKR063: Jurisdiction (국,도,군,구)
  260. # FKR064: Temple names of Kings, Queens, etc. (except 조/종)
  261. # FKR065: Frequent historical names
  262. for i in range(61, 66):
  263. _fkr_log(i)
  264. rom = _replace_map(rom, KCONF[f"fkr{i:03}"])
  265. # FKR066: Starts restore symbols
  266. _fkr_log(66)
  267. for rname, rule in KCONF["fkr066"].items():
  268. logger.debug(f"Applying FKR066[{rname}]")
  269. rom = _replace_map(rom, rule)
  270. # Remove spaces from before punctuation signs.
  271. rom = re.sub(r" (?=[,.;:?!])", "", rom.strip())
  272. rom = re.sub(r"\s{2,}", " ", rom)
  273. return rom
  274. # FKR068: Exceptions, Exceptions to initial sound law, Proper names
  275. def _kor_rom(kor):
  276. kor = re.sub(r"\s{2,}", " ", kor.strip())
  277. orig = kor
  278. # FKR069: Irregular sound change list
  279. kor = _replace_map(kor, KCONF["fkr069"])
  280. # FKR070: [n] insertion position mark +
  281. niun = kor.find("+")
  282. if niun > -1:
  283. kor = kor.replace("+", "")
  284. orig = kor
  285. non_kor = 0
  286. cpoints = tuple(ord(c) for c in kor)
  287. for cp in cpoints:
  288. if cp < CP_MIN:
  289. non_kor += 1
  290. kor = kor[1:]
  291. rom_ls = []
  292. if non_kor > 0:
  293. # Rebuild code point list with non_kor removed.
  294. cpoints = tuple(ord(c) for c in kor)
  295. for i in range(len(kor)):
  296. cp = cpoints[i] - CP_MIN
  297. ini = "i" + str(cp // 588)
  298. med = "m" + str((cp // 28) % 21)
  299. fin = "f" + str(cp % 28)
  300. rom_ls.append("#".join((ini, med, fin)))
  301. rom = "~".join(rom_ls)
  302. if len(rom):
  303. rom = rom + "E"
  304. # FKR071: [n] insertion
  305. if niun > -1:
  306. niun_loc = rom.find("~")
  307. # Advance until the niun'th occurrence of ~
  308. # If niun is 0 or 1 the loop will be skipped.
  309. for i in range(niun - 1):
  310. niun_loc = rom.find("~", niun_loc + 1)
  311. rom_niun_a = rom[:niun_loc]
  312. rom_niun_b = rom[niun_loc + 1:]
  313. if re.match("i11#m(?:2|6|12|17|20)", rom_niun_b):
  314. _fkr_log(71)
  315. rom_niun_b = rom_niun_b.replace("i11#m", "i2#m", 1)
  316. # FKR072: [n]+[l] >[l] + [l]
  317. if rom_niun_b.startswith("i5#") and rom_niun_a.endswith("f4"):
  318. _fkr_log(72)
  319. rom_niun_b = rom_niun_b.replace("i5#", "i2", 1)
  320. rom = f"{rom_niun_a}~{rom_niun_b}"
  321. # FKR073: Palatalization: ㄷ+이,ㄷ+여,ㄷ+히,ㄷ+혀
  322. # FKR074: Palatalization: ㅌ+이,ㅌ+히,ㅌ+히,ㅌ+혀
  323. # FKR075: Consonant assimilation ㄱ
  324. # FKR076: Consonant assimilation ㄲ
  325. # FKR077: Consonant assimilation ㄳ : ㄱ,ㄴ,ㄹ,ㅁ,ㅇ
  326. # FKR078: Consonant assimilation ㄴ
  327. # FKR079: Consonant assimilation ㄵ: ㄱ,ㄴ,ㄷ,ㅈ"
  328. # FKR080: Consonant assimilation ㄶ : ㄱ,ㄴ,ㄷ,ㅈ
  329. # FKR081: Consonant assimilation ㄷ
  330. # FKR082: Consonant assimilation ㄹ
  331. # FKR083: Consonant assimilation ㄺ : ㄱ,ㄴ,ㄷ,ㅈ
  332. # FKR084: Consonant assimilation ㄻ : ㄱ,ㄴ,ㄷ,ㅈ
  333. # FKR085: Consonant assimilation ㄼ : ㄱ,ㄴ,ㄷ,ㅈ
  334. # FKR086: Consonant assimilation ㄾ : ㄱ,ㄴ,ㄷ,ㅈ
  335. # FKR087: Consonant assimilation ㄿ : ㄱ,ㄴ,ㄷ,ㅈ
  336. # FKR088: Consonant assimilation ㅀ : ㄱ,ㄴ,ㄷ,ㅈ
  337. # FKR089: Consonant assimilation ㅁ
  338. # FKR090: Consonant assimilation ㅂ
  339. # FKR091: Consonant assimilation ㅄ
  340. # FKR092: Consonant assimilation ㅅ
  341. # FKR093: Consonant assimilation ㅆ
  342. # FKR094: Consonant assimilation ㅇ
  343. # FKR095: Consonant assimilation ㅈ
  344. # FKR096: Consonant assimilation ㅊ
  345. # FKR097: Consonant assimilation ㅋ
  346. # FKR098: Consonant assimilation ㅌ
  347. # FKR099: Consonant assimilation ㅍ
  348. # FKR100: Consonant assimilation ㅎ
  349. # FKR101: digraphic coda + ㅇ: ㄵ,ㄶ,ㄺ,ㄻ,ㄼ,ㄽ,ㄾ,ㄿ,ㅀ
  350. # FKR102: digraphic coda + ㅎ: ㄵ,ㄶ,ㄺ,ㄻ,ㄼ,(ㄽ),ㄾ,ㄿ,ㅀ
  351. # FKR103: Vocalization 1 (except ㄹ+ㄷ, ㄹ+ㅈ 제외) voiced + unvoiced
  352. # FKR104: Vocalization 2 (except ㄹ+ㄷ, ㄹ+ㅈ 제외) unvoiced + voiced
  353. # FKR105: Vocalization 3 (ㄹ+ㄷ, ㄹ+ㅈ)
  354. # FKR106: Final sound law
  355. # FKR107: Exception for '쉬' = shi
  356. # FKR108: Exception for 'ㄴㄱ'= n'g
  357. for fkr_i in range(73, 109):
  358. _fkr_log(fkr_i)
  359. _bk = rom
  360. rom = _replace_map(rom, KCONF[f"fkr{fkr_i:03}"])
  361. if _bk != rom:
  362. logger.debug(f"FKR{fkr_i} substitution: {rom} (was: {_bk})")
  363. # FKR109: Convert everything else
  364. _fkr_log(109)
  365. for pos, data in KCONF["fkr109"].items():
  366. rom = _replace_map(rom, data)
  367. # FKR110: Convert symbols
  368. rom = _replace_map(rom, {"#": "", "~": ""})
  369. if non_kor > 0:
  370. # Modified from K-Romanizer:1727 in that it does not append a hyphen
  371. # if the whole word is non-Korean.
  372. rom = f"{orig[:non_kor]}-{rom}" if len(rom) else orig
  373. # FKR111: ㄹ + 모음/ㅎ/ㄹ, ["lr","ll"] must be in the last of the array
  374. rom = _replace_map(rom, KCONF["fkr111"])
  375. # FKR112: Exceptions to initial sound law
  376. is_non_kor = False
  377. # FKR113: Check loan words by the first 1 letter
  378. # FKR114: Check loan words by the first 2 letters
  379. # FKR115: Check loan words by the first 3 letters
  380. if orig.startswith(tuple(KCONF["fkr113-115"])):
  381. is_non_kor = True
  382. # FKR116: Exceptions to initial sound law - particles
  383. is_particle = False
  384. if orig.startswith(tuple(KCONF["fkr116"]["particles"])):
  385. is_particle = True
  386. if len(orig) > 1 and not is_non_kor and not is_particle:
  387. if rom.startswith(tuple(KCONF["fkr116"]["replace_initials"].keys())):
  388. rom = _replace_map(rom, KCONF["fkr116"]["replace_initials"])
  389. # FKR117: Proper names _StringPoper Does not work because of breves
  390. if (
  391. # FKR118
  392. orig in KCONF["fkr118"] or
  393. # FKR119
  394. orig in KCONF["fkr119"]["word"] or
  395. (
  396. orig[:-1] in KCONF["fkr119"]["word"] and
  397. orig.endswith(tuple(KCONF["fkr119"]["suffix"]))
  398. ) or
  399. # FKR120
  400. orig.endswith(tuple(KCONF["fkr120"]))):
  401. rom = rom[0].upper() + rom[1:]
  402. # FKR121: Loan words beginning with L
  403. if f" {orig} " in KCONF["fkr121"]:
  404. rom = _replace_map(rom[0], {"R": "L", "r": "l"}) + rom[1:]
  405. # @TODO Move this to a generic normalization step (not only for K)
  406. rom = _replace_map(rom, {"ŏ": "ŏ", "ŭ": "ŭ", "Ŏ": "Ŏ", "Ŭ": "Ŭ"})
  407. return rom
  408. def _marc8_hancha(data):
  409. # FKR142: Chinese character list
  410. _fkr_log(142)
  411. return _replace_map(data, KCONF["fkr142"])
  412. def _hancha2hangul(data):
  413. data = " " + data.replace("\n", "\n ")
  414. # FKR143: Process exceptions first
  415. # FKR144: Apply initial sound law (Except: 列, 烈, 裂, 劣)
  416. # FKR145: Simplified characters, variants
  417. # FKR146: Some characters from expanded list
  418. # FKR147: Chinese characters 1-500 車=차
  419. # FKR148: Chinese characters 501-750 串=관
  420. # FKR149: Chinese characters 751-1000 金=금, 娘=랑
  421. # FKR150: Chinese characters 1001-1250
  422. # FKR151: Chinese characters 1251-1500 제외: 列, 烈, 裂, 劣
  423. # FKR152: Chinese characters 1501-1750 제외: 律, 率, 栗, 慄
  424. # FKR153: Chinese characters 1751-2000
  425. # FKR154: 不,Chinese characters 2001-2250 제외: 不
  426. # FKR155: Chinese characters 2251-2500 塞=색
  427. # FKR156: Chinese characters 2501-2750
  428. # FKR157: Chinese characters 2751-3000
  429. # FKR158: Chinese characters 3001-2250
  430. # FKR159: Chinese characters 3251-3500
  431. # FKR160: Chinese characters 3501-3750
  432. # FKR161: Chinese characters 3751-4000
  433. # FKR162: Chinese characters 4001-4250
  434. # FKR163: Chinese characters 4251-4500
  435. # FKR164: Chinese characters 4501-4750
  436. # FKR165: Chinese characters 4751-5000
  437. # FKR166: Chinese characters 5001-5250
  438. # FKR167: Chinese characters 5251-5500
  439. # FKR168: Chinese characters 5501-5750
  440. # FKR169: Chinese characters 5751-5978
  441. # FKR170: Chinese characters 일본Chinese characters
  442. for i in range(143, 171):
  443. _fkr_log(i)
  444. data = _replace_map(data, KCONF[f"fkr{i}"])
  445. # FKR171: Chinese characters 不(부)의 발음 처리
  446. # Write down indices of occurrences of "不"
  447. idx = [i for i, item in enumerate(data) if item == "不"]
  448. for i in idx:
  449. val = ord(data[i + 1])
  450. if (val > 45795 and val < 46384) or (val > 51087 and val < 51676):
  451. data = data.replace("不", "부", 1)
  452. else:
  453. data = data.replace("不", "불", 1)
  454. # FKR172: Chinese characters 列(렬)의 발음 처리
  455. # FKR173: Chinese characters 烈(렬)의 발음 처리
  456. # FKR174: Chinese characters 裂(렬)의 발음 처리
  457. # FKR175: Chinese characters 劣(렬)의 발음 처리
  458. # FKR176: Chinese characters 律(률)의 발음 처리
  459. # FKR177: Chinese characters 率(률)의 발음 처리
  460. # FKR178: Chinese characters 慄(률)의 발음 처리
  461. # FKR179: Chinese characters 栗(률)의 발음 처리
  462. for char in KCONF["fkr172-179"]:
  463. idx = [i for i, item in enumerate(data) if item == char]
  464. for i in idx:
  465. val = ord(data[i - 1])
  466. coda_value = (val - CP_MIN) % 28
  467. if coda_value == 0 or coda_value == 4 or val < 100: # TODO verify
  468. data = data.replace(char, "열", 1)
  469. else:
  470. data = data.replace(char, "렬", 1)
  471. # FKR180: Katakana
  472. _fkr_log(180)
  473. data = _replace_map(data, KCONF["fkr180"])
  474. return re.sub(r"\s{2,}", " ", data.strip())
  475. def _replace_map(src, rmap, *args, **kw):
  476. """ Replace occurrences in a string according to a map. """
  477. for k, v in rmap.items():
  478. src = src.replace(k, v, *args, **kw)
  479. return src
  480. def _kor_fname_rom(fname):
  481. rom_ls = []
  482. cpoints = tuple(ord(c) for c in fname)
  483. for i in range(len(fname)):
  484. cp = cpoints[i] - CP_MIN
  485. ini = "i" + str(cp // 588)
  486. med = "m" + str((cp // 28) % 21)
  487. fin = "f" + str(cp % 28)
  488. rom_ls.append("#".join((ini, med, fin)))
  489. rom = "~".join(rom_ls) + "E"
  490. # FKR011: Check native Korean name, by coda
  491. origin_by_fin = "sino"
  492. for tok in KCONF["fkr011"]["nat_fin"]:
  493. if tok in rom:
  494. origin_by_fin = "native"
  495. break
  496. j = False
  497. for tok in KCONF["fkr011"]["nat_ini"]:
  498. if tok in rom:
  499. j = True
  500. k = False
  501. for tok in KCONF["fkr011"]["sino_ini"]:
  502. if tok in rom:
  503. k = True
  504. if j:
  505. if k:
  506. origin_by_ini = "sino"
  507. else:
  508. origin_by_ini = "native"
  509. else:
  510. origin_by_ini = "sino"
  511. # FKR012: Check native Korean name, by vowel & coda
  512. origin_by_med = "sino"
  513. for tok in KCONF["fkr011"]:
  514. if tok in rom:
  515. origin_by_med = "native"
  516. break
  517. # FKR013: Check native Korean name, by ㅢ
  518. if "m19#" in rom:
  519. if "의" in fname or "희" in fname:
  520. origin_by_med = "sino"
  521. else:
  522. origin_by_med = "native"
  523. # FKR014: Consonant assimilation ㄱ
  524. # FKR015: Consonant assimilation ㄲ
  525. # FKR016: Consonant assimilation ㄴ
  526. # FKR017: Consonant assimilation ㄷ
  527. # FKR018: Consonant assimilation ㄹ
  528. # FKR019: Consonant assimilation ㅁ
  529. # FKR020: Consonant assimilation ㅂ
  530. # FKR021: Consonant assimilation ㅅ
  531. # FKR022: Consonant assimilation ㅆ
  532. # FKR023: Consonant assimilation ㅇ
  533. # FKR024: Consonant assimilation ㅈ
  534. # FKR025: Consonant assimilation ㅊ
  535. # FKR026: Consonant assimilation ㅎ
  536. # FKR027: Final sound law
  537. # FKR028: Vocalization 1 (except ㄹ+ㄷ, ㄹ+ㅈ): voiced+unvoiced
  538. # FKR029: Vocalization 2 unvoiced+voiced
  539. for i in range(14, 30):
  540. _fkr_log(i)
  541. rom = _replace_map(rom, KCONF[f"fkr{i:03}"])
  542. # FKR030: Convert everything else
  543. _fkr_log(30)
  544. for k, cmap in KCONF["fkr030"].items():
  545. logger.debug(f"Applying FKR030[\"{k}\"]")
  546. rom = _replace_map(rom, cmap)
  547. rom = _replace_map(rom.replace("#", ""), {"swi": "shwi", "Swi": "Shwi"}, 1)
  548. if len(fname) == 2:
  549. rom = rom.replace("~", "-")
  550. else:
  551. rom = _replace_map(rom, {"n~g": "n'g", "~": ""})
  552. # FKR031: ㄹ + vowels/ㅎ/ㄹ ["l-r","l-l"] does not work USE alternative
  553. _fkr_log(31)
  554. for k, cmap in KCONF["fkr031"].items():
  555. logger.debug(f"Applying FKR031[\"{k}\"]")
  556. rom = _replace_map(rom, cmap)
  557. # FKR032: Capitalization
  558. rom = rom[0].upper() + rom[1:]
  559. # FKR033: Remove hyphen in bisyllabic native Korean first name
  560. if (
  561. len(fname) == 2
  562. and "native" in (origin_by_ini, origin_by_fin, origin_by_med)):
  563. rom = _replace_map(rom, {"n-g": "n'g", "-": ""})
  564. # FKR034: First name, initial sound law
  565. for k, v in KCONF["fkr034"].items():
  566. if rom.startswith(k):
  567. rom = rom.replace(k, v)
  568. return rom
  569. def _kor_lname_rom(lname):
  570. if len(lname) == 2:
  571. # FKR181: 2-charater names.
  572. _fkr_log(181)
  573. rom = _replace_map(lname, KCONF["fkr181"])
  574. else:
  575. # FKR182: 1-charater Chinese names.
  576. _fkr_log(182)
  577. lname = _replace_map(lname, KCONF["fkr182"])
  578. # FKR183: 1-charater names.
  579. _fkr_log(183)
  580. rom = _replace_map(lname, KCONF["fkr183"])
  581. return rom if lname != rom else False
  582. def _capitalize(src):
  583. """ Only capitalize first word and words preceded by space."""
  584. orig_ls = src.split(" ")
  585. cap_ls = [orig[0].upper() + orig[1:] for orig in orig_ls]
  586. return " ".join(cap_ls)
  587. def _fkr_log(fkr_i):
  588. fkr_k = f"FKR{fkr_i:03}"
  589. logger.debug(f"Applying {fkr_k}: {FKR_IDX[fkr_k]}")