romanizer.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735
  1. # @package ext.korean
  2. #
  3. __doc__ = """
  4. Korean transcription functions.
  5. Ported from K-Romanizer: https://library.princeton.edu/eastasian/k-romanizer
  6. Only script-to-Roman is possible for Korean.
  7. Note that Korean Romanization must be done separately for strings containing
  8. only personal names and strings that do not contain personal names, due to
  9. ambiguities in the language. A non-deterministic approach using machine
  10. learning that separates words depending on context is being attempted by other
  11. parties, and it may be possible to eventually integrate such services here in
  12. the future, technology and licensing permitting. At the moment there are no
  13. such plans.
  14. Many thanks to Hyoungbae Lee for kindly providing the original K-Romanizer
  15. program and assistance in porting it to Python.
  16. """
  17. import logging
  18. import re
  19. from os import path
  20. from csv import reader
  21. from scriptshifter.exceptions import BREAK
  22. from scriptshifter.hooks.korean import KCONF
  23. from scriptshifter.tools import capitalize
  24. PWD = path.dirname(path.realpath(__file__))
  25. CP_MIN = 44032
  26. # Buid FKR index for better logging.
  27. with open(path.join(PWD, "FKR_index.csv"), newline='') as fh:
  28. csv = reader(fh)
  29. FKR_IDX = {row[0]: row[2] for row in csv}
  30. logger = logging.getLogger(__name__)
  31. def s2r_nonames_post_config(ctx):
  32. """ Romanize a regular string NOT containing personal names. """
  33. ctx.dest, ctx.warnings = _romanize_nonames(
  34. ctx.src, ctx.options)
  35. return BREAK
  36. def s2r_names_post_config(ctx):
  37. """
  38. Romanize a string containing ONLY Korean personal names.
  39. One or more names can be transcribed. A comma or middle dot (U+00B7) is
  40. to be used as separator for multiple names.
  41. """
  42. ctx.dest, ctx.warnings = _romanize_names(ctx.src, ctx.options)
  43. return BREAK
  44. def _romanize_nonames(src, options):
  45. """ Main Romanization function for non-name strings. """
  46. # FKR038: Convert Chinese characters to Hangul
  47. if options.get("hancha", True):
  48. kor = _hancha2hangul(_marc8_hancha(src))
  49. else:
  50. kor = src
  51. # Replace ideographic spaces with ASCII space.
  52. kor = re.sub(r"\s+", " ", kor)
  53. kor = f" {kor} "
  54. # FKR039: Replace Proper name with spaces in advance
  55. # FKR040: Replace Proper name with a hyphen in advance
  56. # FKR041: Romanize names of Hangul consonants
  57. for i in range(39, 42):
  58. _fkr_log(i)
  59. kor = _replace_map(kor, KCONF[f"fkr{i:03}"])
  60. # NOTE This is slightly different from LL 929-930 in that it doesn't
  61. # result in double spaces.
  62. kor = kor.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
  63. # This is more compact but I'm unsure if the replacement order is kept.
  64. # kor = kor.replace({"\r\n": " ", "\r": " ", "\n": " "})
  65. rom = _romanize_oclc_auto(kor)
  66. logger.debug(f"Before capitalization: {rom}")
  67. # FKR042: Capitalize all first letters
  68. if options["capitalize"] == "all":
  69. rom = capitalize(rom)
  70. # FKR043: Capitalize the first letter
  71. elif options["capitalize"] == "first":
  72. rom = rom[0].upper() + rom[1:]
  73. # FKR044: Ambiguities
  74. ambi = re.sub("[,.\";: ]+", " ", rom)
  75. # TODO Decide what to do with these. There is no facility for outputting
  76. # warnings or notes to the user yet.
  77. warnings = []
  78. _fkr_log(45)
  79. for exp, warn in KCONF["fkr045"].items():
  80. if exp in ambi:
  81. warnings.append(ambi if warn == "" else warn)
  82. if rom:
  83. rom = rom.replace("kkk", "kk")
  84. return rom, warnings
  85. def _romanize_names(src, options):
  86. """
  87. Main Romanization function for names.
  88. Separate and romanize multiple names sepearated by comma or middle dot.
  89. K-Romanizer: KorNameRom20
  90. """
  91. rom_ls = []
  92. warnings = []
  93. if "," in src and "·" in src:
  94. warnings.append(
  95. "both commas and middle dots are being used to separate "
  96. "names. Only one of the two types should be used, or "
  97. "unexpected results may occur.")
  98. kor_ls = src.split(",") if "," in src else src.split("·")
  99. for kor in kor_ls:
  100. rom, _warnings = _romanize_name(kor.strip(), options)
  101. rom_ls.append(rom)
  102. warnings.extend(_warnings)
  103. return ", ".join(rom_ls), warnings
  104. def _romanize_name(src, options):
  105. warnings = []
  106. # FKR001: Conversion, Family names in Chinese (dealing with 金 and 李)
  107. # FKR002: Family names, Initial sound law
  108. replaced = False
  109. for ss, r in KCONF["fkr001-002"]:
  110. if replaced:
  111. break
  112. for s in ss:
  113. if src.startswith(s):
  114. src = r + src[1:]
  115. replaced = True
  116. break
  117. # FKR003: First name, Chinese Character Conversion
  118. src = _hancha2hangul(_marc8_hancha(src))
  119. if re.search("[a-zA-Z0-9]", src):
  120. warnings.append(f"{src} is not a recognized personal name.")
  121. return "", warnings
  122. # `parsed` can either be a modified Korean string with markers, or in case
  123. # of a foreign name, the final romanized name.
  124. parsed, _warnings = _parse_kor_name(
  125. re.sub(r"\s{2,}", " ", src.strip()),
  126. options)
  127. logger.debug(f"Parsed Korean name: {parsed}")
  128. if len(_warnings):
  129. warnings += _warnings
  130. if parsed:
  131. if "~" in parsed:
  132. lname, fname = parsed.split("~", 1)
  133. logger.debug(f"First name: {fname}; Last name: {lname}")
  134. fname_rom = _kor_fname_rom(fname)
  135. lname_rom_ls = []
  136. for n in lname.split("+"):
  137. _k = _kor_lname_rom(n)
  138. logger.debug(f"Split last name part: {n}")
  139. logger.debug(f"Split last name part romanized: {_k}")
  140. if _k:
  141. lname_rom_ls.append(_k)
  142. if not any(lname_rom_ls):
  143. warnings.append(f"{parsed} is not a recognized Korean name.")
  144. return "", warnings
  145. lname_rom = " ".join(lname_rom_ls)
  146. # Add comma after the last name for certain MARC fields.
  147. marc_field = options.get("marc_field")
  148. if marc_field in ("100", "600", "700", "800"):
  149. rom = f"{lname_rom}, {fname_rom}"
  150. else:
  151. rom = f"{lname_rom} {fname_rom}"
  152. if False:
  153. # TODO add option for authoritative name.
  154. rom_ls = rom.rsplit(" ", 1)
  155. rom = ", ".join(rom_ls)
  156. return rom, warnings
  157. else:
  158. warnings.append("Romanized as a foreign name.")
  159. return parsed, warnings
  160. warnings.append(f"{src} is not a recognized Korean name.")
  161. return "", warnings
  162. def _parse_kor_name(src, options):
  163. parsed = None
  164. warnings = []
  165. # FKR004: Check first two characters. Two-syllable family name or not?
  166. two_syl_lname = False
  167. for ptn in KCONF["fkr004"]:
  168. if src.startswith(ptn):
  169. two_syl_lname = True
  170. logger.debug("Name has a 2-syllable last name.")
  171. break
  172. src_len = len(src)
  173. # FKR005: Error if more than 7 syllables
  174. if src_len > 7 or src_len < 2 or src.find(" ") > 2:
  175. if options.get("foreign_name"):
  176. return _kor_corp_name_rom(src), warnings
  177. else:
  178. warnings.append("ERROR: not a Korean name.")
  179. return None, warnings
  180. ct_spaces = src.count(" ")
  181. # FKR0006: Error if more than 2 spaces
  182. if ct_spaces > 2:
  183. warnings.append("ERROR: not a name (too many spaces)")
  184. return None, warnings
  185. # FKR007: 2 spaces (two family names)
  186. if ct_spaces == 2:
  187. logger.debug(f"Name {src} has 2 spaces.")
  188. parsed = src.replace(" ", "+", 1).replace(" ", "~", 1)
  189. elif ct_spaces == 1:
  190. # FKR008: 1 space (2nd position)
  191. if src[1] == " ":
  192. logger.debug(f"Name {src} has 1 space in the 2nd position.")
  193. parsed = src.replace(" ", "~")
  194. # FKR009: 1 space (3nd position)
  195. if src[2] == " ":
  196. logger.debug(f"Name {src} has 1 space in the 3rd position.")
  197. if two_syl_lname:
  198. parsed = "+" + src.replace(" ", "~")
  199. # FKR010: When there is no space
  200. else:
  201. logger.debug(f"Name {src} has no spaces.")
  202. if src_len == 2:
  203. logger.debug("Name has 2 characters.")
  204. parsed = src[0] + "~" + src[1:]
  205. elif src_len > 2:
  206. logger.debug("Name has more than 2 characters.")
  207. if two_syl_lname:
  208. logger.debug("Last name has 2 syllables.")
  209. parsed = src[:2] + "~" + src[2:]
  210. else:
  211. logger.debug("Last name has 1 syllable.")
  212. parsed = src[0] + "~" + src[1:]
  213. return parsed, warnings
  214. def _kor_corp_name_rom(src):
  215. chu = yu = 0
  216. if src.startswith("(주) "):
  217. src = src[4:]
  218. chu = "L"
  219. if src.endswith(" (주)"):
  220. src = src[:-4]
  221. chu = "R"
  222. if src.startswith("(유) "):
  223. src = src[4:]
  224. yu = "L"
  225. if src.endswith(" (유)"):
  226. src = src[:-4]
  227. yu = "R"
  228. rom_tok = []
  229. for tok in src.split(" "):
  230. rom_tok.append(_romanize_oclc_auto(tok))
  231. rom = capitalize(" ".join(rom_tok))
  232. if chu == "L":
  233. rom = "(Chu) " + rom
  234. elif chu == "R":
  235. rom = rom + " (Chu)"
  236. if yu == "L":
  237. rom = "(Yu) " + rom
  238. elif yu == "R":
  239. rom = rom + " (Yu)"
  240. # FKR035: Replace established names
  241. rom = _replace_map(rom, KCONF["fkr035"])
  242. return rom
  243. def _romanize_oclc_auto(kor):
  244. # FKR050: Starts preprocessing symbol
  245. _fkr_log(50)
  246. for rname, rule in KCONF["fkr050"].items():
  247. logger.debug(f"Applying fkr050[{rname}]")
  248. kor = _replace_map(kor, rule)
  249. # See https://github.com/lcnetdev/scriptshifter/issues/19
  250. kor = re.sub("제([0-9])", "제 \\1", kor)
  251. # FKR052: Replace Che+number
  252. _fkr_log(52)
  253. for rname, rule in KCONF["fkr052"].items():
  254. logger.debug(f"Applying fkr052[{rname}]")
  255. kor = _replace_map(kor, rule)
  256. # Strip end and multiple whitespace.
  257. kor = re.sub(r"\s{2,}", " ", kor.strip())
  258. kor = kor.replace("^", " GLOTTAL ")
  259. logger.debug(f"Korean before romanization: {kor}")
  260. rom_ls = []
  261. for word in kor.split(" "):
  262. rom_ls.append(_kor_rom(word))
  263. rom = " ".join(rom_ls)
  264. # FKR059: Apply glottalization
  265. rom = _replace_map(
  266. f" {rom.strip()} ", {" GLOTTAL ": "", "*": "", "^": ""})
  267. # FKR060: Process number + -년/-년도/-년대
  268. # TODO Add leading whitespace as per L1221? L1202 already added one.
  269. rom = _replace_map(rom, KCONF["fkr060"])
  270. rom = re.sub(r"\s{2,}", " ", f" {rom.strip()} ")
  271. # FKR061: Jurisdiction (시)
  272. # FKR062: Historical place names
  273. # FKR063: Jurisdiction (국,도,군,구)
  274. # FKR064: Temple names of Kings, Queens, etc. (except 조/종)
  275. # FKR065: Frequent historical names
  276. for i in range(61, 66):
  277. _fkr_log(i)
  278. rom = _replace_map(rom, KCONF[f"fkr{i:03}"])
  279. # FKR066: Starts restore symbols
  280. _fkr_log(66)
  281. for rname, rule in KCONF["fkr066"].items():
  282. logger.debug(f"Applying FKR066[{rname}]")
  283. rom = _replace_map(rom, rule)
  284. # Remove spaces from before punctuation signs.
  285. rom = re.sub(r" (?=[,.;:?!])", "", rom.strip())
  286. rom = re.sub(r"\s{2,}", " ", rom)
  287. return rom
  288. # FKR068: Exceptions, Exceptions to initial sound law, Proper names
  289. def _kor_rom(kor):
  290. kor = re.sub(r"\s{2,}", " ", kor.strip())
  291. orig = kor
  292. # FKR069: Irregular sound change list
  293. kor = _replace_map(kor, KCONF["fkr069"])
  294. # FKR070: [n] insertion position mark +
  295. niun = kor.find("+")
  296. if niun > -1:
  297. kor = kor.replace("+", "")
  298. orig = kor
  299. non_kor = 0
  300. cpoints = tuple(ord(c) for c in kor)
  301. for cp in cpoints:
  302. if cp < CP_MIN:
  303. non_kor += 1
  304. kor = kor[1:]
  305. rom_ls = []
  306. if non_kor > 0:
  307. # Rebuild code point list with non_kor removed.
  308. cpoints = tuple(ord(c) for c in kor)
  309. for i in range(len(kor)):
  310. cp = cpoints[i] - CP_MIN
  311. ini = "i" + str(cp // 588)
  312. med = "m" + str((cp // 28) % 21)
  313. fin = "f" + str(cp % 28)
  314. rom_ls.append("#".join((ini, med, fin)))
  315. rom = "~".join(rom_ls)
  316. if len(rom):
  317. rom = rom + "E"
  318. # FKR071: [n] insertion
  319. if niun > -1:
  320. niun_loc = rom.find("~")
  321. # Advance until the niun'th occurrence of ~
  322. # If niun is 0 or 1 the loop will be skipped.
  323. for i in range(niun - 1):
  324. niun_loc = rom.find("~", niun_loc + 1)
  325. rom_niun_a = rom[:niun_loc]
  326. rom_niun_b = rom[niun_loc + 1:]
  327. if re.match("i11#m(?:2|6|12|17|20)", rom_niun_b):
  328. _fkr_log(71)
  329. rom_niun_b = rom_niun_b.replace("i11#m", "i2#m", 1)
  330. # FKR072: [n]+[l] >[l] + [l]
  331. if rom_niun_b.startswith("i5#") and rom_niun_a.endswith("f4"):
  332. _fkr_log(72)
  333. rom_niun_b = rom_niun_b.replace("i5#", "i2", 1)
  334. rom = f"{rom_niun_a}~{rom_niun_b}"
  335. # FKR073: Palatalization: ㄷ+이,ㄷ+여,ㄷ+히,ㄷ+혀
  336. # FKR074: Palatalization: ㅌ+이,ㅌ+히,ㅌ+히,ㅌ+혀
  337. # FKR075: Consonant assimilation ㄱ
  338. # FKR076: Consonant assimilation ㄲ
  339. # FKR077: Consonant assimilation ㄳ : ㄱ,ㄴ,ㄹ,ㅁ,ㅇ
  340. # FKR078: Consonant assimilation ㄴ
  341. # FKR079: Consonant assimilation ㄵ: ㄱ,ㄴ,ㄷ,ㅈ"
  342. # FKR080: Consonant assimilation ㄶ : ㄱ,ㄴ,ㄷ,ㅈ
  343. # FKR081: Consonant assimilation ㄷ
  344. # FKR082: Consonant assimilation ㄹ
  345. # FKR083: Consonant assimilation ㄺ : ㄱ,ㄴ,ㄷ,ㅈ
  346. # FKR084: Consonant assimilation ㄻ : ㄱ,ㄴ,ㄷ,ㅈ
  347. # FKR085: Consonant assimilation ㄼ : ㄱ,ㄴ,ㄷ,ㅈ
  348. # FKR086: Consonant assimilation ㄾ : ㄱ,ㄴ,ㄷ,ㅈ
  349. # FKR087: Consonant assimilation ㄿ : ㄱ,ㄴ,ㄷ,ㅈ
  350. # FKR088: Consonant assimilation ㅀ : ㄱ,ㄴ,ㄷ,ㅈ
  351. # FKR089: Consonant assimilation ㅁ
  352. # FKR090: Consonant assimilation ㅂ
  353. # FKR091: Consonant assimilation ㅄ
  354. # FKR092: Consonant assimilation ㅅ
  355. # FKR093: Consonant assimilation ㅆ
  356. # FKR094: Consonant assimilation ㅇ
  357. # FKR095: Consonant assimilation ㅈ
  358. # FKR096: Consonant assimilation ㅊ
  359. # FKR097: Consonant assimilation ㅋ
  360. # FKR098: Consonant assimilation ㅌ
  361. # FKR099: Consonant assimilation ㅍ
  362. # FKR100: Consonant assimilation ㅎ
  363. # FKR101: digraphic coda + ㅇ: ㄵ,ㄶ,ㄺ,ㄻ,ㄼ,ㄽ,ㄾ,ㄿ,ㅀ
  364. # FKR102: digraphic coda + ㅎ: ㄵ,ㄶ,ㄺ,ㄻ,ㄼ,(ㄽ),ㄾ,ㄿ,ㅀ
  365. # FKR103: Vocalization 1 (except ㄹ+ㄷ, ㄹ+ㅈ 제외) voiced + unvoiced
  366. # FKR104: Vocalization 2 (except ㄹ+ㄷ, ㄹ+ㅈ 제외) unvoiced + voiced
  367. # FKR105: Vocalization 3 (ㄹ+ㄷ, ㄹ+ㅈ)
  368. # FKR106: Final sound law
  369. # FKR107: Exception for '쉬' = shi
  370. # FKR108: Exception for 'ㄴㄱ'= n'g
  371. for fkr_i in range(73, 109):
  372. _fkr_log(fkr_i)
  373. _bk = rom
  374. rom = _replace_map(rom, KCONF[f"fkr{fkr_i:03}"])
  375. if _bk != rom:
  376. logger.debug(f"FKR{fkr_i} substitution: {rom} (was: {_bk})")
  377. # FKR109: Convert everything else
  378. _fkr_log(109)
  379. for pos, data in KCONF["fkr109"].items():
  380. rom = _replace_map(rom, data)
  381. # FKR110: Convert symbols
  382. rom = _replace_map(rom, {"#": "", "~": ""})
  383. if non_kor > 0:
  384. # Modified from K-Romanizer:1727 in that it does not append a hyphen
  385. # if the whole word is non-Korean.
  386. rom = f"{orig[:non_kor]}-{rom}" if len(rom) else orig
  387. # FKR111: ㄹ + 모음/ㅎ/ㄹ, ["lr","ll"] must be in the last of the array
  388. rom = _replace_map(rom, KCONF["fkr111"])
  389. # FKR112: Exceptions to initial sound law
  390. is_non_kor = False
  391. # FKR113: Check loan words by the first 1 letter
  392. # FKR114: Check loan words by the first 2 letters
  393. # FKR115: Check loan words by the first 3 letters
  394. if orig.startswith(tuple(KCONF["fkr113-115"])):
  395. is_non_kor = True
  396. # FKR116: Exceptions to initial sound law - particles
  397. is_particle = False
  398. if orig.startswith(tuple(KCONF["fkr116"]["particles"])):
  399. is_particle = True
  400. if len(orig) > 1 and not is_non_kor and not is_particle:
  401. if rom.startswith(tuple(KCONF["fkr116"]["replace_initials"].keys())):
  402. rom = _replace_map(rom, KCONF["fkr116"]["replace_initials"])
  403. # FKR117: Proper names _StringPoper Does not work because of breves
  404. if (
  405. # FKR118
  406. orig in KCONF["fkr118"] or
  407. # FKR119
  408. orig in KCONF["fkr119"]["word"] or
  409. (
  410. orig[:-1] in KCONF["fkr119"]["word"] and
  411. orig.endswith(tuple(KCONF["fkr119"]["suffix"]))
  412. ) or
  413. # FKR120
  414. orig.endswith(tuple(KCONF["fkr120"]))):
  415. rom = rom[0].upper() + rom[1:]
  416. # FKR121: Loan words beginning with L
  417. if f" {orig} " in KCONF["fkr121"]:
  418. rom = _replace_map(rom[0], {"R": "L", "r": "l"}) + rom[1:]
  419. # @TODO Move this to a generic normalization step (not only for K)
  420. rom = _replace_map(rom, {"ŏ": "ŏ", "ŭ": "ŭ", "Ŏ": "Ŏ", "Ŭ": "Ŭ"})
  421. return rom
  422. def _marc8_hancha(data):
  423. # FKR142: Chinese character list
  424. _fkr_log(142)
  425. return _replace_map(data, KCONF["fkr142"])
  426. def _hancha2hangul(data):
  427. data = " " + data.replace("\n", "\n ")
  428. # FKR143: Process exceptions first
  429. # FKR144: Apply initial sound law (Except: 列, 烈, 裂, 劣)
  430. # FKR145: Simplified characters, variants
  431. # FKR146: Some characters from expanded list
  432. # FKR147: Chinese characters 1-500 車=차
  433. # FKR148: Chinese characters 501-750 串=관
  434. # FKR149: Chinese characters 751-1000 金=금, 娘=랑
  435. # FKR150: Chinese characters 1001-1250
  436. # FKR151: Chinese characters 1251-1500 제외: 列, 烈, 裂, 劣
  437. # FKR152: Chinese characters 1501-1750 제외: 律, 率, 栗, 慄
  438. # FKR153: Chinese characters 1751-2000
  439. # FKR154: 不,Chinese characters 2001-2250 제외: 不
  440. # FKR155: Chinese characters 2251-2500 塞=색
  441. # FKR156: Chinese characters 2501-2750
  442. # FKR157: Chinese characters 2751-3000
  443. # FKR158: Chinese characters 3001-2250
  444. # FKR159: Chinese characters 3251-3500
  445. # FKR160: Chinese characters 3501-3750
  446. # FKR161: Chinese characters 3751-4000
  447. # FKR162: Chinese characters 4001-4250
  448. # FKR163: Chinese characters 4251-4500
  449. # FKR164: Chinese characters 4501-4750
  450. # FKR165: Chinese characters 4751-5000
  451. # FKR166: Chinese characters 5001-5250
  452. # FKR167: Chinese characters 5251-5500
  453. # FKR168: Chinese characters 5501-5750
  454. # FKR169: Chinese characters 5751-5978
  455. # FKR170: Chinese characters 일본Chinese characters
  456. for i in range(143, 171):
  457. _fkr_log(i)
  458. data = _replace_map(data, KCONF[f"fkr{i}"])
  459. # FKR171: Chinese characters 不(부)의 발음 처리
  460. # Write down indices of occurrences of "不"
  461. idx = [i for i, item in enumerate(data) if item == "不"]
  462. for i in idx:
  463. val = ord(data[i + 1])
  464. if (val > 45795 and val < 46384) or (val > 51087 and val < 51676):
  465. data = data.replace("不", "부", 1)
  466. else:
  467. data = data.replace("不", "불", 1)
  468. # FKR172: Chinese characters 列(렬)의 발음 처리
  469. # FKR173: Chinese characters 烈(렬)의 발음 처리
  470. # FKR174: Chinese characters 裂(렬)의 발음 처리
  471. # FKR175: Chinese characters 劣(렬)의 발음 처리
  472. # FKR176: Chinese characters 律(률)의 발음 처리
  473. # FKR177: Chinese characters 率(률)의 발음 처리
  474. # FKR178: Chinese characters 慄(률)의 발음 처리
  475. # FKR179: Chinese characters 栗(률)의 발음 처리
  476. for char in KCONF["fkr172-179"]:
  477. idx = [i for i, item in enumerate(data) if item == char]
  478. for i in idx:
  479. val = ord(data[i - 1])
  480. coda_value = (val - CP_MIN) % 28
  481. if coda_value == 0 or coda_value == 4 or val < 100: # TODO verify
  482. data = data.replace(char, "열", 1)
  483. else:
  484. data = data.replace(char, "렬", 1)
  485. # FKR180: Katakana
  486. _fkr_log(180)
  487. data = _replace_map(data, KCONF["fkr180"])
  488. return re.sub(r"\s{2,}", " ", data.strip())
  489. def _replace_map(src, rmap, *args, **kw):
  490. """ Replace occurrences in a string according to a map. """
  491. for k, v in rmap.items():
  492. src = src.replace(k, v, *args, **kw)
  493. return src
  494. def _kor_fname_rom(fname):
  495. rom_ls = []
  496. cpoints = tuple(ord(c) for c in fname)
  497. for i in range(len(fname)):
  498. cp = cpoints[i] - CP_MIN
  499. ini = "i" + str(cp // 588)
  500. med = "m" + str((cp // 28) % 21)
  501. fin = "f" + str(cp % 28)
  502. rom_ls.append("#".join((ini, med, fin)))
  503. rom = "~".join(rom_ls) + "E"
  504. logger.debug(f"Encoded first name: {rom}")
  505. # FKR011: Check native Korean name, by coda
  506. native_by_fin = False
  507. for tok in KCONF["fkr011"]["nat_fin"]:
  508. if tok in rom:
  509. native_by_fin = True
  510. break
  511. j = k = False
  512. for tok in KCONF["fkr011"]["nat_ini"]:
  513. if tok in rom:
  514. j = True
  515. break
  516. for tok in KCONF["fkr011"]["sino_ini"]:
  517. if tok in fname:
  518. k = True
  519. break
  520. native_by_ini = j and not k
  521. # FKR012: Check native Korean name, by vowel & coda
  522. native_by_med = False
  523. for tok in KCONF["fkr011"]:
  524. if tok in rom:
  525. native_by_med = True
  526. break
  527. # FKR013: Check native Korean name, by ㅢ
  528. if "m19#" in rom:
  529. native_by_med = "의" not in fname and "희" not in fname
  530. # FKR014: Consonant assimilation ㄱ
  531. # FKR015: Consonant assimilation ㄲ
  532. # FKR016: Consonant assimilation ㄴ
  533. # FKR017: Consonant assimilation ㄷ
  534. # FKR018: Consonant assimilation ㄹ
  535. # FKR019: Consonant assimilation ㅁ
  536. # FKR020: Consonant assimilation ㅂ
  537. # FKR021: Consonant assimilation ㅅ
  538. # FKR022: Consonant assimilation ㅆ
  539. # FKR023: Consonant assimilation ㅇ
  540. # FKR024: Consonant assimilation ㅈ
  541. # FKR025: Consonant assimilation ㅊ
  542. # FKR026: Consonant assimilation ㅎ
  543. # FKR027: Final sound law
  544. # FKR028: Vocalization 1 (except ㄹ+ㄷ, ㄹ+ㅈ): voiced+unvoiced
  545. # FKR029: Vocalization 2 unvoiced+voiced
  546. for i in range(14, 30):
  547. _fkr_log(i)
  548. rom = _replace_map(rom, KCONF[f"fkr{i:03}"])
  549. # FKR030: Convert everything else
  550. _fkr_log(30)
  551. for k, cmap in KCONF["fkr030"].items():
  552. logger.debug(f"Applying FKR030[\"{k}\"]")
  553. rom = _replace_map(rom, cmap)
  554. rom = _replace_map(rom.replace("#", ""), {"swi": "shwi", "Swi": "Shwi"}, 1)
  555. logger.debug(f"Partly romanized first name: {rom}")
  556. logger.debug(f"fname: {fname} ({len(fname)})")
  557. if len(fname) == 2:
  558. rom = rom.replace("~", "-")
  559. else:
  560. rom = _replace_map(rom, {"n~g": "n'g", "~": ""})
  561. # FKR031: ㄹ + vowels/ㅎ/ㄹ ["l-r","l-l"] does not work USE alternative
  562. _fkr_log(31)
  563. for k, cmap in KCONF["fkr031"].items():
  564. logger.debug(f"Applying FKR031[\"{k}\"]")
  565. rom = _replace_map(rom, cmap)
  566. # FKR032: Capitalization
  567. _fkr_log(32)
  568. rom = rom[0].upper() + rom[1:]
  569. # FKR033: Remove hyphen in bisyllabic native Korean first name
  570. _fkr_log(33)
  571. if (
  572. len(fname) == 2
  573. and any((native_by_ini, native_by_fin, native_by_med))):
  574. logger.debug("First name is native.")
  575. rom = _replace_map(rom, {"n-g": "n'g", "-": ""})
  576. # FKR034: First name, initial sound law
  577. for k, v in KCONF["fkr034"].items():
  578. if rom.startswith(k):
  579. rom = rom.replace(k, v)
  580. return rom
  581. def _kor_lname_rom(lname):
  582. if len(lname) == 2:
  583. # FKR181: 2-character names.
  584. _fkr_log(181)
  585. rom = _replace_map(lname, KCONF["fkr181"])
  586. else:
  587. # FKR182: 1-character Chinese names.
  588. _fkr_log(182)
  589. lname = _replace_map(lname, KCONF["fkr182"])
  590. # FKR183: 1-character names.
  591. _fkr_log(183)
  592. rom = _replace_map(lname, KCONF["fkr183"])
  593. return rom if lname != rom else False
  594. def _fkr_log(fkr_i):
  595. fkr_k = f"FKR{fkr_i:03}"
  596. logger.debug(f"Applying {fkr_k}: {FKR_IDX[fkr_k]}")