romanizer.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777
  1. # @package ext.korean
  2. #
  3. __doc__ = """
  4. Korean transcription functions.
  5. Ported from K-Romanizer: https://library.princeton.edu/eastasian/k-romanizer
  6. Only script-to-Roman is possible for Korean.
  7. Note that Korean Romanization must be done separately for strings containing
  8. only personal names and strings that do not contain personal names, due to
  9. ambiguities in the language. A non-deterministic approach using machine
  10. learning that separates words depending on context is being attempted by other
  11. parties, and it may be possible to eventually integrate such services here in
  12. the future, technology and licensing permitting. At the moment there are no
  13. such plans.
  14. Many thanks to Hyoungbae Lee for kindly providing the original K-Romanizer
  15. program and assistance in porting it to Python.
  16. """
  17. import logging
  18. import re
  19. from csv import reader
  20. from os import path
  21. from scriptshifter.exceptions import BREAK
  22. from scriptshifter.hooks.korean import KCONF
  23. from scriptshifter.tools import capitalize
  24. PWD = path.dirname(path.realpath(__file__))
  25. CP_MIN = 44032
  26. ALL_PUNCT_STR = r'[!"#$%&\'()*+,-.:;<=>?・ǂ「」『』@[\\]^_`{|}~‡‰‘’“”–—˜©·]'
  27. # Separator symbols for coded tokens.
  28. # Using esoteric characters most unlikely found in cataloging records.
  29. INI = "🜁" # Initial prefix (was: i).
  30. MED = "🜊" # Medial prefix (was: m).
  31. FIN = "🜔" # Final prefix (was: f).
  32. EOP = "🜿" # End of part (was: #).
  33. EOT = "🝎" # End of token (was: ~).
  34. EON = "🜹" # First-last name separator (was: +).
  35. EOD = "🝥" # End of document (was: E).
  36. GLT = "🜄" # Glottal (was: ^).
  37. # Buid FKR index for better logging.
  38. with open(path.join(PWD, "FKR_index.csv"), newline='') as fh:
  39. csv = reader(fh)
  40. FKR_IDX = {row[0]: row[2] for row in csv}
  41. logger = logging.getLogger(__name__)
  42. def s2r_nonames_post_config(ctx):
  43. """ Romanize a regular string NOT containing personal names. """
  44. ctx.dest, ctx.warnings = _romanize_nonames(
  45. ctx.src, ctx.options)
  46. return BREAK
  47. def s2r_names_post_config(ctx):
  48. """
  49. Romanize a string containing ONLY Korean personal names.
  50. One or more names can be transcribed. A comma or middle dot (U+00B7) is
  51. to be used as separator for multiple names.
  52. """
  53. ctx.dest, ctx.warnings = _romanize_names(ctx.src, ctx.options)
  54. return BREAK
  55. def _romanize_nonames(src, options):
  56. """ Main Romanization function for non-name strings. """
  57. # FKR038: Convert Chinese characters to Hangul
  58. if options.get("hancha", True):
  59. kor = _hancha2hangul(_marc8_hancha(src))
  60. else:
  61. kor = src
  62. # Replace ideographic spaces with ASCII space.
  63. kor = re.sub(r"\s+", " ", kor)
  64. kor = f" {kor} "
  65. # FKR039: Replace Proper name with spaces in advance
  66. # FKR040: Replace Proper name with a hyphen in advance
  67. # FKR041: Romanize names of Hangul consonants
  68. for i in range(39, 42):
  69. _fkr_log(i)
  70. kor = _replace_map(kor, KCONF[f"fkr{i:03}"])
  71. # NOTE This is slightly different from LL 929-930 in that it doesn't
  72. # result in double spaces.
  73. kor = kor.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
  74. # This is more compact but I'm unsure if the replacement order is kept.
  75. # kor = kor.replace({"\r\n": " ", "\r": " ", "\n": " "})
  76. rom = _romanize_oclc_auto(kor)
  77. logger.debug(f"Before capitalization: {rom}")
  78. # FKR042: Capitalize all first letters
  79. if options["capitalize"] == "all":
  80. rom = capitalize(rom)
  81. # FKR043: Capitalize the first letter
  82. elif options["capitalize"] == "first":
  83. rom = rom[0].upper() + rom[1:]
  84. # FKR044: Ambiguities
  85. ambi = re.sub("[,.\";: ]+", " ", rom)
  86. # TODO Decide what to do with these. There is no facility for outputting
  87. # warnings or notes to the user yet.
  88. warnings = []
  89. _fkr_log(45)
  90. for exp, warn in KCONF["fkr045"].items():
  91. if exp in ambi:
  92. warnings.append(ambi if warn == "" else warn)
  93. if rom:
  94. rom = rom.replace("kkk", "kk")
  95. return rom, warnings
  96. def _romanize_names(src, options):
  97. """
  98. Main Romanization function for names.
  99. Separate and romanize multiple names sepearated by comma or middle dot.
  100. K-Romanizer: KorNameRom20
  101. """
  102. rom_ls = []
  103. warnings = []
  104. if "," in src and "·" in src:
  105. warnings.append(
  106. "both commas and middle dots are being used to separate "
  107. "names. Only one of the two types should be used, or "
  108. "unexpected results may occur.")
  109. kor_ls = src.split(",") if "," in src else src.split("·")
  110. for kor in kor_ls:
  111. rom, _warnings = _romanize_name(kor.strip(), options)
  112. rom_ls.append(rom)
  113. warnings.extend(_warnings)
  114. return ", ".join(rom_ls), warnings
  115. def _romanize_name(src, options):
  116. warnings = []
  117. # FKR001: Conversion, Family names in Chinese (dealing with 金 and 李)
  118. # FKR002: Family names, Initial sound law
  119. replaced = False
  120. for ss, r in KCONF["fkr001-002"]:
  121. if replaced:
  122. break
  123. for s in ss:
  124. if src.startswith(s):
  125. src = r + src[1:]
  126. replaced = True
  127. break
  128. # FKR003: First name, Chinese Character Conversion
  129. src = _hancha2hangul(_marc8_hancha(src))
  130. if re.search("[a-zA-Z0-9]", src):
  131. warnings.append(f"{src} is not a recognized personal name.")
  132. return "", warnings
  133. # `parsed` can either be a modified Korean string with markers, or in case
  134. # of a foreign name, the final romanized name.
  135. parsed, _warnings = _parse_kor_name(
  136. re.sub(r"\s{2,}", " ", src.strip()),
  137. options)
  138. logger.debug(f"Parsed Korean name: {parsed}")
  139. if len(_warnings):
  140. warnings += _warnings
  141. if parsed:
  142. if EOT in parsed:
  143. lname, fname = parsed.split(EOT, 1)
  144. logger.debug(f"First name: {fname}; Last name: {lname}")
  145. fname_rom = _kor_fname_rom(fname)
  146. lname_rom_ls = []
  147. for n in lname.split(EON):
  148. _k = _kor_lname_rom(n)
  149. logger.debug(f"Split last name part: {n}")
  150. logger.debug(f"Split last name part romanized: {_k}")
  151. if _k:
  152. lname_rom_ls.append(_k)
  153. if not any(lname_rom_ls):
  154. warnings.append(f"{parsed} is not a recognized Korean name.")
  155. return "", warnings
  156. lname_rom = " ".join(lname_rom_ls)
  157. # Add comma after the last name for certain MARC fields.
  158. marc_field = options.get("marc_field")
  159. if marc_field in ("100", "600", "700", "800"):
  160. rom = f"{lname_rom}, {fname_rom}"
  161. else:
  162. rom = f"{lname_rom} {fname_rom}"
  163. if False:
  164. # TODO add option for authoritative name.
  165. rom_ls = rom.rsplit(" ", 1)
  166. rom = ", ".join(rom_ls)
  167. return rom, warnings
  168. else:
  169. warnings.append("Romanized as a foreign name.")
  170. return parsed, warnings
  171. warnings.append(f"{src} is not a recognized Korean name.")
  172. return "", warnings
  173. def _parse_kor_name(src, options):
  174. parsed = None
  175. warnings = []
  176. # FKR004: Check first two characters. Two-syllable family name or not?
  177. two_syl_lname = False
  178. for ptn in KCONF["fkr004"]:
  179. if src.startswith(ptn):
  180. two_syl_lname = True
  181. logger.debug("Name has a 2-syllable last name.")
  182. break
  183. src_len = len(src)
  184. # FKR005: Error if more than 7 syllables
  185. if src_len > 7 or src_len < 2 or src.find(" ") > 2:
  186. if options.get("foreign_name"):
  187. return _kor_corp_name_rom(src), warnings
  188. else:
  189. warnings.append("ERROR: not a Korean name.")
  190. return None, warnings
  191. ct_spaces = src.count(" ")
  192. # FKR0006: Error if more than 2 spaces
  193. if ct_spaces > 2:
  194. warnings.append("ERROR: not a name (too many spaces)")
  195. return None, warnings
  196. # FKR007: 2 spaces (two family names)
  197. if ct_spaces == 2:
  198. logger.debug(f"Name {src} has 2 spaces.")
  199. parsed = src.replace(" ", EON, 1).replace(" ", EOT, 1)
  200. elif ct_spaces == 1:
  201. # FKR008: 1 space (2nd position)
  202. if src[1] == " ":
  203. logger.debug(f"Name {src} has 1 space in the 2nd position.")
  204. parsed = src.replace(" ", EOT)
  205. # FKR009: 1 space (3nd position)
  206. if src[2] == " ":
  207. logger.debug(f"Name {src} has 1 space in the 3rd position.")
  208. if two_syl_lname:
  209. parsed = EON + src.replace(" ", EOT)
  210. # FKR010: When there is no space
  211. else:
  212. logger.debug(f"Name {src} has no spaces.")
  213. if src_len == 2:
  214. logger.debug("Name has 2 characters.")
  215. parsed = src[0] + EOT + src[1:]
  216. elif src_len > 2:
  217. logger.debug("Name has more than 2 characters.")
  218. if two_syl_lname:
  219. logger.debug("Last name has 2 syllables.")
  220. parsed = src[:2] + EOT + src[2:]
  221. else:
  222. logger.debug("Last name has 1 syllable.")
  223. parsed = src[0] + EOT + src[1:]
  224. return parsed, warnings
  225. def _kor_corp_name_rom(src):
  226. chu = yu = 0
  227. if src.startswith("(주) "):
  228. src = src[4:]
  229. chu = "L"
  230. if src.endswith(" (주)"):
  231. src = src[:-4]
  232. chu = "R"
  233. if src.startswith("(유) "):
  234. src = src[4:]
  235. yu = "L"
  236. if src.endswith(" (유)"):
  237. src = src[:-4]
  238. yu = "R"
  239. rom_tok = []
  240. for tok in src.split(" "):
  241. rom_tok.append(_romanize_oclc_auto(tok))
  242. rom = capitalize(" ".join(rom_tok))
  243. if chu == "L":
  244. rom = "(Chu) " + rom
  245. elif chu == "R":
  246. rom = rom + " (Chu)"
  247. if yu == "L":
  248. rom = "(Yu) " + rom
  249. elif yu == "R":
  250. rom = rom + " (Yu)"
  251. # FKR035: Replace established names
  252. rom = _replace_map(rom, KCONF["fkr035"])
  253. return rom
  254. def _romanize_oclc_auto(kor):
  255. # See https://github.com/lcnetdev/scriptshifter/issues/19
  256. kor = re.sub("제([0-9])", "제 \\1", kor)
  257. # FKR052: Replace Che+number
  258. _fkr_log(52)
  259. for rname, rule in KCONF["fkr052"].items():
  260. logger.debug(f"Applying fkr052[{rname}]")
  261. kor = _replace_map(kor, rule)
  262. # Strip end and multiple whitespace.
  263. kor = re.sub(r"\s{2,}", " ", kor.strip())
  264. kor = kor.replace(GLT, " GLOTTAL ")
  265. logger.debug(f"Korean before romanization: {kor}")
  266. rom_ls = []
  267. for word in kor.split(" "):
  268. rom_ls.append(_kor_rom(word))
  269. rom = " ".join(rom_ls)
  270. # FKR059: Apply glottalization
  271. rom = _replace_map(
  272. f" {rom.strip()} ", {" GLOTTAL ": "", "*": "", "^": ""})
  273. # FKR060: Process number + -년/-년도/-년대
  274. # TODO Add leading whitespace as per L1221? L1202 already added one.
  275. rom = _replace_map(rom, KCONF["fkr060"])
  276. rom = re.sub(r"\s{2,}", " ", f" {rom.strip()} ")
  277. # FKR061: Jurisdiction (시)
  278. # FKR062: Historical place names
  279. # FKR063: Jurisdiction (국,도,군,구)
  280. # FKR064: Temple names of Kings, Queens, etc. (except 조/종)
  281. # FKR065: Frequent historical names
  282. for i in range(61, 66):
  283. _fkr_log(i)
  284. rom = _replace_map(rom, KCONF[f"fkr{i:03}"])
  285. # Replace Korean punctuation.
  286. rom = _replace_map(rom, {"・": ", ", "·": ", "})
  287. # Normalize punctuation spacing.
  288. rom = re.sub(r"\s{2,}", " ", rom.strip())
  289. rom = re.sub(r" (?=[,.;:?!\]\)\}’”])", "", rom)
  290. rom = re.sub(r"(?<=[\[\(\{‘“]) ", "", rom)
  291. return rom
  292. # FKR068: Exceptions, Exceptions to initial sound law, Proper names
  293. def _kor_rom(kor):
  294. kor = re.sub(r"\s{2,}", " ", kor.strip())
  295. orig = kor
  296. # FKR069: Irregular sound change list
  297. kor = _replace_map(kor, KCONF["fkr069"])
  298. # FKR070: [n] insertion position mark +
  299. niun = kor.find(EON)
  300. if niun > -1:
  301. kor = kor.replace(EON, "")
  302. orig = kor
  303. non_kor = 0
  304. cpoints = tuple(ord(c) for c in kor)
  305. for cp in cpoints:
  306. if cp < CP_MIN:
  307. non_kor += 1
  308. kor = kor[1:]
  309. else:
  310. # Break as soon as a Korean code point is found.
  311. break
  312. rom_ls = []
  313. if non_kor > 0:
  314. # Rebuild code point list with non_kor removed.
  315. cpoints = tuple(ord(c) for c in kor)
  316. for i in range(len(kor)):
  317. cp = cpoints[i] - CP_MIN
  318. if cp < 0:
  319. # This accounts for punctuation attached to the end of the word.
  320. rom_ls.append(kor[i])
  321. continue
  322. ini = INI + str(cp // 588)
  323. med = MED + str((cp // 28) % 21)
  324. fin = FIN + str(cp % 28)
  325. rom_ls.append(EOP.join((ini, med, fin)))
  326. rom = EOT.join(rom_ls)
  327. if len(rom):
  328. rom = rom + EOD
  329. logger.debug(f"Coded romanization before replacements: {rom}")
  330. # FKR071: [n] insertion
  331. if niun > -1:
  332. niun_loc = rom.find(EOT)
  333. # Advance until the niun'th occurrence of EOT
  334. # If niun is 0 or 1 the loop will be skipped.
  335. for i in range(niun - 1):
  336. niun_loc = rom.find(EOT, niun_loc + 1)
  337. rom_niun_a = rom[:niun_loc]
  338. rom_niun_b = rom[niun_loc + 1:]
  339. if re.match(
  340. f"{INI}11{EOP}"
  341. f"{MED}(?:2|6|12|17|20)", rom_niun_b):
  342. _fkr_log(71)
  343. rom_niun_b = rom_niun_b.replace(
  344. f"{INI}11{EOP}{MED}", f"{INI}2{EOP}{MED}", 1)
  345. # FKR072: [n]+[l] >[l] + [l]
  346. if (
  347. rom_niun_b.startswith(f"{INI}5{EOP}")
  348. and rom_niun_a.endswith(f"{FIN}4")):
  349. _fkr_log(72)
  350. rom_niun_b = rom_niun_b.replace(f"{INI}5{EOP}", f"{INI}2", 1)
  351. rom = f"{rom_niun_a}{EOT}{rom_niun_b}"
  352. # FKR073: Palatalization: ㄷ+이,ㄷ+여,ㄷ+히,ㄷ+혀
  353. # FKR074: Palatalization: ㅌ+이,ㅌ+히,ㅌ+히,ㅌ+혀
  354. # FKR075: Consonant assimilation ㄱ
  355. # FKR076: Consonant assimilation ㄲ
  356. # FKR077: Consonant assimilation ㄳ : ㄱ,ㄴ,ㄹ,ㅁ,ㅇ
  357. # FKR078: Consonant assimilation ㄴ
  358. # FKR079: Consonant assimilation ㄵ: ㄱ,ㄴ,ㄷ,ㅈ"
  359. # FKR080: Consonant assimilation ㄶ : ㄱ,ㄴ,ㄷ,ㅈ
  360. # FKR081: Consonant assimilation ㄷ
  361. # FKR082: Consonant assimilation ㄹ
  362. # FKR083: Consonant assimilation ㄺ : ㄱ,ㄴ,ㄷ,ㅈ
  363. # FKR084: Consonant assimilation ㄻ : ㄱ,ㄴ,ㄷ,ㅈ
  364. # FKR085: Consonant assimilation ㄼ : ㄱ,ㄴ,ㄷ,ㅈ
  365. # FKR086: Consonant assimilation ㄾ : ㄱ,ㄴ,ㄷ,ㅈ
  366. # FKR087: Consonant assimilation ㄿ : ㄱ,ㄴ,ㄷ,ㅈ
  367. # FKR088: Consonant assimilation ㅀ : ㄱ,ㄴ,ㄷ,ㅈ
  368. # FKR089: Consonant assimilation ㅁ
  369. # FKR090: Consonant assimilation ㅂ
  370. # FKR091: Consonant assimilation ㅄ
  371. # FKR092: Consonant assimilation ㅅ
  372. # FKR093: Consonant assimilation ㅆ
  373. # FKR094: Consonant assimilation ㅇ
  374. # FKR095: Consonant assimilation ㅈ
  375. # FKR096: Consonant assimilation ㅊ
  376. # FKR097: Consonant assimilation ㅋ
  377. # FKR098: Consonant assimilation ㅌ
  378. # FKR099: Consonant assimilation ㅍ
  379. # FKR100: Consonant assimilation ㅎ
  380. # FKR101: digraphic coda + ㅇ: ㄵ,ㄶ,ㄺ,ㄻ,ㄼ,ㄽ,ㄾ,ㄿ,ㅀ
  381. # FKR102: digraphic coda + ㅎ: ㄵ,ㄶ,ㄺ,ㄻ,ㄼ,(ㄽ),ㄾ,ㄿ,ㅀ
  382. # FKR103: Vocalization 1 (except ㄹ+ㄷ, ㄹ+ㅈ 제외) voiced + unvoiced
  383. # FKR104: Vocalization 2 (except ㄹ+ㄷ, ㄹ+ㅈ 제외) unvoiced + voiced
  384. # FKR105: Vocalization 3 (ㄹ+ㄷ, ㄹ+ㅈ)
  385. # FKR106: Final sound law
  386. # FKR107: Exception for '쉬' = shi
  387. # FKR108: Exception for 'ㄴㄱ'= n'g
  388. for fkr_i in range(73, 109):
  389. _bk = rom
  390. rom = _replace_map(rom, KCONF[f"fkr{fkr_i:03}"])
  391. if _bk != rom:
  392. _fkr_log(fkr_i)
  393. logger.debug(f"FKR{fkr_i} substitution: {rom} (was: {_bk})")
  394. logger.debug(f"Coded romanization after replacements: {rom}")
  395. # FKR109: Convert everything else
  396. _fkr_log(109)
  397. for pos, data in KCONF["fkr109"].items():
  398. rom = _replace_map(rom, data)
  399. # FKR110: Convert leftover separator symbols
  400. rom = _replace_map(rom, {EOP: "", EOT: "", EOD: ""})
  401. if non_kor > 0:
  402. logger.debug(f"Non-Korean part: {orig[:non_kor]}")
  403. # Modified from K-Romanizer:1727 in that it does not append a hyphen
  404. # if the whole word is non-Korean or if the last non-Korean character
  405. # is a punctuation symbol.
  406. if orig[non_kor - 1] in ALL_PUNCT_STR:
  407. rom = f"{orig[:non_kor]}{rom}"
  408. elif len(rom):
  409. rom = f"{orig[:non_kor]}-{rom}"
  410. else:
  411. rom = orig
  412. # FKR111: ㄹ + 모음/ㅎ/ㄹ, ["lr","ll"] must be in the last of the array
  413. rom = _replace_map(rom, KCONF["fkr111"])
  414. # FKR112: Exceptions to initial sound law
  415. is_non_kor = False
  416. # FKR113: Check loan words by the first 1 letter
  417. # FKR114: Check loan words by the first 2 letters
  418. # FKR115: Check loan words by the first 3 letters
  419. if orig.startswith(tuple(KCONF["fkr113-115"])):
  420. is_non_kor = True
  421. # FKR116: Exceptions to initial sound law - particles
  422. is_particle = False
  423. if orig.startswith(tuple(KCONF["fkr116"]["particles"])):
  424. is_particle = True
  425. if len(orig) > 1 and not is_non_kor and not is_particle:
  426. if rom.startswith(tuple(KCONF["fkr116"]["replace_initials"].keys())):
  427. rom = _replace_map(rom, KCONF["fkr116"]["replace_initials"])
  428. # FKR117: Proper names _StringPoper Does not work because of breves
  429. if (
  430. # FKR118
  431. orig in KCONF["fkr118"] or
  432. # FKR119
  433. orig in KCONF["fkr119"]["word"] or
  434. (
  435. orig[:-1] in KCONF["fkr119"]["word"] and
  436. orig.endswith(tuple(KCONF["fkr119"]["suffix"]))
  437. ) or
  438. # FKR120
  439. orig in KCONF["fkr120"]):
  440. rom = rom[0].upper() + rom[1:]
  441. # FKR121: Loan words beginning with L
  442. if f" {orig} " in KCONF["fkr121"]:
  443. rom = _replace_map(rom[0], {"R": "L", "r": "l"}) + rom[1:]
  444. # @TODO Move this to a generic normalization step (not only for K)
  445. rom = _replace_map(rom, {"ŏ": "ŏ", "ŭ": "ŭ", "Ŏ": "Ŏ", "Ŭ": "Ŭ"})
  446. logger.debug(f"Romanized token: {rom}")
  447. return rom
  448. def _marc8_hancha(data):
  449. # FKR142: Chinese character list
  450. _fkr_log(142)
  451. return _replace_map(data, KCONF["fkr142"])
  452. def _hancha2hangul(data):
  453. data = " " + data.replace("\n", "\n ")
  454. # FKR143: Process exceptions first
  455. # FKR144: Apply initial sound law (Except: 列, 烈, 裂, 劣)
  456. # FKR145: Simplified characters, variants
  457. # FKR146: Some characters from expanded list
  458. # FKR147: Chinese characters 1-500 車=차
  459. # FKR148: Chinese characters 501-750 串=관
  460. # FKR149: Chinese characters 751-1000 金=금, 娘=랑
  461. # FKR150: Chinese characters 1001-1250
  462. # FKR151: Chinese characters 1251-1500 제외: 列, 烈, 裂, 劣
  463. # FKR152: Chinese characters 1501-1750 제외: 律, 率, 栗, 慄
  464. # FKR153: Chinese characters 1751-2000
  465. # FKR154: 不,Chinese characters 2001-2250 제외: 不
  466. # FKR155: Chinese characters 2251-2500 塞=색
  467. # FKR156: Chinese characters 2501-2750
  468. # FKR157: Chinese characters 2751-3000
  469. # FKR158: Chinese characters 3001-2250
  470. # FKR159: Chinese characters 3251-3500
  471. # FKR160: Chinese characters 3501-3750
  472. # FKR161: Chinese characters 3751-4000
  473. # FKR162: Chinese characters 4001-4250
  474. # FKR163: Chinese characters 4251-4500
  475. # FKR164: Chinese characters 4501-4750
  476. # FKR165: Chinese characters 4751-5000
  477. # FKR166: Chinese characters 5001-5250
  478. # FKR167: Chinese characters 5251-5500
  479. # FKR168: Chinese characters 5501-5750
  480. # FKR169: Chinese characters 5751-5978
  481. # FKR170: Chinese characters 일본Chinese characters
  482. for i in range(143, 171):
  483. _fkr_log(i)
  484. data = _replace_map(data, KCONF[f"fkr{i}"])
  485. # FKR171: Chinese characters 不(부)의 발음 처리
  486. # Write down indices of occurrences of "不"
  487. idx = [i for i, item in enumerate(data) if item == "不"]
  488. for i in idx:
  489. val = ord(data[i + 1])
  490. if (val > 45795 and val < 46384) or (val > 51087 and val < 51676):
  491. data = data.replace("不", "부", 1)
  492. else:
  493. data = data.replace("不", "불", 1)
  494. # FKR172: Chinese characters 列(렬)의 발음 처리
  495. # FKR173: Chinese characters 烈(렬)의 발음 처리
  496. # FKR174: Chinese characters 裂(렬)의 발음 처리
  497. # FKR175: Chinese characters 劣(렬)의 발음 처리
  498. for char in KCONF["fkr172-175"]:
  499. idx = [i for i, item in enumerate(data) if item == char]
  500. for i in idx:
  501. val = ord(data[i - 1])
  502. coda_value = (val - CP_MIN) % 28
  503. if coda_value == 0 or coda_value == 4 or val < 100:
  504. data = data.replace(char, "열", 1)
  505. else:
  506. data = data.replace(char, "렬", 1)
  507. # FKR176: Chinese characters 律(률)의 발음 처리
  508. # FKR177: Chinese characters 率(률)의 발음 처리
  509. # FKR178: Chinese characters 慄(률)의 발음 처리
  510. # FKR179: Chinese characters 栗(률)의 발음 처리
  511. for char in KCONF["fkr176-179"]:
  512. idx = [i for i, item in enumerate(data) if item == char]
  513. for i in idx:
  514. val = ord(data[i - 1])
  515. coda_value = (val - CP_MIN) % 28
  516. if coda_value == 0 or coda_value == 4 or val < 100:
  517. data = data.replace(char, "율", 1)
  518. else:
  519. data = data.replace(char, "률", 1)
  520. # FKR180: Katakana
  521. _fkr_log(180)
  522. data = _replace_map(data, KCONF["fkr180"])
  523. return re.sub(r"\s{2,}", " ", data.strip())
  524. def _replace_map(src, rmap, *args, **kw):
  525. """ Replace occurrences in a string according to a map. """
  526. for k, v in rmap.items():
  527. src = src.replace(k, v, *args, **kw)
  528. return src
  529. def _kor_fname_rom(fname):
  530. rom_ls = []
  531. cpoints = tuple(ord(c) for c in fname)
  532. for i in range(len(fname)):
  533. cp = cpoints[i] - CP_MIN
  534. ini = INI + str(cp // 588)
  535. med = MED + str((cp // 28) % 21)
  536. fin = FIN + str(cp % 28)
  537. rom_ls.append(EOP.join((ini, med, fin)))
  538. rom = EOT.join(rom_ls) + EOD
  539. logger.debug(f"Encoded first name: {rom}")
  540. # FKR011: Check native Korean name, by coda
  541. native_by_fin = False
  542. for tok in KCONF["fkr011"]["nat_fin"]:
  543. if tok in rom:
  544. native_by_fin = True
  545. break
  546. j = k = False
  547. for tok in KCONF["fkr011"]["nat_ini"]:
  548. if tok in rom:
  549. j = True
  550. break
  551. for tok in KCONF["fkr011"]["sino_ini"]:
  552. if tok in fname:
  553. k = True
  554. break
  555. native_by_ini = j and not k
  556. # FKR012: Check native Korean name, by vowel & coda
  557. native_by_med = False
  558. for tok in KCONF["fkr011"]:
  559. if tok in rom:
  560. native_by_med = True
  561. break
  562. # FKR013: Check native Korean name, by ㅢ
  563. if f"{MED}19{EOP}" in rom:
  564. native_by_med = "의" not in fname and "희" not in fname
  565. # FKR014: Consonant assimilation ㄱ
  566. # FKR015: Consonant assimilation ㄲ
  567. # FKR016: Consonant assimilation ㄴ
  568. # FKR017: Consonant assimilation ㄷ
  569. # FKR018: Consonant assimilation ㄹ
  570. # FKR019: Consonant assimilation ㅁ
  571. # FKR020: Consonant assimilation ㅂ
  572. # FKR021: Consonant assimilation ㅅ
  573. # FKR022: Consonant assimilation ㅆ
  574. # FKR023: Consonant assimilation ㅇ
  575. # FKR024: Consonant assimilation ㅈ
  576. # FKR025: Consonant assimilation ㅊ
  577. # FKR026: Consonant assimilation ㅎ
  578. # FKR027: Final sound law
  579. # FKR028: Vocalization 1 (except ㄹ+ㄷ, ㄹ+ㅈ): voiced+unvoiced
  580. # FKR029: Vocalization 2 unvoiced+voiced
  581. for i in range(14, 30):
  582. _fkr_log(i)
  583. rom = _replace_map(rom, KCONF[f"fkr{i:03}"])
  584. # FKR030: Convert everything else
  585. _fkr_log(30)
  586. for k, cmap in KCONF["fkr030"].items():
  587. logger.debug(f"Applying FKR030[\"{k}\"]")
  588. rom = _replace_map(rom, cmap)
  589. rom = _replace_map(rom.replace(EOP, ""), {"swi": "shwi", "Swi": "Shwi"}, 1)
  590. logger.debug(f"Partly romanized first name: {rom}")
  591. logger.debug(f"fname: {fname} ({len(fname)})")
  592. if len(fname) == 2:
  593. rom = _replace_map(rom, {EOT: "-", EOD: ""})
  594. else:
  595. rom = _replace_map(rom, {f"n{EOT}g": "n'g", EOT: "", EOD: ""})
  596. # FKR031: ㄹ + vowels/ㅎ/ㄹ ["l-r","l-l"] does not work USE alternative
  597. _fkr_log(31)
  598. for k, cmap in KCONF["fkr031"].items():
  599. logger.debug(f"Applying FKR031[\"{k}\"]")
  600. rom = _replace_map(rom, cmap)
  601. # FKR032: Capitalization
  602. _fkr_log(32)
  603. rom = rom[0].upper() + rom[1:]
  604. # FKR033: Remove hyphen in bisyllabic native Korean first name
  605. _fkr_log(33)
  606. if (
  607. len(fname) == 2
  608. and any((native_by_ini, native_by_fin, native_by_med))):
  609. _fkr_log(33)
  610. logger.debug("First name is native.")
  611. rom = _replace_map(rom, {"n-g": "n'g", "-": ""})
  612. # FKR034: First name, initial sound law
  613. if len(fname) > 1:
  614. _fkr_log(34)
  615. for k, v in KCONF["fkr034"].items():
  616. if rom.startswith(k):
  617. rom = rom.replace(k, v)
  618. return rom
  619. def _kor_lname_rom(lname):
  620. if len(lname) == 2:
  621. # FKR181: 2-character names.
  622. _fkr_log(181)
  623. rom = _replace_map(lname, KCONF["fkr181"])
  624. else:
  625. # FKR182: 1-character Chinese names.
  626. _fkr_log(182)
  627. lname = _replace_map(lname, KCONF["fkr182"])
  628. # FKR183: 1-character names.
  629. _fkr_log(183)
  630. rom = _replace_map(lname, KCONF["fkr183"])
  631. return rom if lname != rom else False
  632. def _fkr_log(fkr_i):
  633. fkr_k = f"FKR{fkr_i:03}"
  634. logger.debug(f"Applying {fkr_k}: {FKR_IDX[fkr_k]}")