123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717 |
- # @package ext.korean
- #
- __doc__ = """
- Korean transcription functions.
- Ported from K-Romanizer: https://library.princeton.edu/eastasian/k-romanizer
- Only script-to-Roman is possible for Korean.
- Note that Korean Romanization must be done separately for strings containing
- only personal names and strings that do not contain personal names, due to
- ambiguities in the language. A non-deterministic approach using machine
- learning that separates words depending on context is being attempted by other
- parties, and it may be possible to eventually integrate such services here in
- the future, technology and licensing permitting. At the moment there are no
- such plans.
- Many thanks to Hyoungbae Lee for kindly providing the original K-Romanizer
- program and assistance in porting it to Python.
- """
- import logging
- import re
- from os import path
- from csv import reader
- from scriptshifter.exceptions import BREAK
- from scriptshifter.hooks.korean import KCONF
- PWD = path.dirname(path.realpath(__file__))
- CP_MIN = 44032
- # Buid FKR index for better logging.
- with open(path.join(PWD, "FKR_index.csv"), newline='') as fh:
- csv = reader(fh)
- FKR_IDX = {row[0]: row[2] for row in csv}
- logger = logging.getLogger(__name__)
- def s2r_nonames_post_config(ctx):
- """ Romanize a regular string NOT containing personal names. """
- ctx.dest, ctx.warnings = _romanize_nonames(
- ctx.src, ctx.options["capitalize"])
- return BREAK
- def s2r_names_post_config(ctx):
- """
- Romanize a string containing ONLY Korean personal names.
- One or more names can be transcribed. A comma or middle dot (U+00B7) is
- to be used as separator for multiple names.
- """
- ctx.dest, ctx.warnings = _romanize_names(ctx.src)
- return BREAK
- def _romanize_nonames(src, capitalize="first", hancha=True):
- """ Main Romanization function for non-name strings. """
- # FKR038: Convert Chinese characters to Hangul
- if hancha:
- kor = _hancha2hangul(_marc8_hancha(src))
- else:
- kor = src
- # Replace ideographic spaces with ASCII space.
- kor = re.sub(r"\s+", " ", kor)
- kor = f" {kor} "
- # FKR039: Replace Proper name with spaces in advance
- # FKR040: Replace Proper name with a hyphen in advance
- # FKR041: Romanize names of Hangul consonants
- for i in range(39, 42):
- _fkr_log(i)
- kor = _replace_map(kor, KCONF[f"fkr{i:03}"])
- # NOTE This is slightly different from LL 929-930 in that it doesn't
- # result in double spaces.
- kor = kor.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
- # This is more compact but I'm unsure if the replacement order is kept.
- # kor = kor.replace({"\r\n": " ", "\r": " ", "\n": " "})
- rom = _romanize_oclc_auto(kor)
- logger.debug(f"Before capitalization: {rom}")
- # FKR042: Capitalize all first letters
- if capitalize == "all":
- rom = _capitalize(rom)
- # FKR043: Capitalize the first letter
- elif capitalize == "first":
- rom = rom[0].upper() + rom[1:]
- # FKR044: Ambiguities
- ambi = re.sub("[,.\";: ]+", " ", rom)
- # TODO Decide what to do with these. There is no facility for outputting
- # warnings or notes to the user yet.
- warnings = []
- _fkr_log(45)
- for exp, warn in KCONF["fkr045"].items():
- if exp in ambi:
- warnings.append(ambi if warn == "" else warn)
- if rom:
- rom = rom.replace("kkk", "kk")
- return rom, warnings
- def _romanize_names(src):
- """
- Main Romanization function for names.
- Separate and romanize multiple names sepearated by comma or middle dot.
- K-Romanizer: KorNameRom20
- """
- rom_ls = []
- warnings = []
- if "," in src and "·" in src:
- warnings.append(
- "both commas and middle dots are being used to separate "
- "names. Only one of the two types should be used, or "
- "unexpected results may occur.")
- kor_ls = src.split(",") if "," in src else src.split("·")
- for kor in kor_ls:
- rom, _warnings = _romanize_name(kor.strip())
- rom_ls.append(rom)
- warnings.extend(_warnings)
- return ", ".join(rom_ls), warnings
- def _romanize_name(src):
- warnings = []
- # FKR001: Conversion, Family names in Chinese (dealing with 金 and 李)
- # FKR002: Family names, Initial sound law
- replaced = False
- for ss, r in KCONF["fkr001-002"]:
- if replaced:
- break
- for s in ss:
- if src.startswith(s):
- src = r + src[1:]
- replaced = True
- break
- # FKR003: First name, Chinese Character Conversion
- src = _hancha2hangul(_marc8_hancha(src))
- if re.search("[a-zA-Z0-9]", src):
- warnings.append(f"{src} is not a recognized personal name.")
- return None, warnings
- # `parsed` can either be a modified Korean string with markers, or in case
- # of a foreign name, the final romanized name.
- parsed, _warnings = _parse_kor_name(re.sub(r"\s{2,}", " ", src.strip()))
- if len(_warnings):
- warnings += _warnings
- if parsed:
- if "~" in parsed:
- lname, fname = parsed.split("~", 1)
- fname_rom = _kor_fname_rom(fname)
- lname_rom_ls = [_kor_lname_rom(n) for n in lname.split("+")]
- if not any(lname_rom_ls):
- warnings.append(f"{parsed} is not a recognized Korean name.")
- return None, warnings
- lname_rom = " ".join(lname_rom_ls)
- rom = f"{lname_rom} {fname_rom}"
- if False:
- # TODO add option for authoritative name.
- rom_ls = rom.rsplit(" ", 1)
- rom = ", ".join(rom_ls)
- return rom, warnings
- else:
- warnings.append("Romanized as a foreign name.")
- return parsed, warnings
- warnings.append(f"{src} is not a recognized Korean name.")
- return None, warnings
- def _parse_kor_name(src):
- parsed = None
- warnings = []
- # FKR004: Check first two characters. Two-syllable family name or not?
- two_syl_fname = False
- for ptn in KCONF["fkr004"]:
- if src.startswith(ptn):
- two_syl_fname = True
- break
- src_len = len(src)
- # FKR005: Error if more than 7 syllables
- if src_len > 7 or src_len < 2 or " " in src[3:]:
- return _kor_corp_name_rom(src), warnings
- ct_spaces = src.count(" ")
- # FKR0006: Error if more than 2 spaces
- if ct_spaces > 2:
- warnings.append("ERROR: not a name (too many spaces)")
- return parsed, warnings
- # FKR007: 2 spaces (two family names)
- if ct_spaces == 2:
- parsed = src.replace(" ", "+", 1).replace(" ", "~", 1)
- elif ct_spaces == 1:
- # FKR008: 1 space (2nd position)
- if src[1] == " ":
- parsed = src.replace(" ", "~")
- # FKR009: 1 space (3nd position)
- if src[2] == " ":
- if two_syl_fname:
- parsed = "+" + src.replace(" ", "~")
- # FKR010: When there is no space
- else:
- if src_len == 2:
- parsed = src[0] + "~" + src[1:]
- elif src_len > 2:
- if two_syl_fname:
- parsed = src[:1] + "~" + src[2:]
- else:
- parsed = src[0] + "~" + src[1:]
- return parsed, warnings
- def _kor_corp_name_rom(src):
- chu = yu = 0
- if src.startswith("(주) "):
- src = src[4:]
- chu = "L"
- if src.endswith(" (주)"):
- src = src[:-4]
- chu = "R"
- if src.startswith("(유) "):
- src = src[4:]
- yu = "L"
- if src.endswith(" (유)"):
- src = src[:-4]
- yu = "R"
- rom_tok = []
- for tok in src.split(" "):
- rom_tok.append(_romanize_oclc_auto(tok))
- rom = _capitalize(" ".join(rom_tok))
- if chu == "L":
- rom = "(Chu) " + rom
- elif chu == "R":
- rom = rom + " (Chu)"
- if yu == "L":
- rom = "(Yu) " + rom
- elif yu == "R":
- rom = rom + " (Yu)"
- # FKR035: Replace established names
- rom = _replace_map(rom, KCONF["fkr035"])
- return rom
- def _romanize_oclc_auto(kor):
- # FKR050: Starts preprocessing symbol
- _fkr_log(50)
- for rname, rule in KCONF["fkr050"].items():
- logger.debug(f"Applying fkr050[{rname}]")
- kor = _replace_map(kor, rule)
- # See https://github.com/lcnetdev/scriptshifter/issues/19
- kor = re.sub("제([0-9])", "제 \\1", kor)
- # FKR052: Replace Che+number
- _fkr_log(52)
- for rname, rule in KCONF["fkr052"].items():
- logger.debug(f"Applying fkr052[{rname}]")
- kor = _replace_map(kor, rule)
- # Strip end and multiple whitespace.
- kor = re.sub(r"\s{2,}", " ", kor.strip())
- kor = kor.replace("^", " GLOTTAL ")
- logger.debug(f"Korean before romanization: {kor}")
- rom_ls = []
- for word in kor.split(" "):
- rom_ls.append(_kor_rom(word))
- rom = " ".join(rom_ls)
- # FKR059: Apply glottalization
- rom = _replace_map(
- f" {rom.strip()} ", {" GLOTTAL ": "", "*": "", "^": ""})
- # FKR060: Process number + -년/-년도/-년대
- # TODO Add leading whitespace as per L1221? L1202 already added one.
- rom = _replace_map(rom, KCONF["fkr060"])
- rom = re.sub(r"\s{2,}", " ", f" {rom.strip()} ")
- # FKR061: Jurisdiction (시)
- # FKR062: Historical place names
- # FKR063: Jurisdiction (국,도,군,구)
- # FKR064: Temple names of Kings, Queens, etc. (except 조/종)
- # FKR065: Frequent historical names
- for i in range(61, 66):
- _fkr_log(i)
- rom = _replace_map(rom, KCONF[f"fkr{i:03}"])
- # FKR066: Starts restore symbols
- _fkr_log(66)
- for rname, rule in KCONF["fkr066"].items():
- logger.debug(f"Applying FKR066[{rname}]")
- rom = _replace_map(rom, rule)
- # Remove spaces from before punctuation signs.
- rom = re.sub(r" (?=[,.;:?!])", "", rom.strip())
- rom = re.sub(r"\s{2,}", " ", rom)
- return rom
- # FKR068: Exceptions, Exceptions to initial sound law, Proper names
- def _kor_rom(kor):
- kor = re.sub(r"\s{2,}", " ", kor.strip())
- orig = kor
- # FKR069: Irregular sound change list
- kor = _replace_map(kor, KCONF["fkr069"])
- # FKR070: [n] insertion position mark +
- niun = kor.find("+")
- if niun > -1:
- kor = kor.replace("+", "")
- orig = kor
- non_kor = 0
- cpoints = tuple(ord(c) for c in kor)
- for cp in cpoints:
- if cp < CP_MIN:
- non_kor += 1
- kor = kor[1:]
- rom_ls = []
- if non_kor > 0:
- # Rebuild code point list with non_kor removed.
- cpoints = tuple(ord(c) for c in kor)
- for i in range(len(kor)):
- cp = cpoints[i] - CP_MIN
- ini = "i" + str(cp // 588)
- med = "m" + str((cp // 28) % 21)
- fin = "f" + str(cp % 28)
- rom_ls.append("#".join((ini, med, fin)))
- rom = "~".join(rom_ls)
- if len(rom):
- rom = rom + "E"
- # FKR071: [n] insertion
- if niun > -1:
- niun_loc = rom.find("~")
- # Advance until the niun'th occurrence of ~
- # If niun is 0 or 1 the loop will be skipped.
- for i in range(niun - 1):
- niun_loc = rom.find("~", niun_loc + 1)
- rom_niun_a = rom[:niun_loc]
- rom_niun_b = rom[niun_loc + 1:]
- if re.match("i11#m(?:2|6|12|17|20)", rom_niun_b):
- _fkr_log(71)
- rom_niun_b = rom_niun_b.replace("i11#m", "i2#m", 1)
- # FKR072: [n]+[l] >[l] + [l]
- if rom_niun_b.startswith("i5#") and rom_niun_a.endswith("f4"):
- _fkr_log(72)
- rom_niun_b = rom_niun_b.replace("i5#", "i2", 1)
- rom = f"{rom_niun_a}~{rom_niun_b}"
- # FKR073: Palatalization: ㄷ+이,ㄷ+여,ㄷ+히,ㄷ+혀
- # FKR074: Palatalization: ㅌ+이,ㅌ+히,ㅌ+히,ㅌ+혀
- # FKR075: Consonant assimilation ㄱ
- # FKR076: Consonant assimilation ㄲ
- # FKR077: Consonant assimilation ㄳ : ㄱ,ㄴ,ㄹ,ㅁ,ㅇ
- # FKR078: Consonant assimilation ㄴ
- # FKR079: Consonant assimilation ㄵ: ㄱ,ㄴ,ㄷ,ㅈ"
- # FKR080: Consonant assimilation ㄶ : ㄱ,ㄴ,ㄷ,ㅈ
- # FKR081: Consonant assimilation ㄷ
- # FKR082: Consonant assimilation ㄹ
- # FKR083: Consonant assimilation ㄺ : ㄱ,ㄴ,ㄷ,ㅈ
- # FKR084: Consonant assimilation ㄻ : ㄱ,ㄴ,ㄷ,ㅈ
- # FKR085: Consonant assimilation ㄼ : ㄱ,ㄴ,ㄷ,ㅈ
- # FKR086: Consonant assimilation ㄾ : ㄱ,ㄴ,ㄷ,ㅈ
- # FKR087: Consonant assimilation ㄿ : ㄱ,ㄴ,ㄷ,ㅈ
- # FKR088: Consonant assimilation ㅀ : ㄱ,ㄴ,ㄷ,ㅈ
- # FKR089: Consonant assimilation ㅁ
- # FKR090: Consonant assimilation ㅂ
- # FKR091: Consonant assimilation ㅄ
- # FKR092: Consonant assimilation ㅅ
- # FKR093: Consonant assimilation ㅆ
- # FKR094: Consonant assimilation ㅇ
- # FKR095: Consonant assimilation ㅈ
- # FKR096: Consonant assimilation ㅊ
- # FKR097: Consonant assimilation ㅋ
- # FKR098: Consonant assimilation ㅌ
- # FKR099: Consonant assimilation ㅍ
- # FKR100: Consonant assimilation ㅎ
- # FKR101: digraphic coda + ㅇ: ㄵ,ㄶ,ㄺ,ㄻ,ㄼ,ㄽ,ㄾ,ㄿ,ㅀ
- # FKR102: digraphic coda + ㅎ: ㄵ,ㄶ,ㄺ,ㄻ,ㄼ,(ㄽ),ㄾ,ㄿ,ㅀ
- # FKR103: Vocalization 1 (except ㄹ+ㄷ, ㄹ+ㅈ 제외) voiced + unvoiced
- # FKR104: Vocalization 2 (except ㄹ+ㄷ, ㄹ+ㅈ 제외) unvoiced + voiced
- # FKR105: Vocalization 3 (ㄹ+ㄷ, ㄹ+ㅈ)
- # FKR106: Final sound law
- # FKR107: Exception for '쉬' = shi
- # FKR108: Exception for 'ㄴㄱ'= n'g
- for fkr_i in range(73, 109):
- _fkr_log(fkr_i)
- _bk = rom
- rom = _replace_map(rom, KCONF[f"fkr{fkr_i:03}"])
- if _bk != rom:
- logger.debug(f"FKR{fkr_i} substitution: {rom} (was: {_bk})")
- # FKR109: Convert everything else
- _fkr_log(109)
- for pos, data in KCONF["fkr109"].items():
- rom = _replace_map(rom, data)
- # FKR110: Convert symbols
- rom = _replace_map(rom, {"#": "", "~": ""})
- if non_kor > 0:
- # Modified from K-Romanizer:1727 in that it does not append a hyphen
- # if the whole word is non-Korean.
- rom = f"{orig[:non_kor]}-{rom}" if len(rom) else orig
- # FKR111: ㄹ + 모음/ㅎ/ㄹ, ["lr","ll"] must be in the last of the array
- rom = _replace_map(rom, KCONF["fkr111"])
- # FKR112: Exceptions to initial sound law
- is_non_kor = False
- # FKR113: Check loan words by the first 1 letter
- # FKR114: Check loan words by the first 2 letters
- # FKR115: Check loan words by the first 3 letters
- if orig.startswith(tuple(KCONF["fkr113-115"])):
- is_non_kor = True
- # FKR116: Exceptions to initial sound law - particles
- is_particle = False
- if orig.startswith(tuple(KCONF["fkr116"]["particles"])):
- is_particle = True
- if len(orig) > 1 and not is_non_kor and not is_particle:
- if rom.startswith(tuple(KCONF["fkr116"]["replace_initials"].keys())):
- rom = _replace_map(rom, KCONF["fkr116"]["replace_initials"])
- # FKR117: Proper names _StringPoper Does not work because of breves
- if (
- # FKR118
- orig in KCONF["fkr118"] or
- # FKR119
- orig in KCONF["fkr119"]["word"] or
- (
- orig[:-1] in KCONF["fkr119"]["word"] and
- orig.endswith(tuple(KCONF["fkr119"]["suffix"]))
- ) or
- # FKR120
- orig.endswith(tuple(KCONF["fkr120"]))):
- rom = rom[0].upper() + rom[1:]
- # FKR121: Loan words beginning with L
- if f" {orig} " in KCONF["fkr121"]:
- rom = _replace_map(rom[0], {"R": "L", "r": "l"}) + rom[1:]
- # @TODO Move this to a generic normalization step (not only for K)
- rom = _replace_map(rom, {"ŏ": "ŏ", "ŭ": "ŭ", "Ŏ": "Ŏ", "Ŭ": "Ŭ"})
- return rom
- def _marc8_hancha(data):
- # FKR142: Chinese character list
- _fkr_log(142)
- return _replace_map(data, KCONF["fkr142"])
- def _hancha2hangul(data):
- data = " " + data.replace("\n", "\n ")
- # FKR143: Process exceptions first
- # FKR144: Apply initial sound law (Except: 列, 烈, 裂, 劣)
- # FKR145: Simplified characters, variants
- # FKR146: Some characters from expanded list
- # FKR147: Chinese characters 1-500 車=차
- # FKR148: Chinese characters 501-750 串=관
- # FKR149: Chinese characters 751-1000 金=금, 娘=랑
- # FKR150: Chinese characters 1001-1250
- # FKR151: Chinese characters 1251-1500 제외: 列, 烈, 裂, 劣
- # FKR152: Chinese characters 1501-1750 제외: 律, 率, 栗, 慄
- # FKR153: Chinese characters 1751-2000
- # FKR154: 不,Chinese characters 2001-2250 제외: 不
- # FKR155: Chinese characters 2251-2500 塞=색
- # FKR156: Chinese characters 2501-2750
- # FKR157: Chinese characters 2751-3000
- # FKR158: Chinese characters 3001-2250
- # FKR159: Chinese characters 3251-3500
- # FKR160: Chinese characters 3501-3750
- # FKR161: Chinese characters 3751-4000
- # FKR162: Chinese characters 4001-4250
- # FKR163: Chinese characters 4251-4500
- # FKR164: Chinese characters 4501-4750
- # FKR165: Chinese characters 4751-5000
- # FKR166: Chinese characters 5001-5250
- # FKR167: Chinese characters 5251-5500
- # FKR168: Chinese characters 5501-5750
- # FKR169: Chinese characters 5751-5978
- # FKR170: Chinese characters 일본Chinese characters
- for i in range(143, 171):
- _fkr_log(i)
- data = _replace_map(data, KCONF[f"fkr{i}"])
- # FKR171: Chinese characters 不(부)의 발음 처리
- # Write down indices of occurrences of "不"
- idx = [i for i, item in enumerate(data) if item == "不"]
- for i in idx:
- val = ord(data[i + 1])
- if (val > 45795 and val < 46384) or (val > 51087 and val < 51676):
- data = data.replace("不", "부", 1)
- else:
- data = data.replace("不", "불", 1)
- # FKR172: Chinese characters 列(렬)의 발음 처리
- # FKR173: Chinese characters 烈(렬)의 발음 처리
- # FKR174: Chinese characters 裂(렬)의 발음 처리
- # FKR175: Chinese characters 劣(렬)의 발음 처리
- # FKR176: Chinese characters 律(률)의 발음 처리
- # FKR177: Chinese characters 率(률)의 발음 처리
- # FKR178: Chinese characters 慄(률)의 발음 처리
- # FKR179: Chinese characters 栗(률)의 발음 처리
- for char in KCONF["fkr172-179"]:
- idx = [i for i, item in enumerate(data) if item == char]
- for i in idx:
- val = ord(data[i + 1])
- coda_value = (val - CP_MIN) % 28
- if coda_value == 1 or coda_value == 4 or val < 100: # TODO verify
- data = data.replace(char, "열", 1)
- else:
- data = data.replace(char, "렬", 1)
- # FKR180: Katakana
- _fkr_log(180)
- data = _replace_map(data, KCONF["fkr180"])
- return re.sub(r"\s{2,}", " ", data.strip())
- def _replace_map(src, rmap, *args, **kw):
- """ Replace occurrences in a string according to a map. """
- for k, v in rmap.items():
- src = src.replace(k, v, *args, **kw)
- return src
- def _kor_fname_rom(fname):
- rom_ls = []
- cpoints = tuple(ord(c) for c in fname)
- for i in range(len(fname)):
- cp = cpoints[i] - CP_MIN
- ini = "i" + str(cp // 588)
- med = "m" + str((cp // 28) % 21)
- fin = "f" + str(cp % 28)
- rom_ls.append("#".join((ini, med, fin)))
- rom = "~".join(rom_ls) + "E"
- # FKR011: Check native Korean name, by coda
- origin_by_fin = "sino"
- for tok in KCONF["fkr011"]["nat_fin"]:
- if tok in rom:
- origin_by_fin = "native"
- break
- j = False
- for tok in KCONF["fkr011"]["nat_ini"]:
- if tok in rom:
- j = True
- k = False
- for tok in KCONF["fkr011"]["sino_ini"]:
- if tok in rom:
- k = True
- if j:
- if k:
- origin_by_ini = "sino"
- else:
- origin_by_ini = "native"
- else:
- origin_by_ini = "sino"
- # FKR012: Check native Korean name, by vowel & coda
- origin_by_med = "sino"
- for tok in KCONF["fkr011"]:
- if tok in rom:
- origin_by_med = "native"
- break
- # FKR013: Check native Korean name, by ㅢ
- if "m19#" in rom:
- if "의" in fname or "희" in fname:
- origin_by_med = "sino"
- else:
- origin_by_med = "native"
- # FKR014: Consonant assimilation ㄱ
- # FKR015: Consonant assimilation ㄲ
- # FKR016: Consonant assimilation ㄴ
- # FKR017: Consonant assimilation ㄷ
- # FKR018: Consonant assimilation ㄹ
- # FKR019: Consonant assimilation ㅁ
- # FKR020: Consonant assimilation ㅂ
- # FKR021: Consonant assimilation ㅅ
- # FKR022: Consonant assimilation ㅆ
- # FKR023: Consonant assimilation ㅇ
- # FKR024: Consonant assimilation ㅈ
- # FKR025: Consonant assimilation ㅊ
- # FKR026: Consonant assimilation ㅎ
- # FKR027: Final sound law
- # FKR028: Vocalization 1 (except ㄹ+ㄷ, ㄹ+ㅈ): voiced+unvoiced
- # FKR029: Vocalization 2 unvoiced+voiced
- for i in range(14, 30):
- _fkr_log(i)
- rom = _replace_map(rom, KCONF[f"fkr{i:03}"])
- # FKR030: Convert everything else
- _fkr_log(30)
- for k, cmap in KCONF["fkr030"].items():
- logger.debug(f"Applying FKR030[\"{k}\"]")
- rom = _replace_map(rom, cmap)
- rom = _replace_map(rom.replace("#", ""), {"swi": "shwi", "Swi": "Shwi"}, 1)
- if len(fname) == 2:
- rom = rom.replace("~", "-")
- else:
- rom = _replace_map(rom, {"n~g": "n'g", "~": ""})
- # FKR031: ㄹ + vowels/ㅎ/ㄹ ["l-r","l-l"] does not work USE alternative
- _fkr_log(31)
- for k, cmap in KCONF["fkr031"].items():
- logger.debug(f"Applying FKR031[\"{k}\"]")
- rom = _replace_map(rom, cmap)
- # FKR032: Capitalization
- rom = rom[0].upper() + rom[1:]
- # FKR033: Remove hyphen in bisyllabic native Korean first name
- if (
- len(fname) == 2
- and "native" in (origin_by_ini, origin_by_fin, origin_by_med)):
- rom = _replace_map(rom, {"n-g": "n'g", "-": ""})
- # FKR034: First name, initial sound law
- for k, v in KCONF["fkr034"].items():
- if rom.startswith(k):
- rom = rom.replace(k, v)
- return rom
- def _kor_lname_rom(lname):
- if len(lname) == 2:
- # FKR181: 2-charater names.
- _fkr_log(181)
- rom = _replace_map(lname, KCONF["fkr181"])
- else:
- # FKR182: 1-charater Chinese names.
- _fkr_log(182)
- lname = _replace_map(lname, KCONF["fkr182"])
- # FKR183: 1-charater names.
- _fkr_log(183)
- rom = _replace_map(lname, KCONF["fkr183"])
- return rom if lname != rom else False
- def _capitalize(src):
- """ Only capitalize first word and words preceded by space."""
- orig_ls = src.split(" ")
- cap_ls = [orig[0].upper() + orig[1:] for orig in orig_ls]
- return " ".join(cap_ls)
- def _fkr_log(fkr_i):
- fkr_k = f"FKR{fkr_i:03}"
- logger.debug(f"Applying {fkr_k}: {FKR_IDX[fkr_k]}")
|