123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717 |
- __doc__ = """
- Korean transcription functions.
- Ported from K-Romanizer: https://library.princeton.edu/eastasian/k-romanizer
- Only script-to-Roman is possible for Korean.
- Note that Korean Romanization must be done separately for strings containing
- only personal names and strings that do not contain personal names, due to
- ambiguities in the language. A non-deterministic approach using machine
- learning that separates words depending on context is being attempted by other
- parties, and it may be possible to eventually integrate such services here in
- the future, technology and licensing permitting. At the moment there are no
- such plans.
- Many thanks to Hyoungbae Lee for kindly providing the original K-Romanizer
- program and assistance in porting it to Python.
- """
- import logging
- import re
- from os import path
- from csv import reader
- from scriptshifter.exceptions import BREAK
- from scriptshifter.hooks.korean import KCONF
- PWD = path.dirname(path.realpath(__file__))
- CP_MIN = 44032
- with open(path.join(PWD, "FKR_index.csv"), newline='') as fh:
- csv = reader(fh)
- FKR_IDX = {row[0]: row[2] for row in csv}
- logger = logging.getLogger(__name__)
- def s2r_nonames_post_config(ctx):
- """ Romanize a regular string NOT containing personal names. """
- ctx.dest, ctx.warnings = _romanize_nonames(
- ctx.src, ctx.options["capitalize"])
- return BREAK
- def s2r_names_post_config(ctx):
- """
- Romanize a string containing ONLY Korean personal names.
- One or more names can be transcribed. A comma or middle dot (U+00B7) is
- to be used as separator for multiple names.
- """
- ctx.dest, ctx.warnings = _romanize_names(ctx.src)
- return BREAK
- def _romanize_nonames(src, capitalize="first", hancha=True):
- """ Main Romanization function for non-name strings. """
-
- if hancha:
- kor = _hancha2hangul(_marc8_hancha(src))
- else:
- kor = src
-
- kor = re.sub(r"\s+", " ", kor)
- kor = f" {kor} "
-
-
-
- for i in range(39, 42):
- _fkr_log(i)
- kor = _replace_map(kor, KCONF[f"fkr{i:03}"])
-
-
- kor = kor.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
-
-
- rom = _romanize_oclc_auto(kor)
- logger.debug(f"Before capitalization: {rom}")
-
- if capitalize == "all":
- rom = _capitalize(rom)
-
- elif capitalize == "first":
- rom = rom[0].upper() + rom[1:]
-
- ambi = re.sub("[,.\";: ]+", " ", rom)
-
-
- warnings = []
- _fkr_log(45)
- for exp, warn in KCONF["fkr045"].items():
- if exp in ambi:
- warnings.append(ambi if warn == "" else warn)
- if rom:
- rom = rom.replace("kkk", "kk")
- return rom, warnings
- def _romanize_names(src):
- """
- Main Romanization function for names.
- Separate and romanize multiple names sepearated by comma or middle dot.
- K-Romanizer: KorNameRom20
- """
- rom_ls = []
- warnings = []
- if "," in src and "·" in src:
- warnings.append(
- "both commas and middle dots are being used to separate "
- "names. Only one of the two types should be used, or "
- "unexpected results may occur.")
- kor_ls = src.split(",") if "," in src else src.split("·")
- for kor in kor_ls:
- rom, _warnings = _romanize_name(kor.strip())
- rom_ls.append(rom)
- warnings.extend(_warnings)
- return ", ".join(rom_ls), warnings
- def _romanize_name(src):
- warnings = []
-
-
- replaced = False
- for ss, r in KCONF["fkr001-002"]:
- if replaced:
- break
- for s in ss:
- if src.startswith(s):
- src = r + src[1:]
- replaced = True
- break
-
- src = _hancha2hangul(_marc8_hancha(src))
- if re.search("[a-zA-Z0-9]", src):
- warnings.append(f"{src} is not a recognized personal name.")
- return None, warnings
-
-
- parsed, _warnings = _parse_kor_name(re.sub(r"\s{2,}", " ", src.strip()))
- if len(_warnings):
- warnings += _warnings
- if parsed:
- if "~" in parsed:
- lname, fname = parsed.split("~", 1)
- fname_rom = _kor_fname_rom(fname)
- lname_rom_ls = [_kor_lname_rom(n) for n in lname.split("+")]
- if not any(lname_rom_ls):
- warnings.append(f"{parsed} is not a recognized Korean name.")
- return None, warnings
- lname_rom = " ".join(lname_rom_ls)
- rom = f"{lname_rom} {fname_rom}"
- if False:
-
- rom_ls = rom.rsplit(" ", 1)
- rom = ", ".join(rom_ls)
- return rom, warnings
- else:
- warnings.append("Romanized as a foreign name.")
- return parsed, warnings
- warnings.append(f"{src} is not a recognized Korean name.")
- return None, warnings
- def _parse_kor_name(src):
- parsed = None
- warnings = []
-
- two_syl_fname = False
- for ptn in KCONF["fkr004"]:
- if src.startswith(ptn):
- two_syl_fname = True
- break
- src_len = len(src)
-
- if src_len > 7 or src_len < 2 or " " in src[3:]:
- return _kor_corp_name_rom(src), warnings
- ct_spaces = src.count(" ")
-
- if ct_spaces > 2:
- warnings.append("ERROR: not a name (too many spaces)")
- return parsed, warnings
-
- if ct_spaces == 2:
- parsed = src.replace(" ", "+", 1).replace(" ", "~", 1)
- elif ct_spaces == 1:
-
- if src[1] == " ":
- parsed = src.replace(" ", "~")
-
- if src[2] == " ":
- if two_syl_fname:
- parsed = "+" + src.replace(" ", "~")
-
- else:
- if src_len == 2:
- parsed = src[0] + "~" + src[1:]
- elif src_len > 2:
- if two_syl_fname:
- parsed = src[:1] + "~" + src[2:]
- else:
- parsed = src[0] + "~" + src[1:]
- return parsed, warnings
- def _kor_corp_name_rom(src):
- chu = yu = 0
- if src.startswith("(주) "):
- src = src[4:]
- chu = "L"
- if src.endswith(" (주)"):
- src = src[:-4]
- chu = "R"
- if src.startswith("(유) "):
- src = src[4:]
- yu = "L"
- if src.endswith(" (유)"):
- src = src[:-4]
- yu = "R"
- rom_tok = []
- for tok in src.split(" "):
- rom_tok.append(_romanize_oclc_auto(tok))
- rom = _capitalize(" ".join(rom_tok))
- if chu == "L":
- rom = "(Chu) " + rom
- elif chu == "R":
- rom = rom + " (Chu)"
- if yu == "L":
- rom = "(Yu) " + rom
- elif yu == "R":
- rom = rom + " (Yu)"
-
- rom = _replace_map(rom, KCONF["fkr035"])
- return rom
- def _romanize_oclc_auto(kor):
-
- _fkr_log(50)
- for rname, rule in KCONF["fkr050"].items():
- logger.debug(f"Applying fkr050[{rname}]")
- kor = _replace_map(kor, rule)
-
- kor = re.sub("제([0-9])", "제 \\1", kor)
-
- _fkr_log(52)
- for rname, rule in KCONF["fkr052"].items():
- logger.debug(f"Applying fkr052[{rname}]")
- kor = _replace_map(kor, rule)
-
- kor = re.sub(r"\s{2,}", " ", kor.strip())
- kor = kor.replace("^", " GLOTTAL ")
- logger.debug(f"Korean before romanization: {kor}")
- rom_ls = []
- for word in kor.split(" "):
- rom_ls.append(_kor_rom(word))
- rom = " ".join(rom_ls)
-
- rom = _replace_map(
- f" {rom.strip()} ", {" GLOTTAL ": "", "*": "", "^": ""})
-
-
- rom = _replace_map(rom, KCONF["fkr060"])
- rom = re.sub(r"\s{2,}", " ", f" {rom.strip()} ")
-
-
-
-
-
- for i in range(61, 66):
- _fkr_log(i)
- rom = _replace_map(rom, KCONF[f"fkr{i:03}"])
-
- _fkr_log(66)
- for rname, rule in KCONF["fkr066"].items():
- logger.debug(f"Applying FKR066[{rname}]")
- rom = _replace_map(rom, rule)
-
- rom = re.sub(r" (?=[,.;:?!])", "", rom.strip())
- rom = re.sub(r"\s{2,}", " ", rom)
- return rom
- def _kor_rom(kor):
- kor = re.sub(r"\s{2,}", " ", kor.strip())
- orig = kor
-
- kor = _replace_map(kor, KCONF["fkr069"])
-
- niun = kor.find("+")
- if niun > -1:
- kor = kor.replace("+", "")
- orig = kor
- non_kor = 0
- cpoints = tuple(ord(c) for c in kor)
- for cp in cpoints:
- if cp < CP_MIN:
- non_kor += 1
- kor = kor[1:]
- rom_ls = []
- if non_kor > 0:
-
- cpoints = tuple(ord(c) for c in kor)
- for i in range(len(kor)):
- cp = cpoints[i] - CP_MIN
- ini = "i" + str(cp // 588)
- med = "m" + str((cp // 28) % 21)
- fin = "f" + str(cp % 28)
- rom_ls.append("#".join((ini, med, fin)))
- rom = "~".join(rom_ls)
- if len(rom):
- rom = rom + "E"
-
- if niun > -1:
- niun_loc = rom.find("~")
-
-
- for i in range(niun - 1):
- niun_loc = rom.find("~", niun_loc + 1)
- rom_niun_a = rom[:niun_loc]
- rom_niun_b = rom[niun_loc + 1:]
- if re.match("i11#m(?:2|6|12|17|20)", rom_niun_b):
- _fkr_log(71)
- rom_niun_b = rom_niun_b.replace("i11#m", "i2#m", 1)
-
- if rom_niun_b.startswith("i5#") and rom_niun_a.endswith("f4"):
- _fkr_log(72)
- rom_niun_b = rom_niun_b.replace("i5#", "i2", 1)
- rom = f"{rom_niun_a}~{rom_niun_b}"
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- for fkr_i in range(73, 109):
- _fkr_log(fkr_i)
- _bk = rom
- rom = _replace_map(rom, KCONF[f"fkr{fkr_i:03}"])
- if _bk != rom:
- logger.debug(f"FKR{fkr_i} substitution: {rom} (was: {_bk})")
-
- _fkr_log(109)
- for pos, data in KCONF["fkr109"].items():
- rom = _replace_map(rom, data)
-
- rom = _replace_map(rom, {"#": "", "~": ""})
- if non_kor > 0:
-
-
- rom = f"{orig[:non_kor]}-{rom}" if len(rom) else orig
-
- rom = _replace_map(rom, KCONF["fkr111"])
-
- is_non_kor = False
-
-
-
- if orig.startswith(tuple(KCONF["fkr113-115"])):
- is_non_kor = True
-
- is_particle = False
- if orig.startswith(tuple(KCONF["fkr116"]["particles"])):
- is_particle = True
- if len(orig) > 1 and not is_non_kor and not is_particle:
- if rom.startswith(tuple(KCONF["fkr116"]["replace_initials"].keys())):
- rom = _replace_map(rom, KCONF["fkr116"]["replace_initials"])
-
- if (
-
- orig in KCONF["fkr118"] or
-
- orig in KCONF["fkr119"]["word"] or
- (
- orig[:-1] in KCONF["fkr119"]["word"] and
- orig.endswith(tuple(KCONF["fkr119"]["suffix"]))
- ) or
-
- orig.endswith(tuple(KCONF["fkr120"]))):
- rom = rom[0].upper() + rom[1:]
-
- if f" {orig} " in KCONF["fkr121"]:
- rom = _replace_map(rom[0], {"R": "L", "r": "l"}) + rom[1:]
-
- rom = _replace_map(rom, {"ŏ": "ŏ", "ŭ": "ŭ", "Ŏ": "Ŏ", "Ŭ": "Ŭ"})
- return rom
- def _marc8_hancha(data):
-
- _fkr_log(142)
- return _replace_map(data, KCONF["fkr142"])
- def _hancha2hangul(data):
- data = " " + data.replace("\n", "\n ")
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- for i in range(143, 171):
- _fkr_log(i)
- data = _replace_map(data, KCONF[f"fkr{i}"])
-
-
- idx = [i for i, item in enumerate(data) if item == "不"]
- for i in idx:
- val = ord(data[i + 1])
- if (val > 45795 and val < 46384) or (val > 51087 and val < 51676):
- data = data.replace("不", "부", 1)
- else:
- data = data.replace("不", "불", 1)
-
-
-
-
-
-
-
-
- for char in KCONF["fkr172-179"]:
- idx = [i for i, item in enumerate(data) if item == char]
- for i in idx:
- val = ord(data[i + 1])
- coda_value = (val - CP_MIN) % 28
- if coda_value == 1 or coda_value == 4 or val < 100:
- data = data.replace(char, "열", 1)
- else:
- data = data.replace(char, "렬", 1)
-
- _fkr_log(180)
- data = _replace_map(data, KCONF["fkr180"])
- return re.sub(r"\s{2,}", " ", data.strip())
- def _replace_map(src, rmap, *args, **kw):
- """ Replace occurrences in a string according to a map. """
- for k, v in rmap.items():
- src = src.replace(k, v, *args, **kw)
- return src
- def _kor_fname_rom(fname):
- rom_ls = []
- cpoints = tuple(ord(c) for c in fname)
- for i in range(len(fname)):
- cp = cpoints[i] - CP_MIN
- ini = "i" + str(cp // 588)
- med = "m" + str((cp // 28) % 21)
- fin = "f" + str(cp % 28)
- rom_ls.append("#".join((ini, med, fin)))
- rom = "~".join(rom_ls) + "E"
-
- origin_by_fin = "sino"
- for tok in KCONF["fkr011"]["nat_fin"]:
- if tok in rom:
- origin_by_fin = "native"
- break
- j = False
- for tok in KCONF["fkr011"]["nat_ini"]:
- if tok in rom:
- j = True
- k = False
- for tok in KCONF["fkr011"]["sino_ini"]:
- if tok in rom:
- k = True
- if j:
- if k:
- origin_by_ini = "sino"
- else:
- origin_by_ini = "native"
- else:
- origin_by_ini = "sino"
-
- origin_by_med = "sino"
- for tok in KCONF["fkr011"]:
- if tok in rom:
- origin_by_med = "native"
- break
-
- if "m19#" in rom:
- if "의" in fname or "희" in fname:
- origin_by_med = "sino"
- else:
- origin_by_med = "native"
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- for i in range(14, 30):
- _fkr_log(i)
- rom = _replace_map(rom, KCONF[f"fkr{i:03}"])
-
- _fkr_log(30)
- for k, cmap in KCONF["fkr030"].items():
- logger.debug(f"Applying FKR030[\"{k}\"]")
- rom = _replace_map(rom, cmap)
- rom = _replace_map(rom.replace("#", ""), {"swi": "shwi", "Swi": "Shwi"}, 1)
- if len(fname) == 2:
- rom = rom.replace("~", "-")
- else:
- rom = _replace_map(rom, {"n~g": "n'g", "~": ""})
-
- _fkr_log(31)
- for k, cmap in KCONF["fkr031"].items():
- logger.debug(f"Applying FKR031[\"{k}\"]")
- rom = _replace_map(rom, cmap)
-
- rom = rom[0].upper() + rom[1:]
-
- if (
- len(fname) == 2
- and "native" in (origin_by_ini, origin_by_fin, origin_by_med)):
- rom = _replace_map(rom, {"n-g": "n'g", "-": ""})
-
- for k, v in KCONF["fkr034"].items():
- if rom.startswith(k):
- rom = rom.replace(k, v)
- return rom
- def _kor_lname_rom(lname):
- if len(lname) == 2:
-
- _fkr_log(181)
- rom = _replace_map(lname, KCONF["fkr181"])
- else:
-
- _fkr_log(182)
- lname = _replace_map(lname, KCONF["fkr182"])
-
- _fkr_log(183)
- rom = _replace_map(lname, KCONF["fkr183"])
- return rom if lname != rom else False
- def _capitalize(src):
- """ Only capitalize first word and words preceded by space."""
- orig_ls = src.split(" ")
- cap_ls = [orig[0].upper() + orig[1:] for orig in orig_ls]
- return " ".join(cap_ls)
- def _fkr_log(fkr_i):
- fkr_k = f"FKR{fkr_i:03}"
- logger.debug(f"Applying {fkr_k}: {FKR_IDX[fkr_k]}")
|