yaejunyang/packages/utils/saferKorean.ts

595 lines
18 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import CallingNumberKorean from "./callingNumberKorean.js";
import FloatKorean from "./floatKorean.js";
import IntegerKorean from "./integerKorean.js";
import PhoneNumberKorean from "./phoneNumberKorean.js";
import EmojiDescriptions from "./emoji-descriptions.json" with { type: "json" };
// Process trim tailing dots
export function processUnsounds(input: string): string {
return (
input
// Change tailing dots
.replace(/[.,]+$/, "")
.replace(/[.,]{2,}/g, "")
.replace(/[.,]\s/g, " ")
.replace(/[(){}[]]/g, " ")
);
}
// 여러가지 이상한 소리를 내도록 만드는 프롬프트나
// 소리를 지르도록 하는 프롬프트를 필터링 합니다.
// 예를들어...
// 흐아..하아아..
// 아!아!아!아!아!
// 흐으..흐아아..헤...하아..
// 혀어어어어어어어엉........ 핫. 혀엉..... 흑... 하앗... 흐윽... 형. 하앙.
// 혀엉.... 하앙... 흐윽... 항. 항. 형... 하앙. 흐으윽... 형... 흡... 혀엉..
// 하아아앗. 혀엉.. 흡... 흐읍... 형.. 하앗. 하아앙... 형... 하앙... 흐윽...
// 혀어어엉.. 하앙. 항. 형... 하앙. 혀엉.... 하앙. 흑... 항. 형... 흡 하앗.
// 혀엉..... 흑. 흣
// .. 하앗!. 하!아앙~형..~. 하!~앙... 흐윽...
// 혀!어어엉.. !앙. 항. !형...~ 하앙.!... 하앙~흑!... 항. 형..~! 흡 하앗.
// 혀엉...!.. 흑. 흣!
export function processCensor(input: string): string {
return input
.replace(
/([흐하해헤혀형][아으앙응앗웅응ㅡ!?.,><~'"/]+)/g,
(content: string) => content[0] ?? "",
)
.replace(processCensor.StrangeRepeatableRegex, (content: string) =>
content.substring(0, 3),
);
}
export namespace processCensor {
// prettier-ignore
export const StrangeRepeatable = [
"아", "ㅏ", "어", "ㅓ", "으", "ㅡ", "우", "ㅜ",
"에", "오", "ㅗ", "야", "ㅑ", "읍", "앙", "읏",
"웃", "엉", "앗", "엣", "웅", "응", "흐", "해",
"헤", "헼", "헥", "하", "형", "혀", "흡", "흑",
"협", "혓", "핫", "헵", "햅", "잇",
"あ", "ア", "う", "お", "ー",
"a", "A", "o", "O", "u", "U",
];
export const StrangeRepeatableRegex = new RegExp(
`[${StrangeRepeatable.join("")}][${StrangeRepeatable.join("")}!.,><~'"/]{2,}`,
"g",
);
}
// 핵토파스칼, 바, 핵타르 AU (에이커 인치 피트 야드)
// Process korean letter, choseong shortens
export function processKorean(input: string): string {
return input.replace(/[ㄱ-ㅎㄲㄸㅃㅆㅉ]+/g, (i) =>
i
.replace(processKorean.DoubleMixedChoseongMapRegex, (content: string) => {
// ㅇㅋ => 오키, ㅇㄴ => 아니, ...
return processKorean.DoubleMixedChoseongMap[
content as keyof typeof processKorean.DoubleMixedChoseongMap
];
})
.replace(processKorean.RepeatedChoseongMapRegex, (content: string) => {
// process ㄴㄴ ㄱㄱ ㅋㅋ ㄷㄷ, ...
const key = (content[0] ??
"") as keyof typeof processKorean.RepeatedChoseongMap;
const item = processKorean.RepeatedChoseongMap[key];
if (typeof item == "string") {
return item;
} else if (typeof item == "function") {
return item(content);
}
return content;
})
.replace(
/[ㄱ-ㅎㄲㄸㅃㅆㅉ]/g,
(char: string) =>
processKorean.ChoseongMap[
char as keyof typeof processKorean.ChoseongMap
] ?? char,
),
);
}
export namespace processKorean {
export const DoubleMixedChoseongMap = {
: "좋은사랑하세요",
: "이쁜사랑하세요",
: "좋은사랑하세요",
: "이쁜사랑하세요",
: "이쁜사랑",
: "이지랄",
: "지랄노",
: "모야",
: "하이",
: "싫어",
: "기달",
: "제발",
: "몰라",
: "시바",
: "어디",
: "노잼",
: "바바",
: "바이",
: "죄송",
: "아니",
: "빨리",
: "인정",
: "노노",
: "감사",
: "쯧쯧",
: "지랄",
: "리얼",
: "아하",
: "오키",
: "추카",
: "꺼져",
: "잠깐만",
: "존나",
: "가능",
};
export const DoubleMixedChoseongMapRegex = new RegExp(
Object.keys(DoubleMixedChoseongMap)
.map((k) => `(?:${k})`)
.join("|"),
"g",
);
export const RepeatedChoseongMap = {
: "틔틔",
: "덜덜",
: "노노",
: "응응",
: "추추",
: "유유",
: "야야",
: (content: string) => "크".repeat(content.length),
: (content: string) => "흐".repeat(content.length),
: (content: string) => {
if (content.length == 2) {
return "고고";
} else if (content.length == 3) {
return "고고고";
}
return content;
},
};
export const RepeatedChoseongMapRegex = new RegExp(
Object.keys(RepeatedChoseongMap)
.map((k) => `${k}{2,}`)
.join("|"),
"g",
);
// prettier-ignore
export const ChoseongMap = {
: "기역", : "니은", : "디귿", : "리을", : "미음", : "비읍",
: "시옷", : "이응", : "지읒", : "치읓", : "키읔", : "티읕",
: "피읖", : "히읗", : "쌍기역", : "쌍디귿", : "쌍비읍",
: "쌍시옷", : "쌍지읒",
};
}
// Process 10km 1,000 1.1, ... numbers
export function processNumber(input: string): string {
return input
.replace(
/(\+\d+[\s-]+)?([\d-]+)/g,
(_, prefix: string | undefined, phone: string) => {
const all = (prefix ?? "") + phone;
if (!phone.includes("-")) return all;
return PhoneNumberKorean.convert(all);
},
)
.replace(
/([\d,]+)(?:(?<prefix>[kKMmgGtTpPeEzZyY][iI]?)(?<unit>[bB])|(?<prefix>[m]?)(?<unit>[lL])|(?<prefix>[mck]?)(?<unit>m))(?<tail>[^a-zA-Z])/g,
(_, num: string, ...last: any): string => {
const group = last[last.length - 1] as {
prefix: string;
unit: string;
tail: string;
};
const tail = group.tail;
const unit = group.unit.toLocaleLowerCase();
const numStr = IntegerKorean.convertFromString(num);
let prefix = group.prefix;
if (unit == "b") {
// 10kib => 십키비바이트
prefix =
processNumber.DatasizePrefix[
prefix.toLowerCase() as keyof typeof processNumber.DatasizePrefix
];
return `${numStr} ${prefix}바이트 ${tail}`;
}
if (unit == "l") {
// 10l => 십리터
prefix =
processNumber.LiterPrefix[
prefix.toLowerCase() as keyof typeof processNumber.LiterPrefix
];
return `${numStr} ${prefix}리터 ${tail}`;
}
if (unit == "m") {
// 10m => 십미터
prefix =
processNumber.MeterPrefix[
prefix as keyof typeof processNumber.MeterPrefix
];
return `${numStr} ${prefix}미터 ${tail}`;
}
return `${num}${prefix}${unit}${tail}`;
},
)
.replace(
/([\d.,]+)\s*([개살시평명자벌장달병잔번채])/g,
(_, num: string, postfix: string) => {
// 10명 => 열명
if (num.includes(".")) {
return num + postfix;
}
const intNum = parseInt(num.replace(/,/g, ""));
if (CallingNumberKorean.canConvert(intNum)) {
return CallingNumberKorean.convert(intNum) + postfix;
} else {
return IntegerKorean.convertFromString(num) + postfix;
}
},
)
.replace(/[\d,]+/g, (num: string) => {
// 1,000 원 => 천원
if (!num.includes(",")) return num;
return IntegerKorean.convertFromString(num);
})
.replace(
/(v?)([\d.]+)([ab]?)/g,
(_, suffix: string, num: string, postfix: string) => {
const dotCount = [...num.matchAll(/\./g)].length;
const hasNoSuffix = suffix == "";
if (hasNoSuffix && dotCount == 0) {
// 일반 숫자는 인트로 읽음
return IntegerKorean.convertFromString(num) + postfix;
} else if (hasNoSuffix && dotCount == 1) {
// 소수는 . 앞은 인트로, 뒤는 플로트로 읽음
const [intPart, floatPart] = num.split(/\./);
return (
IntegerKorean.convertFromString(intPart ?? "") +
"쩜" +
FloatKorean.convert(floatPart ?? "") +
postfix
);
} else if ((suffix == "v" || postfix.length) && dotCount > 1) {
// 버전표기는 버전을 붙여서
return (
"버전" +
FloatKorean.convert(num) +
(processNumber.VersionPostfix[
postfix as keyof typeof processNumber.VersionPostfix
] ?? "")
);
} else {
// 모든 경우에 속하지 않으면 영일이삼사 형태로 읽음
// (예: 111.111.111.111 ip address)
return FloatKorean.convert(num) + postfix;
}
},
);
}
export namespace processNumber {
// prettier-ignore
export const DatasizePrefix = {
k: "킬로", ki: "키비", m: "메가", mi: "메비",
g: "기가", gi: "기비", t: "테라", ti: "테비",
p: "페타", pi: "페비", e: "엑사", ei: "엑시",
z: "제타", zi: "제비", y: "요타", yi: "요비",
};
// prettier-ignore
export const LiterPrefix = { m: "밀리", "": "" };
// prettier-ignore
export const MeterPrefix = {
m: "밀리", c: "센치", "": "", k: "킬로",
};
// prettier-ignore
export const VersionPostfix = {
a: "알파", b: "베타",
};
}
// Process unicode emojis and unicode symbols
export function processEmoji(input: string): string {
return input
.replace(
processEmoji.UnicodeSymbolsRegex,
(content: string) =>
processEmoji.UnicodeSymbols[
content as keyof typeof processEmoji.UnicodeSymbols
] ?? content,
)
.replace(/\p{Extended_Pictographic}/gu, (content: string) => {
return (
EmojiDescriptions[content as keyof typeof EmojiDescriptions] ?? content
);
})
.replace(/\p{Emoji}/u, " ");
}
export namespace processEmoji {
export const UnicodeSymbols = {
"㎢": "제곱킬로미터",
"㎡": "제곱미터",
"↑": "위쪽 화살표",
"↓": "아래쪽 화살표",
"←": "왼쪽 화살표",
"→": "오른쪽 화살표",
"↔": "좌우 화살표",
"↖": "왼쪽 위 화살표",
"↗": "오른쪽 위 화살표",
"↘": "오른쪽 아래 화살표",
"↙": "왼쪽 아래 화살표",
"™": "트레이드마크",
};
export const UnicodeSymbolsRegex = new RegExp(
"[" + Object.keys(UnicodeSymbols).join() + "]",
"gu",
);
}
// Process ```codeblock``` and https://link
export function processMarkdown(input: string): string {
return input
.replace(/```([\s\S]*?)```/g, (_, content: string) => {
// Process codeblock
const code = content
.substring(0, processMarkdown.LangPrefixMaxLength)
.toLowerCase();
let lang = "";
for (const [key, value] of Object.entries(processMarkdown.LangPrefixes)) {
if (code.startsWith(key + "\n")) {
lang = value + " ";
break;
}
}
return lang + "코드블럭";
})
.replace(/[hH][tT]{2}[pP][sS]?:\/\/(\S+)/g, (_, url: string) => {
// Process link
const mapped = processMarkdown.GIFMap[
url as keyof typeof processMarkdown.GIFMap
] as string | undefined;
if (mapped) return mapped;
if (
url.startsWith("tenor.com/view") ||
url.startsWith("images-ext-1.discordapp.net/external/")
) {
return "움짤! ";
}
if (
url.startsWith("www.youtube.com/") ||
url.startsWith("youtube.com/") ||
url.startsWith("youtu.be/")
) {
return "유튜브 영상! ";
}
if (url.startsWith("www.reddit.com/") || url.startsWith("reddit.com/")) {
return "레딧 링크! ";
}
if (
url.startsWith("www.instagram.com/") ||
url.startsWith("instagram.com/")
) {
return "인스타 링크! ";
}
if (url.startsWith("x.com/")) {
return "엑스 링크! ";
}
if (url.startsWith("github.com/")) {
return "깃허브 링크! ";
}
if (url.startsWith("store.steampowered.com")) {
return "스팀 스토어 링크! ";
}
if (url.startsWith("steamcommunity.com")) {
return "스팀 커뮤니티 링크! ";
}
return "링크 ";
});
}
export namespace processMarkdown {
export const LangPrefixes = {
typescript: "타입스크립트",
javascript: "자바스크립트",
java: "자바",
kotlin: "코틀린",
rust: "러스트",
lua: "루아",
json: "제이슨",
yaml: "야믈",
yml: "야믈",
toml: "토믈",
xml: "엑스엠엘",
julia: "줄리아",
matlab: "매트랩",
erlang: "얼랭",
elxir: "엘릭서",
zig: "지그",
txt: "텍스트",
vim: "빔",
perl: "펄",
php: "피에이치피",
lisp: "리스프",
postscript: "포스트스크립트",
ghostscript: "고스트스크립트",
fortran: "포트란",
algol: "알골",
scala: "스칼라",
haskell: "하스켈",
basic: "베이직",
cpp: "씨플플",
"c++": "씨플플",
csharp: "씨샵",
cs: "씨샵",
"c#": "씨샵",
c: "씨",
h: "헤더",
d: "디",
awk: "에이더블류케이",
pl: "펄",
pwsh: "파워쉘",
powershell: "파워쉘",
cmd: "씨엠디",
sh: "쉘",
ps1: "파워셀",
bat: "배치파일",
bash: "베시스크립트",
tex: "텍",
dart: "다트",
go: "고랭",
python: "파이썬",
swift: "스위프트",
css: "씨에스에스",
html: "에이치티엠엘",
latex: "레이텍",
md: "마크다운",
markdown: "마크다운",
py: "파이썬",
hs: "하스켈",
rs: "러스트",
kt: "코틀린",
js: "자스",
ts: "타스",
tsx: "리액트 타입스크립트",
jsx: "리액트 자바스크립트",
an: "에이엔",
parlance: "팔렌스",
};
export const LangPrefixMaxLength = (() => {
let max = 0;
for (const key in LangPrefixes) {
max = Math.max(key.length, max);
}
return max;
})();
export const GIFMap = {
"tenor.com/view/majo-no-tabitabi-the-journey-of-elaina-elaina-windy-hair-gif-19187698":
"화난 일레이나",
"tenor.com/view/majo-no-tabitabi-the-journey-of-elaina-elaina-sparkle-amazed-gif-18827847":
"일레이나 반짝반짝!",
"images-ext-1.discordapp.net/external/C3xPFuUxs16jY25AR3NvsIDezaOtib9wozhLBWejZk4/https/media.tenor.com/bUd8mk4ufwsAAAPo/anime-disappointment.mp4":
"일레이나 절래절래",
"images-ext-1.discordapp.net/external/SXv4qgpy2r1Gx-dNxhcfJle6AXDaH_SToRjEBYYaup0/https/media.tenor.com/nDDxJc4FDwEAAAPo/cute.mp4":
"일레이나 끄덕",
"tenor.com/view/majo-no-tabitabi-the-journey-of-elaina-elaina-what-gif-19011602":
"당황한 일레이나",
"images-ext-1.discordapp.net/external/2R41WcvNJwYMD69UKls2cDa_hEL-rzCRCFvOi2DDOVo/https/media.tenor.com/sU3RCOixDbgAAAPo/majo-no-tabitabi-the-journey-of-elaina.mp4":
"일레이나 손짓",
};
}
// Process %$*&... symbols to readable korean
export function processSymbol(input: string): string {
return input
.replace(
processSymbol.SymbolMapRegExp,
(t) => processSymbol.SymbolMap[t as keyof typeof processSymbol.SymbolMap],
)
.replace(/([?!]+)/g, (_, content: string): string => content[0] ?? "")
.replace(/[ \t\f\r]+/g, " ");
}
export namespace processSymbol {
export const SymbolMap = {
"%": "퍼센트",
$: "달러",
"^": "캐럿",
"&": "엔드",
"*": "스타",
"#": "샵",
"@": "엣",
".": "쩜",
"-": "마이너스",
"+": "플러스",
_: "언더바",
"=": "이퀄",
"/": "슬래쉬",
"~": "물결표",
"\\": "역슬래쉬",
"♡": "하트 ",
"|": "",
">": "",
"<": "",
":": "콜론",
";": "세미콜론",
};
export const SymbolMapRegExp = new RegExp(
"[" +
Object.keys(SymbolMap)
.map((i) => "\\" + i)
.join() +
"]",
"g",
);
}
// Process isolated symbols
export function processIsolatedSymbol(input: string): string {
return input
.replace(/^ * (\?+) *$/, "어?")
.replace(/^ *[?!'"]+ *$/, (total) =>
[...total]
.map(
(element) =>
processIsolatedSymbol.IsolatedSymbolMap[
element as keyof typeof processIsolatedSymbol.IsolatedSymbolMap
],
)
.join(""),
)
.replace(/\s\|\|\s/g, " 오얼 ")
.replace(/\s&&\s/g, " 엔드 ");
}
export namespace processIsolatedSymbol {
export const IsolatedSymbolMap = {
"?": "물음표",
"!": "느낌표",
"'": "쿼트",
'"': "더블쿼트",
};
}
export function processFullWidth(input: string): string {
return input.replace(
/[]/g,
(i) =>
processFullWidth.FullWidthNumberMap[
i as keyof typeof processFullWidth.FullWidthNumberMap
],
);
}
export namespace processFullWidth {
// prettier-ignore
export const FullWidthNumberMap = {
"": "0", "": "1", "": "2", "": "3", "": "4",
"": "5", "": "6", "": "7", "": "8", "": "9",
};
}
export function saferKorean(input: string): string {
return (input.normalize() + " ")
.let(processCensor)
.let(processUnsounds)
.let(processFullWidth)
.let(processIsolatedSymbol)
.let(processMarkdown)
.let(processKorean)
.let(processNumber)
.let(processSymbol)
.let(processEmoji)
.replace(/\s+/g, " ")
.trim();
}