From 5ee42ede565afe5acf77175331ac4f2669794d9e Mon Sep 17 00:00:00 2001 From: kimpure Date: Tue, 19 May 2026 15:59:17 +0000 Subject: [PATCH] add supertonic --- crates/yaejuyang-supertonic/.dockerignore | 5 ++ crates/yaejuyang-supertonic/Cargo.toml | 7 +- crates/yaejuyang-supertonic/Dockerfile | 21 +++++ .../docker-compose.example.yml | 12 +++ crates/yaejuyang-supertonic/src/main.rs | 4 +- crates/yaejuyang-supertonic/src/tts/engine.rs | 6 ++ packages/bot/tts.ts | 12 ++- packages/db/generated/prisma/enums.ts | 3 +- .../db/generated/prisma/internal/class.ts | 2 +- packages/tts/papago.ts | 6 ++ packages/tts/supertonic.ts | 63 ++++++++++++++ packages/tts/typecast.ts | 82 +----------------- packages/utils/saferKorean.ts | 83 +++++++++++++++++++ .../migration.sql | 2 + prisma/schema.prisma | 1 + 15 files changed, 223 insertions(+), 86 deletions(-) create mode 100644 crates/yaejuyang-supertonic/.dockerignore create mode 100644 crates/yaejuyang-supertonic/Dockerfile create mode 100644 crates/yaejuyang-supertonic/docker-compose.example.yml create mode 100644 packages/tts/supertonic.ts create mode 100644 packages/utils/saferKorean.ts create mode 100644 prisma/migrations/20260519155510_add_supertonic/migration.sql diff --git a/crates/yaejuyang-supertonic/.dockerignore b/crates/yaejuyang-supertonic/.dockerignore new file mode 100644 index 0000000..7c15446 --- /dev/null +++ b/crates/yaejuyang-supertonic/.dockerignore @@ -0,0 +1,5 @@ +.env +assets +Dockerfile +docker-compose.yml +.dockerignore diff --git a/crates/yaejuyang-supertonic/Cargo.toml b/crates/yaejuyang-supertonic/Cargo.toml index a23ee5e..6033a49 100644 --- a/crates/yaejuyang-supertonic/Cargo.toml +++ b/crates/yaejuyang-supertonic/Cargo.toml @@ -3,8 +3,13 @@ name = "yaejuyang-supertonic" version = "0.1.0" edition = "2024" +[profile.release] +opt-level = "z" +strip = true +lto = true + [features] -default = ["webgpu"] +default = [] webgpu = [ "ort/webgpu" ] cuda = [ "ort/cuda" ] diff --git a/crates/yaejuyang-supertonic/Dockerfile b/crates/yaejuyang-supertonic/Dockerfile new file mode 100644 index 0000000..82b38b8 --- /dev/null +++ b/crates/yaejuyang-supertonic/Dockerfile @@ -0,0 +1,21 @@ +FROM alpine:latest AS assets + +RUN apk add git git-lfs +WORKDIR /app +RUN git clone https://huggingface.co/Supertone/supertonic-3 assets && rm -r assets/.git + +FROM rust:trixie AS builder + +RUN apt-get update && apt-get install -y --no-install-recommends \ + pkg-config libssl-dev +WORKDIR /app +ADD . . +RUN cargo build --profile=release + +FROM debian:trixie-slim AS runtime + +RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg +WORKDIR /app +COPY --from=assets /app/assets /app/assets +COPY --from=builder /app/target/release/yaejuyang-supertonic /app/ +ENV SUPERTONIC_MODEL_DIR="/app/assets" diff --git a/crates/yaejuyang-supertonic/docker-compose.example.yml b/crates/yaejuyang-supertonic/docker-compose.example.yml new file mode 100644 index 0000000..b87c0a9 --- /dev/null +++ b/crates/yaejuyang-supertonic/docker-compose.example.yml @@ -0,0 +1,12 @@ +services: + yaejuyang-supertonic: + build: . + command: /app/yaejuyang-supertonic + container_name: "yaejuyang-supertonic" + ports: + - 3000:80 + environment: + SUPERTONIC_WORKERS: "1" + ENABLED_BACKENDS: "" + WEBGPU_DEVICE_ID: "0" + RUST_LOG: "info,ort=warn" diff --git a/crates/yaejuyang-supertonic/src/main.rs b/crates/yaejuyang-supertonic/src/main.rs index a02f30b..42f067b 100644 --- a/crates/yaejuyang-supertonic/src/main.rs +++ b/crates/yaejuyang-supertonic/src/main.rs @@ -12,8 +12,8 @@ async fn main() -> Result<(), Box> { dotenvy::dotenv().ok(); tracing_subscriber::fmt::init(); - let model_dir = std::env::var("SUPERTONIC_MODEL_DIR") - .unwrap_or_else(|_| "./assets/supertonic-3".to_string()); + let model_dir = + std::env::var("SUPERTONIC_MODEL_DIR").unwrap_or_else(|_| "./assets".to_string()); let voice_style_path = std::env::var("SUPERTONIC_VOICE_STYLE") .unwrap_or_else(|_| format!("{model_dir}/voice_styles/M1.json")); let lang = std::env::var("SUPERTONIC_LANG").unwrap_or_else(|_| "en".to_string()); diff --git a/crates/yaejuyang-supertonic/src/tts/engine.rs b/crates/yaejuyang-supertonic/src/tts/engine.rs index 04832cf..bea3c85 100644 --- a/crates/yaejuyang-supertonic/src/tts/engine.rs +++ b/crates/yaejuyang-supertonic/src/tts/engine.rs @@ -382,6 +382,12 @@ fn load_backends(config: &HashMap) -> Vec>(); enabled_backends.iter().filter_map(|name| { + let name = name.trim(); + + if name.is_empty() { + return None + } + #[cfg(feature = "cuda")] if name == "cuda" { return load_backend_cuda(config) diff --git a/packages/bot/tts.ts b/packages/bot/tts.ts index bd31a48..2be890d 100644 --- a/packages/bot/tts.ts +++ b/packages/bot/tts.ts @@ -8,6 +8,7 @@ import TTSModelBase from "../tts"; import { DiscordUserProfile } from "../db/generated/prisma/client"; import { nyaize } from "../utils/nyaize"; import { OutputHandler } from "../utils/outputHandler"; +import TTSSupertonicModel from "../tts/supertonic"; export async function createVoiceBuffer(voice: Voice, text: string): Promise { if (voice == "TypeCast") { @@ -19,8 +20,17 @@ export async function createVoiceBuffer(voice: Voice, text: string): Promise { protected cachedVoice: Map> @@ -9,6 +10,11 @@ export class TTSPapagoModel extends TTSModelBase { super() this.cachedVoice = new Map(); } + ttsify(input: string): string { + return super.ttsify(saferKorean( + input + )) + } public getVoicePath(id: TTSPapagoModel.RequestId): string { const audioFileName = TTSModelBase.hashAudioFile(id.text, `.${id.speaker}.${id.speed.replace(/\-/g, "_")}`); const audioPath = join( diff --git a/packages/tts/supertonic.ts b/packages/tts/supertonic.ts new file mode 100644 index 0000000..328a77d --- /dev/null +++ b/packages/tts/supertonic.ts @@ -0,0 +1,63 @@ +import { join } from "path"; +import fetch from "../utils/fetch"; +import TTSModelBase from "."; +import { saferKorean } from "../utils/saferKorean"; + +export class TTSSupertonicModel extends TTSModelBase { + protected override cachedVoice: Map> + constructor() { + super() + this.cachedVoice = new Map(); + } + override ttsify(input: string): string { + return super.ttsify(saferKorean( + input + )) + } + private async getSupertonicResponse(voiceId: TTSSupertonicModel.RequestId) { + const payload = { + text: voiceId.text, + lang: "ko", + }; + + if (!process.env.SUPERTONIC_API_URL) { + throw Error("process.env.SUPERTONIC_API_URL not set"); + } + + return await fetch(process.env.SUPERTONIC_API_URL, { + method: "POST", + headers: { + "Content-Type": "application/json" + }, + body: JSON.stringify(payload) + }); + } + async getVoiceBuffer(voiceId: TTSSupertonicModel.RequestId): Promise { + let response: Response | undefined; + + response = await this.getSupertonicResponse(voiceId) as Response; + if (response.ok) + return await response.arrayBuffer(); + + throw new Error(`invalid supertonic response ${await response.text()}`); + } + public getVoicePath(id: TTSSupertonicModel.RequestId): string { + const audioFileName = TTSModelBase.hashAudioFile(id.text); + const audioPath = join( + TTSSupertonicModel.SupertonicAudioCachePath, + audioFileName + ); + return audioPath; + } + public createRequestId(text: string): TTSSupertonicModel.RequestId { + return { + text, + }; + } +} +export namespace TTSSupertonicModel { + export const instance = new TTSSupertonicModel(); + export type RequestId = { text: string }; + export const SupertonicAudioCachePath = join(TTSModelBase.AudioCachePath, "supertonic"); +} +export default TTSSupertonicModel; diff --git a/packages/tts/typecast.ts b/packages/tts/typecast.ts index fd36a70..ba5a62d 100644 --- a/packages/tts/typecast.ts +++ b/packages/tts/typecast.ts @@ -2,11 +2,9 @@ import { join } from "path"; import { TYPECAST_TOKENS } from "../env"; import fetch from "../utils/fetch"; import TTSModelBase from "."; -import CallingNumberKorean from "../utils/callingNumberKorean"; -import IntegerKorean from "../utils/integerKorean"; -import FloatKorean from "../utils/floatKorean"; import { readFileSync, writeFileSync } from "fs"; import { cwd, env } from "process"; +import { saferKorean } from "../utils/saferKorean"; export class TTSTypecastModel extends TTSModelBase { protected cachedVoice: Map> @@ -19,61 +17,12 @@ export class TTSTypecastModel extends TTSModelBase { ttsify(input: string): string { return super.ttsify( - input - .replace(/\.+$/, "") - .replace(/\.\.+/g, "") - .replace(/\.[ \t]/g, " ") - .replace(/^[\?\!\'\"]+$/, (total)=>( - [...total].map(element => TTSTypecastModel.IsolatedSymbolMap[ - element as keyof typeof TTSTypecastModel.IsolatedSymbolMap - ]).join("") - )) - .replace(/\`\`\`.+?\`\`\`/g, "코드블럭") - .replace(/https\S+/g, "링크") + saferKorean(input) .replace(/ㄴㄴ/g, "노노") .replace(/ㅇㅋ/g, "오키") .replace(/ㅜㅜ/g, "눙물") .replace(/빵/g, "빵 크크") .replace(/[\?]+ *ㄴ/g, "물음표ㄴ") - .replace(/(\d+)[ \t\n]*([개살])/g, (_, num: string, postfix: string)=>{ - const intNum = parseInt(num) - if (CallingNumberKorean.canConvert(intNum)) { - return CallingNumberKorean.convert(intNum) + postfix; - } else { - return IntegerKorean.convertFromString(num) + postfix; - } - }) - .replace(/(v?)([\d\.]+)([ab]?)/g, (_, suffix: string, num: string, postfix: string) => { - const dotCount = [...num.matchAll(/\./g)].length; - const hasNoSuffix = suffix == ""; - - if (hasNoSuffix && dotCount == 0) { - return IntegerKorean.convertFromString(num) + postfix; - } else if (hasNoSuffix && dotCount == 1) { - const [intPart, floatPart] = num.split(/\./); - return ( - IntegerKorean.convertFromString(intPart) - + "쩜" - + FloatKorean.convert(floatPart) - + postfix - ) - } else if (suffix == "v") { - return ( - "버전" - + FloatKorean.convert(num) - + (TTSTypecastModel.VersionPostfix[ - postfix as keyof typeof TTSTypecastModel.VersionPostfix - ] ?? "") - ); - } else { - return FloatKorean.convert(num) + postfix; - } - }) - .replace(/[\%\^\&\*\#\@\.\-\+\_\=\/\\♡\$]/g, (t) => ( - TTSTypecastModel.SymbolMap[t as keyof typeof TTSTypecastModel.SymbolMap] - )) - .replace(/\?+/g, "?") - .replace(/\!+/g, "!") ) } private async getTypecastResponse(apiKey: string, voiceId: TTSTypecastModel.RequestId) { @@ -140,33 +89,6 @@ export class TTSTypecastModel extends TTSModelBase { } } export namespace TTSTypecastModel { - export const IsolatedSymbolMap = { - "?": "물음표", - "!": "느낌표", - "'": "쿼트", - "\"": "더블쿼트", - } - export const SymbolMap = { - "%": "퍼센트", - "$": "달러싸인", - "^": "캐럿", - "&": "엠퍼센드", - "*": "스타", - "#": "해시", - "@": "엣", - ".": "쩜", - "-": "마이너스", - "+": "플러스", - "_": "언더바", - "=": "이퀄", - "/": "슬래쉬", - "\\": "역슬래쉬", - "♡": "하투 ", - }; - export const VersionPostfix = { - "a": "알파", - "b": "베타", - }; export const instance = new TTSTypecastModel(); export type RequestId = { text: string, voiceId: string }; export const TypecastAudioCachePath = join(TTSModelBase.AudioCachePath, "typecast"); diff --git a/packages/utils/saferKorean.ts b/packages/utils/saferKorean.ts new file mode 100644 index 0000000..13f5d16 --- /dev/null +++ b/packages/utils/saferKorean.ts @@ -0,0 +1,83 @@ +import CallingNumberKorean from "./callingNumberKorean"; +import FloatKorean from "./floatKorean"; +import IntegerKorean from "./integerKorean"; + +export const IsolatedSymbolMap = { + "?": "물음표", + "!": "느낌표", + "'": "쿼트", + "\"": "더블쿼트", +} +export const SymbolMap = { + "%": "퍼센트", + "$": "달러싸인", + "^": "캐럿", + "&": "엠퍼센드", + "*": "스타", + "#": "해시", + "@": "엣", + ".": "쩜", + "-": "마이너스", + "+": "플러스", + "_": "언더바", + "=": "이퀄", + "/": "슬래쉬", + "\\": "역슬래쉬", + "♡": "하투 ", +}; +export const VersionPostfix = { + "a": "알파", + "b": "베타", +}; + +export function saferKorean(input: string): string { + return input.replace(/\.+$/, "") + .replace(/\.\.+/g, "") + .replace(/\.[ \t]/g, " ") + .replace(/^[\?\!\'\"]+$/, (total)=>( + [...total].map(element => IsolatedSymbolMap[ + element as keyof typeof IsolatedSymbolMap + ]).join("") + )) + .replace(/\`\`\`.+?\`\`\`/g, "코드블럭") + .replace(/https\S+/g, "링크") + .replace(/(\d+)[ \t\n]*([개살])/g, (_, num: string, postfix: string)=>{ + const intNum = parseInt(num) + if (CallingNumberKorean.canConvert(intNum)) { + return CallingNumberKorean.convert(intNum) + postfix; + } else { + return IntegerKorean.convertFromString(num) + postfix; + } + }) + .replace(/(v?)([\d\.]+)([ab]?)/g, (_, suffix: string, num: string, postfix: string) => { + const dotCount = [...num.matchAll(/\./g)].length; + const hasNoSuffix = suffix == ""; + + if (hasNoSuffix && dotCount == 0) { + return IntegerKorean.convertFromString(num) + postfix; + } else if (hasNoSuffix && dotCount == 1) { + const [intPart, floatPart] = num.split(/\./); + return ( + IntegerKorean.convertFromString(intPart) + + "쩜" + + FloatKorean.convert(floatPart) + + postfix + ) + } else if (suffix == "v") { + return ( + "버전" + + FloatKorean.convert(num) + + (VersionPostfix[ + postfix as keyof typeof VersionPostfix + ] ?? "") + ); + } else { + return FloatKorean.convert(num) + postfix; + } + }) + .replace(/[\%\^\&\*\#\@\.\-\+\_\=\/\\♡\$]/g, (t) => ( + SymbolMap[t as keyof typeof SymbolMap] + )) + .replace(/\?+/g, "?") + .replace(/\!+/g, "!") +} diff --git a/prisma/migrations/20260519155510_add_supertonic/migration.sql b/prisma/migrations/20260519155510_add_supertonic/migration.sql new file mode 100644 index 0000000..c61409a --- /dev/null +++ b/prisma/migrations/20260519155510_add_supertonic/migration.sql @@ -0,0 +1,2 @@ +-- AlterEnum +ALTER TYPE "Voice" ADD VALUE 'Supertonic'; diff --git a/prisma/schema.prisma b/prisma/schema.prisma index 19f6643..94f112f 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -25,4 +25,5 @@ model DiscordGuildProfile { enum Voice { TypeCast Papago + Supertonic }