add supertonic

2026-05-19 15:59:17 +00:00 · 2026-05-19 15:59:17 +00:00 · 5ee42ede56
commit 5ee42ede56
parent 69ec38d16b
15 changed files with 223 additions and 86 deletions
--- a/crates/yaejuyang-supertonic/.dockerignore
+++ b/crates/yaejuyang-supertonic/.dockerignore
@ -0,0 +1,5 @@
+.env
+assets
+Dockerfile
+docker-compose.yml
+.dockerignore
--- a/crates/yaejuyang-supertonic/Cargo.toml
+++ b/crates/yaejuyang-supertonic/Cargo.toml
@ -3,8 +3,13 @@ name = "yaejuyang-supertonic"
 version = "0.1.0"
 edition = "2024"

+[profile.release]
+opt-level = "z"
+strip = true
+lto = true
+
 [features]
-default = ["webgpu"]
+default = []
 webgpu = [ "ort/webgpu" ]
 cuda = [ "ort/cuda" ]

--- a/crates/yaejuyang-supertonic/Dockerfile
+++ b/crates/yaejuyang-supertonic/Dockerfile
@ -0,0 +1,21 @@
+FROM alpine:latest AS assets
+
+RUN apk add git git-lfs
+WORKDIR /app
+RUN git clone https://huggingface.co/Supertone/supertonic-3 assets && rm -r assets/.git
+
+FROM rust:trixie AS builder
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    pkg-config libssl-dev
+WORKDIR /app
+ADD . .
+RUN cargo build --profile=release
+
+FROM debian:trixie-slim AS runtime
+
+RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg
+WORKDIR /app
+COPY --from=assets /app/assets /app/assets
+COPY --from=builder /app/target/release/yaejuyang-supertonic /app/
+ENV SUPERTONIC_MODEL_DIR="/app/assets"
--- a/crates/yaejuyang-supertonic/docker-compose.example.yml
+++ b/crates/yaejuyang-supertonic/docker-compose.example.yml
@ -0,0 +1,12 @@
+services:
+  yaejuyang-supertonic:
+    build: .
+    command: /app/yaejuyang-supertonic
+    container_name: "yaejuyang-supertonic"
+    ports:
+      - 3000:80
+    environment:
+      SUPERTONIC_WORKERS: "1"
+      ENABLED_BACKENDS: ""
+      WEBGPU_DEVICE_ID: "0"
+      RUST_LOG: "info,ort=warn"
--- a/crates/yaejuyang-supertonic/src/main.rs
+++ b/crates/yaejuyang-supertonic/src/main.rs
@ -12,8 +12,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
    dotenvy::dotenv().ok();
    tracing_subscriber::fmt::init();

-    let model_dir = std::env::var("SUPERTONIC_MODEL_DIR")
-        .unwrap_or_else(|_| "./assets/supertonic-3".to_string());
+    let model_dir =
+        std::env::var("SUPERTONIC_MODEL_DIR").unwrap_or_else(|_| "./assets".to_string());
    let voice_style_path = std::env::var("SUPERTONIC_VOICE_STYLE")
        .unwrap_or_else(|_| format!("{model_dir}/voice_styles/M1.json"));
    let lang = std::env::var("SUPERTONIC_LANG").unwrap_or_else(|_| "en".to_string());
--- a/crates/yaejuyang-supertonic/src/tts/engine.rs
+++ b/crates/yaejuyang-supertonic/src/tts/engine.rs
@ -382,6 +382,12 @@ fn load_backends(config: &HashMap<String, String>) -> Vec<ExecutionProviderDispa
        .collect::<Vec<String>>();

    enabled_backends.iter().filter_map(|name| {
+        let name = name.trim();
+
+        if name.is_empty() {
+            return None
+        }
+        
        #[cfg(feature = "cuda")]
        if name == "cuda" {
            return load_backend_cuda(config)
--- a/packages/bot/tts.ts
+++ b/packages/bot/tts.ts
@ -8,6 +8,7 @@ import TTSModelBase from "../tts";
 import { DiscordUserProfile } from "../db/generated/prisma/client";
 import { nyaize } from "../utils/nyaize";
 import { OutputHandler } from "../utils/outputHandler";
+import TTSSupertonicModel from "../tts/supertonic";

 export async function createVoiceBuffer(voice: Voice, text: string): Promise<Buffer> {
    if (voice == "TypeCast") {
@ -19,8 +20,17 @@ export async function createVoiceBuffer(voice: Voice, text: string): Promise<Buf
        return await TTSTypecastModel.instance.getMemcachedVoice(
            TTSTypecastModel.instance.createRequestId(content)
        );
+    } else if (voice == "Supertonic") {
+        const content = TTSSupertonicModel.instance.ttsify(text);
+
+        if (!content.length)
+            throw new Error("Empty content");
+
+        return await TTSSupertonicModel.instance.getMemcachedVoice(
+            TTSSupertonicModel.instance.createRequestId(content)
+        );
    } else if (voice == "Papago") {
-        const content = TTSTypecastModel.instance.ttsify(text);
+        const content = TTSPapagoModel.instance.ttsify(text);
        if (!content.length)
            throw new Error("Empty content");

--- a/packages/db/generated/prisma/enums.ts
+++ b/packages/db/generated/prisma/enums.ts
@ -11,7 +11,8 @@

 export const Voice = {
  TypeCast: 'TypeCast',
-  Papago: 'Papago'
+  Papago: 'Papago',
+  Supertonic: 'Supertonic'
 } as const

 export type Voice = (typeof Voice)[keyof typeof Voice]
--- a/packages/db/generated/prisma/internal/class.ts
+++ b/packages/db/generated/prisma/internal/class.ts
@ -20,7 +20,7 @@ const config: runtime.GetPrismaClientConfig = {
  "clientVersion": "7.3.0",
  "engineVersion": "9d6ad21cbbceab97458517b147a6a09ff43aa735",
  "activeProvider": "postgresql",
-  "inlineSchema": "generator client {\n  provider   = \"prisma-client\"\n  output     = \"../packages/db/generated/prisma\"\n  specifying = [\"prismaSchemaFolder\"]\n}\n\ndatasource db {\n  provider = \"postgresql\"\n}\n\nmodel DiscordUserProfile {\n  id          String  @id @default(cuid())\n  userId      String  @unique\n  voice       Voice   @default(Papago)\n  nya         Boolean @default(false)\n  canTypecast Boolean @default(false)\n}\n\nmodel DiscordGuildProfile {\n  id          String   @id @default(cuid())\n  guildId     String   @unique\n  readChannel String[] @default([])\n}\n\nenum Voice {\n  TypeCast\n  Papago\n}\n",
+  "inlineSchema": "generator client {\n  provider   = \"prisma-client\"\n  output     = \"../packages/db/generated/prisma\"\n  specifying = [\"prismaSchemaFolder\"]\n}\n\ndatasource db {\n  provider = \"postgresql\"\n}\n\nmodel DiscordUserProfile {\n  id          String  @id @default(cuid())\n  userId      String  @unique\n  voice       Voice   @default(Papago)\n  nya         Boolean @default(false)\n  canTypecast Boolean @default(false)\n}\n\nmodel DiscordGuildProfile {\n  id          String   @id @default(cuid())\n  guildId     String   @unique\n  readChannel String[] @default([])\n}\n\nenum Voice {\n  TypeCast\n  Papago\n  Supertonic\n}\n",
  "runtimeDataModel": {
    "models": {},
    "enums": {},
--- a/packages/tts/papago.ts
+++ b/packages/tts/papago.ts
@ -2,6 +2,7 @@ import { createHmac } from "crypto";
 import { join } from "path";
 import fetch from "../utils/fetch";
 import TTSModelBase from ".";
+import { saferKorean } from "../utils/saferKorean";

 export class TTSPapagoModel extends TTSModelBase<TTSPapagoModel.RequestId> {
    protected cachedVoice: Map<String, Promise<Buffer>>
@ -9,6 +10,11 @@ export class TTSPapagoModel extends TTSModelBase<TTSPapagoModel.RequestId> {
        super()
        this.cachedVoice = new Map();
    }
+    ttsify(input: string): string {
+        return super.ttsify(saferKorean(
+            input
+        ))
+    }
    public getVoicePath(id: TTSPapagoModel.RequestId): string {
        const audioFileName = TTSModelBase.hashAudioFile(id.text, `.${id.speaker}.${id.speed.replace(/\-/g, "_")}`);
        const audioPath = join(
--- a/packages/tts/supertonic.ts
+++ b/packages/tts/supertonic.ts
@ -0,0 +1,63 @@
+import { join } from "path";
+import fetch from "../utils/fetch";
+import TTSModelBase from ".";
+import { saferKorean } from "../utils/saferKorean";
+
+export class TTSSupertonicModel extends TTSModelBase<TTSSupertonicModel.RequestId> {
+    protected override cachedVoice: Map<String, Promise<Buffer>>
+    constructor() {
+        super()
+        this.cachedVoice = new Map();
+    }
+    override ttsify(input: string): string {
+        return super.ttsify(saferKorean(
+            input
+        ))
+    }
+    private async getSupertonicResponse(voiceId: TTSSupertonicModel.RequestId) {
+        const payload = {
+            text: voiceId.text,
+            lang: "ko",
+        };
+
+        if (!process.env.SUPERTONIC_API_URL) {
+            throw Error("process.env.SUPERTONIC_API_URL not set");
+        }
+
+        return await fetch(process.env.SUPERTONIC_API_URL, {
+            method: "POST",
+            headers: {
+                "Content-Type": "application/json"
+            },
+            body: JSON.stringify(payload)
+        });
+    }
+    async getVoiceBuffer(voiceId: TTSSupertonicModel.RequestId): Promise<ArrayBuffer> {
+        let response: Response | undefined;
+
+        response = await this.getSupertonicResponse(voiceId) as Response;
+        if (response.ok)
+            return await response.arrayBuffer();
+
+        throw new Error(`invalid supertonic response ${await response.text()}`);
+    }
+    public getVoicePath(id: TTSSupertonicModel.RequestId): string {
+        const audioFileName = TTSModelBase.hashAudioFile(id.text);
+        const audioPath = join(
+            TTSSupertonicModel.SupertonicAudioCachePath,
+            audioFileName
+        );
+        return audioPath;
+    }
+    public createRequestId(text: string): TTSSupertonicModel.RequestId {
+        return {
+            text,
+        };
+    }
+}
+export namespace TTSSupertonicModel {
+    export const instance = new TTSSupertonicModel();
+    export type RequestId = { text: string };
+    export const SupertonicAudioCachePath = join(TTSModelBase.AudioCachePath, "supertonic");
+}
+export default TTSSupertonicModel;
--- a/packages/tts/typecast.ts
+++ b/packages/tts/typecast.ts
@ -2,11 +2,9 @@ import { join } from "path";
 import { TYPECAST_TOKENS } from "../env";
 import fetch from "../utils/fetch";
 import TTSModelBase from ".";
-import CallingNumberKorean from "../utils/callingNumberKorean";
-import IntegerKorean from "../utils/integerKorean";
-import FloatKorean from "../utils/floatKorean";
 import { readFileSync, writeFileSync } from "fs";
 import { cwd, env } from "process";
+import { saferKorean } from "../utils/saferKorean";

 export class TTSTypecastModel extends TTSModelBase<TTSTypecastModel.RequestId> {
    protected cachedVoice: Map<String, Promise<Buffer>>
@ -19,61 +17,12 @@ export class TTSTypecastModel extends TTSModelBase<TTSTypecastModel.RequestId> {
    ttsify(input: string): string {
        
        return super.ttsify(
-            input
-            .replace(/\.+$/, "")
-            .replace(/\.\.+/g, "")
-            .replace(/\.[ \t]/g, " ")
-            .replace(/^[\?\!\'\"]+$/, (total)=>(
-                [...total].map(element => TTSTypecastModel.IsolatedSymbolMap[
-                    element as keyof typeof TTSTypecastModel.IsolatedSymbolMap
-                ]).join("")
-            ))
-            .replace(/\`\`\`.+?\`\`\`/g, "코드블럭")
-            .replace(/https\S+/g, "링크")
+            saferKorean(input)
            .replace(/ㄴㄴ/g, "노노")
            .replace(/ㅇㅋ/g, "오키")
            .replace(/ㅜㅜ/g, "눙물")
            .replace(/빵/g, "빵 크크")
            .replace(/[\?]+ *ㄴ/g, "물음표ㄴ")
-            .replace(/(\d+)[ \t\n]*([개살])/g, (_, num: string, postfix: string)=>{
-                const intNum = parseInt(num)
-                if (CallingNumberKorean.canConvert(intNum)) {
-                    return CallingNumberKorean.convert(intNum) + postfix;
-                } else {
-                    return IntegerKorean.convertFromString(num) + postfix;
-                }
-            })
-            .replace(/(v?)([\d\.]+)([ab]?)/g, (_, suffix: string, num: string, postfix: string) => {
-                const dotCount = [...num.matchAll(/\./g)].length;
-                const hasNoSuffix = suffix == "";
-
-                if (hasNoSuffix && dotCount == 0) {
-                    return IntegerKorean.convertFromString(num) + postfix;
-                } else if (hasNoSuffix && dotCount == 1) {
-                    const [intPart, floatPart] = num.split(/\./);
-                    return (
-                        IntegerKorean.convertFromString(intPart)
-                        + "쩜"
-                        + FloatKorean.convert(floatPart)
-                        + postfix
-                    )
-                } else if (suffix == "v") {
-                    return (
-                        "버전"
-                        + FloatKorean.convert(num)
-                        + (TTSTypecastModel.VersionPostfix[
-                            postfix as keyof typeof TTSTypecastModel.VersionPostfix
-                        ] ?? "")
-                    );
-                } else {
-                    return FloatKorean.convert(num) + postfix;
-                }
-            })
-            .replace(/[\%\^\&\*\#\@\.\-\+\_\=\/\\♡\$]/g, (t) => (
-                TTSTypecastModel.SymbolMap[t as keyof typeof TTSTypecastModel.SymbolMap]
-            ))
-            .replace(/\?+/g, "?")
-            .replace(/\!+/g, "!")
        )
    }
    private async getTypecastResponse(apiKey: string, voiceId: TTSTypecastModel.RequestId) {
@ -140,33 +89,6 @@ export class TTSTypecastModel extends TTSModelBase<TTSTypecastModel.RequestId> {
    }
 }
 export namespace TTSTypecastModel {
-    export const IsolatedSymbolMap = {
-        "?": "물음표",
-        "!": "느낌표",
-        "'": "쿼트",
-        "\"": "더블쿼트",
-    }
-    export const SymbolMap = {
-        "%": "퍼센트",
-        "$": "달러싸인",
-        "^": "캐럿",
-        "&": "엠퍼센드",
-        "*": "스타",
-        "#": "해시",
-        "@": "엣",
-        ".": "쩜",
-        "-": "마이너스",
-        "+": "플러스",
-        "_": "언더바",
-        "=": "이퀄",
-        "/": "슬래쉬",
-        "\\": "역슬래쉬",
-        "♡": "하투 ",
-    };
-    export const VersionPostfix = {
-        "a": "알파",
-        "b": "베타",
-    };
    export const instance = new TTSTypecastModel();
    export type RequestId = { text: string, voiceId: string };
    export const TypecastAudioCachePath = join(TTSModelBase.AudioCachePath, "typecast");
--- a/packages/utils/saferKorean.ts
+++ b/packages/utils/saferKorean.ts
@ -0,0 +1,83 @@
+import CallingNumberKorean from "./callingNumberKorean";
+import FloatKorean from "./floatKorean";
+import IntegerKorean from "./integerKorean";
+
+export const IsolatedSymbolMap = {
+    "?": "물음표",
+    "!": "느낌표",
+    "'": "쿼트",
+    "\"": "더블쿼트",
+}
+export const SymbolMap = {
+    "%": "퍼센트",
+    "$": "달러싸인",
+    "^": "캐럿",
+    "&": "엠퍼센드",
+    "*": "스타",
+    "#": "해시",
+    "@": "엣",
+    ".": "쩜",
+    "-": "마이너스",
+    "+": "플러스",
+    "_": "언더바",
+    "=": "이퀄",
+    "/": "슬래쉬",
+    "\\": "역슬래쉬",
+    "♡": "하투 ",
+};
+export const VersionPostfix = {
+    "a": "알파",
+    "b": "베타",
+};
+
+export function saferKorean(input: string): string {
+    return input.replace(/\.+$/, "")
+        .replace(/\.\.+/g, "")
+        .replace(/\.[ \t]/g, " ")
+        .replace(/^[\?\!\'\"]+$/, (total)=>(
+            [...total].map(element => IsolatedSymbolMap[
+                element as keyof typeof IsolatedSymbolMap
+            ]).join("")
+        ))
+        .replace(/\`\`\`.+?\`\`\`/g, "코드블럭")
+        .replace(/https\S+/g, "링크")
+        .replace(/(\d+)[ \t\n]*([개살])/g, (_, num: string, postfix: string)=>{
+            const intNum = parseInt(num)
+            if (CallingNumberKorean.canConvert(intNum)) {
+                return CallingNumberKorean.convert(intNum) + postfix;
+            } else {
+                return IntegerKorean.convertFromString(num) + postfix;
+            }
+        })
+        .replace(/(v?)([\d\.]+)([ab]?)/g, (_, suffix: string, num: string, postfix: string) => {
+            const dotCount = [...num.matchAll(/\./g)].length;
+            const hasNoSuffix = suffix == "";
+
+            if (hasNoSuffix && dotCount == 0) {
+                return IntegerKorean.convertFromString(num) + postfix;
+            } else if (hasNoSuffix && dotCount == 1) {
+                const [intPart, floatPart] = num.split(/\./);
+                return (
+                    IntegerKorean.convertFromString(intPart)
+                    + "쩜"
+                    + FloatKorean.convert(floatPart)
+                    + postfix
+                )
+            } else if (suffix == "v") {
+                return (
+                    "버전"
+                    + FloatKorean.convert(num)
+                    + (VersionPostfix[
+                        postfix as keyof typeof VersionPostfix
+                    ] ?? "")
+                );
+            } else {
+                return FloatKorean.convert(num) + postfix;
+            }
+        })
+        .replace(/[\%\^\&\*\#\@\.\-\+\_\=\/\\♡\$]/g, (t) => (
+            SymbolMap[t as keyof typeof SymbolMap]
+        ))
+        .replace(/\?+/g, "?")
+        .replace(/\!+/g, "!")
+}
--- a/prisma/migrations/20260519155510_add_supertonic/migration.sql
+++ b/prisma/migrations/20260519155510_add_supertonic/migration.sql
@ -0,0 +1,2 @@
+-- AlterEnum
+ALTER TYPE "Voice" ADD VALUE 'Supertonic';
--- a/prisma/schema.prisma
+++ b/prisma/schema.prisma
@ -25,4 +25,5 @@ model DiscordGuildProfile {
 enum Voice {
  TypeCast
  Papago
+  Supertonic
 }