add supertonic

This commit is contained in:
kimpure 2026-05-19 15:59:17 +00:00
parent 69ec38d16b
commit 5ee42ede56
No known key found for this signature in database
15 changed files with 223 additions and 86 deletions

View file

@ -0,0 +1,5 @@
.env
assets
Dockerfile
docker-compose.yml
.dockerignore

View file

@ -3,8 +3,13 @@ name = "yaejuyang-supertonic"
version = "0.1.0"
edition = "2024"
[profile.release]
opt-level = "z"
strip = true
lto = true
[features]
default = ["webgpu"]
default = []
webgpu = [ "ort/webgpu" ]
cuda = [ "ort/cuda" ]

View file

@ -0,0 +1,21 @@
FROM alpine:latest AS assets
RUN apk add git git-lfs
WORKDIR /app
RUN git clone https://huggingface.co/Supertone/supertonic-3 assets && rm -r assets/.git
FROM rust:trixie AS builder
RUN apt-get update && apt-get install -y --no-install-recommends \
pkg-config libssl-dev
WORKDIR /app
ADD . .
RUN cargo build --profile=release
FROM debian:trixie-slim AS runtime
RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg
WORKDIR /app
COPY --from=assets /app/assets /app/assets
COPY --from=builder /app/target/release/yaejuyang-supertonic /app/
ENV SUPERTONIC_MODEL_DIR="/app/assets"

View file

@ -0,0 +1,12 @@
services:
yaejuyang-supertonic:
build: .
command: /app/yaejuyang-supertonic
container_name: "yaejuyang-supertonic"
ports:
- 3000:80
environment:
SUPERTONIC_WORKERS: "1"
ENABLED_BACKENDS: ""
WEBGPU_DEVICE_ID: "0"
RUST_LOG: "info,ort=warn"

View file

@ -12,8 +12,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
dotenvy::dotenv().ok();
tracing_subscriber::fmt::init();
let model_dir = std::env::var("SUPERTONIC_MODEL_DIR")
.unwrap_or_else(|_| "./assets/supertonic-3".to_string());
let model_dir =
std::env::var("SUPERTONIC_MODEL_DIR").unwrap_or_else(|_| "./assets".to_string());
let voice_style_path = std::env::var("SUPERTONIC_VOICE_STYLE")
.unwrap_or_else(|_| format!("{model_dir}/voice_styles/M1.json"));
let lang = std::env::var("SUPERTONIC_LANG").unwrap_or_else(|_| "en".to_string());

View file

@ -382,6 +382,12 @@ fn load_backends(config: &HashMap<String, String>) -> Vec<ExecutionProviderDispa
.collect::<Vec<String>>();
enabled_backends.iter().filter_map(|name| {
let name = name.trim();
if name.is_empty() {
return None
}
#[cfg(feature = "cuda")]
if name == "cuda" {
return load_backend_cuda(config)

View file

@ -8,6 +8,7 @@ import TTSModelBase from "../tts";
import { DiscordUserProfile } from "../db/generated/prisma/client";
import { nyaize } from "../utils/nyaize";
import { OutputHandler } from "../utils/outputHandler";
import TTSSupertonicModel from "../tts/supertonic";
export async function createVoiceBuffer(voice: Voice, text: string): Promise<Buffer> {
if (voice == "TypeCast") {
@ -19,8 +20,17 @@ export async function createVoiceBuffer(voice: Voice, text: string): Promise<Buf
return await TTSTypecastModel.instance.getMemcachedVoice(
TTSTypecastModel.instance.createRequestId(content)
);
} else if (voice == "Supertonic") {
const content = TTSSupertonicModel.instance.ttsify(text);
if (!content.length)
throw new Error("Empty content");
return await TTSSupertonicModel.instance.getMemcachedVoice(
TTSSupertonicModel.instance.createRequestId(content)
);
} else if (voice == "Papago") {
const content = TTSTypecastModel.instance.ttsify(text);
const content = TTSPapagoModel.instance.ttsify(text);
if (!content.length)
throw new Error("Empty content");

View file

@ -11,7 +11,8 @@
export const Voice = {
TypeCast: 'TypeCast',
Papago: 'Papago'
Papago: 'Papago',
Supertonic: 'Supertonic'
} as const
export type Voice = (typeof Voice)[keyof typeof Voice]

View file

@ -20,7 +20,7 @@ const config: runtime.GetPrismaClientConfig = {
"clientVersion": "7.3.0",
"engineVersion": "9d6ad21cbbceab97458517b147a6a09ff43aa735",
"activeProvider": "postgresql",
"inlineSchema": "generator client {\n provider = \"prisma-client\"\n output = \"../packages/db/generated/prisma\"\n specifying = [\"prismaSchemaFolder\"]\n}\n\ndatasource db {\n provider = \"postgresql\"\n}\n\nmodel DiscordUserProfile {\n id String @id @default(cuid())\n userId String @unique\n voice Voice @default(Papago)\n nya Boolean @default(false)\n canTypecast Boolean @default(false)\n}\n\nmodel DiscordGuildProfile {\n id String @id @default(cuid())\n guildId String @unique\n readChannel String[] @default([])\n}\n\nenum Voice {\n TypeCast\n Papago\n}\n",
"inlineSchema": "generator client {\n provider = \"prisma-client\"\n output = \"../packages/db/generated/prisma\"\n specifying = [\"prismaSchemaFolder\"]\n}\n\ndatasource db {\n provider = \"postgresql\"\n}\n\nmodel DiscordUserProfile {\n id String @id @default(cuid())\n userId String @unique\n voice Voice @default(Papago)\n nya Boolean @default(false)\n canTypecast Boolean @default(false)\n}\n\nmodel DiscordGuildProfile {\n id String @id @default(cuid())\n guildId String @unique\n readChannel String[] @default([])\n}\n\nenum Voice {\n TypeCast\n Papago\n Supertonic\n}\n",
"runtimeDataModel": {
"models": {},
"enums": {},

View file

@ -2,6 +2,7 @@ import { createHmac } from "crypto";
import { join } from "path";
import fetch from "../utils/fetch";
import TTSModelBase from ".";
import { saferKorean } from "../utils/saferKorean";
export class TTSPapagoModel extends TTSModelBase<TTSPapagoModel.RequestId> {
protected cachedVoice: Map<String, Promise<Buffer>>
@ -9,6 +10,11 @@ export class TTSPapagoModel extends TTSModelBase<TTSPapagoModel.RequestId> {
super()
this.cachedVoice = new Map();
}
ttsify(input: string): string {
return super.ttsify(saferKorean(
input
))
}
public getVoicePath(id: TTSPapagoModel.RequestId): string {
const audioFileName = TTSModelBase.hashAudioFile(id.text, `.${id.speaker}.${id.speed.replace(/\-/g, "_")}`);
const audioPath = join(

View file

@ -0,0 +1,63 @@
import { join } from "path";
import fetch from "../utils/fetch";
import TTSModelBase from ".";
import { saferKorean } from "../utils/saferKorean";
export class TTSSupertonicModel extends TTSModelBase<TTSSupertonicModel.RequestId> {
protected override cachedVoice: Map<String, Promise<Buffer>>
constructor() {
super()
this.cachedVoice = new Map();
}
override ttsify(input: string): string {
return super.ttsify(saferKorean(
input
))
}
private async getSupertonicResponse(voiceId: TTSSupertonicModel.RequestId) {
const payload = {
text: voiceId.text,
lang: "ko",
};
if (!process.env.SUPERTONIC_API_URL) {
throw Error("process.env.SUPERTONIC_API_URL not set");
}
return await fetch(process.env.SUPERTONIC_API_URL, {
method: "POST",
headers: {
"Content-Type": "application/json"
},
body: JSON.stringify(payload)
});
}
async getVoiceBuffer(voiceId: TTSSupertonicModel.RequestId): Promise<ArrayBuffer> {
let response: Response | undefined;
response = await this.getSupertonicResponse(voiceId) as Response;
if (response.ok)
return await response.arrayBuffer();
throw new Error(`invalid supertonic response ${await response.text()}`);
}
public getVoicePath(id: TTSSupertonicModel.RequestId): string {
const audioFileName = TTSModelBase.hashAudioFile(id.text);
const audioPath = join(
TTSSupertonicModel.SupertonicAudioCachePath,
audioFileName
);
return audioPath;
}
public createRequestId(text: string): TTSSupertonicModel.RequestId {
return {
text,
};
}
}
export namespace TTSSupertonicModel {
export const instance = new TTSSupertonicModel();
export type RequestId = { text: string };
export const SupertonicAudioCachePath = join(TTSModelBase.AudioCachePath, "supertonic");
}
export default TTSSupertonicModel;

View file

@ -2,11 +2,9 @@ import { join } from "path";
import { TYPECAST_TOKENS } from "../env";
import fetch from "../utils/fetch";
import TTSModelBase from ".";
import CallingNumberKorean from "../utils/callingNumberKorean";
import IntegerKorean from "../utils/integerKorean";
import FloatKorean from "../utils/floatKorean";
import { readFileSync, writeFileSync } from "fs";
import { cwd, env } from "process";
import { saferKorean } from "../utils/saferKorean";
export class TTSTypecastModel extends TTSModelBase<TTSTypecastModel.RequestId> {
protected cachedVoice: Map<String, Promise<Buffer>>
@ -19,61 +17,12 @@ export class TTSTypecastModel extends TTSModelBase<TTSTypecastModel.RequestId> {
ttsify(input: string): string {
return super.ttsify(
input
.replace(/\.+$/, "")
.replace(/\.\.+/g, "")
.replace(/\.[ \t]/g, " ")
.replace(/^[\?\!\'\"]+$/, (total)=>(
[...total].map(element => TTSTypecastModel.IsolatedSymbolMap[
element as keyof typeof TTSTypecastModel.IsolatedSymbolMap
]).join("")
))
.replace(/\`\`\`.+?\`\`\`/g, "코드블럭")
.replace(/https\S+/g, "링크")
saferKorean(input)
.replace(/ㄴㄴ/g, "노노")
.replace(/ㅇㅋ/g, "오키")
.replace(/ㅜㅜ/g, "눙물")
.replace(/빵/g, "빵 크크")
.replace(/[\?]+ *ㄴ/g, "물음표ㄴ")
.replace(/(\d+)[ \t\n]*([개살])/g, (_, num: string, postfix: string)=>{
const intNum = parseInt(num)
if (CallingNumberKorean.canConvert(intNum)) {
return CallingNumberKorean.convert(intNum) + postfix;
} else {
return IntegerKorean.convertFromString(num) + postfix;
}
})
.replace(/(v?)([\d\.]+)([ab]?)/g, (_, suffix: string, num: string, postfix: string) => {
const dotCount = [...num.matchAll(/\./g)].length;
const hasNoSuffix = suffix == "";
if (hasNoSuffix && dotCount == 0) {
return IntegerKorean.convertFromString(num) + postfix;
} else if (hasNoSuffix && dotCount == 1) {
const [intPart, floatPart] = num.split(/\./);
return (
IntegerKorean.convertFromString(intPart)
+ "쩜"
+ FloatKorean.convert(floatPart)
+ postfix
)
} else if (suffix == "v") {
return (
"버전"
+ FloatKorean.convert(num)
+ (TTSTypecastModel.VersionPostfix[
postfix as keyof typeof TTSTypecastModel.VersionPostfix
] ?? "")
);
} else {
return FloatKorean.convert(num) + postfix;
}
})
.replace(/[\%\^\&\*\#\@\.\-\+\_\=\/\\♡\$]/g, (t) => (
TTSTypecastModel.SymbolMap[t as keyof typeof TTSTypecastModel.SymbolMap]
))
.replace(/\?+/g, "?")
.replace(/\!+/g, "!")
)
}
private async getTypecastResponse(apiKey: string, voiceId: TTSTypecastModel.RequestId) {
@ -140,33 +89,6 @@ export class TTSTypecastModel extends TTSModelBase<TTSTypecastModel.RequestId> {
}
}
export namespace TTSTypecastModel {
export const IsolatedSymbolMap = {
"?": "물음표",
"!": "느낌표",
"'": "쿼트",
"\"": "더블쿼트",
}
export const SymbolMap = {
"%": "퍼센트",
"$": "달러싸인",
"^": "캐럿",
"&": "엠퍼센드",
"*": "스타",
"#": "해시",
"@": "엣",
".": "쩜",
"-": "마이너스",
"+": "플러스",
"_": "언더바",
"=": "이퀄",
"/": "슬래쉬",
"\\": "역슬래쉬",
"♡": "하투 ",
};
export const VersionPostfix = {
"a": "알파",
"b": "베타",
};
export const instance = new TTSTypecastModel();
export type RequestId = { text: string, voiceId: string };
export const TypecastAudioCachePath = join(TTSModelBase.AudioCachePath, "typecast");

View file

@ -0,0 +1,83 @@
import CallingNumberKorean from "./callingNumberKorean";
import FloatKorean from "./floatKorean";
import IntegerKorean from "./integerKorean";
export const IsolatedSymbolMap = {
"?": "물음표",
"!": "느낌표",
"'": "쿼트",
"\"": "더블쿼트",
}
export const SymbolMap = {
"%": "퍼센트",
"$": "달러싸인",
"^": "캐럿",
"&": "엠퍼센드",
"*": "스타",
"#": "해시",
"@": "엣",
".": "쩜",
"-": "마이너스",
"+": "플러스",
"_": "언더바",
"=": "이퀄",
"/": "슬래쉬",
"\\": "역슬래쉬",
"♡": "하투 ",
};
export const VersionPostfix = {
"a": "알파",
"b": "베타",
};
export function saferKorean(input: string): string {
return input.replace(/\.+$/, "")
.replace(/\.\.+/g, "")
.replace(/\.[ \t]/g, " ")
.replace(/^[\?\!\'\"]+$/, (total)=>(
[...total].map(element => IsolatedSymbolMap[
element as keyof typeof IsolatedSymbolMap
]).join("")
))
.replace(/\`\`\`.+?\`\`\`/g, "코드블럭")
.replace(/https\S+/g, "링크")
.replace(/(\d+)[ \t\n]*([개살])/g, (_, num: string, postfix: string)=>{
const intNum = parseInt(num)
if (CallingNumberKorean.canConvert(intNum)) {
return CallingNumberKorean.convert(intNum) + postfix;
} else {
return IntegerKorean.convertFromString(num) + postfix;
}
})
.replace(/(v?)([\d\.]+)([ab]?)/g, (_, suffix: string, num: string, postfix: string) => {
const dotCount = [...num.matchAll(/\./g)].length;
const hasNoSuffix = suffix == "";
if (hasNoSuffix && dotCount == 0) {
return IntegerKorean.convertFromString(num) + postfix;
} else if (hasNoSuffix && dotCount == 1) {
const [intPart, floatPart] = num.split(/\./);
return (
IntegerKorean.convertFromString(intPart)
+ "쩜"
+ FloatKorean.convert(floatPart)
+ postfix
)
} else if (suffix == "v") {
return (
"버전"
+ FloatKorean.convert(num)
+ (VersionPostfix[
postfix as keyof typeof VersionPostfix
] ?? "")
);
} else {
return FloatKorean.convert(num) + postfix;
}
})
.replace(/[\%\^\&\*\#\@\.\-\+\_\=\/\\♡\$]/g, (t) => (
SymbolMap[t as keyof typeof SymbolMap]
))
.replace(/\?+/g, "?")
.replace(/\!+/g, "!")
}

View file

@ -0,0 +1,2 @@
-- AlterEnum
ALTER TYPE "Voice" ADD VALUE 'Supertonic';

View file

@ -25,4 +25,5 @@ model DiscordGuildProfile {
enum Voice {
TypeCast
Papago
Supertonic
}