add supertonic
This commit is contained in:
parent
69ec38d16b
commit
5ee42ede56
15 changed files with 223 additions and 86 deletions
5
crates/yaejuyang-supertonic/.dockerignore
Normal file
5
crates/yaejuyang-supertonic/.dockerignore
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
.env
|
||||
assets
|
||||
Dockerfile
|
||||
docker-compose.yml
|
||||
.dockerignore
|
||||
|
|
@ -3,8 +3,13 @@ name = "yaejuyang-supertonic"
|
|||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[profile.release]
|
||||
opt-level = "z"
|
||||
strip = true
|
||||
lto = true
|
||||
|
||||
[features]
|
||||
default = ["webgpu"]
|
||||
default = []
|
||||
webgpu = [ "ort/webgpu" ]
|
||||
cuda = [ "ort/cuda" ]
|
||||
|
||||
|
|
|
|||
21
crates/yaejuyang-supertonic/Dockerfile
Normal file
21
crates/yaejuyang-supertonic/Dockerfile
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
FROM alpine:latest AS assets
|
||||
|
||||
RUN apk add git git-lfs
|
||||
WORKDIR /app
|
||||
RUN git clone https://huggingface.co/Supertone/supertonic-3 assets && rm -r assets/.git
|
||||
|
||||
FROM rust:trixie AS builder
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
pkg-config libssl-dev
|
||||
WORKDIR /app
|
||||
ADD . .
|
||||
RUN cargo build --profile=release
|
||||
|
||||
FROM debian:trixie-slim AS runtime
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg
|
||||
WORKDIR /app
|
||||
COPY --from=assets /app/assets /app/assets
|
||||
COPY --from=builder /app/target/release/yaejuyang-supertonic /app/
|
||||
ENV SUPERTONIC_MODEL_DIR="/app/assets"
|
||||
12
crates/yaejuyang-supertonic/docker-compose.example.yml
Normal file
12
crates/yaejuyang-supertonic/docker-compose.example.yml
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
services:
|
||||
yaejuyang-supertonic:
|
||||
build: .
|
||||
command: /app/yaejuyang-supertonic
|
||||
container_name: "yaejuyang-supertonic"
|
||||
ports:
|
||||
- 3000:80
|
||||
environment:
|
||||
SUPERTONIC_WORKERS: "1"
|
||||
ENABLED_BACKENDS: ""
|
||||
WEBGPU_DEVICE_ID: "0"
|
||||
RUST_LOG: "info,ort=warn"
|
||||
|
|
@ -12,8 +12,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
|||
dotenvy::dotenv().ok();
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
let model_dir = std::env::var("SUPERTONIC_MODEL_DIR")
|
||||
.unwrap_or_else(|_| "./assets/supertonic-3".to_string());
|
||||
let model_dir =
|
||||
std::env::var("SUPERTONIC_MODEL_DIR").unwrap_or_else(|_| "./assets".to_string());
|
||||
let voice_style_path = std::env::var("SUPERTONIC_VOICE_STYLE")
|
||||
.unwrap_or_else(|_| format!("{model_dir}/voice_styles/M1.json"));
|
||||
let lang = std::env::var("SUPERTONIC_LANG").unwrap_or_else(|_| "en".to_string());
|
||||
|
|
|
|||
|
|
@ -382,6 +382,12 @@ fn load_backends(config: &HashMap<String, String>) -> Vec<ExecutionProviderDispa
|
|||
.collect::<Vec<String>>();
|
||||
|
||||
enabled_backends.iter().filter_map(|name| {
|
||||
let name = name.trim();
|
||||
|
||||
if name.is_empty() {
|
||||
return None
|
||||
}
|
||||
|
||||
#[cfg(feature = "cuda")]
|
||||
if name == "cuda" {
|
||||
return load_backend_cuda(config)
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import TTSModelBase from "../tts";
|
|||
import { DiscordUserProfile } from "../db/generated/prisma/client";
|
||||
import { nyaize } from "../utils/nyaize";
|
||||
import { OutputHandler } from "../utils/outputHandler";
|
||||
import TTSSupertonicModel from "../tts/supertonic";
|
||||
|
||||
export async function createVoiceBuffer(voice: Voice, text: string): Promise<Buffer> {
|
||||
if (voice == "TypeCast") {
|
||||
|
|
@ -19,8 +20,17 @@ export async function createVoiceBuffer(voice: Voice, text: string): Promise<Buf
|
|||
return await TTSTypecastModel.instance.getMemcachedVoice(
|
||||
TTSTypecastModel.instance.createRequestId(content)
|
||||
);
|
||||
} else if (voice == "Supertonic") {
|
||||
const content = TTSSupertonicModel.instance.ttsify(text);
|
||||
|
||||
if (!content.length)
|
||||
throw new Error("Empty content");
|
||||
|
||||
return await TTSSupertonicModel.instance.getMemcachedVoice(
|
||||
TTSSupertonicModel.instance.createRequestId(content)
|
||||
);
|
||||
} else if (voice == "Papago") {
|
||||
const content = TTSTypecastModel.instance.ttsify(text);
|
||||
const content = TTSPapagoModel.instance.ttsify(text);
|
||||
if (!content.length)
|
||||
throw new Error("Empty content");
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,8 @@
|
|||
|
||||
export const Voice = {
|
||||
TypeCast: 'TypeCast',
|
||||
Papago: 'Papago'
|
||||
Papago: 'Papago',
|
||||
Supertonic: 'Supertonic'
|
||||
} as const
|
||||
|
||||
export type Voice = (typeof Voice)[keyof typeof Voice]
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ const config: runtime.GetPrismaClientConfig = {
|
|||
"clientVersion": "7.3.0",
|
||||
"engineVersion": "9d6ad21cbbceab97458517b147a6a09ff43aa735",
|
||||
"activeProvider": "postgresql",
|
||||
"inlineSchema": "generator client {\n provider = \"prisma-client\"\n output = \"../packages/db/generated/prisma\"\n specifying = [\"prismaSchemaFolder\"]\n}\n\ndatasource db {\n provider = \"postgresql\"\n}\n\nmodel DiscordUserProfile {\n id String @id @default(cuid())\n userId String @unique\n voice Voice @default(Papago)\n nya Boolean @default(false)\n canTypecast Boolean @default(false)\n}\n\nmodel DiscordGuildProfile {\n id String @id @default(cuid())\n guildId String @unique\n readChannel String[] @default([])\n}\n\nenum Voice {\n TypeCast\n Papago\n}\n",
|
||||
"inlineSchema": "generator client {\n provider = \"prisma-client\"\n output = \"../packages/db/generated/prisma\"\n specifying = [\"prismaSchemaFolder\"]\n}\n\ndatasource db {\n provider = \"postgresql\"\n}\n\nmodel DiscordUserProfile {\n id String @id @default(cuid())\n userId String @unique\n voice Voice @default(Papago)\n nya Boolean @default(false)\n canTypecast Boolean @default(false)\n}\n\nmodel DiscordGuildProfile {\n id String @id @default(cuid())\n guildId String @unique\n readChannel String[] @default([])\n}\n\nenum Voice {\n TypeCast\n Papago\n Supertonic\n}\n",
|
||||
"runtimeDataModel": {
|
||||
"models": {},
|
||||
"enums": {},
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import { createHmac } from "crypto";
|
|||
import { join } from "path";
|
||||
import fetch from "../utils/fetch";
|
||||
import TTSModelBase from ".";
|
||||
import { saferKorean } from "../utils/saferKorean";
|
||||
|
||||
export class TTSPapagoModel extends TTSModelBase<TTSPapagoModel.RequestId> {
|
||||
protected cachedVoice: Map<String, Promise<Buffer>>
|
||||
|
|
@ -9,6 +10,11 @@ export class TTSPapagoModel extends TTSModelBase<TTSPapagoModel.RequestId> {
|
|||
super()
|
||||
this.cachedVoice = new Map();
|
||||
}
|
||||
ttsify(input: string): string {
|
||||
return super.ttsify(saferKorean(
|
||||
input
|
||||
))
|
||||
}
|
||||
public getVoicePath(id: TTSPapagoModel.RequestId): string {
|
||||
const audioFileName = TTSModelBase.hashAudioFile(id.text, `.${id.speaker}.${id.speed.replace(/\-/g, "_")}`);
|
||||
const audioPath = join(
|
||||
|
|
|
|||
63
packages/tts/supertonic.ts
Normal file
63
packages/tts/supertonic.ts
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
import { join } from "path";
|
||||
import fetch from "../utils/fetch";
|
||||
import TTSModelBase from ".";
|
||||
import { saferKorean } from "../utils/saferKorean";
|
||||
|
||||
export class TTSSupertonicModel extends TTSModelBase<TTSSupertonicModel.RequestId> {
|
||||
protected override cachedVoice: Map<String, Promise<Buffer>>
|
||||
constructor() {
|
||||
super()
|
||||
this.cachedVoice = new Map();
|
||||
}
|
||||
override ttsify(input: string): string {
|
||||
return super.ttsify(saferKorean(
|
||||
input
|
||||
))
|
||||
}
|
||||
private async getSupertonicResponse(voiceId: TTSSupertonicModel.RequestId) {
|
||||
const payload = {
|
||||
text: voiceId.text,
|
||||
lang: "ko",
|
||||
};
|
||||
|
||||
if (!process.env.SUPERTONIC_API_URL) {
|
||||
throw Error("process.env.SUPERTONIC_API_URL not set");
|
||||
}
|
||||
|
||||
return await fetch(process.env.SUPERTONIC_API_URL, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
body: JSON.stringify(payload)
|
||||
});
|
||||
}
|
||||
async getVoiceBuffer(voiceId: TTSSupertonicModel.RequestId): Promise<ArrayBuffer> {
|
||||
let response: Response | undefined;
|
||||
|
||||
response = await this.getSupertonicResponse(voiceId) as Response;
|
||||
if (response.ok)
|
||||
return await response.arrayBuffer();
|
||||
|
||||
throw new Error(`invalid supertonic response ${await response.text()}`);
|
||||
}
|
||||
public getVoicePath(id: TTSSupertonicModel.RequestId): string {
|
||||
const audioFileName = TTSModelBase.hashAudioFile(id.text);
|
||||
const audioPath = join(
|
||||
TTSSupertonicModel.SupertonicAudioCachePath,
|
||||
audioFileName
|
||||
);
|
||||
return audioPath;
|
||||
}
|
||||
public createRequestId(text: string): TTSSupertonicModel.RequestId {
|
||||
return {
|
||||
text,
|
||||
};
|
||||
}
|
||||
}
|
||||
export namespace TTSSupertonicModel {
|
||||
export const instance = new TTSSupertonicModel();
|
||||
export type RequestId = { text: string };
|
||||
export const SupertonicAudioCachePath = join(TTSModelBase.AudioCachePath, "supertonic");
|
||||
}
|
||||
export default TTSSupertonicModel;
|
||||
|
|
@ -2,11 +2,9 @@ import { join } from "path";
|
|||
import { TYPECAST_TOKENS } from "../env";
|
||||
import fetch from "../utils/fetch";
|
||||
import TTSModelBase from ".";
|
||||
import CallingNumberKorean from "../utils/callingNumberKorean";
|
||||
import IntegerKorean from "../utils/integerKorean";
|
||||
import FloatKorean from "../utils/floatKorean";
|
||||
import { readFileSync, writeFileSync } from "fs";
|
||||
import { cwd, env } from "process";
|
||||
import { saferKorean } from "../utils/saferKorean";
|
||||
|
||||
export class TTSTypecastModel extends TTSModelBase<TTSTypecastModel.RequestId> {
|
||||
protected cachedVoice: Map<String, Promise<Buffer>>
|
||||
|
|
@ -19,61 +17,12 @@ export class TTSTypecastModel extends TTSModelBase<TTSTypecastModel.RequestId> {
|
|||
ttsify(input: string): string {
|
||||
|
||||
return super.ttsify(
|
||||
input
|
||||
.replace(/\.+$/, "")
|
||||
.replace(/\.\.+/g, "")
|
||||
.replace(/\.[ \t]/g, " ")
|
||||
.replace(/^[\?\!\'\"]+$/, (total)=>(
|
||||
[...total].map(element => TTSTypecastModel.IsolatedSymbolMap[
|
||||
element as keyof typeof TTSTypecastModel.IsolatedSymbolMap
|
||||
]).join("")
|
||||
))
|
||||
.replace(/\`\`\`.+?\`\`\`/g, "코드블럭")
|
||||
.replace(/https\S+/g, "링크")
|
||||
saferKorean(input)
|
||||
.replace(/ㄴㄴ/g, "노노")
|
||||
.replace(/ㅇㅋ/g, "오키")
|
||||
.replace(/ㅜㅜ/g, "눙물")
|
||||
.replace(/빵/g, "빵 크크")
|
||||
.replace(/[\?]+ *ㄴ/g, "물음표ㄴ")
|
||||
.replace(/(\d+)[ \t\n]*([개살])/g, (_, num: string, postfix: string)=>{
|
||||
const intNum = parseInt(num)
|
||||
if (CallingNumberKorean.canConvert(intNum)) {
|
||||
return CallingNumberKorean.convert(intNum) + postfix;
|
||||
} else {
|
||||
return IntegerKorean.convertFromString(num) + postfix;
|
||||
}
|
||||
})
|
||||
.replace(/(v?)([\d\.]+)([ab]?)/g, (_, suffix: string, num: string, postfix: string) => {
|
||||
const dotCount = [...num.matchAll(/\./g)].length;
|
||||
const hasNoSuffix = suffix == "";
|
||||
|
||||
if (hasNoSuffix && dotCount == 0) {
|
||||
return IntegerKorean.convertFromString(num) + postfix;
|
||||
} else if (hasNoSuffix && dotCount == 1) {
|
||||
const [intPart, floatPart] = num.split(/\./);
|
||||
return (
|
||||
IntegerKorean.convertFromString(intPart)
|
||||
+ "쩜"
|
||||
+ FloatKorean.convert(floatPart)
|
||||
+ postfix
|
||||
)
|
||||
} else if (suffix == "v") {
|
||||
return (
|
||||
"버전"
|
||||
+ FloatKorean.convert(num)
|
||||
+ (TTSTypecastModel.VersionPostfix[
|
||||
postfix as keyof typeof TTSTypecastModel.VersionPostfix
|
||||
] ?? "")
|
||||
);
|
||||
} else {
|
||||
return FloatKorean.convert(num) + postfix;
|
||||
}
|
||||
})
|
||||
.replace(/[\%\^\&\*\#\@\.\-\+\_\=\/\\♡\$]/g, (t) => (
|
||||
TTSTypecastModel.SymbolMap[t as keyof typeof TTSTypecastModel.SymbolMap]
|
||||
))
|
||||
.replace(/\?+/g, "?")
|
||||
.replace(/\!+/g, "!")
|
||||
)
|
||||
}
|
||||
private async getTypecastResponse(apiKey: string, voiceId: TTSTypecastModel.RequestId) {
|
||||
|
|
@ -140,33 +89,6 @@ export class TTSTypecastModel extends TTSModelBase<TTSTypecastModel.RequestId> {
|
|||
}
|
||||
}
|
||||
export namespace TTSTypecastModel {
|
||||
export const IsolatedSymbolMap = {
|
||||
"?": "물음표",
|
||||
"!": "느낌표",
|
||||
"'": "쿼트",
|
||||
"\"": "더블쿼트",
|
||||
}
|
||||
export const SymbolMap = {
|
||||
"%": "퍼센트",
|
||||
"$": "달러싸인",
|
||||
"^": "캐럿",
|
||||
"&": "엠퍼센드",
|
||||
"*": "스타",
|
||||
"#": "해시",
|
||||
"@": "엣",
|
||||
".": "쩜",
|
||||
"-": "마이너스",
|
||||
"+": "플러스",
|
||||
"_": "언더바",
|
||||
"=": "이퀄",
|
||||
"/": "슬래쉬",
|
||||
"\\": "역슬래쉬",
|
||||
"♡": "하투 ",
|
||||
};
|
||||
export const VersionPostfix = {
|
||||
"a": "알파",
|
||||
"b": "베타",
|
||||
};
|
||||
export const instance = new TTSTypecastModel();
|
||||
export type RequestId = { text: string, voiceId: string };
|
||||
export const TypecastAudioCachePath = join(TTSModelBase.AudioCachePath, "typecast");
|
||||
|
|
|
|||
83
packages/utils/saferKorean.ts
Normal file
83
packages/utils/saferKorean.ts
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
import CallingNumberKorean from "./callingNumberKorean";
|
||||
import FloatKorean from "./floatKorean";
|
||||
import IntegerKorean from "./integerKorean";
|
||||
|
||||
export const IsolatedSymbolMap = {
|
||||
"?": "물음표",
|
||||
"!": "느낌표",
|
||||
"'": "쿼트",
|
||||
"\"": "더블쿼트",
|
||||
}
|
||||
export const SymbolMap = {
|
||||
"%": "퍼센트",
|
||||
"$": "달러싸인",
|
||||
"^": "캐럿",
|
||||
"&": "엠퍼센드",
|
||||
"*": "스타",
|
||||
"#": "해시",
|
||||
"@": "엣",
|
||||
".": "쩜",
|
||||
"-": "마이너스",
|
||||
"+": "플러스",
|
||||
"_": "언더바",
|
||||
"=": "이퀄",
|
||||
"/": "슬래쉬",
|
||||
"\\": "역슬래쉬",
|
||||
"♡": "하투 ",
|
||||
};
|
||||
export const VersionPostfix = {
|
||||
"a": "알파",
|
||||
"b": "베타",
|
||||
};
|
||||
|
||||
export function saferKorean(input: string): string {
|
||||
return input.replace(/\.+$/, "")
|
||||
.replace(/\.\.+/g, "")
|
||||
.replace(/\.[ \t]/g, " ")
|
||||
.replace(/^[\?\!\'\"]+$/, (total)=>(
|
||||
[...total].map(element => IsolatedSymbolMap[
|
||||
element as keyof typeof IsolatedSymbolMap
|
||||
]).join("")
|
||||
))
|
||||
.replace(/\`\`\`.+?\`\`\`/g, "코드블럭")
|
||||
.replace(/https\S+/g, "링크")
|
||||
.replace(/(\d+)[ \t\n]*([개살])/g, (_, num: string, postfix: string)=>{
|
||||
const intNum = parseInt(num)
|
||||
if (CallingNumberKorean.canConvert(intNum)) {
|
||||
return CallingNumberKorean.convert(intNum) + postfix;
|
||||
} else {
|
||||
return IntegerKorean.convertFromString(num) + postfix;
|
||||
}
|
||||
})
|
||||
.replace(/(v?)([\d\.]+)([ab]?)/g, (_, suffix: string, num: string, postfix: string) => {
|
||||
const dotCount = [...num.matchAll(/\./g)].length;
|
||||
const hasNoSuffix = suffix == "";
|
||||
|
||||
if (hasNoSuffix && dotCount == 0) {
|
||||
return IntegerKorean.convertFromString(num) + postfix;
|
||||
} else if (hasNoSuffix && dotCount == 1) {
|
||||
const [intPart, floatPart] = num.split(/\./);
|
||||
return (
|
||||
IntegerKorean.convertFromString(intPart)
|
||||
+ "쩜"
|
||||
+ FloatKorean.convert(floatPart)
|
||||
+ postfix
|
||||
)
|
||||
} else if (suffix == "v") {
|
||||
return (
|
||||
"버전"
|
||||
+ FloatKorean.convert(num)
|
||||
+ (VersionPostfix[
|
||||
postfix as keyof typeof VersionPostfix
|
||||
] ?? "")
|
||||
);
|
||||
} else {
|
||||
return FloatKorean.convert(num) + postfix;
|
||||
}
|
||||
})
|
||||
.replace(/[\%\^\&\*\#\@\.\-\+\_\=\/\\♡\$]/g, (t) => (
|
||||
SymbolMap[t as keyof typeof SymbolMap]
|
||||
))
|
||||
.replace(/\?+/g, "?")
|
||||
.replace(/\!+/g, "!")
|
||||
}
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
-- AlterEnum
|
||||
ALTER TYPE "Voice" ADD VALUE 'Supertonic';
|
||||
|
|
@ -25,4 +25,5 @@ model DiscordGuildProfile {
|
|||
enum Voice {
|
||||
TypeCast
|
||||
Papago
|
||||
Supertonic
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue