add multi voice support

2026-05-19 17:47:17 +00:00 · 2026-05-19 17:47:17 +00:00 · caa6db37ce
commit caa6db37ce
parent 6380daa4a4
7 changed files with 132 additions and 97 deletions
--- a/crates/yaejuyang-supertonic/src/api.rs
+++ b/crates/yaejuyang-supertonic/src/api.rs
@ -9,11 +9,14 @@ use serde::{Deserialize, Serialize};
 pub struct Body {
    text: String,
    lang: String,
    style_id: String,
 }
 pub async fn handler(
    State(state): State<Arc<TtsPool>>,
    Json(payload): Json<Body>,
 ) -> Result<Vec<u8>, String> {
-    Ok(state.synthesize(payload.text, payload.lang).await?)
+    Ok(state
        .synthesize(payload.text, payload.lang, payload.style_id)
        .await?)
 }
--- a/crates/yaejuyang-supertonic/src/main.rs
+++ b/crates/yaejuyang-supertonic/src/main.rs
@ -1,12 +1,14 @@
 use axum::{Router, routing::post};
 use std::path::PathBuf;
 use std::sync::Arc;
 use std::{collections::HashMap, path::PathBuf};
 pub mod api;
 pub mod tts;
 use tts::{TtsOpts, TtsPool, load_text_to_speech, load_voice_style};
 use crate::tts::engine::load_voice_style_map;
 #[tokio::main(flavor = "multi_thread")]
 async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
    dotenvy::dotenv().ok();
@ -15,7 +17,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
    let model_dir =
        std::env::var("SUPERTONIC_MODEL_DIR").unwrap_or_else(|_| "./assets".to_string());
    let voice_style_path = std::env::var("SUPERTONIC_VOICE_STYLE")
-        .unwrap_or_else(|_| format!("{model_dir}/voice_styles/M1.json"));
+        .unwrap_or_else(|_| format!("F1={model_dir}/voice_styles/F1.json"));
    let lang = std::env::var("SUPERTONIC_LANG").unwrap_or_else(|_| "en".to_string());
    let total_step: usize = std::env::var("SUPERTONIC_TOTAL_STEP")
        .ok()
@ -40,15 +42,26 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
    tts::assets::ensure_assets(&model_path, &hf_repo)?;
    let onnx_dir_for_init = model_path.join("onnx").to_string_lossy().into_owned();
-    let voice_style_for_init = voice_style_path.clone();
+    let voice_style_for_init = voice_style_path
        .split(",")
        .filter_map(|i| {
            i.split_once("=").or_else(|| {
                tracing::error!("Voice style '{i}' is not valid.");
                None
            })
        })
        .map(|(k, v)| (k.to_owned(), PathBuf::from(v)))
        .collect::<HashMap<_, _>>();
    let voice_style_map = Arc::new(load_voice_style_map(&voice_style_for_init)?);
    let pool = Arc::new(TtsPool::spawn(
        workers,
        move |id| {
            let span = tracing::info_span!("worker", worker_id = id);
            let _enter = span.enter();
            let tts = load_text_to_speech(&onnx_dir_for_init)?;
-            let style = load_voice_style(std::slice::from_ref(&voice_style_for_init), false)?;
+
-            Ok((tts, style))
+            Ok((tts, voice_style_map.clone()))
        },
        TtsOpts {
            total_step,
--- a/crates/yaejuyang-supertonic/src/tts/engine.rs
+++ b/crates/yaejuyang-supertonic/src/tts/engine.rs
@ -8,7 +8,7 @@ use serde::Deserialize;
 use std::collections::HashMap;
 use std::fs::File;
 use std::io::BufReader;
-use std::path::Path;
+use std::path::{Path, PathBuf};
 #[derive(Debug, Clone, Deserialize)]
 pub struct Config {
@ -282,62 +282,59 @@ fn sample_noisy_latent(
    (noisy_latent, latent_mask)
 }
-pub fn load_voice_style(voice_style_paths: &[String], verbose: bool) -> Result<Style> {
+pub fn load_voice_style_map(voice_style_path_map: &HashMap<String, PathBuf>) -> Result<HashMap<String, Style>> {
-    let bsz = voice_style_paths.len();
+    let mut map = HashMap::new();
-    let first_file =
+    for (name, path) in voice_style_path_map.iter() {
-        File::open(&voice_style_paths[0]).context("Failed to open voice style file")?;
+        map.insert(name.to_owned(), load_voice_style(path)?);
-    let first_reader = BufReader::new(first_file);
+        tracing::info!("Voice style {name} loaded")
-    let first_data: VoiceStyleData = serde_json::from_reader(first_reader)?;
+    }
-    let ttl_dims = &first_data.style_ttl.dims;
+    Ok(map)
-    let dp_dims = &first_data.style_dp.dims;
+}
 pub fn load_voice_style(voice_style_path: &Path) -> Result<Style> {
    let file =
        File::open(&voice_style_path).context("Failed to open voice style file")?;
    let file_reader = BufReader::new(file);
    let file_data: VoiceStyleData = serde_json::from_reader(file_reader)?;
    let ttl_dims = &file_data.style_ttl.dims;
    let dp_dims = &file_data.style_dp.dims;
    let ttl_dim1 = ttl_dims[1];
    let ttl_dim2 = ttl_dims[2];
    let dp_dim1 = dp_dims[1];
    let dp_dim2 = dp_dims[2];
-    let ttl_size = bsz * ttl_dim1 * ttl_dim2;
+    let ttl_size = ttl_dim1 * ttl_dim2;
-    let dp_size = bsz * dp_dim1 * dp_dim2;
+    let dp_size = dp_dim1 * dp_dim2;
    let mut ttl_flat = vec![0.0f32; ttl_size];
    let mut dp_flat = vec![0.0f32; dp_size];
-    for (i, path) in voice_style_paths.iter().enumerate() {
+    let mut idx = 0;
-        let file = File::open(path).context("Failed to open voice style file")?;
+    for batch in &file_data.style_ttl.data {
-        let reader = BufReader::new(file);
+        for row in batch {
-        let data: VoiceStyleData = serde_json::from_reader(reader)?;
+            for &val in row {
-
+                ttl_flat[idx] = val;
-        let ttl_offset = i * ttl_dim1 * ttl_dim2;
+                idx += 1;
        let mut idx = 0;
        for batch in &data.style_ttl.data {
            for row in batch {
                for &val in row {
                    ttl_flat[ttl_offset + idx] = val;
                    idx += 1;
                }
            }
        }
        let dp_offset = i * dp_dim1 * dp_dim2;
        idx = 0;
        for batch in &data.style_dp.data {
            for row in batch {
                for &val in row {
                    dp_flat[dp_offset + idx] = val;
                    idx += 1;
                }
            }
        }
    }
-    let ttl_style = Array3::from_shape_vec((bsz, ttl_dim1, ttl_dim2), ttl_flat)?;
+    idx = 0;
-    let dp_style = Array3::from_shape_vec((bsz, dp_dim1, dp_dim2), dp_flat)?;
+    for batch in &file_data.style_dp.data {
-
+        for row in batch {
-    if verbose {
+            for &val in row {
-        tracing::info!("Loaded {} voice styles", bsz);
+                dp_flat[idx] = val;
                idx += 1;
            }
        }
    }
    let ttl_style = Array3::from_shape_vec((1, ttl_dim1, ttl_dim2), ttl_flat)?;
    let dp_style = Array3::from_shape_vec((1, dp_dim1, dp_dim2), dp_flat)?;
    Ok(Style {
        ttl: ttl_style,
        dp: dp_style,
--- a/crates/yaejuyang-supertonic/src/tts/pool.rs
+++ b/crates/yaejuyang-supertonic/src/tts/pool.rs
@ -1,5 +1,6 @@
 use anyhow::{Context, Result};
 use crossbeam_channel::{Sender, bounded};
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::thread;
 use std::time::Instant;
@ -18,6 +19,7 @@ pub struct TtsOpts {
 pub struct TtsJob {
    pub text: String,
    pub lang: String,
    pub style_id: String,
    pub reply: oneshot::Sender<Result<Vec<u8>, String>>,
 }
@ -28,7 +30,7 @@ pub struct TtsPool {
 impl TtsPool {
    pub fn spawn<I>(workers: usize, init: I, opts: TtsOpts) -> Result<Self>
    where
-        I: Fn(u32) -> Result<(TextToSpeech, Style)> + Send + Sync + 'static,
+        I: Fn(u32) -> Result<(TextToSpeech, Arc<HashMap<String, Style>>)> + Send + Sync + 'static,
    {
        let workers = workers.max(1);
        let (tx, rx) = bounded::<TtsJob>(workers * 4);
@ -40,7 +42,7 @@ impl TtsPool {
            thread::Builder::new()
                .name(format!("supertonic-tts-{worker_id}"))
                .spawn(move || {
-                    let (mut tts, style) = match init(worker_id as u32) {
+                    let (mut tts, style_map) = match init(worker_id as u32) {
                        Ok(pair) => pair,
                        Err(e) => {
                            tracing::error!("worker {worker_id} init failed: {e:?}");
@ -52,6 +54,12 @@ impl TtsPool {
                    while let Ok(job) = rx.recv() {
                        let start_at = Instant::now();
                        let Some(style) = style_map.get(&job.style_id) else {
                            job.reply
                                .send(Err(format!("Voice style {} not found", job.style_id)));
                            continue;
                        };
                        let result = (|| -> Result<Vec<u8>, String> {
                            let (wav, _dur) = tts
                                .synthesize(
@ -77,11 +85,17 @@ impl TtsPool {
        Ok(TtsPool { tx })
    }
-    pub async fn synthesize(&self, text: String, lang: String) -> Result<Vec<u8>, String> {
+    pub async fn synthesize(
        &self,
        text: String,
        lang: String,
        style_id: String,
    ) -> Result<Vec<u8>, String> {
        let (reply_tx, reply_rx) = oneshot::channel();
        let job = TtsJob {
            text,
            lang,
            style_id,
            reply: reply_tx,
        };
--- a/packages/bot/events/readChannel.ts
+++ b/packages/bot/events/readChannel.ts
@ -1,7 +1,7 @@
 import { getOrCreateVoiceConnection } from "../util";
 import { getUserProfile, hasGuildReadChannel } from "../db";
 import { defineEvent } from "../event";
-import { playVoice } from "../tts";
+import { playVoice, PlayVoiceOptions } from "../tts";
 import { Voice } from "../../db/generated/prisma/enums";
 export default defineEvent("messageCreate", async (message) => {
@ -20,18 +20,25 @@ export default defineEvent("messageCreate", async (message) => {
    let content = message.cleanContent;
    let voice: Voice | null = null
    let options: PlayVoiceOptions | undefined = undefined;
    let matched: RegExpMatchArray | null = null;
    if (content.startsWith("$t ")) {
        voice = "TypeCast";
    } else if (content.startsWith("$p ")) {
        voice = "Papago";
-    } else if (content.startsWith("$s ")) {
+    } else if (matched = content.match(/^\$s(\S*) /)) {
        voice = "Supertonic";
        let style: string | undefined = undefined
        if (matched[1].length) {
            style = matched[1]
        }
        options = { supertonicStyleId: style };
    } else if (content.match(/^\$\s/)) {
        return;
    }
    if (voice) {
-        content = content.replace(/^\$[^ ]+ +/, "")
+        content = content.replace(/^\$\S+\s+/, "")
    } else {
        voice = profile.voice;
    }
@ -61,5 +68,6 @@ export default defineEvent("messageCreate", async (message) => {
    } catch(err) {
        message.reply("말이 꼬이네요 ㅜ.ㅜ");
        console.log("playVoice failed. ", err);
    }
 })
--- a/packages/bot/tts.ts
+++ b/packages/bot/tts.ts
@ -10,38 +10,6 @@ import { nyaize } from "../utils/nyaize";
 import { OutputHandler } from "../utils/outputHandler";
 import TTSSupertonicModel from "../tts/supertonic";
 export async function createVoiceBuffer(voice: Voice, text: string): Promise<Buffer> {
    if (voice == "TypeCast") {
        const content = TTSTypecastModel.instance.ttsify(text);
        if (!content.length)
            throw new Error("Empty content");
        return await TTSTypecastModel.instance.getMemcachedVoice(
            TTSTypecastModel.instance.createRequestId(content)
        );
    } else if (voice == "Supertonic") {
        const content = TTSSupertonicModel.instance.ttsify(text);
        if (!content.length)
            throw new Error("Empty content");
        return await TTSSupertonicModel.instance.getMemcachedVoice(
            TTSSupertonicModel.instance.createRequestId(content)
        );
    } else if (voice == "Papago") {
        const content = TTSPapagoModel.instance.ttsify(text);
        if (!content.length)
            throw new Error("Empty content");
        return await TTSPapagoModel.instance.getMemcachedVoice(
            TTSPapagoModel.instance.createRequestId(content)
        );
    } else {
        throw new Error(`Unknown voice type: ${voice}`);
    }
 }
 class VoiceQueue {
    private connection: VoiceConnection;
    private list: AudioResource[];
@ -77,11 +45,16 @@ class VoiceQueue {
    }
 }
 export type PlayVoiceOptions = {
    supertonicStyleId?: string,
 };
 export async function playVoice(
    guild: Guild,
    profile: DiscordUserProfile,
    voice: Voice,
-    text: string
+    text: string,
    options?: PlayVoiceOptions,
 ) {
    if (profile.nya)
        text = nyaize(text);
@ -91,16 +64,41 @@ export async function playVoice(
        if (!connection)
            throw new Error("Yaeju is not joined VoiceChat");
-        let voiceBuffer: Buffer;
+        if (voice == "TypeCast" && profile.canTypecast) {
-        if (voice == "TypeCast") {
+            throw new Error(`the user ${profile.userId} is can't use typecast voice`);
            if (profile.canTypecast) {
                voiceBuffer = await createVoiceBuffer(voice, text);
            } else {
                throw new Error(`the user ${profile.userId} is can't use typecast voice`);
            }
        } else {
            voiceBuffer = await createVoiceBuffer(voice, text);
        }
        let voiceBuffer: Buffer
        if (voice == "TypeCast") {
            const content = TTSTypecastModel.instance.ttsify(text);
            if (!content.length)
                throw new Error("Empty content");
            voiceBuffer =  await TTSTypecastModel.instance.getMemcachedVoice(
                TTSTypecastModel.instance.createRequestId(content)
            );
        } else if (voice == "Supertonic") {
            const content = TTSSupertonicModel.instance.ttsify(text);
            if (!content.length)
                throw new Error("Empty content");
            voiceBuffer = await TTSSupertonicModel.instance.getMemcachedVoice(
                TTSSupertonicModel.instance.createRequestId(content, options?.supertonicStyleId)
            );
        } else if (voice == "Papago") {
            const content = TTSPapagoModel.instance.ttsify(text);
            if (!content.length)
                throw new Error("Empty content");
            voiceBuffer = await TTSPapagoModel.instance.getMemcachedVoice(
                TTSPapagoModel.instance.createRequestId(content)
            );
        } else {
            throw new Error(`Unknown voice type: ${voice}`);
        }
        VoiceQueue.fromConnection(connection).enqueue(
            TTSModelBase.bufferToAudioResource(voiceBuffer)
--- a/packages/tts/supertonic.ts
+++ b/packages/tts/supertonic.ts
@ -18,6 +18,7 @@ export class TTSSupertonicModel extends TTSModelBase<TTSSupertonicModel.RequestI
        const payload = {
            text: voiceId.text,
            lang: "ko",
            style_id: voiceId.styleId
        };
        if (!process.env.SUPERTONIC_API_URL) {
@ -42,22 +43,23 @@ export class TTSSupertonicModel extends TTSModelBase<TTSSupertonicModel.RequestI
        throw new Error(`invalid supertonic response ${await response.text()}`);
    }
    public getVoicePath(id: TTSSupertonicModel.RequestId): string {
-        const audioFileName = TTSModelBase.hashAudioFile(id.text);
+        const audioFileName = TTSModelBase.hashAudioFile(id.text + id.styleId);
        const audioPath = join(
            TTSSupertonicModel.SupertonicAudioCachePath,
            audioFileName
        );
        return audioPath;
    }
-    public createRequestId(text: string): TTSSupertonicModel.RequestId {
+    public createRequestId(text: string, styleId?: string): TTSSupertonicModel.RequestId {
        return {
            text,
            styleId: styleId ?? "F1"
        };
    }
 }
 export namespace TTSSupertonicModel {
    export const instance = new TTSSupertonicModel();
-    export type RequestId = { text: string };
+    export type RequestId = { text: string, styleId: string };
    export const SupertonicAudioCachePath = join(TTSModelBase.AudioCachePath, "supertonic");
 }
 export default TTSSupertonicModel;