add supertonic

2026-05-19 14:37:27 +00:00 · 2026-05-19 14:37:27 +00:00 · 69ec38d16b
commit 69ec38d16b
parent a69c90c822
13 changed files with 2850 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,3 +9,6 @@ packages/generated
 cache
 db
 docker-compose.yml
+
+target/
+test.wav
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,3 @@
+[workspace]
+resolver = "2"
+members = ["crates/yaejuyang-supertonic"]
--- a/crates/yaejuyang-supertonic/.gitignore
+++ b/crates/yaejuyang-supertonic/.gitignore
@ -0,0 +1,2 @@
+assets/
+.env
--- a/crates/yaejuyang-supertonic/Cargo.toml
+++ b/crates/yaejuyang-supertonic/Cargo.toml
@ -0,0 +1,30 @@
+[package]
+name = "yaejuyang-supertonic"
+version = "0.1.0"
+edition = "2024"
+
+[features]
+default = ["webgpu"]
+webgpu = [ "ort/webgpu" ]
+cuda = [ "ort/cuda" ]
+
+[dependencies]
+anyhow = "1.0.102"
+axum = "0.8.9"
+clap = "4.6.1"
+crossbeam-channel = "0.5.15"
+dotenvy = "0.15.7"
+hound = "3.5.1"
+ndarray = "0.17.2"
+ort = "2.0.0-rc.12"
+qwreey-utility-rs = "0.1.9"
+rand = "0.10.1"
+chacha20 = { version = "0.10.0-rc.5" }
+rand_distr = "0.6.0"
+regex = "1.12.3"
+serde = { version = "1.0.228", features = ["derive"] }
+serde_json = "1.0.149"
+tokio = { version = "1.52.3", features = ["full"] }
+tracing = "0.1.44"
+tracing-subscriber = "0.3.23"
+unicode-normalization = "0.1.25"
--- a/crates/yaejuyang-supertonic/src/api.rs
+++ b/crates/yaejuyang-supertonic/src/api.rs
@ -0,0 +1,19 @@
+use std::sync::Arc;
+
+use axum::{Json, extract::State};
+
+use crate::tts::TtsPool;
+use serde::{Deserialize, Serialize};
+
+#[derive(Serialize, Deserialize)]
+pub struct Body {
+    text: String,
+    lang: String,
+}
+
+pub async fn handler(
+    State(state): State<Arc<TtsPool>>,
+    Json(payload): Json<Body>,
+) -> Result<Vec<u8>, String> {
+    Ok(state.synthesize(payload.text, payload.lang).await?)
+}
--- a/crates/yaejuyang-supertonic/src/main.rs
+++ b/crates/yaejuyang-supertonic/src/main.rs
@ -0,0 +1,69 @@
+use axum::{Router, routing::post};
+use std::path::PathBuf;
+use std::sync::Arc;
+
+pub mod api;
+pub mod tts;
+
+use tts::{TtsOpts, TtsPool, load_text_to_speech, load_voice_style};
+
+#[tokio::main(flavor = "multi_thread")]
+async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    dotenvy::dotenv().ok();
+    tracing_subscriber::fmt::init();
+
+    let model_dir = std::env::var("SUPERTONIC_MODEL_DIR")
+        .unwrap_or_else(|_| "./assets/supertonic-3".to_string());
+    let voice_style_path = std::env::var("SUPERTONIC_VOICE_STYLE")
+        .unwrap_or_else(|_| format!("{model_dir}/voice_styles/M1.json"));
+    let lang = std::env::var("SUPERTONIC_LANG").unwrap_or_else(|_| "en".to_string());
+    let total_step: usize = std::env::var("SUPERTONIC_TOTAL_STEP")
+        .ok()
+        .and_then(|v| v.parse().ok())
+        .unwrap_or(8);
+    let speed: f32 = std::env::var("SUPERTONIC_SPEED")
+        .ok()
+        .and_then(|v| v.parse().ok())
+        .unwrap_or(1.05);
+    let silence_duration: f32 = std::env::var("SUPERTONIC_SILENCE_DUR")
+        .ok()
+        .and_then(|v| v.parse().ok())
+        .unwrap_or(0.3);
+    let workers: usize = std::env::var("SUPERTONIC_WORKERS")
+        .ok()
+        .and_then(|v| v.parse().ok())
+        .unwrap_or(2);
+    let hf_repo = std::env::var("SUPERTONIC_HF_REPO")
+        .unwrap_or_else(|_| "https://huggingface.co/Supertone/supertonic-3".to_string());
+
+    let model_path = PathBuf::from(&model_dir);
+    tts::assets::ensure_assets(&model_path, &hf_repo)?;
+
+    let onnx_dir_for_init = model_path.join("onnx").to_string_lossy().into_owned();
+    let voice_style_for_init = voice_style_path.clone();
+    let pool = Arc::new(TtsPool::spawn(
+        workers,
+        move |id| {
+            let span = tracing::info_span!("worker", worker_id = id);
+            let _enter = span.enter();
+            let tts = load_text_to_speech(&onnx_dir_for_init)?;
+            let style = load_voice_style(std::slice::from_ref(&voice_style_for_init), false)?;
+            Ok((tts, style))
+        },
+        TtsOpts {
+            total_step,
+            speed,
+            silence_duration,
+        },
+    )?);
+
+    let app = Router::new()
+        .route("/", post(api::handler))
+        .with_state(pool);
+
+    let addr = std::env::var("ADDR").unwrap_or_else(|_| "0.0.0.0:80".to_string());
+    let listener = tokio::net::TcpListener::bind(addr).await?;
+    axum::serve(listener, app).await?;
+
+    Ok(())
+}
--- a/crates/yaejuyang-supertonic/src/tts/assets.rs
+++ b/crates/yaejuyang-supertonic/src/tts/assets.rs
@ -0,0 +1,77 @@
+use anyhow::{Context, Result, bail};
+use std::path::Path;
+use std::process::Command;
+
+/// Ensure the Supertonic model directory is present.
+///
+/// `model_dir` is the unpacked HuggingFace repo root — it should contain
+/// `onnx/tts.json` and `voice_styles/*.json` after the clone. If
+/// `model_dir/onnx/tts.json` already exists, this is a no-op (typical inside
+/// containers that bake assets in at build time). Otherwise the HF repo is
+/// cloned into `model_dir`, which must not already exist or must be empty.
+///
+/// Requires `git` and `git-lfs` on PATH.
+pub fn ensure_assets(model_dir: &Path, hf_repo: &str) -> Result<()> {
+    if model_dir.join("onnx").join("tts.json").exists() {
+        tracing::info!(
+            model_dir = %model_dir.display(),
+            "supertonic assets already present"
+        );
+        return Ok(());
+    }
+
+    if model_dir.exists() {
+        let is_empty = std::fs::read_dir(model_dir)
+            .with_context(|| format!("failed to read {}", model_dir.display()))?
+            .next()
+            .is_none();
+        if !is_empty {
+            bail!(
+                "SUPERTONIC_MODEL_DIR={} already exists and is not empty but does not \
+                 contain onnx/tts.json — delete it or point SUPERTONIC_MODEL_DIR \
+                 somewhere fresh",
+                model_dir.display()
+            );
+        }
+    } else if let Some(parent) = model_dir.parent() {
+        if !parent.as_os_str().is_empty() {
+            std::fs::create_dir_all(parent)
+                .with_context(|| format!("failed to create {}", parent.display()))?;
+        }
+    }
+
+    tracing::info!(
+        repo = hf_repo,
+        target = %model_dir.display(),
+        "cloning supertonic assets from HuggingFace"
+    );
+
+    let status = Command::new("git")
+        .args(["clone", "--depth=1", hf_repo])
+        .arg(model_dir)
+        .status()
+        .context("failed to invoke git — is git installed?")?;
+    if !status.success() {
+        bail!("git clone of {hf_repo} failed with status {status}");
+    }
+
+    let lfs_status = Command::new("git")
+        .args(["-C"])
+        .arg(model_dir)
+        .args(["lfs", "pull"])
+        .status()
+        .context("failed to invoke git lfs — is git-lfs installed?")?;
+    if !lfs_status.success() {
+        bail!("git lfs pull failed with status {lfs_status}");
+    }
+
+    if !model_dir.join("onnx").join("tts.json").exists() {
+        bail!(
+            "expected {} to exist after clone — the HF repo layout may have changed",
+            model_dir.join("onnx").join("tts.json").display()
+        );
+    }
+
+    tracing::info!("supertonic assets ready");
+    Ok(())
+}
--- a/crates/yaejuyang-supertonic/src/tts/engine.rs
+++ b/crates/yaejuyang-supertonic/src/tts/engine.rs
@ -0,0 +1,460 @@
+use super::text::{UnicodeProcessor, VoiceStyleData, chunk_text, length_to_mask};
+use anyhow::{Context, Result, anyhow};
+use ndarray::{Array, Array3};
+use ort::value::Value;
+use ort::{ep::ExecutionProviderDispatch, session::Session};
+use rand_distr::{Distribution, Normal};
+use serde::Deserialize;
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::BufReader;
+use std::path::Path;
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct Config {
+    pub ae: AEConfig,
+    pub ttl: TTLConfig,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct AEConfig {
+    pub sample_rate: i32,
+    pub base_chunk_size: i32,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct TTLConfig {
+    pub chunk_compress_factor: i32,
+    pub latent_dim: i32,
+}
+
+pub fn load_cfgs<P: AsRef<Path>>(onnx_dir: P) -> Result<Config> {
+    let cfg_path = onnx_dir.as_ref().join("tts.json");
+    let file =
+        File::open(&cfg_path).with_context(|| format!("failed to open {}", cfg_path.display()))?;
+    let reader = BufReader::new(file);
+    let cfgs: Config = serde_json::from_reader(reader)?;
+    Ok(cfgs)
+}
+
+pub struct Style {
+    pub ttl: Array3<f32>,
+    pub dp: Array3<f32>,
+}
+
+pub struct TextToSpeech {
+    cfgs: Config,
+    text_processor: UnicodeProcessor,
+    dp_ort: Session,
+    text_enc_ort: Session,
+    vector_est_ort: Session,
+    vocoder_ort: Session,
+    pub sample_rate: i32,
+}
+
+impl TextToSpeech {
+    pub fn new(
+        cfgs: Config,
+        text_processor: UnicodeProcessor,
+        dp_ort: Session,
+        text_enc_ort: Session,
+        vector_est_ort: Session,
+        vocoder_ort: Session,
+    ) -> Self {
+        let sample_rate = cfgs.ae.sample_rate;
+        TextToSpeech {
+            cfgs,
+            text_processor,
+            dp_ort,
+            text_enc_ort,
+            vector_est_ort,
+            vocoder_ort,
+            sample_rate,
+        }
+    }
+
+    fn infer(
+        &mut self,
+        text_list: &[String],
+        lang_list: &[String],
+        style: &Style,
+        total_step: usize,
+        speed: f32,
+    ) -> Result<(Vec<f32>, Vec<f32>)> {
+        let bsz = text_list.len();
+
+        let (text_ids, text_mask) = self.text_processor.call(text_list, lang_list)?;
+
+        let text_ids_array = {
+            let text_ids_shape = (bsz, text_ids[0].len());
+            let mut flat = Vec::new();
+            for row in &text_ids {
+                flat.extend_from_slice(row);
+            }
+            Array::from_shape_vec(text_ids_shape, flat)?
+        };
+
+        let text_ids_value = Value::from_array(text_ids_array)?;
+        let text_mask_value = Value::from_array(text_mask.clone())?;
+        let style_dp_value = Value::from_array(style.dp.clone())?;
+
+        let dp_outputs = self.dp_ort.run(ort::inputs! {
+            "text_ids" => &text_ids_value,
+            "style_dp" => &style_dp_value,
+            "text_mask" => &text_mask_value
+        })?;
+
+        let (_, duration_data) = dp_outputs["duration"].try_extract_tensor::<f32>()?;
+        let mut duration: Vec<f32> = duration_data.to_vec();
+
+        for dur in duration.iter_mut() {
+            *dur /= speed;
+        }
+
+        let style_ttl_value = Value::from_array(style.ttl.clone())?;
+        let text_enc_outputs = self.text_enc_ort.run(ort::inputs! {
+            "text_ids" => &text_ids_value,
+            "style_ttl" => &style_ttl_value,
+            "text_mask" => &text_mask_value
+        })?;
+
+        let (text_emb_shape, text_emb_data) =
+            text_enc_outputs["text_emb"].try_extract_tensor::<f32>()?;
+        let text_emb = Array3::from_shape_vec(
+            (
+                text_emb_shape[0] as usize,
+                text_emb_shape[1] as usize,
+                text_emb_shape[2] as usize,
+            ),
+            text_emb_data.to_vec(),
+        )?;
+
+        let (mut xt, latent_mask) = sample_noisy_latent(
+            &duration,
+            self.sample_rate,
+            self.cfgs.ae.base_chunk_size,
+            self.cfgs.ttl.chunk_compress_factor,
+            self.cfgs.ttl.latent_dim,
+        );
+
+        let total_step_array = Array::from_elem(bsz, total_step as f32);
+
+        for step in 0..total_step {
+            let current_step_array = Array::from_elem(bsz, step as f32);
+
+            let xt_value = Value::from_array(xt.clone())?;
+            let text_emb_value = Value::from_array(text_emb.clone())?;
+            let latent_mask_value = Value::from_array(latent_mask.clone())?;
+            let text_mask_value2 = Value::from_array(text_mask.clone())?;
+            let current_step_value = Value::from_array(current_step_array)?;
+            let total_step_value = Value::from_array(total_step_array.clone())?;
+
+            let vector_est_outputs = self.vector_est_ort.run(ort::inputs! {
+                "noisy_latent" => &xt_value,
+                "text_emb" => &text_emb_value,
+                "style_ttl" => &style_ttl_value,
+                "latent_mask" => &latent_mask_value,
+                "text_mask" => &text_mask_value2,
+                "current_step" => &current_step_value,
+                "total_step" => &total_step_value
+            })?;
+
+            let (denoised_shape, denoised_data) =
+                vector_est_outputs["denoised_latent"].try_extract_tensor::<f32>()?;
+            xt = Array3::from_shape_vec(
+                (
+                    denoised_shape[0] as usize,
+                    denoised_shape[1] as usize,
+                    denoised_shape[2] as usize,
+                ),
+                denoised_data.to_vec(),
+            )?;
+        }
+
+        let final_latent_value = Value::from_array(xt)?;
+        let vocoder_outputs = self.vocoder_ort.run(ort::inputs! {
+            "latent" => &final_latent_value
+        })?;
+
+        let (_, wav_data) = vocoder_outputs["wav_tts"].try_extract_tensor::<f32>()?;
+        let wav: Vec<f32> = wav_data.to_vec();
+
+        Ok((wav, duration))
+    }
+
+    pub fn synthesize(
+        &mut self,
+        text: &str,
+        lang: &str,
+        style: &Style,
+        total_step: usize,
+        speed: f32,
+        silence_duration: f32,
+    ) -> Result<(Vec<f32>, f32)> {
+        let max_len = if lang == "ko" || lang == "ja" {
+            120
+        } else {
+            300
+        };
+        let chunks = chunk_text(text, Some(max_len));
+
+        let mut wav_cat: Vec<f32> = Vec::new();
+        let mut dur_cat: f32 = 0.0;
+
+        for (i, chunk) in chunks.iter().enumerate() {
+            let (wav, duration) = self.infer(
+                &[chunk.clone()],
+                &[lang.to_string()],
+                style,
+                total_step,
+                speed,
+            )?;
+
+            let dur = duration[0];
+            let wav_len = (self.sample_rate as f32 * dur) as usize;
+            let wav_chunk = &wav[..wav_len.min(wav.len())];
+
+            if i == 0 {
+                wav_cat.extend_from_slice(wav_chunk);
+                dur_cat = dur;
+            } else {
+                let silence_len = (silence_duration * self.sample_rate as f32) as usize;
+                let silence = vec![0.0f32; silence_len];
+
+                wav_cat.extend_from_slice(&silence);
+                wav_cat.extend_from_slice(wav_chunk);
+                dur_cat += silence_duration + dur;
+            }
+        }
+
+        Ok((wav_cat, dur_cat))
+    }
+}
+
+fn sample_noisy_latent(
+    duration: &[f32],
+    sample_rate: i32,
+    base_chunk_size: i32,
+    chunk_compress: i32,
+    latent_dim: i32,
+) -> (Array3<f32>, Array3<f32>) {
+    let bsz = duration.len();
+    let max_dur = duration.iter().fold(0.0f32, |a, &b| a.max(b));
+
+    let wav_len_max = (max_dur * sample_rate as f32) as usize;
+    let wav_lengths: Vec<usize> = duration
+        .iter()
+        .map(|&d| (d * sample_rate as f32) as usize)
+        .collect();
+
+    let chunk_size = (base_chunk_size * chunk_compress) as usize;
+    let latent_len = wav_len_max.div_ceil(chunk_size);
+    let latent_dim_val = (latent_dim * chunk_compress) as usize;
+
+    let mut noisy_latent = Array3::<f32>::zeros((bsz, latent_dim_val, latent_len));
+
+    let normal = Normal::new(0.0f32, 1.0).unwrap();
+    let mut rng = rand::rng();
+
+    for b in 0..bsz {
+        for d in 0..latent_dim_val {
+            for t in 0..latent_len {
+                noisy_latent[[b, d, t]] = normal.sample(&mut rng);
+            }
+        }
+    }
+
+    let latent_lengths: Vec<usize> = wav_lengths
+        .iter()
+        .map(|&len| len.div_ceil(chunk_size))
+        .collect();
+
+    let latent_mask = length_to_mask(&latent_lengths, Some(latent_len));
+
+    for b in 0..bsz {
+        for d in 0..latent_dim_val {
+            for t in 0..latent_len {
+                noisy_latent[[b, d, t]] *= latent_mask[[b, 0, t]];
+            }
+        }
+    }
+
+    (noisy_latent, latent_mask)
+}
+
+pub fn load_voice_style(voice_style_paths: &[String], verbose: bool) -> Result<Style> {
+    let bsz = voice_style_paths.len();
+
+    let first_file =
+        File::open(&voice_style_paths[0]).context("Failed to open voice style file")?;
+    let first_reader = BufReader::new(first_file);
+    let first_data: VoiceStyleData = serde_json::from_reader(first_reader)?;
+
+    let ttl_dims = &first_data.style_ttl.dims;
+    let dp_dims = &first_data.style_dp.dims;
+
+    let ttl_dim1 = ttl_dims[1];
+    let ttl_dim2 = ttl_dims[2];
+    let dp_dim1 = dp_dims[1];
+    let dp_dim2 = dp_dims[2];
+
+    let ttl_size = bsz * ttl_dim1 * ttl_dim2;
+    let dp_size = bsz * dp_dim1 * dp_dim2;
+    let mut ttl_flat = vec![0.0f32; ttl_size];
+    let mut dp_flat = vec![0.0f32; dp_size];
+
+    for (i, path) in voice_style_paths.iter().enumerate() {
+        let file = File::open(path).context("Failed to open voice style file")?;
+        let reader = BufReader::new(file);
+        let data: VoiceStyleData = serde_json::from_reader(reader)?;
+
+        let ttl_offset = i * ttl_dim1 * ttl_dim2;
+        let mut idx = 0;
+        for batch in &data.style_ttl.data {
+            for row in batch {
+                for &val in row {
+                    ttl_flat[ttl_offset + idx] = val;
+                    idx += 1;
+                }
+            }
+        }
+
+        let dp_offset = i * dp_dim1 * dp_dim2;
+        idx = 0;
+        for batch in &data.style_dp.data {
+            for row in batch {
+                for &val in row {
+                    dp_flat[dp_offset + idx] = val;
+                    idx += 1;
+                }
+            }
+        }
+    }
+
+    let ttl_style = Array3::from_shape_vec((bsz, ttl_dim1, ttl_dim2), ttl_flat)?;
+    let dp_style = Array3::from_shape_vec((bsz, dp_dim1, dp_dim2), dp_flat)?;
+
+    if verbose {
+        tracing::info!("Loaded {} voice styles", bsz);
+    }
+
+    Ok(Style {
+        ttl: ttl_style,
+        dp: dp_style,
+    })
+}
+
+#[cfg(feature = "webgpu")]
+fn load_backend_webgpu(config: &HashMap<String, String>) -> Result<ExecutionProviderDispatch> {
+    let webgpu_device_id = config
+        .get("WEBGPU_DEVICE_ID")
+        .cloned()
+        .unwrap_or_else(|| "0".to_string())
+        .parse::<i32>()
+        .inspect_err(|e| tracing::error!("{e}"))?;
+
+    let webgpu = ort::ep::WebGPU::default().with_device_id(webgpu_device_id);
+
+    Ok(webgpu.build())
+}
+
+#[cfg(feature = "cuda")]
+fn load_backend_cuda(config: &HashMap<String, String>) -> Result<ExecutionProviderDispatch> {
+    let cuda_device_id = config
+        .get("CUDA_DEVICE_ID")
+        .cloned()
+        .unwrap_or_else(|| "0".to_string())
+        .parse::<i32>()
+        .inspect_err(|e| tracing::error!("{e}"))?;
+
+    let cuda = ort::ep::CUDA::default().with_device_id(cuda_device_id);
+
+    Ok(cuda.build())
+}
+
+fn load_backends(config: &HashMap<String, String>) -> Vec<ExecutionProviderDispatch> {
+    let enabled_backends = config
+        .get("ENABLED_BACKENDS")
+        .map(|v| v.as_str())
+        .unwrap_or("")
+        .split(",")
+        .map(Into::into)
+        .collect::<Vec<String>>();
+
+    enabled_backends.iter().filter_map(|name| {
+        #[cfg(feature = "cuda")]
+        if name == "cuda" {
+            return load_backend_cuda(config)
+                .inspect_err(|err| {
+                    tracing::error!("Failed to load backend *{}*: {:?}", name, err);
+                })
+                .ok();
+        }
+
+        #[cfg(feature = "webgpu")]
+        if name == "webgpu" {
+            return load_backend_webgpu(config)
+                .inspect_err(|err| {
+                    tracing::error!("Failed to load backend *{}*: {:?}", name, err);
+                })
+                .ok();
+        }
+
+        tracing::error!(
+            "ENABLED_BACKENDS contains {}, but the binary is not compiled with {} backend support.",
+            name,
+            name
+        );
+        None
+    }).collect()
+}
+
+pub fn load_text_to_speech(onnx_dir: &str) -> Result<TextToSpeech> {
+    let cfgs = load_cfgs(onnx_dir)?;
+
+    let dp_path = format!("{}/duration_predictor.onnx", onnx_dir);
+    let text_enc_path = format!("{}/text_encoder.onnx", onnx_dir);
+    let vector_est_path = format!("{}/vector_estimator.onnx", onnx_dir);
+    let vocoder_path = format!("{}/vocoder.onnx", onnx_dir);
+
+    tracing::info!("Session successfully loaded with Vulkan GPU acceleration!");
+
+    let providers = load_backends(&std::env::vars().collect());
+
+    let dp_ort = Session::builder()?
+        .with_intra_threads(8)
+        .map_err(|e| anyhow!(e.message().to_string()))?
+        .with_execution_providers(&providers)
+        .map_err(|e| anyhow!(e.message().to_string()))?
+        .commit_from_file(&dp_path)?;
+    let text_enc_ort = Session::builder()?
+        .with_intra_threads(8)
+        .map_err(|e| anyhow!(e.message().to_string()))?
+        .with_execution_providers(&providers)
+        .map_err(|e| anyhow!(e.message().to_string()))?
+        .commit_from_file(&text_enc_path)?;
+    let vector_est_ort = Session::builder()?
+        .with_intra_threads(8)
+        .map_err(|e| anyhow!(e.message().to_string()))?
+        .with_execution_providers(&providers)
+        .map_err(|e| anyhow!(e.message().to_string()))?
+        .commit_from_file(&vector_est_path)?;
+    let vocoder_ort = Session::builder()?
+        .with_intra_threads(8)
+        .map_err(|e| anyhow!(e.message().to_string()))?
+        .with_execution_providers(&providers)
+        .map_err(|e| anyhow!(e.message().to_string()))?
+        .commit_from_file(&vocoder_path)?;
+
+    let unicode_indexer_path = format!("{}/unicode_indexer.json", onnx_dir);
+    let text_processor = UnicodeProcessor::new(&unicode_indexer_path)?;
+
+    Ok(TextToSpeech::new(
+        cfgs,
+        text_processor,
+        dp_ort,
+        text_enc_ort,
+        vector_est_ort,
+        vocoder_ort,
+    ))
+}
--- a/crates/yaejuyang-supertonic/src/tts/mod.rs
+++ b/crates/yaejuyang-supertonic/src/tts/mod.rs
@ -0,0 +1,8 @@
+pub mod assets;
+pub mod engine;
+pub mod pool;
+pub mod text;
+pub mod wav;
+
+pub use engine::{Style, TextToSpeech, load_text_to_speech, load_voice_style};
+pub use pool::{TtsOpts, TtsPool};
--- a/crates/yaejuyang-supertonic/src/tts/pool.rs
+++ b/crates/yaejuyang-supertonic/src/tts/pool.rs
@ -0,0 +1,98 @@
+use anyhow::{Context, Result};
+use crossbeam_channel::{Sender, bounded};
+use std::sync::Arc;
+use std::thread;
+use std::time::Instant;
+use tokio::sync::oneshot;
+
+use super::engine::{Style, TextToSpeech};
+use super::wav::wav_bytes;
+
+#[derive(Clone, Copy, Debug)]
+pub struct TtsOpts {
+    pub total_step: usize,
+    pub speed: f32,
+    pub silence_duration: f32,
+}
+
+pub struct TtsJob {
+    pub text: String,
+    pub lang: String,
+    pub reply: oneshot::Sender<Result<Vec<u8>, String>>,
+}
+
+pub struct TtsPool {
+    tx: Sender<TtsJob>,
+}
+
+impl TtsPool {
+    pub fn spawn<I>(workers: usize, init: I, opts: TtsOpts) -> Result<Self>
+    where
+        I: Fn(u32) -> Result<(TextToSpeech, Style)> + Send + Sync + 'static,
+    {
+        let workers = workers.max(1);
+        let (tx, rx) = bounded::<TtsJob>(workers * 4);
+        let init = Arc::new(init);
+
+        for worker_id in 0..workers {
+            let rx = rx.clone();
+            let init = init.clone();
+            thread::Builder::new()
+                .name(format!("supertonic-tts-{worker_id}"))
+                .spawn(move || {
+                    let (mut tts, style) = match init(worker_id as u32) {
+                        Ok(pair) => pair,
+                        Err(e) => {
+                            tracing::error!("worker {worker_id} init failed: {e:?}");
+                            return;
+                        }
+                    };
+                    tracing::info!("supertonic worker {worker_id} ready");
+
+                    while let Ok(job) = rx.recv() {
+                        let start_at = Instant::now();
+
+                        let result = (|| -> Result<Vec<u8>, String> {
+                            let (wav, _dur) = tts
+                                .synthesize(
+                                    &job.text,
+                                    &job.lang,
+                                    &style,
+                                    opts.total_step,
+                                    opts.speed,
+                                    opts.silence_duration,
+                                )
+                                .map_err(|e| e.to_string())?;
+                            wav_bytes(&wav, tts.sample_rate).map_err(|e| e.to_string())
+                        })();
+                        tracing::info!("synthesize taken {}ms", start_at.elapsed().as_millis());
+
+                        let _ = job.reply.send(result);
+                    }
+                    tracing::info!("supertonic worker {worker_id} exiting");
+                })
+                .context("failed to spawn TTS worker thread")?;
+        }
+
+        Ok(TtsPool { tx })
+    }
+
+    pub async fn synthesize(&self, text: String, lang: String) -> Result<Vec<u8>, String> {
+        let (reply_tx, reply_rx) = oneshot::channel();
+        let job = TtsJob {
+            text,
+            lang,
+            reply: reply_tx,
+        };
+
+        let tx = self.tx.clone();
+        tokio::task::spawn_blocking(move || tx.send(job))
+            .await
+            .map_err(|e| format!("dispatch task join failed: {e}"))?
+            .map_err(|_| "TTS pool channel closed".to_string())?;
+
+        reply_rx
+            .await
+            .map_err(|_| "TTS worker dropped reply channel".to_string())?
+    }
+}
--- a/crates/yaejuyang-supertonic/src/tts/text.rs
+++ b/crates/yaejuyang-supertonic/src/tts/text.rs
@ -0,0 +1,375 @@
+use anyhow::{Result, bail};
+use ndarray::Array3;
+use regex::Regex;
+use serde::Deserialize;
+use std::fs::File;
+use std::io::BufReader;
+use std::path::Path;
+use unicode_normalization::UnicodeNormalization;
+
+pub const AVAILABLE_LANGS: &[&str] = &[
+    "en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es", "et", "fi", "fr", "hi", "hr", "hu",
+    "id", "it", "lt", "lv", "nl", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "tr", "uk", "vi", "na",
+];
+
+pub fn is_valid_lang(lang: &str) -> bool {
+    AVAILABLE_LANGS.contains(&lang)
+}
+
+pub struct UnicodeProcessor {
+    indexer: Vec<i64>,
+}
+
+impl UnicodeProcessor {
+    pub fn new<P: AsRef<Path>>(unicode_indexer_json_path: P) -> Result<Self> {
+        let file = File::open(unicode_indexer_json_path)?;
+        let reader = BufReader::new(file);
+        let indexer: Vec<i64> = serde_json::from_reader(reader)?;
+        Ok(UnicodeProcessor { indexer })
+    }
+
+    pub fn call(
+        &self,
+        text_list: &[String],
+        lang_list: &[String],
+    ) -> Result<(Vec<Vec<i64>>, Array3<f32>)> {
+        let mut processed_texts: Vec<String> = Vec::new();
+        for (text, lang) in text_list.iter().zip(lang_list.iter()) {
+            processed_texts.push(preprocess_text(text, lang)?);
+        }
+
+        let text_ids_lengths: Vec<usize> =
+            processed_texts.iter().map(|t| t.chars().count()).collect();
+
+        let max_len = *text_ids_lengths.iter().max().unwrap_or(&0);
+
+        let mut text_ids = Vec::new();
+        for text in &processed_texts {
+            let mut row = vec![0i64; max_len];
+            let unicode_vals = text_to_unicode_values(text);
+            for (j, &val) in unicode_vals.iter().enumerate() {
+                if val < self.indexer.len() {
+                    row[j] = self.indexer[val];
+                } else {
+                    row[j] = -1;
+                }
+            }
+            text_ids.push(row);
+        }
+
+        let text_mask = get_text_mask(&text_ids_lengths);
+
+        Ok((text_ids, text_mask))
+    }
+}
+
+pub fn preprocess_text(text: &str, lang: &str) -> Result<String> {
+    let mut text: String = text.nfkd().collect();
+
+    let emoji_pattern = Regex::new(r"[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+").unwrap();
+    text = emoji_pattern.replace_all(&text, "").to_string();
+
+    let replacements = [
+        ("\u{2013}", "-"),
+        ("\u{2011}", "-"),
+        ("\u{2014}", "-"),
+        ("_", " "),
+        ("\u{201C}", "\""),
+        ("\u{201D}", "\""),
+        ("\u{2018}", "'"),
+        ("\u{2019}", "'"),
+        ("\u{00B4}", "'"),
+        ("`", "'"),
+        ("[", " "),
+        ("]", " "),
+        ("|", " "),
+        ("/", " "),
+        ("#", " "),
+        ("\u{2192}", " "),
+        ("\u{2190}", " "),
+    ];
+
+    for (from, to) in &replacements {
+        text = text.replace(from, to);
+    }
+
+    let special_symbols = ["\u{2665}", "\u{2606}", "\u{2661}", "\u{00A9}", "\\"];
+    for symbol in &special_symbols {
+        text = text.replace(symbol, "");
+    }
+
+    let expr_replacements = [
+        ("@", " at "),
+        ("e.g.,", "for example, "),
+        ("i.e.,", "that is, "),
+    ];
+
+    for (from, to) in &expr_replacements {
+        text = text.replace(from, to);
+    }
+
+    text = Regex::new(r" ,")
+        .unwrap()
+        .replace_all(&text, ",")
+        .to_string();
+    text = Regex::new(r" \.")
+        .unwrap()
+        .replace_all(&text, ".")
+        .to_string();
+    text = Regex::new(r" !")
+        .unwrap()
+        .replace_all(&text, "!")
+        .to_string();
+    text = Regex::new(r" \?")
+        .unwrap()
+        .replace_all(&text, "?")
+        .to_string();
+    text = Regex::new(r" ;")
+        .unwrap()
+        .replace_all(&text, ";")
+        .to_string();
+    text = Regex::new(r" :")
+        .unwrap()
+        .replace_all(&text, ":")
+        .to_string();
+    text = Regex::new(r" '")
+        .unwrap()
+        .replace_all(&text, "'")
+        .to_string();
+
+    while text.contains("\"\"") {
+        text = text.replace("\"\"", "\"");
+    }
+    while text.contains("''") {
+        text = text.replace("''", "'");
+    }
+    while text.contains("``") {
+        text = text.replace("``", "`");
+    }
+
+    text = Regex::new(r"\s+")
+        .unwrap()
+        .replace_all(&text, " ")
+        .to_string();
+    text = text.trim().to_string();
+
+    if !text.is_empty() {
+        let ends_with_punct =
+            Regex::new(r#"[.!?;:,'"\u{201C}\u{201D}\u{2018}\u{2019})\]}\u{2026}\u{3002}\u{300D}\u{300F}\u{3011}\u{3009}\u{300B}\u{203A}\u{00BB}]$"#).unwrap();
+        if !ends_with_punct.is_match(&text) {
+            text.push('.');
+        }
+    }
+
+    if !is_valid_lang(lang) {
+        bail!(
+            "Invalid language: {}. Available: {:?}",
+            lang,
+            AVAILABLE_LANGS
+        );
+    }
+
+    text = format!("<{}>{}</{}>", lang, text, lang);
+
+    Ok(text)
+}
+
+pub fn text_to_unicode_values(text: &str) -> Vec<usize> {
+    text.chars().map(|c| c as usize).collect()
+}
+
+pub fn length_to_mask(lengths: &[usize], max_len: Option<usize>) -> Array3<f32> {
+    let bsz = lengths.len();
+    let max_len = max_len.unwrap_or_else(|| *lengths.iter().max().unwrap_or(&0));
+
+    let mut mask = Array3::<f32>::zeros((bsz, 1, max_len));
+    for (i, &len) in lengths.iter().enumerate() {
+        for j in 0..len.min(max_len) {
+            mask[[i, 0, j]] = 1.0;
+        }
+    }
+    mask
+}
+
+pub fn get_text_mask(text_ids_lengths: &[usize]) -> Array3<f32> {
+    let max_len = *text_ids_lengths.iter().max().unwrap_or(&0);
+    length_to_mask(text_ids_lengths, Some(max_len))
+}
+
+const MAX_CHUNK_LENGTH: usize = 300;
+
+const ABBREVIATIONS: &[&str] = &[
+    "Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.", "St.", "Ave.", "Rd.", "Blvd.", "Dept.",
+    "Inc.", "Ltd.", "Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D.",
+];
+
+pub fn chunk_text(text: &str, max_len: Option<usize>) -> Vec<String> {
+    let max_len = max_len.unwrap_or(MAX_CHUNK_LENGTH);
+    let text = text.trim();
+
+    if text.is_empty() {
+        return vec![String::new()];
+    }
+
+    let para_re = Regex::new(r"\n\s*\n").unwrap();
+    let paragraphs: Vec<&str> = para_re.split(text).collect();
+    let mut chunks = Vec::new();
+
+    for para in paragraphs {
+        let para = para.trim();
+        if para.is_empty() {
+            continue;
+        }
+
+        if para.len() <= max_len {
+            chunks.push(para.to_string());
+            continue;
+        }
+
+        let sentences = split_sentences(para);
+        let mut current = String::new();
+        let mut current_len = 0;
+
+        for sentence in sentences {
+            let sentence = sentence.trim();
+            if sentence.is_empty() {
+                continue;
+            }
+
+            let sentence_len = sentence.len();
+            if sentence_len > max_len {
+                if !current.is_empty() {
+                    chunks.push(current.trim().to_string());
+                    current.clear();
+                    current_len = 0;
+                }
+
+                let parts: Vec<&str> = sentence.split(',').collect();
+                for part in parts {
+                    let part = part.trim();
+                    if part.is_empty() {
+                        continue;
+                    }
+
+                    let part_len = part.len();
+                    if part_len > max_len {
+                        let words: Vec<&str> = part.split_whitespace().collect();
+                        let mut word_chunk = String::new();
+                        let mut word_chunk_len = 0;
+
+                        for word in words {
+                            let word_len = word.len();
+                            if word_chunk_len + word_len + 1 > max_len && !word_chunk.is_empty() {
+                                chunks.push(word_chunk.trim().to_string());
+                                word_chunk.clear();
+                                word_chunk_len = 0;
+                            }
+
+                            if !word_chunk.is_empty() {
+                                word_chunk.push(' ');
+                                word_chunk_len += 1;
+                            }
+                            word_chunk.push_str(word);
+                            word_chunk_len += word_len;
+                        }
+
+                        if !word_chunk.is_empty() {
+                            chunks.push(word_chunk.trim().to_string());
+                        }
+                    } else {
+                        if current_len + part_len + 1 > max_len && !current.is_empty() {
+                            chunks.push(current.trim().to_string());
+                            current.clear();
+                            current_len = 0;
+                        }
+
+                        if !current.is_empty() {
+                            current.push_str(", ");
+                            current_len += 2;
+                        }
+                        current.push_str(part);
+                        current_len += part_len;
+                    }
+                }
+                continue;
+            }
+
+            if current_len + sentence_len + 1 > max_len && !current.is_empty() {
+                chunks.push(current.trim().to_string());
+                current.clear();
+                current_len = 0;
+            }
+
+            if !current.is_empty() {
+                current.push(' ');
+                current_len += 1;
+            }
+            current.push_str(sentence);
+            current_len += sentence_len;
+        }
+
+        if !current.is_empty() {
+            chunks.push(current.trim().to_string());
+        }
+    }
+
+    if chunks.is_empty() {
+        vec![String::new()]
+    } else {
+        chunks
+    }
+}
+
+fn split_sentences(text: &str) -> Vec<String> {
+    let re = Regex::new(r"([.!?])\s+").unwrap();
+
+    let matches: Vec<_> = re.find_iter(text).collect();
+    if matches.is_empty() {
+        return vec![text.to_string()];
+    }
+
+    let mut sentences = Vec::new();
+    let mut last_end = 0;
+
+    for m in matches {
+        let before_punc = &text[last_end..m.start()];
+
+        let mut is_abbrev = false;
+        for abbrev in ABBREVIATIONS {
+            let combined = format!("{}{}", before_punc.trim(), &text[m.start()..m.start() + 1]);
+            if combined.ends_with(abbrev) {
+                is_abbrev = true;
+                break;
+            }
+        }
+
+        if !is_abbrev {
+            sentences.push(text[last_end..m.end()].to_string());
+            last_end = m.end();
+        }
+    }
+
+    if last_end < text.len() {
+        sentences.push(text[last_end..].to_string());
+    }
+
+    if sentences.is_empty() {
+        vec![text.to_string()]
+    } else {
+        sentences
+    }
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct VoiceStyleData {
+    pub style_ttl: StyleComponent,
+    pub style_dp: StyleComponent,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct StyleComponent {
+    pub data: Vec<Vec<Vec<f32>>>,
+    pub dims: Vec<usize>,
+    #[serde(rename = "type")]
+    pub _dtype: String,
+}
--- a/crates/yaejuyang-supertonic/src/tts/wav.rs
+++ b/crates/yaejuyang-supertonic/src/tts/wav.rs
@ -0,0 +1,25 @@
+use anyhow::Result;
+use hound::{SampleFormat, WavSpec, WavWriter};
+use std::io::Cursor;
+
+pub fn wav_bytes(audio: &[f32], sample_rate: i32) -> Result<Vec<u8>> {
+    let spec = WavSpec {
+        channels: 1,
+        sample_rate: sample_rate as u32,
+        bits_per_sample: 16,
+        sample_format: SampleFormat::Int,
+    };
+
+    let mut buf = Cursor::new(Vec::<u8>::new());
+    {
+        let mut writer = WavWriter::new(&mut buf, spec)?;
+        for &sample in audio {
+            let clamped = sample.clamp(-1.0, 1.0);
+            let val = (clamped * 32767.0) as i16;
+            writer.write_sample(val)?;
+        }
+        writer.finalize()?;
+    }
+
+    Ok(buf.into_inner())
+}