From f5194f55a11485b1f4d156e92656d2d4edbe04ed Mon Sep 17 00:00:00 2001 From: claude-bot Date: Sun, 3 May 2026 18:24:39 +0900 Subject: [PATCH] Tune MeloTTS speed and prosody defaults --- .env.example | 5 ++++- README.md | 13 +++++++++++-- docker/melotts/Dockerfile | 2 +- docker/melotts/melo_tts_cli.py | 6 ++++++ src/config.ts | 5 ++++- src/services/melo-tts.ts | 28 +++++++++++++++++++++++++++- 6 files changed, 53 insertions(+), 6 deletions(-) diff --git a/.env.example b/.env.example index 3635da9..2fed58f 100644 --- a/.env.example +++ b/.env.example @@ -13,7 +13,10 @@ TTS_IMAGE=realtime-voice-bot-melotts:v0.1.2 TTS_LANGUAGE=KR TTS_SPEAKER=KR TTS_DEVICE=cpu -TTS_SPEED=1 +TTS_SPEED=1.18 +TTS_SDP_RATIO=0.22 +TTS_NOISE_SCALE=0.55 +TTS_NOISE_SCALE_W=0.75 TTS_CACHE_DIR=.local-ai/tts-cache TTS_OUTPUT_DIR=.local-ai/tts-output OLLAMA_BASE_URL=http://127.0.0.1:11434 diff --git a/README.md b/README.md index c185732..5c94fd3 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,13 @@ bun run test:tts -- "안녕하세요. 로컬 티티에스 테스트입니다." - 기본값 `cpu` - Docker GPU passthrough를 쓸 때만 `cuda`로 바꿉니다 - `TTS_SPEED` - - 기본값 `1` + - 기본값 `1.18` +- `TTS_SDP_RATIO` + - 기본값 `0.22` +- `TTS_NOISE_SCALE` + - 기본값 `0.55` +- `TTS_NOISE_SCALE_W` + - 기본값 `0.75` - `OLLAMA_KEEP_ALIVE` - 기본값 `5m` - `MAX_CONVERSATION_TURNS` @@ -180,7 +186,10 @@ TTS_IMAGE=realtime-voice-bot-melotts:v0.1.2 TTS_LANGUAGE=KR TTS_SPEAKER=KR TTS_DEVICE=cpu -TTS_SPEED=1 +TTS_SPEED=1.18 +TTS_SDP_RATIO=0.22 +TTS_NOISE_SCALE=0.55 +TTS_NOISE_SCALE_W=0.75 TTS_CACHE_DIR=.local-ai/tts-cache TTS_OUTPUT_DIR=.local-ai/tts-output OLLAMA_BASE_URL=http://127.0.0.1:11434 diff --git a/docker/melotts/Dockerfile b/docker/melotts/Dockerfile index 449ab0a..c9f93fa 100644 --- a/docker/melotts/Dockerfile +++ b/docker/melotts/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.10-slim +FROM python:3.11-slim ENV PYTHONUNBUFFERED=1 diff --git a/docker/melotts/melo_tts_cli.py b/docker/melotts/melo_tts_cli.py index 240c648..dc58972 100644 --- a/docker/melotts/melo_tts_cli.py +++ b/docker/melotts/melo_tts_cli.py @@ -11,6 +11,9 @@ def main() -> None: parser.add_argument("--language", default="KR") parser.add_argument("--speaker", default="KR") parser.add_argument("--speed", type=float, default=1.0) + parser.add_argument("--sdp-ratio", type=float, default=0.2) + parser.add_argument("--noise-scale", type=float, default=0.6) + parser.add_argument("--noise-scale-w", type=float, default=0.8) parser.add_argument("--device", default="cpu") args = parser.parse_args() @@ -29,6 +32,9 @@ def main() -> None: speaker_ids[args.speaker], str(output_path), speed=args.speed, + sdp_ratio=args.sdp_ratio, + noise_scale=args.noise_scale, + noise_scale_w=args.noise_scale_w, ) diff --git a/src/config.ts b/src/config.ts index edc1e9b..a8fe226 100644 --- a/src/config.ts +++ b/src/config.ts @@ -24,7 +24,10 @@ const envSchema = z.object({ TTS_LANGUAGE: z.string().min(1).default("KR"), TTS_SPEAKER: z.string().min(1).default("KR"), TTS_DEVICE: z.string().min(1).default("cpu"), - TTS_SPEED: z.coerce.number().min(0.5).max(2).default(1), + TTS_SPEED: z.coerce.number().min(0.5).max(2).default(1.18), + TTS_SDP_RATIO: z.coerce.number().min(0).max(1).default(0.22), + TTS_NOISE_SCALE: z.coerce.number().min(0).max(2).default(0.55), + TTS_NOISE_SCALE_W: z.coerce.number().min(0).max(2).default(0.75), TTS_CACHE_DIR: z.string().min(1).default(".local-ai/tts-cache"), TTS_OUTPUT_DIR: z.string().min(1).default(".local-ai/tts-output"), DEBUG: z diff --git a/src/services/melo-tts.ts b/src/services/melo-tts.ts index 4ccd992..7c2808a 100644 --- a/src/services/melo-tts.ts +++ b/src/services/melo-tts.ts @@ -57,7 +57,7 @@ export class MeloTtsService { } async speak(text: string): Promise { - const trimmed = text.trim(); + const trimmed = this.normalizeText(text); if (!trimmed) { return; } @@ -113,6 +113,12 @@ export class MeloTtsService { this.config.TTS_SPEAKER, "--speed", String(this.config.TTS_SPEED), + "--sdp-ratio", + String(this.config.TTS_SDP_RATIO), + "--noise-scale", + String(this.config.TTS_NOISE_SCALE), + "--noise-scale-w", + String(this.config.TTS_NOISE_SCALE_W), "--device", this.config.TTS_DEVICE, ); @@ -122,10 +128,30 @@ export class MeloTtsService { language: this.config.TTS_LANGUAGE, speaker: this.config.TTS_SPEAKER, speed: this.config.TTS_SPEED, + sdp_ratio: this.config.TTS_SDP_RATIO, + noise_scale: this.config.TTS_NOISE_SCALE, + noise_scale_w: this.config.TTS_NOISE_SCALE_W, device: this.config.TTS_DEVICE, }); const docker = await resolveDockerCommand(this.config); await run(docker, args, "inherit"); } + + private normalizeText(input: string): string { + const collapsed = input + .replace(/[`*_#>\[\]\(\)]/g, " ") + .replace(/\s+/g, " ") + .trim(); + + if (!collapsed) { + return ""; + } + + if (/[.!?…]$/.test(collapsed)) { + return collapsed; + } + + return `${collapsed}.`; + } }