From ad357a6edec9c3980041ec2f2872c0f0a3856f99 Mon Sep 17 00:00:00 2001 From: claude-bot Date: Sun, 3 May 2026 01:56:09 +0900 Subject: [PATCH] Add local MeloTTS support --- .env.example | 8 +++ README.md | 42 +++++++++++- docker/melotts/Dockerfile | 18 +++++ docker/melotts/melo_tts_cli.py | 36 ++++++++++ package.json | 2 + src/audio/realtime-segmenter.ts | 9 +++ src/config.ts | 11 ++++ src/index.ts | 54 ++++++++++++++- src/services/audio-playback.ts | 42 ++++++++++++ src/services/melo-tts.ts | 113 ++++++++++++++++++++++++++++++++ src/services/ollama-llm.ts | 2 + src/setup-tts.ts | 60 +++++++++++++++++ src/setup.ts | 2 + 13 files changed, 396 insertions(+), 3 deletions(-) create mode 100644 docker/melotts/Dockerfile create mode 100644 docker/melotts/melo_tts_cli.py create mode 100644 src/services/audio-playback.ts create mode 100644 src/services/melo-tts.ts create mode 100644 src/setup-tts.ts diff --git a/.env.example b/.env.example index b0e83fb..2dc385e 100644 --- a/.env.example +++ b/.env.example @@ -7,6 +7,14 @@ LOCAL_AI_PYTHON=python AUDIO_SOURCE= DEBUG=false +TTS_ENABLED=true +TTS_IMAGE=realtime-voice-bot-melotts:v0.1.2 +TTS_LANGUAGE=KR +TTS_SPEAKER=KR +TTS_DEVICE=cpu +TTS_SPEED=1 +TTS_CACHE_DIR=.local-ai/tts-cache +TTS_OUTPUT_DIR=.local-ai/tts-output OLLAMA_BASE_URL=http://127.0.0.1:11434 OLLAMA_MODEL=qwen3:8b OLLAMA_KEEP_ALIVE=5m diff --git a/README.md b/README.md index 3c63a15..48e06b9 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ - STT 전용 테스트 - STT 결과에 대해 답변 가치 판단 후 필요할 때만 LLM 답변하는 통합 테스트 - 로컬 `Ollama` LLM 에이전트 CLI 테스트 +- 무료 로컬 `MeloTTS` 기반 음성 출력 테스트 ## 빠른 시작 @@ -47,6 +48,12 @@ LLM 단독 테스트: bun run test:llm ``` +TTS 단독 테스트: + +```bat +bun run test:tts -- "안녕하세요. 로컬 티티에스 테스트입니다." +``` + ## 환경 변수 - `AUDIO_SOURCE` @@ -61,6 +68,19 @@ bun run test:llm - 기본값 `http://127.0.0.1:11434` - `OLLAMA_MODEL` - 기본값 `qwen3:8b` +- `TTS_ENABLED` + - 기본값 `true` +- `TTS_IMAGE` + - 기본값 `realtime-voice-bot-melotts:v0.1.2` +- `TTS_LANGUAGE` + - 기본값 `KR` +- `TTS_SPEAKER` + - 기본값 `KR` +- `TTS_DEVICE` + - 기본값 `cpu` + - Docker GPU passthrough를 쓸 때만 `cuda`로 바꿉니다 +- `TTS_SPEED` + - 기본값 `1` - `OLLAMA_KEEP_ALIVE` - 기본값 `5m` - `MAX_CONVERSATION_TURNS` @@ -84,12 +104,15 @@ bun run test:llm ## 메모 - 이 버전은 `STT`, `STT+LLM`, `LLM` 테스트를 따로 제공합니다. +- `test:sttllm`에서 TTS가 켜져 있으면 답변을 스피커로 읽어줍니다. +- `test:sttllm`에서는 자기 음성을 다시 전사하지 않도록 TTS 재생 중에는 캡처를 잠시 멈춥니다. - LLM 프롬프트는 `prompts/*.md` 에 분리되어 있습니다. - 최소 지연을 위해 파일 저장은 하지 않습니다. - VAD는 현재 모델 기반이 아니라 진폭 기반 단순 분리입니다. - Windows에서는 보통 출력 루프백이 가능한 장치나 `Stereo Mix`, 오디오 인터페이스 loopback 채널을 `AUDIO_SOURCE`로 잡아야 합니다. - 단순히 스피커 이름을 넣는 구조가 아니라, **루프백/캡처 가능한 입력 장치 이름**을 넣어야 합니다. - `ffmpeg`가 PATH에 잡혀 있어야 합니다. +- TTS는 Windows에서 Docker Desktop이 필요합니다. MeloTTS 공식 문서도 Windows/macOS에서는 Docker 실행을 권장합니다. - `cmd` 기준으로 `.env`는 `copy .env.example .env`로 만들면 됩니다. ## Windows 테스트 순서 @@ -105,8 +128,9 @@ bun run test:llm ## Windows STT+LLM 통합 테스트 순서 1. `bun run setup:llm` -2. `bun run test:sttllm` -3. 유튜브, 디스코드 통화, 동영상 같은 소리를 재생해서 전사와 답변 확인 +2. `bun run setup:tts` +3. `bun run test:sttllm` +4. 유튜브, 디스코드 통화, 동영상 같은 소리를 재생해서 전사와 답변, 음성 출력 확인 ## Windows LLM 테스트 순서 @@ -115,6 +139,12 @@ bun run test:llm 3. 콘솔에 직접 문장을 입력하고 답변 확인 4. `/reset` 으로 문맥 초기화, `/exit` 로 종료 +## Windows TTS 테스트 순서 + +1. Docker Desktop 실행 +2. `bun run setup:tts` +3. `bun run test:tts -- "안녕하세요. 로컬 티티에스 테스트입니다."` + 현재 `test:llm` 에이전트 도구: - 현재 시간 조회 - 현재 런타임 설정 조회 @@ -141,6 +171,14 @@ bun run test:llm LOCAL_AI_PYTHON=python AUDIO_SOURCE= DEBUG=false +TTS_ENABLED=true +TTS_IMAGE=realtime-voice-bot-melotts:v0.1.2 +TTS_LANGUAGE=KR +TTS_SPEAKER=KR +TTS_DEVICE=cpu +TTS_SPEED=1 +TTS_CACHE_DIR=.local-ai/tts-cache +TTS_OUTPUT_DIR=.local-ai/tts-output OLLAMA_BASE_URL=http://127.0.0.1:11434 OLLAMA_MODEL=qwen3:8b OLLAMA_KEEP_ALIVE=5m diff --git a/docker/melotts/Dockerfile b/docker/melotts/Dockerfile new file mode 100644 index 0000000..bc66544 --- /dev/null +++ b/docker/melotts/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.9-slim + +ENV PYTHONUNBUFFERED=1 + +WORKDIR /opt/realtime-voice-bot + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + git \ + libsndfile1 \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir git+https://github.com/myshell-ai/MeloTTS.git@v0.1.2 +RUN python -m unidic download + +COPY melo_tts_cli.py /opt/realtime-voice-bot/melo_tts_cli.py + +ENTRYPOINT ["python", "/opt/realtime-voice-bot/melo_tts_cli.py"] diff --git a/docker/melotts/melo_tts_cli.py b/docker/melotts/melo_tts_cli.py new file mode 100644 index 0000000..240c648 --- /dev/null +++ b/docker/melotts/melo_tts_cli.py @@ -0,0 +1,36 @@ +import argparse +from pathlib import Path + +from melo.api import TTS + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--text", required=True) + parser.add_argument("--output", required=True) + parser.add_argument("--language", default="KR") + parser.add_argument("--speaker", default="KR") + parser.add_argument("--speed", type=float, default=1.0) + parser.add_argument("--device", default="cpu") + args = parser.parse_args() + + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + model = TTS(language=args.language, device=args.device) + speaker_ids = model.hps.data.spk2id + + if args.speaker not in speaker_ids: + supported = ", ".join(sorted(speaker_ids.keys())) + raise SystemExit(f"지원하지 않는 speaker 입니다: {args.speaker}. 사용 가능: {supported}") + + model.tts_to_file( + args.text, + speaker_ids[args.speaker], + str(output_path), + speed=args.speed, + ) + + +if __name__ == "__main__": + main() diff --git a/package.json b/package.json index 5c52456..131fad4 100644 --- a/package.json +++ b/package.json @@ -7,10 +7,12 @@ "setup": "bun src/setup.ts", "setup:stt": "bun src/setup-python.ts", "setup:llm": "bun src/setup-llm.ts", + "setup:tts": "bun src/setup-tts.ts", "setup:python": "bun run setup:stt", "test:stt": "bun src/index.ts test-stt", "test:sttllm": "bun src/index.ts test-sttllm", "test:llm": "bun src/index.ts test-llm", + "test:tts": "bun src/index.ts test-tts", "devices": "bun src/index.ts devices", "check": "tsc --noEmit", "build": "tsc -p tsconfig.json" diff --git a/src/audio/realtime-segmenter.ts b/src/audio/realtime-segmenter.ts index 3310200..6addcdd 100644 --- a/src/audio/realtime-segmenter.ts +++ b/src/audio/realtime-segmenter.ts @@ -55,6 +55,15 @@ export class RealtimeSegmenter { } } + reset(): void { + this.pendingSamples.splice(0, this.pendingSamples.length); + this.preRoll.splice(0, this.preRoll.length); + this.speech.splice(0, this.speech.length); + this.speechActive = false; + this.speechCandidateFrames = 0; + this.silenceFrames = 0; + } + private processFrame(frame: Int16Array): void { let peak = 0; for (const sample of frame) { diff --git a/src/config.ts b/src/config.ts index 785b54f..51002f5 100644 --- a/src/config.ts +++ b/src/config.ts @@ -15,6 +15,17 @@ const envSchema = z.object({ LOCAL_AI_VENV_PATH: z.string().min(1).default(".local-ai/.venv"), LOCAL_AI_PYTHON: emptyToUndefined, AUDIO_SOURCE: emptyToUndefined, + TTS_ENABLED: z + .string() + .optional() + .transform((value) => value?.trim().toLowerCase() !== "false"), + TTS_IMAGE: z.string().min(1).default("realtime-voice-bot-melotts:v0.1.2"), + TTS_LANGUAGE: z.string().min(1).default("KR"), + TTS_SPEAKER: z.string().min(1).default("KR"), + TTS_DEVICE: z.string().min(1).default("cpu"), + TTS_SPEED: z.coerce.number().min(0.5).max(2).default(1), + TTS_CACHE_DIR: z.string().min(1).default(".local-ai/tts-cache"), + TTS_OUTPUT_DIR: z.string().min(1).default(".local-ai/tts-output"), DEBUG: z .string() .optional() diff --git a/src/index.ts b/src/index.ts index dbe2224..7428136 100644 --- a/src/index.ts +++ b/src/index.ts @@ -6,6 +6,7 @@ import { Logger } from "./logger.js"; import { printAudioDevices, spawnLoopbackCapture } from "./audio/capture.js"; import { RealtimeSegmenter } from "./audio/realtime-segmenter.js"; import { FasterWhisperSttService } from "./services/faster-whisper-stt.js"; +import { MeloTtsService } from "./services/melo-tts.js"; import { OllamaLlmService } from "./services/ollama-llm.js"; const mode = process.argv[2] ?? "test-stt"; @@ -15,8 +16,10 @@ async function runSttTest(enableLlm: boolean): Promise { const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error"); const stt = new FasterWhisperSttService(config, logger); const llm = enableLlm ? new OllamaLlmService(config, logger) : null; + let tts = enableLlm && config.TTS_ENABLED ? new MeloTtsService(config, logger) : null; let capture = null as ReturnType | null; let shuttingDown: Promise | null = null; + let suppressCapture = false; let receivedChunks = 0; let receivedBytes = 0; let maxPeak = 0; @@ -79,6 +82,22 @@ async function runSttTest(enableLlm: boolean): Promise { logger.info("LLM warmup finished"); console.log("LLM 준비 완료"); } + if (tts) { + console.log("TTS 준비중..."); + try { + await tts.warmup(); + logger.info("TTS warmup finished", { + image: config.TTS_IMAGE, + language: config.TTS_LANGUAGE, + speaker: config.TTS_SPEAKER, + }); + console.log("TTS 준비 완료"); + } catch (error) { + logger.warn("TTS warmup failed", error); + console.log("TTS 비활성화: bun run setup:tts 를 먼저 실행하세요."); + tts = null; + } + } const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = []; let transcribing = false; @@ -155,6 +174,20 @@ async function runSttTest(enableLlm: boolean): Promise { } else { console.log(`답변> ${reply}`); } + + if (tts) { + suppressCapture = true; + segmenter.reset(); + try { + await tts.speak(reply); + } catch (error) { + logger.warn("TTS playback failed", error); + } finally { + suppressCapture = false; + sawSpeechStart = false; + maxPeak = 0; + } + } } } } catch (error) { @@ -227,6 +260,9 @@ async function runSttTest(enableLlm: boolean): Promise { receivedChunks += 1; receivedBytes += chunk.length; lastChunkAt = Date.now(); + if (suppressCapture) { + return; + } segmenter.pushChunk(chunk); }); capture.stderr.on("data", (chunk: Buffer) => { @@ -330,6 +366,19 @@ async function runLlmCli(): Promise { }); } +async function runTtsTest(): Promise { + const text = process.argv.slice(3).join(" ").trim() || "안녕하세요. 로컬 티티에스 테스트입니다."; + const config = loadConfig(); + const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error"); + const tts = new MeloTtsService(config, logger); + + console.log("TTS 준비중..."); + await tts.warmup(); + console.log("TTS 준비 완료"); + console.log(`재생 문장: ${text}`); + await tts.speak(text); +} + async function main(): Promise { switch (mode) { case "devices": @@ -344,8 +393,11 @@ async function main(): Promise { case "test-llm": await runLlmCli(); return; + case "test-tts": + await runTtsTest(); + return; default: - throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-llm, devices`); + throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-llm, test-tts, devices`); } } diff --git a/src/services/audio-playback.ts b/src/services/audio-playback.ts new file mode 100644 index 0000000..1a75afa --- /dev/null +++ b/src/services/audio-playback.ts @@ -0,0 +1,42 @@ +import { spawn } from "node:child_process"; +import process from "node:process"; + +async function run(command: string, args: string[]): Promise { + await new Promise((resolve, reject) => { + const child = spawn(command, args, { + stdio: ["ignore", "inherit", "inherit"], + windowsHide: true, + }); + + child.on("error", reject); + child.on("exit", (code) => { + if (code === 0) { + resolve(); + return; + } + reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`)); + }); + }); +} + +export async function playWavFile(filePath: string): Promise { + if (process.platform === "win32") { + await run("powershell.exe", [ + "-NoProfile", + "-NonInteractive", + "-ExecutionPolicy", + "Bypass", + "-Command", + [ + "$path = $args[0]", + "$player = New-Object System.Media.SoundPlayer $path", + "$player.Load()", + "$player.PlaySync()", + ].join("; "), + filePath, + ]); + return; + } + + throw new Error(`지원하지 않는 플랫폼입니다: ${process.platform}`); +} diff --git a/src/services/melo-tts.ts b/src/services/melo-tts.ts new file mode 100644 index 0000000..ccf7daf --- /dev/null +++ b/src/services/melo-tts.ts @@ -0,0 +1,113 @@ +import { spawn } from "node:child_process"; +import { randomUUID } from "node:crypto"; +import { mkdir, rm } from "node:fs/promises"; +import path from "node:path"; + +import type { AppConfig } from "../config.js"; +import type { Logger } from "../logger.js"; +import { playWavFile } from "./audio-playback.js"; + +async function run(command: string, args: string[], stdio: "ignore" | "inherit" = "ignore"): Promise { + await new Promise((resolve, reject) => { + const child = spawn(command, args, { + stdio: ["ignore", stdio, "inherit"], + windowsHide: true, + }); + + child.on("error", reject); + child.on("exit", (code) => { + if (code === 0) { + resolve(); + return; + } + reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`)); + }); + }); +} + +export class MeloTtsService { + constructor( + private readonly config: AppConfig, + private readonly logger: Logger, + ) {} + + async warmup(): Promise { + await mkdir(path.resolve(process.cwd(), this.config.TTS_CACHE_DIR), { recursive: true }); + await mkdir(path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR), { recursive: true }); + + await run("docker", ["--version"]); + await run("docker", ["image", "inspect", this.config.TTS_IMAGE]); + } + + async speak(text: string): Promise { + const trimmed = text.trim(); + if (!trimmed) { + return; + } + + const fileName = `tts-${Date.now()}-${randomUUID()}.wav`; + const targetPath = path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR, fileName); + + try { + await this.synthesizeToFile(trimmed, targetPath); + await playWavFile(targetPath); + } finally { + await rm(targetPath, { force: true }).catch(() => undefined); + } + } + + async synthesizeToFile(text: string, targetPath: string): Promise { + await this.warmup(); + + const outputDir = path.dirname(targetPath); + const cacheDir = path.resolve(process.cwd(), this.config.TTS_CACHE_DIR); + const fileName = path.basename(targetPath); + + await mkdir(outputDir, { recursive: true }); + + const args = [ + "run", + "--rm", + "-v", + `${outputDir}:/work/output`, + "-v", + `${cacheDir}:/cache`, + "-e", + "HF_HOME=/cache/huggingface", + "-e", + "HF_HUB_CACHE=/cache/huggingface/hub", + "-e", + "TRANSFORMERS_CACHE=/cache/transformers", + ]; + + if (this.config.TTS_DEVICE !== "cpu") { + args.push("--gpus", "all"); + } + + args.push( + this.config.TTS_IMAGE, + "--text", + text, + "--output", + `/work/output/${fileName}`, + "--language", + this.config.TTS_LANGUAGE, + "--speaker", + this.config.TTS_SPEAKER, + "--speed", + String(this.config.TTS_SPEED), + "--device", + this.config.TTS_DEVICE, + ); + + this.logger.info("Starting MeloTTS synthesis", { + image: this.config.TTS_IMAGE, + language: this.config.TTS_LANGUAGE, + speaker: this.config.TTS_SPEAKER, + speed: this.config.TTS_SPEED, + device: this.config.TTS_DEVICE, + }); + + await run("docker", args, "inherit"); + } +} diff --git a/src/services/ollama-llm.ts b/src/services/ollama-llm.ts index bad8f6d..c7be41b 100644 --- a/src/services/ollama-llm.ts +++ b/src/services/ollama-llm.ts @@ -374,10 +374,12 @@ export class OllamaLlmService { "bun run setup", "bun run setup:stt", "bun run setup:llm", + "bun run setup:tts", "bun run devices", "bun run test:stt", "bun run test:sttllm", "bun run test:llm", + "bun run test:tts -- \"안녕하세요\"", ], }; } diff --git a/src/setup-tts.ts b/src/setup-tts.ts new file mode 100644 index 0000000..526ec08 --- /dev/null +++ b/src/setup-tts.ts @@ -0,0 +1,60 @@ +import process from "node:process"; +import { mkdir, rm } from "node:fs/promises"; +import path from "node:path"; +import { spawn } from "node:child_process"; + +import { loadConfig } from "./config.js"; +import { Logger } from "./logger.js"; +import { MeloTtsService } from "./services/melo-tts.js"; + +async function run(command: string, args: string[], cwd = process.cwd()): Promise { + await new Promise((resolve, reject) => { + const child = spawn(command, args, { + cwd, + stdio: "inherit", + windowsHide: true, + }); + + child.on("error", reject); + child.on("exit", (code) => { + if (code === 0) { + resolve(); + return; + } + reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`)); + }); + }); +} + +export async function setupTts(): Promise { + const config = loadConfig(); + const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error"); + const dockerContext = path.resolve(process.cwd(), "docker", "melotts"); + const cacheDir = path.resolve(process.cwd(), config.TTS_CACHE_DIR); + const outputDir = path.resolve(process.cwd(), config.TTS_OUTPUT_DIR); + + await mkdir(cacheDir, { recursive: true }); + await mkdir(outputDir, { recursive: true }); + + console.log(`MeloTTS Docker 이미지 빌드: ${config.TTS_IMAGE}`); + await run("docker", ["build", "-t", config.TTS_IMAGE, dockerContext]); + + const tts = new MeloTtsService(config, logger); + const warmupPath = path.join(outputDir, "warmup.wav"); + + console.log("MeloTTS 모델 워밍업..."); + try { + await tts.synthesizeToFile("안녕하세요. 로컬 티티에스 준비 테스트입니다.", warmupPath); + } finally { + await rm(warmupPath, { force: true }).catch(() => undefined); + } + + console.log("로컬 TTS 환경 준비 완료"); +} + +if (import.meta.main) { + void setupTts().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); + }); +} diff --git a/src/setup.ts b/src/setup.ts index 5ec24b5..83bdca3 100644 --- a/src/setup.ts +++ b/src/setup.ts @@ -2,10 +2,12 @@ import process from "node:process"; import { setupLlm } from "./setup-llm.js"; import { setupSttPython } from "./setup-python.js"; +import { setupTts } from "./setup-tts.js"; async function main(): Promise { await setupSttPython(); await setupLlm(); + await setupTts(); } if (import.meta.main) {