From 99857cdaa82308c3178dbd95be6c30df9790f477 Mon Sep 17 00:00:00 2001 From: claude-bot Date: Sun, 3 May 2026 21:54:51 +0900 Subject: [PATCH] Add full STT LLM TTS test mode --- README.md | 20 +- docker/melotts/Dockerfile | 1 + docker/melotts/melo_tts_worker.py | 110 +++++++++++ package.json | 1 + src/index.ts | 36 +++- src/services/melo-tts.ts | 310 +++++++++++++++++++++++++----- src/services/ollama-llm.ts | 1 + src/setup-tts.ts | 7 +- 8 files changed, 419 insertions(+), 67 deletions(-) create mode 100644 docker/melotts/melo_tts_worker.py diff --git a/README.md b/README.md index 668e81b..b43b6c0 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,12 @@ STT + LLM 통합 테스트: bun run test:sttllm ``` +STT + LLM + TTS 전체 연결 테스트: + +```bat +bun run test:all +``` + LLM 단독 테스트: ```bat @@ -117,8 +123,9 @@ bun run test:tts -- "안녕하세요. 로컬 티티에스 테스트입니다." ## 메모 - 이 버전은 `STT`, `STT+LLM`, `LLM` 테스트를 따로 제공합니다. -- `test:sttllm`에서 TTS가 켜져 있으면 답변을 스피커로 읽어줍니다. -- `test:sttllm`에서는 자기 음성을 다시 전사하지 않도록 TTS 재생 중에는 캡처를 잠시 멈춥니다. +- `test:sttllm`은 STT와 LLM만 연결합니다. +- `test:all`은 STT, LLM, TTS를 모두 연결합니다. +- `test:all`에서는 자기 음성을 다시 전사하지 않도록 TTS 재생 중에는 캡처를 잠시 멈춥니다. - LLM 프롬프트는 `prompts/*.md` 에 분리되어 있습니다. - 최소 지연을 위해 파일 저장은 하지 않습니다. - VAD는 현재 모델 기반이 아니라 진폭 기반 단순 분리입니다. @@ -143,7 +150,14 @@ bun run test:tts -- "안녕하세요. 로컬 티티에스 테스트입니다." 1. `bun run setup:llm` 2. `bun run setup:tts` 3. `bun run test:sttllm` -4. 유튜브, 디스코드 통화, 동영상 같은 소리를 재생해서 전사와 답변, 음성 출력 확인 +4. 유튜브, 디스코드 통화, 동영상 같은 소리를 재생해서 전사와 답변 확인 + +## Windows 전체 연결 테스트 순서 + +1. `bun run setup:llm` +2. `bun run setup:tts` +3. `bun run test:all` +4. 유튜브, 디스코드 통화, 동영상 같은 소리를 재생해서 전사, 답변, 음성 출력 확인 ## Windows LLM 테스트 순서 diff --git a/docker/melotts/Dockerfile b/docker/melotts/Dockerfile index c9f93fa..af0a420 100644 --- a/docker/melotts/Dockerfile +++ b/docker/melotts/Dockerfile @@ -17,5 +17,6 @@ RUN python -m unidic download RUN python /opt/MeloTTS/melo/init_downloads.py COPY melo_tts_cli.py /opt/realtime-voice-bot/melo_tts_cli.py +COPY melo_tts_worker.py /opt/realtime-voice-bot/melo_tts_worker.py ENTRYPOINT ["python", "/opt/realtime-voice-bot/melo_tts_cli.py"] diff --git a/docker/melotts/melo_tts_worker.py b/docker/melotts/melo_tts_worker.py new file mode 100644 index 0000000..42dc754 --- /dev/null +++ b/docker/melotts/melo_tts_worker.py @@ -0,0 +1,110 @@ +import json +import os +import sys +from pathlib import Path + +from melo.api import TTS + + +LANGUAGE = os.getenv("TTS_LANGUAGE", "KR") +SPEAKER = os.getenv("TTS_SPEAKER", "KR") +DEVICE = os.getenv("TTS_DEVICE", "cpu") +SPEED = float(os.getenv("TTS_SPEED", "1.18")) +SDP_RATIO = float(os.getenv("TTS_SDP_RATIO", "0.22")) +NOISE_SCALE = float(os.getenv("TTS_NOISE_SCALE", "0.55")) +NOISE_SCALE_W = float(os.getenv("TTS_NOISE_SCALE_W", "0.75")) + +_MODEL = None +_SPEAKER_ID = None + + +def load_model(): + global _MODEL + global _SPEAKER_ID + + if _MODEL is not None and _SPEAKER_ID is not None: + return _MODEL, _SPEAKER_ID + + model = TTS(language=LANGUAGE, device=DEVICE) + speaker_ids = model.hps.data.spk2id + + if SPEAKER not in speaker_ids: + supported = ", ".join(sorted(speaker_ids.keys())) + raise RuntimeError(f"지원하지 않는 speaker 입니다: {SPEAKER}. 사용 가능: {supported}") + + _MODEL = model + _SPEAKER_ID = speaker_ids[SPEAKER] + return _MODEL, _SPEAKER_ID + + +def handle_ping(): + model, speaker_id = load_model() + return { + "language": LANGUAGE, + "speaker": SPEAKER, + "speaker_id": speaker_id, + "device": DEVICE, + "speed": SPEED, + "sdp_ratio": SDP_RATIO, + "noise_scale": NOISE_SCALE, + "noise_scale_w": NOISE_SCALE_W, + "speaker_count": len(model.hps.data.spk2id), + } + + +def handle_synthesize(params): + text = str(params["text"]).strip() + output_path = Path(str(params["output_path"])) + output_path.parent.mkdir(parents=True, exist_ok=True) + + model, speaker_id = load_model() + model.tts_to_file( + text, + speaker_id, + str(output_path), + speed=SPEED, + sdp_ratio=SDP_RATIO, + noise_scale=NOISE_SCALE, + noise_scale_w=NOISE_SCALE_W, + ) + + return { + "output_path": str(output_path), + "text_length": len(text), + } + + +def main(): + for raw_line in sys.stdin: + line = raw_line.strip() + if not line: + continue + + try: + payload = json.loads(line) + request_id = str(payload["id"]) + method = payload["method"] + params = payload.get("params", {}) + + if method == "ping": + result = handle_ping() + elif method == "synthesize": + result = handle_synthesize(params) + else: + raise RuntimeError(f"알 수 없는 method 입니다: {method}") + + sys.stdout.write(json.dumps({"id": request_id, "result": result}, ensure_ascii=False) + "\n") + sys.stdout.flush() + except Exception as error: + request_id = "unknown" + try: + request_id = str(payload.get("id", "unknown")) + except Exception: + pass + + sys.stdout.write(json.dumps({"id": request_id, "error": str(error)}, ensure_ascii=False) + "\n") + sys.stdout.flush() + + +if __name__ == "__main__": + main() diff --git a/package.json b/package.json index 131fad4..5d1a6e6 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,7 @@ "setup:python": "bun run setup:stt", "test:stt": "bun src/index.ts test-stt", "test:sttllm": "bun src/index.ts test-sttllm", + "test:all": "bun src/index.ts test-all", "test:llm": "bun src/index.ts test-llm", "test:tts": "bun src/index.ts test-tts", "devices": "bun src/index.ts devices", diff --git a/src/index.ts b/src/index.ts index 7428136..3e11732 100644 --- a/src/index.ts +++ b/src/index.ts @@ -11,12 +11,12 @@ import { OllamaLlmService } from "./services/ollama-llm.js"; const mode = process.argv[2] ?? "test-stt"; -async function runSttTest(enableLlm: boolean): Promise { +async function runSttTest(options: { enableLlm: boolean; enableTts: boolean }): Promise { const config = loadConfig(); const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error"); const stt = new FasterWhisperSttService(config, logger); - const llm = enableLlm ? new OllamaLlmService(config, logger) : null; - let tts = enableLlm && config.TTS_ENABLED ? new MeloTtsService(config, logger) : null; + const llm = options.enableLlm ? new OllamaLlmService(config, logger) : null; + let tts = options.enableTts && config.TTS_ENABLED ? new MeloTtsService(config, logger) : null; let capture = null as ReturnType | null; let shuttingDown: Promise | null = null; let suppressCapture = false; @@ -47,6 +47,11 @@ async function runSttTest(enableLlm: boolean): Promise { await stt.destroy().catch((destroyError) => { logger.warn("STT destroy failed", destroyError); }); + if (tts) { + await tts.destroy().catch((destroyError) => { + logger.warn("TTS destroy failed", destroyError); + }); + } })(); await shuttingDown; @@ -70,6 +75,9 @@ async function runSttTest(enableLlm: boolean): Promise { capture.kill("SIGKILL"); } void stt.destroy(); + if (tts) { + void tts.destroy(); + } }); console.log("STT 준비중..."); @@ -282,14 +290,23 @@ async function runSttTest(enableLlm: boolean): Promise { }); if (config.DEBUG) { - console.log(enableLlm ? "실시간 출력장치 STT+LLM 테스트를 시작합니다. Ctrl+C 로 종료합니다." : "실시간 출력장치 STT 테스트를 시작합니다. Ctrl+C 로 종료합니다."); + if (options.enableLlm && options.enableTts) { + console.log("실시간 출력장치 STT+LLM+TTS 테스트를 시작합니다. Ctrl+C 로 종료합니다."); + } else if (options.enableLlm) { + console.log("실시간 출력장치 STT+LLM 테스트를 시작합니다. Ctrl+C 로 종료합니다."); + } else { + console.log("실시간 출력장치 STT 테스트를 시작합니다. Ctrl+C 로 종료합니다."); + } console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`); console.log(`model: ${config.WHISPER_MODEL}`); console.log(`language: ${config.WHISPER_LANGUAGE}`); console.log(`beam: ${config.WHISPER_BEAM_SIZE}`); - if (enableLlm) { + if (options.enableLlm) { console.log(`llm: ${config.OLLAMA_MODEL}`); } + if (options.enableTts) { + console.log(`tts: ${config.TTS_IMAGE}`); + } } setInterval(() => { @@ -385,10 +402,13 @@ async function main(): Promise { await printAudioDevices(); return; case "test-stt": - await runSttTest(false); + await runSttTest({ enableLlm: false, enableTts: false }); return; case "test-sttllm": - await runSttTest(true); + await runSttTest({ enableLlm: true, enableTts: false }); + return; + case "test-all": + await runSttTest({ enableLlm: true, enableTts: true }); return; case "test-llm": await runLlmCli(); @@ -397,7 +417,7 @@ async function main(): Promise { await runTtsTest(); return; default: - throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-llm, test-tts, devices`); + throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-all, test-llm, test-tts, devices`); } } diff --git a/src/services/melo-tts.ts b/src/services/melo-tts.ts index a01a70a..056a0ca 100644 --- a/src/services/melo-tts.ts +++ b/src/services/melo-tts.ts @@ -1,7 +1,9 @@ -import { spawn } from "node:child_process"; +import { spawn, type ChildProcessWithoutNullStreams } from "node:child_process"; import { randomUUID } from "node:crypto"; import { mkdir, rm } from "node:fs/promises"; +import { once } from "node:events"; import path from "node:path"; +import { createInterface } from "node:readline"; import type { AppConfig } from "../config.js"; import { resolveDockerCommand } from "../docker-runtime.js"; @@ -41,19 +43,80 @@ async function run(command: string, args: string[], stdio: "ignore" | "inherit" }); } +interface RpcSuccess { + id: string; + result: T; +} + +interface RpcFailure { + id: string; + error: string; +} + +type RpcResponse = RpcSuccess | RpcFailure; + +function isFailure(value: RpcResponse): value is RpcFailure { + return "error" in value; +} + +interface TtsPingResult { + language: string; + speaker: string; + speaker_id: number; + device: string; + speed: number; + sdp_ratio: number; + noise_scale: number; + noise_scale_w: number; + speaker_count: number; +} + export class MeloTtsService { + private processRef: ChildProcessWithoutNullStreams | null = null; + private shuttingDown = false; + private warmedUp = false; + private readonly pending = new Map< + string, + { + resolve: (value: unknown) => void; + reject: (reason?: unknown) => void; + } + >(); + private nextId = 1; + constructor( private readonly config: AppConfig, private readonly logger: Logger, ) {} async warmup(): Promise { + if (this.warmedUp) { + return; + } + await mkdir(path.resolve(process.cwd(), this.config.TTS_CACHE_DIR), { recursive: true }); await mkdir(path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR), { recursive: true }); const docker = await resolveDockerCommand(this.config); await run(docker, ["--version"]); await run(docker, ["image", "inspect", this.config.TTS_IMAGE]); + + await this.start(); + const result = await this.request("ping", {}); + this.logger.info("TTS worker ready", result); + + const warmupFileName = `warmup-${randomUUID()}.wav`; + const warmupHostPath = path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR, warmupFileName); + try { + await this.request("synthesize", { + text: "안녕하세요. 로컬 티티에스 준비 테스트입니다.", + output_path: `/work/output/${warmupFileName}`, + }); + } finally { + await rm(warmupHostPath, { force: true }).catch(() => undefined); + } + + this.warmedUp = true; } async speak(text: string): Promise { @@ -66,62 +129,16 @@ export class MeloTtsService { const targetPath = path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR, fileName); try { - await this.synthesizeToFile(trimmed, targetPath); + await this.synthesizeToFile(trimmed, targetPath, fileName); await playWavFile(targetPath, this.config.TTS_PLAYBACK_RATE); } finally { await rm(targetPath, { force: true }).catch(() => undefined); } } - async synthesizeToFile(text: string, targetPath: string): Promise { + async synthesizeToFile(text: string, targetPath: string, fileName?: string): Promise { await this.warmup(); - - const outputDir = path.dirname(targetPath); - const cacheDir = path.resolve(process.cwd(), this.config.TTS_CACHE_DIR); - const fileName = path.basename(targetPath); - - await mkdir(outputDir, { recursive: true }); - - const args = [ - "run", - "--rm", - "-v", - `${outputDir}:/work/output`, - "-v", - `${cacheDir}:/cache`, - "-e", - "HF_HOME=/cache/huggingface", - "-e", - "HF_HUB_CACHE=/cache/huggingface/hub", - "-e", - "TRANSFORMERS_CACHE=/cache/transformers", - ]; - - if (this.config.TTS_DEVICE !== "cpu") { - args.push("--gpus", "all"); - } - - args.push( - this.config.TTS_IMAGE, - "--text", - text, - "--output", - `/work/output/${fileName}`, - "--language", - this.config.TTS_LANGUAGE, - "--speaker", - this.config.TTS_SPEAKER, - "--speed", - String(this.config.TTS_SPEED), - "--sdp-ratio", - String(this.config.TTS_SDP_RATIO), - "--noise-scale", - String(this.config.TTS_NOISE_SCALE), - "--noise-scale-w", - String(this.config.TTS_NOISE_SCALE_W), - "--device", - this.config.TTS_DEVICE, - ); + const resolvedFileName = fileName ?? path.basename(targetPath); this.logger.info("Starting MeloTTS synthesis", { image: this.config.TTS_IMAGE, @@ -135,8 +152,45 @@ export class MeloTtsService { device: this.config.TTS_DEVICE, }); - const docker = await resolveDockerCommand(this.config); - await run(docker, args, "inherit"); + await this.request("synthesize", { + text, + output_path: `/work/output/${resolvedFileName}`, + }); + } + + async destroy(): Promise { + if (!this.processRef) { + return; + } + + const child = this.processRef; + this.shuttingDown = true; + + try { + child.stdin.end(); + } catch { + // ignore + } + + if (!child.killed && child.exitCode === null) { + child.kill("SIGTERM"); + } + + const timedWait = Promise.race([ + once(child, "exit"), + new Promise((resolve) => setTimeout(() => resolve(null), 1500)), + ]); + + await timedWait; + + if (child.exitCode === null && !child.killed) { + child.kill("SIGKILL"); + await once(child, "exit").catch(() => null); + } + + this.processRef = null; + this.shuttingDown = false; + this.warmedUp = false; } private normalizeText(input: string): string { @@ -155,4 +209,156 @@ export class MeloTtsService { return `${collapsed}.`; } + + private async start(): Promise { + if (this.processRef) { + return; + } + if (this.shuttingDown) { + throw new Error("tts worker is shutting down"); + } + + const docker = await resolveDockerCommand(this.config); + const outputDir = path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR); + const cacheDir = path.resolve(process.cwd(), this.config.TTS_CACHE_DIR); + + await mkdir(outputDir, { recursive: true }); + await mkdir(cacheDir, { recursive: true }); + + const args = [ + "run", + "--rm", + "-i", + "-v", + `${outputDir}:/work/output`, + "-v", + `${cacheDir}:/cache`, + "-e", + "HF_HOME=/cache/huggingface", + "-e", + "HF_HUB_CACHE=/cache/huggingface/hub", + "-e", + "TRANSFORMERS_CACHE=/cache/transformers", + "-e", + `TTS_LANGUAGE=${this.config.TTS_LANGUAGE}`, + "-e", + `TTS_SPEAKER=${this.config.TTS_SPEAKER}`, + "-e", + `TTS_DEVICE=${this.config.TTS_DEVICE}`, + "-e", + `TTS_SPEED=${this.config.TTS_SPEED}`, + "-e", + `TTS_SDP_RATIO=${this.config.TTS_SDP_RATIO}`, + "-e", + `TTS_NOISE_SCALE=${this.config.TTS_NOISE_SCALE}`, + "-e", + `TTS_NOISE_SCALE_W=${this.config.TTS_NOISE_SCALE_W}`, + "--entrypoint", + "python", + ]; + + if (this.config.TTS_DEVICE !== "cpu") { + args.push("--gpus", "all"); + } + + args.push( + this.config.TTS_IMAGE, + "/opt/realtime-voice-bot/melo_tts_worker.py", + ); + + const env = { ...process.env }; + if (path.isAbsolute(docker)) { + const dockerBinDir = path.dirname(docker); + const currentPath = env.PATH ?? env.Path ?? ""; + env.PATH = `${dockerBinDir}${path.delimiter}${currentPath}`; + } + + this.processRef = spawn(docker, args, { + stdio: ["pipe", "pipe", "pipe"], + windowsHide: true, + shell: process.platform === "win32" && !path.isAbsolute(docker), + env, + }); + + const rl = createInterface({ + input: this.processRef.stdout, + crlfDelay: Infinity, + }); + + rl.on("line", (line) => { + this.handleStdoutLine(line); + }); + + this.processRef.stderr.on("data", (chunk: Buffer) => { + const text = chunk.toString().trim(); + if (text.length > 0) { + this.logger.warn(`[melotts] ${text}`); + } + }); + + this.processRef.stdin.on("error", (error) => { + this.logger.debug("melotts stdin error", error); + }); + + this.processRef.on("exit", (code, signal) => { + const error = new Error(`melotts worker exited code=${code ?? "null"} signal=${signal ?? "null"}`); + for (const entry of this.pending.values()) { + entry.reject(error); + } + this.pending.clear(); + this.processRef = null; + }); + } + + private async request(method: string, params: Record): Promise { + await this.start(); + + if (!this.processRef) { + throw new Error("melotts worker is not running"); + } + + const id = String(this.nextId++); + const payload = JSON.stringify({ + id, + method, + params, + }); + + const promise = new Promise((resolve, reject) => { + this.pending.set(id, { + resolve: (value) => resolve(value as T), + reject, + }); + }); + + this.processRef.stdin.write(`${payload}\n`); + return await promise; + } + + private handleStdoutLine(line: string): void { + const trimmed = line.trim(); + if (!trimmed) { + return; + } + + let message: RpcResponse; + try { + message = JSON.parse(trimmed) as RpcResponse; + } catch (error) { + this.logger.warn("melotts stdout parse failed", error); + return; + } + + const pending = this.pending.get(message.id); + if (!pending) { + return; + } + + this.pending.delete(message.id); + if (isFailure(message)) { + pending.reject(new Error(message.error)); + return; + } + pending.resolve(message.result); + } } diff --git a/src/services/ollama-llm.ts b/src/services/ollama-llm.ts index c7be41b..f5a2f1a 100644 --- a/src/services/ollama-llm.ts +++ b/src/services/ollama-llm.ts @@ -378,6 +378,7 @@ export class OllamaLlmService { "bun run devices", "bun run test:stt", "bun run test:sttllm", + "bun run test:all", "bun run test:llm", "bun run test:tts -- \"안녕하세요\"", ], diff --git a/src/setup-tts.ts b/src/setup-tts.ts index 86f133e..461d981 100644 --- a/src/setup-tts.ts +++ b/src/setup-tts.ts @@ -1,5 +1,5 @@ import process from "node:process"; -import { mkdir, rm } from "node:fs/promises"; +import { mkdir } from "node:fs/promises"; import path from "node:path"; import { spawn } from "node:child_process"; @@ -57,13 +57,12 @@ export async function setupTts(): Promise { await run(docker, ["build", "-t", config.TTS_IMAGE, dockerContext]); const tts = new MeloTtsService(config, logger); - const warmupPath = path.join(outputDir, "warmup.wav"); console.log("MeloTTS 모델 워밍업..."); try { - await tts.synthesizeToFile("안녕하세요. 로컬 티티에스 준비 테스트입니다.", warmupPath); + await tts.warmup(); } finally { - await rm(warmupPath, { force: true }).catch(() => undefined); + await tts.destroy().catch(() => undefined); } console.log("로컬 TTS 환경 준비 완료");