Add local MeloTTS support

This commit is contained in:
2026-05-03 01:56:09 +09:00
parent 3360015179
commit ad357a6ede
13 changed files with 396 additions and 3 deletions

View File

@@ -7,6 +7,14 @@ LOCAL_AI_PYTHON=python
AUDIO_SOURCE=
DEBUG=false
TTS_ENABLED=true
TTS_IMAGE=realtime-voice-bot-melotts:v0.1.2
TTS_LANGUAGE=KR
TTS_SPEAKER=KR
TTS_DEVICE=cpu
TTS_SPEED=1
TTS_CACHE_DIR=.local-ai/tts-cache
TTS_OUTPUT_DIR=.local-ai/tts-output
OLLAMA_BASE_URL=http://127.0.0.1:11434
OLLAMA_MODEL=qwen3:8b
OLLAMA_KEEP_ALIVE=5m

View File

@@ -14,6 +14,7 @@
- STT 전용 테스트
- STT 결과에 대해 답변 가치 판단 후 필요할 때만 LLM 답변하는 통합 테스트
- 로컬 `Ollama` LLM 에이전트 CLI 테스트
- 무료 로컬 `MeloTTS` 기반 음성 출력 테스트
## 빠른 시작
@@ -47,6 +48,12 @@ LLM 단독 테스트:
bun run test:llm
```
TTS 단독 테스트:
```bat
bun run test:tts -- "안녕하세요. 로컬 티티에스 테스트입니다."
```
## 환경 변수
- `AUDIO_SOURCE`
@@ -61,6 +68,19 @@ bun run test:llm
- 기본값 `http://127.0.0.1:11434`
- `OLLAMA_MODEL`
- 기본값 `qwen3:8b`
- `TTS_ENABLED`
- 기본값 `true`
- `TTS_IMAGE`
- 기본값 `realtime-voice-bot-melotts:v0.1.2`
- `TTS_LANGUAGE`
- 기본값 `KR`
- `TTS_SPEAKER`
- 기본값 `KR`
- `TTS_DEVICE`
- 기본값 `cpu`
- Docker GPU passthrough를 쓸 때만 `cuda`로 바꿉니다
- `TTS_SPEED`
- 기본값 `1`
- `OLLAMA_KEEP_ALIVE`
- 기본값 `5m`
- `MAX_CONVERSATION_TURNS`
@@ -84,12 +104,15 @@ bun run test:llm
## 메모
- 이 버전은 `STT`, `STT+LLM`, `LLM` 테스트를 따로 제공합니다.
- `test:sttllm`에서 TTS가 켜져 있으면 답변을 스피커로 읽어줍니다.
- `test:sttllm`에서는 자기 음성을 다시 전사하지 않도록 TTS 재생 중에는 캡처를 잠시 멈춥니다.
- LLM 프롬프트는 `prompts/*.md` 에 분리되어 있습니다.
- 최소 지연을 위해 파일 저장은 하지 않습니다.
- VAD는 현재 모델 기반이 아니라 진폭 기반 단순 분리입니다.
- Windows에서는 보통 출력 루프백이 가능한 장치나 `Stereo Mix`, 오디오 인터페이스 loopback 채널을 `AUDIO_SOURCE`로 잡아야 합니다.
- 단순히 스피커 이름을 넣는 구조가 아니라, **루프백/캡처 가능한 입력 장치 이름**을 넣어야 합니다.
- `ffmpeg`가 PATH에 잡혀 있어야 합니다.
- TTS는 Windows에서 Docker Desktop이 필요합니다. MeloTTS 공식 문서도 Windows/macOS에서는 Docker 실행을 권장합니다.
- `cmd` 기준으로 `.env``copy .env.example .env`로 만들면 됩니다.
## Windows 테스트 순서
@@ -105,8 +128,9 @@ bun run test:llm
## Windows STT+LLM 통합 테스트 순서
1. `bun run setup:llm`
2. `bun run test:sttllm`
3. 유튜브, 디스코드 통화, 동영상 같은 소리를 재생해서 전사와 답변 확인
2. `bun run setup:tts`
3. `bun run test:sttllm`
4. 유튜브, 디스코드 통화, 동영상 같은 소리를 재생해서 전사와 답변, 음성 출력 확인
## Windows LLM 테스트 순서
@@ -115,6 +139,12 @@ bun run test:llm
3. 콘솔에 직접 문장을 입력하고 답변 확인
4. `/reset` 으로 문맥 초기화, `/exit` 로 종료
## Windows TTS 테스트 순서
1. Docker Desktop 실행
2. `bun run setup:tts`
3. `bun run test:tts -- "안녕하세요. 로컬 티티에스 테스트입니다."`
현재 `test:llm` 에이전트 도구:
- 현재 시간 조회
- 현재 런타임 설정 조회
@@ -141,6 +171,14 @@ bun run test:llm
LOCAL_AI_PYTHON=python
AUDIO_SOURCE=
DEBUG=false
TTS_ENABLED=true
TTS_IMAGE=realtime-voice-bot-melotts:v0.1.2
TTS_LANGUAGE=KR
TTS_SPEAKER=KR
TTS_DEVICE=cpu
TTS_SPEED=1
TTS_CACHE_DIR=.local-ai/tts-cache
TTS_OUTPUT_DIR=.local-ai/tts-output
OLLAMA_BASE_URL=http://127.0.0.1:11434
OLLAMA_MODEL=qwen3:8b
OLLAMA_KEEP_ALIVE=5m

18
docker/melotts/Dockerfile Normal file
View File

@@ -0,0 +1,18 @@
FROM python:3.9-slim
ENV PYTHONUNBUFFERED=1
WORKDIR /opt/realtime-voice-bot
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
git \
libsndfile1 \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir git+https://github.com/myshell-ai/MeloTTS.git@v0.1.2
RUN python -m unidic download
COPY melo_tts_cli.py /opt/realtime-voice-bot/melo_tts_cli.py
ENTRYPOINT ["python", "/opt/realtime-voice-bot/melo_tts_cli.py"]

View File

@@ -0,0 +1,36 @@
import argparse
from pathlib import Path
from melo.api import TTS
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--text", required=True)
parser.add_argument("--output", required=True)
parser.add_argument("--language", default="KR")
parser.add_argument("--speaker", default="KR")
parser.add_argument("--speed", type=float, default=1.0)
parser.add_argument("--device", default="cpu")
args = parser.parse_args()
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
model = TTS(language=args.language, device=args.device)
speaker_ids = model.hps.data.spk2id
if args.speaker not in speaker_ids:
supported = ", ".join(sorted(speaker_ids.keys()))
raise SystemExit(f"지원하지 않는 speaker 입니다: {args.speaker}. 사용 가능: {supported}")
model.tts_to_file(
args.text,
speaker_ids[args.speaker],
str(output_path),
speed=args.speed,
)
if __name__ == "__main__":
main()

View File

@@ -7,10 +7,12 @@
"setup": "bun src/setup.ts",
"setup:stt": "bun src/setup-python.ts",
"setup:llm": "bun src/setup-llm.ts",
"setup:tts": "bun src/setup-tts.ts",
"setup:python": "bun run setup:stt",
"test:stt": "bun src/index.ts test-stt",
"test:sttllm": "bun src/index.ts test-sttllm",
"test:llm": "bun src/index.ts test-llm",
"test:tts": "bun src/index.ts test-tts",
"devices": "bun src/index.ts devices",
"check": "tsc --noEmit",
"build": "tsc -p tsconfig.json"

View File

@@ -55,6 +55,15 @@ export class RealtimeSegmenter {
}
}
reset(): void {
this.pendingSamples.splice(0, this.pendingSamples.length);
this.preRoll.splice(0, this.preRoll.length);
this.speech.splice(0, this.speech.length);
this.speechActive = false;
this.speechCandidateFrames = 0;
this.silenceFrames = 0;
}
private processFrame(frame: Int16Array): void {
let peak = 0;
for (const sample of frame) {

View File

@@ -15,6 +15,17 @@ const envSchema = z.object({
LOCAL_AI_VENV_PATH: z.string().min(1).default(".local-ai/.venv"),
LOCAL_AI_PYTHON: emptyToUndefined,
AUDIO_SOURCE: emptyToUndefined,
TTS_ENABLED: z
.string()
.optional()
.transform((value) => value?.trim().toLowerCase() !== "false"),
TTS_IMAGE: z.string().min(1).default("realtime-voice-bot-melotts:v0.1.2"),
TTS_LANGUAGE: z.string().min(1).default("KR"),
TTS_SPEAKER: z.string().min(1).default("KR"),
TTS_DEVICE: z.string().min(1).default("cpu"),
TTS_SPEED: z.coerce.number().min(0.5).max(2).default(1),
TTS_CACHE_DIR: z.string().min(1).default(".local-ai/tts-cache"),
TTS_OUTPUT_DIR: z.string().min(1).default(".local-ai/tts-output"),
DEBUG: z
.string()
.optional()

View File

@@ -6,6 +6,7 @@ import { Logger } from "./logger.js";
import { printAudioDevices, spawnLoopbackCapture } from "./audio/capture.js";
import { RealtimeSegmenter } from "./audio/realtime-segmenter.js";
import { FasterWhisperSttService } from "./services/faster-whisper-stt.js";
import { MeloTtsService } from "./services/melo-tts.js";
import { OllamaLlmService } from "./services/ollama-llm.js";
const mode = process.argv[2] ?? "test-stt";
@@ -15,8 +16,10 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
const stt = new FasterWhisperSttService(config, logger);
const llm = enableLlm ? new OllamaLlmService(config, logger) : null;
let tts = enableLlm && config.TTS_ENABLED ? new MeloTtsService(config, logger) : null;
let capture = null as ReturnType<typeof spawnLoopbackCapture> | null;
let shuttingDown: Promise<void> | null = null;
let suppressCapture = false;
let receivedChunks = 0;
let receivedBytes = 0;
let maxPeak = 0;
@@ -79,6 +82,22 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
logger.info("LLM warmup finished");
console.log("LLM 준비 완료");
}
if (tts) {
console.log("TTS 준비중...");
try {
await tts.warmup();
logger.info("TTS warmup finished", {
image: config.TTS_IMAGE,
language: config.TTS_LANGUAGE,
speaker: config.TTS_SPEAKER,
});
console.log("TTS 준비 완료");
} catch (error) {
logger.warn("TTS warmup failed", error);
console.log("TTS 비활성화: bun run setup:tts 를 먼저 실행하세요.");
tts = null;
}
}
const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = [];
let transcribing = false;
@@ -155,6 +174,20 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
} else {
console.log(`답변> ${reply}`);
}
if (tts) {
suppressCapture = true;
segmenter.reset();
try {
await tts.speak(reply);
} catch (error) {
logger.warn("TTS playback failed", error);
} finally {
suppressCapture = false;
sawSpeechStart = false;
maxPeak = 0;
}
}
}
}
} catch (error) {
@@ -227,6 +260,9 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
receivedChunks += 1;
receivedBytes += chunk.length;
lastChunkAt = Date.now();
if (suppressCapture) {
return;
}
segmenter.pushChunk(chunk);
});
capture.stderr.on("data", (chunk: Buffer) => {
@@ -330,6 +366,19 @@ async function runLlmCli(): Promise<void> {
});
}
async function runTtsTest(): Promise<void> {
const text = process.argv.slice(3).join(" ").trim() || "안녕하세요. 로컬 티티에스 테스트입니다.";
const config = loadConfig();
const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
const tts = new MeloTtsService(config, logger);
console.log("TTS 준비중...");
await tts.warmup();
console.log("TTS 준비 완료");
console.log(`재생 문장: ${text}`);
await tts.speak(text);
}
async function main(): Promise<void> {
switch (mode) {
case "devices":
@@ -344,8 +393,11 @@ async function main(): Promise<void> {
case "test-llm":
await runLlmCli();
return;
case "test-tts":
await runTtsTest();
return;
default:
throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-llm, devices`);
throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-llm, test-tts, devices`);
}
}

View File

@@ -0,0 +1,42 @@
import { spawn } from "node:child_process";
import process from "node:process";
async function run(command: string, args: string[]): Promise<void> {
await new Promise<void>((resolve, reject) => {
const child = spawn(command, args, {
stdio: ["ignore", "inherit", "inherit"],
windowsHide: true,
});
child.on("error", reject);
child.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
});
});
}
export async function playWavFile(filePath: string): Promise<void> {
if (process.platform === "win32") {
await run("powershell.exe", [
"-NoProfile",
"-NonInteractive",
"-ExecutionPolicy",
"Bypass",
"-Command",
[
"$path = $args[0]",
"$player = New-Object System.Media.SoundPlayer $path",
"$player.Load()",
"$player.PlaySync()",
].join("; "),
filePath,
]);
return;
}
throw new Error(`지원하지 않는 플랫폼입니다: ${process.platform}`);
}

113
src/services/melo-tts.ts Normal file
View File

@@ -0,0 +1,113 @@
import { spawn } from "node:child_process";
import { randomUUID } from "node:crypto";
import { mkdir, rm } from "node:fs/promises";
import path from "node:path";
import type { AppConfig } from "../config.js";
import type { Logger } from "../logger.js";
import { playWavFile } from "./audio-playback.js";
async function run(command: string, args: string[], stdio: "ignore" | "inherit" = "ignore"): Promise<void> {
await new Promise<void>((resolve, reject) => {
const child = spawn(command, args, {
stdio: ["ignore", stdio, "inherit"],
windowsHide: true,
});
child.on("error", reject);
child.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
});
});
}
export class MeloTtsService {
constructor(
private readonly config: AppConfig,
private readonly logger: Logger,
) {}
async warmup(): Promise<void> {
await mkdir(path.resolve(process.cwd(), this.config.TTS_CACHE_DIR), { recursive: true });
await mkdir(path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR), { recursive: true });
await run("docker", ["--version"]);
await run("docker", ["image", "inspect", this.config.TTS_IMAGE]);
}
async speak(text: string): Promise<void> {
const trimmed = text.trim();
if (!trimmed) {
return;
}
const fileName = `tts-${Date.now()}-${randomUUID()}.wav`;
const targetPath = path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR, fileName);
try {
await this.synthesizeToFile(trimmed, targetPath);
await playWavFile(targetPath);
} finally {
await rm(targetPath, { force: true }).catch(() => undefined);
}
}
async synthesizeToFile(text: string, targetPath: string): Promise<void> {
await this.warmup();
const outputDir = path.dirname(targetPath);
const cacheDir = path.resolve(process.cwd(), this.config.TTS_CACHE_DIR);
const fileName = path.basename(targetPath);
await mkdir(outputDir, { recursive: true });
const args = [
"run",
"--rm",
"-v",
`${outputDir}:/work/output`,
"-v",
`${cacheDir}:/cache`,
"-e",
"HF_HOME=/cache/huggingface",
"-e",
"HF_HUB_CACHE=/cache/huggingface/hub",
"-e",
"TRANSFORMERS_CACHE=/cache/transformers",
];
if (this.config.TTS_DEVICE !== "cpu") {
args.push("--gpus", "all");
}
args.push(
this.config.TTS_IMAGE,
"--text",
text,
"--output",
`/work/output/${fileName}`,
"--language",
this.config.TTS_LANGUAGE,
"--speaker",
this.config.TTS_SPEAKER,
"--speed",
String(this.config.TTS_SPEED),
"--device",
this.config.TTS_DEVICE,
);
this.logger.info("Starting MeloTTS synthesis", {
image: this.config.TTS_IMAGE,
language: this.config.TTS_LANGUAGE,
speaker: this.config.TTS_SPEAKER,
speed: this.config.TTS_SPEED,
device: this.config.TTS_DEVICE,
});
await run("docker", args, "inherit");
}
}

View File

@@ -374,10 +374,12 @@ export class OllamaLlmService {
"bun run setup",
"bun run setup:stt",
"bun run setup:llm",
"bun run setup:tts",
"bun run devices",
"bun run test:stt",
"bun run test:sttllm",
"bun run test:llm",
"bun run test:tts -- \"안녕하세요\"",
],
};
}

60
src/setup-tts.ts Normal file
View File

@@ -0,0 +1,60 @@
import process from "node:process";
import { mkdir, rm } from "node:fs/promises";
import path from "node:path";
import { spawn } from "node:child_process";
import { loadConfig } from "./config.js";
import { Logger } from "./logger.js";
import { MeloTtsService } from "./services/melo-tts.js";
async function run(command: string, args: string[], cwd = process.cwd()): Promise<void> {
await new Promise<void>((resolve, reject) => {
const child = spawn(command, args, {
cwd,
stdio: "inherit",
windowsHide: true,
});
child.on("error", reject);
child.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
});
});
}
export async function setupTts(): Promise<void> {
const config = loadConfig();
const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
const dockerContext = path.resolve(process.cwd(), "docker", "melotts");
const cacheDir = path.resolve(process.cwd(), config.TTS_CACHE_DIR);
const outputDir = path.resolve(process.cwd(), config.TTS_OUTPUT_DIR);
await mkdir(cacheDir, { recursive: true });
await mkdir(outputDir, { recursive: true });
console.log(`MeloTTS Docker 이미지 빌드: ${config.TTS_IMAGE}`);
await run("docker", ["build", "-t", config.TTS_IMAGE, dockerContext]);
const tts = new MeloTtsService(config, logger);
const warmupPath = path.join(outputDir, "warmup.wav");
console.log("MeloTTS 모델 워밍업...");
try {
await tts.synthesizeToFile("안녕하세요. 로컬 티티에스 준비 테스트입니다.", warmupPath);
} finally {
await rm(warmupPath, { force: true }).catch(() => undefined);
}
console.log("로컬 TTS 환경 준비 완료");
}
if (import.meta.main) {
void setupTts().catch((error) => {
console.error(error instanceof Error ? error.message : String(error));
process.exit(1);
});
}

View File

@@ -2,10 +2,12 @@ import process from "node:process";
import { setupLlm } from "./setup-llm.js";
import { setupSttPython } from "./setup-python.js";
import { setupTts } from "./setup-tts.js";
async function main(): Promise<void> {
await setupSttPython();
await setupLlm();
await setupTts();
}
if (import.meta.main) {