Add local MeloTTS support

2026-05-03 01:56:09 +09:00
parent 3360015179
commit ad357a6ede
13 changed files with 396 additions and 3 deletions
--- a/src/audio/realtime-segmenter.ts
+++ b/src/audio/realtime-segmenter.ts
@@ -55,6 +55,15 @@ export class RealtimeSegmenter {
    }
  }

+  reset(): void {
+    this.pendingSamples.splice(0, this.pendingSamples.length);
+    this.preRoll.splice(0, this.preRoll.length);
+    this.speech.splice(0, this.speech.length);
+    this.speechActive = false;
+    this.speechCandidateFrames = 0;
+    this.silenceFrames = 0;
+  }
+
  private processFrame(frame: Int16Array): void {
    let peak = 0;
    for (const sample of frame) {
--- a/src/config.ts
+++ b/src/config.ts
@@ -15,6 +15,17 @@ const envSchema = z.object({
  LOCAL_AI_VENV_PATH: z.string().min(1).default(".local-ai/.venv"),
  LOCAL_AI_PYTHON: emptyToUndefined,
  AUDIO_SOURCE: emptyToUndefined,
+  TTS_ENABLED: z
+    .string()
+    .optional()
+    .transform((value) => value?.trim().toLowerCase() !== "false"),
+  TTS_IMAGE: z.string().min(1).default("realtime-voice-bot-melotts:v0.1.2"),
+  TTS_LANGUAGE: z.string().min(1).default("KR"),
+  TTS_SPEAKER: z.string().min(1).default("KR"),
+  TTS_DEVICE: z.string().min(1).default("cpu"),
+  TTS_SPEED: z.coerce.number().min(0.5).max(2).default(1),
+  TTS_CACHE_DIR: z.string().min(1).default(".local-ai/tts-cache"),
+  TTS_OUTPUT_DIR: z.string().min(1).default(".local-ai/tts-output"),
  DEBUG: z
    .string()
    .optional()
--- a/src/index.ts
+++ b/src/index.ts
@@ -6,6 +6,7 @@ import { Logger } from "./logger.js";
 import { printAudioDevices, spawnLoopbackCapture } from "./audio/capture.js";
 import { RealtimeSegmenter } from "./audio/realtime-segmenter.js";
 import { FasterWhisperSttService } from "./services/faster-whisper-stt.js";
+import { MeloTtsService } from "./services/melo-tts.js";
 import { OllamaLlmService } from "./services/ollama-llm.js";

 const mode = process.argv[2] ?? "test-stt";
@@ -15,8 +16,10 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
  const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
  const stt = new FasterWhisperSttService(config, logger);
  const llm = enableLlm ? new OllamaLlmService(config, logger) : null;
+  let tts = enableLlm && config.TTS_ENABLED ? new MeloTtsService(config, logger) : null;
  let capture = null as ReturnType<typeof spawnLoopbackCapture> | null;
  let shuttingDown: Promise<void> | null = null;
+  let suppressCapture = false;
  let receivedChunks = 0;
  let receivedBytes = 0;
  let maxPeak = 0;
@@ -79,6 +82,22 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
    logger.info("LLM warmup finished");
    console.log("LLM 준비 완료");
  }
+  if (tts) {
+    console.log("TTS 준비중...");
+    try {
+      await tts.warmup();
+      logger.info("TTS warmup finished", {
+        image: config.TTS_IMAGE,
+        language: config.TTS_LANGUAGE,
+        speaker: config.TTS_SPEAKER,
+      });
+      console.log("TTS 준비 완료");
+    } catch (error) {
+      logger.warn("TTS warmup failed", error);
+      console.log("TTS 비활성화: bun run setup:tts 를 먼저 실행하세요.");
+      tts = null;
+    }
+  }

  const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = [];
  let transcribing = false;
@@ -155,6 +174,20 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
          } else {
            console.log(`답변> ${reply}`);
          }
+
+          if (tts) {
+            suppressCapture = true;
+            segmenter.reset();
+            try {
+              await tts.speak(reply);
+            } catch (error) {
+              logger.warn("TTS playback failed", error);
+            } finally {
+              suppressCapture = false;
+              sawSpeechStart = false;
+              maxPeak = 0;
+            }
+          }
        }
      }
    } catch (error) {
@@ -227,6 +260,9 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
    receivedChunks += 1;
    receivedBytes += chunk.length;
    lastChunkAt = Date.now();
+    if (suppressCapture) {
+      return;
+    }
    segmenter.pushChunk(chunk);
  });
  capture.stderr.on("data", (chunk: Buffer) => {
@@ -330,6 +366,19 @@ async function runLlmCli(): Promise<void> {
  });
 }

+async function runTtsTest(): Promise<void> {
+  const text = process.argv.slice(3).join(" ").trim() || "안녕하세요. 로컬 티티에스 테스트입니다.";
+  const config = loadConfig();
+  const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
+  const tts = new MeloTtsService(config, logger);
+
+  console.log("TTS 준비중...");
+  await tts.warmup();
+  console.log("TTS 준비 완료");
+  console.log(`재생 문장: ${text}`);
+  await tts.speak(text);
+}
+
 async function main(): Promise<void> {
  switch (mode) {
    case "devices":
@@ -344,8 +393,11 @@ async function main(): Promise<void> {
    case "test-llm":
      await runLlmCli();
      return;
+    case "test-tts":
+      await runTtsTest();
+      return;
    default:
-      throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-llm, devices`);
+      throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-llm, test-tts, devices`);
  }
 }

--- a/src/services/audio-playback.ts
+++ b/src/services/audio-playback.ts
@@ -0,0 +1,42 @@
+import { spawn } from "node:child_process";
+import process from "node:process";
+
+async function run(command: string, args: string[]): Promise<void> {
+  await new Promise<void>((resolve, reject) => {
+    const child = spawn(command, args, {
+      stdio: ["ignore", "inherit", "inherit"],
+      windowsHide: true,
+    });
+
+    child.on("error", reject);
+    child.on("exit", (code) => {
+      if (code === 0) {
+        resolve();
+        return;
+      }
+      reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
+    });
+  });
+}
+
+export async function playWavFile(filePath: string): Promise<void> {
+  if (process.platform === "win32") {
+    await run("powershell.exe", [
+      "-NoProfile",
+      "-NonInteractive",
+      "-ExecutionPolicy",
+      "Bypass",
+      "-Command",
+      [
+        "$path = $args[0]",
+        "$player = New-Object System.Media.SoundPlayer $path",
+        "$player.Load()",
+        "$player.PlaySync()",
+      ].join("; "),
+      filePath,
+    ]);
+    return;
+  }
+
+  throw new Error(`지원하지 않는 플랫폼입니다: ${process.platform}`);
+}
--- a/src/services/melo-tts.ts
+++ b/src/services/melo-tts.ts
@@ -0,0 +1,113 @@
+import { spawn } from "node:child_process";
+import { randomUUID } from "node:crypto";
+import { mkdir, rm } from "node:fs/promises";
+import path from "node:path";
+
+import type { AppConfig } from "../config.js";
+import type { Logger } from "../logger.js";
+import { playWavFile } from "./audio-playback.js";
+
+async function run(command: string, args: string[], stdio: "ignore" | "inherit" = "ignore"): Promise<void> {
+  await new Promise<void>((resolve, reject) => {
+    const child = spawn(command, args, {
+      stdio: ["ignore", stdio, "inherit"],
+      windowsHide: true,
+    });
+
+    child.on("error", reject);
+    child.on("exit", (code) => {
+      if (code === 0) {
+        resolve();
+        return;
+      }
+      reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
+    });
+  });
+}
+
+export class MeloTtsService {
+  constructor(
+    private readonly config: AppConfig,
+    private readonly logger: Logger,
+  ) {}
+
+  async warmup(): Promise<void> {
+    await mkdir(path.resolve(process.cwd(), this.config.TTS_CACHE_DIR), { recursive: true });
+    await mkdir(path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR), { recursive: true });
+
+    await run("docker", ["--version"]);
+    await run("docker", ["image", "inspect", this.config.TTS_IMAGE]);
+  }
+
+  async speak(text: string): Promise<void> {
+    const trimmed = text.trim();
+    if (!trimmed) {
+      return;
+    }
+
+    const fileName = `tts-${Date.now()}-${randomUUID()}.wav`;
+    const targetPath = path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR, fileName);
+
+    try {
+      await this.synthesizeToFile(trimmed, targetPath);
+      await playWavFile(targetPath);
+    } finally {
+      await rm(targetPath, { force: true }).catch(() => undefined);
+    }
+  }
+
+  async synthesizeToFile(text: string, targetPath: string): Promise<void> {
+    await this.warmup();
+
+    const outputDir = path.dirname(targetPath);
+    const cacheDir = path.resolve(process.cwd(), this.config.TTS_CACHE_DIR);
+    const fileName = path.basename(targetPath);
+
+    await mkdir(outputDir, { recursive: true });
+
+    const args = [
+      "run",
+      "--rm",
+      "-v",
+      `${outputDir}:/work/output`,
+      "-v",
+      `${cacheDir}:/cache`,
+      "-e",
+      "HF_HOME=/cache/huggingface",
+      "-e",
+      "HF_HUB_CACHE=/cache/huggingface/hub",
+      "-e",
+      "TRANSFORMERS_CACHE=/cache/transformers",
+    ];
+
+    if (this.config.TTS_DEVICE !== "cpu") {
+      args.push("--gpus", "all");
+    }
+
+    args.push(
+      this.config.TTS_IMAGE,
+      "--text",
+      text,
+      "--output",
+      `/work/output/${fileName}`,
+      "--language",
+      this.config.TTS_LANGUAGE,
+      "--speaker",
+      this.config.TTS_SPEAKER,
+      "--speed",
+      String(this.config.TTS_SPEED),
+      "--device",
+      this.config.TTS_DEVICE,
+    );
+
+    this.logger.info("Starting MeloTTS synthesis", {
+      image: this.config.TTS_IMAGE,
+      language: this.config.TTS_LANGUAGE,
+      speaker: this.config.TTS_SPEAKER,
+      speed: this.config.TTS_SPEED,
+      device: this.config.TTS_DEVICE,
+    });
+
+    await run("docker", args, "inherit");
+  }
+}
--- a/src/services/ollama-llm.ts
+++ b/src/services/ollama-llm.ts
@@ -374,10 +374,12 @@ export class OllamaLlmService {
        "bun run setup",
        "bun run setup:stt",
        "bun run setup:llm",
+        "bun run setup:tts",
        "bun run devices",
        "bun run test:stt",
        "bun run test:sttllm",
        "bun run test:llm",
+        "bun run test:tts -- \"안녕하세요\"",
      ],
    };
  }
--- a/src/setup-tts.ts
+++ b/src/setup-tts.ts
@@ -0,0 +1,60 @@
+import process from "node:process";
+import { mkdir, rm } from "node:fs/promises";
+import path from "node:path";
+import { spawn } from "node:child_process";
+
+import { loadConfig } from "./config.js";
+import { Logger } from "./logger.js";
+import { MeloTtsService } from "./services/melo-tts.js";
+
+async function run(command: string, args: string[], cwd = process.cwd()): Promise<void> {
+  await new Promise<void>((resolve, reject) => {
+    const child = spawn(command, args, {
+      cwd,
+      stdio: "inherit",
+      windowsHide: true,
+    });
+
+    child.on("error", reject);
+    child.on("exit", (code) => {
+      if (code === 0) {
+        resolve();
+        return;
+      }
+      reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
+    });
+  });
+}
+
+export async function setupTts(): Promise<void> {
+  const config = loadConfig();
+  const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
+  const dockerContext = path.resolve(process.cwd(), "docker", "melotts");
+  const cacheDir = path.resolve(process.cwd(), config.TTS_CACHE_DIR);
+  const outputDir = path.resolve(process.cwd(), config.TTS_OUTPUT_DIR);
+
+  await mkdir(cacheDir, { recursive: true });
+  await mkdir(outputDir, { recursive: true });
+
+  console.log(`MeloTTS Docker 이미지 빌드: ${config.TTS_IMAGE}`);
+  await run("docker", ["build", "-t", config.TTS_IMAGE, dockerContext]);
+
+  const tts = new MeloTtsService(config, logger);
+  const warmupPath = path.join(outputDir, "warmup.wav");
+
+  console.log("MeloTTS 모델 워밍업...");
+  try {
+    await tts.synthesizeToFile("안녕하세요. 로컬 티티에스 준비 테스트입니다.", warmupPath);
+  } finally {
+    await rm(warmupPath, { force: true }).catch(() => undefined);
+  }
+
+  console.log("로컬 TTS 환경 준비 완료");
+}
+
+if (import.meta.main) {
+  void setupTts().catch((error) => {
+    console.error(error instanceof Error ? error.message : String(error));
+    process.exit(1);
+  });
+}
--- a/src/setup.ts
+++ b/src/setup.ts
@@ -2,10 +2,12 @@ import process from "node:process";

 import { setupLlm } from "./setup-llm.js";
 import { setupSttPython } from "./setup-python.js";
+import { setupTts } from "./setup-tts.js";

 async function main(): Promise<void> {
  await setupSttPython();
  await setupLlm();
+  await setupTts();
 }

 if (import.meta.main) {