Add Windows TTS wave dump mode

2026-05-01 03:34:43 +09:00
parent ac88b8c50a
commit 0a88e8dab1
5 changed files with 90 additions and 48 deletions
--- a/README.md
+++ b/README.md
@@ -71,6 +71,12 @@ TTS만 단독으로 확인:
 bun run tts:test -- "안녕하세요. 출력 장치 테스트입니다."
 ```

+TTS WAV 파일만 생성해서 확인:
+
+```bash
+bun run tts:dump -- "안녕하세요. WAV 파일 테스트입니다."
+```
+
 Discord 모드:

 ```bash
--- a/package.json
+++ b/package.json
@@ -9,6 +9,7 @@
    "start:discord": "bun src/index.ts discord",
    "start:local": "bun src/index.ts local",
    "tts:test": "bun src/index.ts local-say",
+    "tts:dump": "bun src/index.ts local-say-dump",
    "setup:local-ai": "bun src/setup-local-ai.ts",
    "devices": "bun src/index.ts local-devices",
    "audio:devices": "bun src/index.ts local-devices",
--- a/src/index.ts
+++ b/src/index.ts
@@ -3,7 +3,7 @@ import process from "node:process";
 import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js";
 import { runDiscordBot } from "./discord-main.js";
 import { Logger } from "./logger.js";
-import { printLocalAudioDevices, runLocalAssistant, runLocalTtsSmokeTest } from "./local-main.js";
+import { dumpLocalTtsWave, printLocalAudioDevices, runLocalAssistant, runLocalTtsSmokeTest } from "./local-main.js";

 const mode = process.argv[2] ?? "discord";
 const config = loadConfig();
@@ -25,8 +25,13 @@ async function main(): Promise<void> {
      await runLocalTtsSmokeTest(requireAssistantRuntimeConfig(config), logger, text);
      return;
    }
+    case "local-say-dump": {
+      const text = process.argv.slice(3).join(" ").trim() || "안녕하세요. TTS WAV 파일 테스트입니다.";
+      await dumpLocalTtsWave(requireAssistantRuntimeConfig(config), logger, text);
+      return;
+    }
    default:
-      throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices, local-say`);
+      throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices, local-say, local-say-dump`);
  }
 }

--- a/src/local-main.ts
+++ b/src/local-main.ts
@@ -1,4 +1,6 @@
 import { spawn } from "node:child_process";
+import { mkdir } from "node:fs/promises";
+import path from "node:path";
 import process from "node:process";

 import type { AssistantRuntimeConfig } from "./config.js";
@@ -10,7 +12,7 @@ import { LocalFasterWhisperSttService } from "./services/local-stt.js";
 import { LocalKokoroTtsService } from "./services/local-tts.js";
 import { OllamaLlmService } from "./services/ollama-llm.js";
 import type { SttService } from "./services/stt.js";
-import { WindowsSystemTtsService } from "./services/windows-system-tts.js";
+import { synthesizeWindowsSpeechToWaveFile, WindowsSystemTtsService } from "./services/windows-system-tts.js";

 export async function printLocalAudioDevices(): Promise<void> {
  if (process.platform === "win32") {
@@ -166,3 +168,22 @@ export async function runLocalTtsSmokeTest(
    await Promise.allSettled([session.destroy(), tts.destroy?.()]);
  }
 }
+
+export async function dumpLocalTtsWave(
+  config: AssistantRuntimeConfig,
+  _logger: Logger,
+  text: string,
+  outputPath?: string,
+): Promise<void> {
+  if (process.platform !== "win32") {
+    throw new Error("현재 TTS WAV 덤프 모드는 Windows에서만 구현되어 있습니다.");
+  }
+
+  const resolvedPath = path.resolve(outputPath?.trim() || "tts-test.wav");
+  await mkdir(path.dirname(resolvedPath), { recursive: true });
+  await synthesizeWindowsSpeechToWaveFile(text, config.LOCAL_TTS_SPEED, resolvedPath);
+
+  console.log("TTS WAV 파일 생성 완료");
+  console.log(`출력 파일: ${resolvedPath}`);
+  console.log("이 파일이 재생되면 TTS 합성은 정상이고, 실시간 재생 경로만 따로 보면 됩니다.");
+}
--- a/src/services/windows-system-tts.ts
+++ b/src/services/windows-system-tts.ts
@@ -18,6 +18,59 @@ function toSpeechRate(speed: number): number {
  return Math.max(-10, Math.min(10, mapped));
 }

+export async function synthesizeWindowsSpeechToWaveFile(
+  text: string,
+  speed: number,
+  outputPath: string,
+  signal?: AbortSignal,
+): Promise<void> {
+  const rate = toSpeechRate(speed);
+  const script = [
+    "Add-Type -AssemblyName System.Speech;",
+    "$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
+    "$koVoice = $synth.GetInstalledVoices() | Where-Object { $_.VoiceInfo.Culture.Name -like 'ko*' } | Select-Object -First 1;",
+    "if ($koVoice) { $synth.SelectVoice($koVoice.VoiceInfo.Name) }",
+    `$synth.Rate = ${rate};`,
+    `$synth.SetOutputToWaveFile('${escapePowerShellSingleQuoted(outputPath)}');`,
+    `$synth.Speak('${escapePowerShellSingleQuoted(text)}');`,
+    "$synth.Dispose();",
+  ].join(" ");
+
+  await new Promise<void>((resolve, reject) => {
+    const child = spawn("powershell", ["-NoProfile", "-Command", script], {
+      stdio: ["ignore", "ignore", "pipe"],
+    });
+
+    let stderr = "";
+    child.stderr.on("data", (chunk: Buffer) => {
+      stderr += chunk.toString();
+    });
+
+    signal?.addEventListener(
+      "abort",
+      () => {
+        if (!child.killed) {
+          child.kill("SIGKILL");
+        }
+      },
+      { once: true },
+    );
+
+    child.on("exit", (code) => {
+      if (signal?.aborted) {
+        reject(new Error("tts aborted"));
+        return;
+      }
+      if (code === 0) {
+        resolve();
+        return;
+      }
+      reject(new Error(stderr.trim() || `powershell tts exited with code ${code ?? "null"}`));
+    });
+    child.on("error", reject);
+  });
+}
+
 export class WindowsSystemTtsService implements TtsService {
  constructor(private readonly speed: number) {
    const resolvedFfmpegPath = resolveFfmpegPath();
@@ -32,51 +85,7 @@ export class WindowsSystemTtsService implements TtsService {

  async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
    const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-tts-${Date.now()}.wav`);
-    const rate = toSpeechRate(this.speed);
-    const script = [
-      "Add-Type -AssemblyName System.Speech;",
-      "$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
-      "$koVoice = $synth.GetInstalledVoices() | Where-Object { $_.VoiceInfo.Culture.Name -like 'ko*' } | Select-Object -First 1;",
-      "if ($koVoice) { $synth.SelectVoice($koVoice.VoiceInfo.Name) }",
-      `$synth.Rate = ${rate};`,
-      `$synth.SetOutputToWaveFile('${escapePowerShellSingleQuoted(tempPath)}');`,
-      `$synth.Speak('${escapePowerShellSingleQuoted(text)}');`,
-      "$synth.Dispose();",
-    ].join(" ");
-
-    await new Promise<void>((resolve, reject) => {
-      const child = spawn("powershell", ["-NoProfile", "-Command", script], {
-        stdio: ["ignore", "ignore", "pipe"],
-      });
-
-      let stderr = "";
-      child.stderr.on("data", (chunk: Buffer) => {
-        stderr += chunk.toString();
-      });
-
-      signal?.addEventListener(
-        "abort",
-        () => {
-          if (!child.killed) {
-            child.kill("SIGKILL");
-          }
-        },
-        { once: true },
-      );
-
-      child.on("exit", (code) => {
-        if (signal?.aborted) {
-          reject(new Error("tts aborted"));
-          return;
-        }
-        if (code === 0) {
-          resolve();
-          return;
-        }
-        reject(new Error(stderr.trim() || `powershell tts exited with code ${code ?? "null"}`));
-      });
-      child.on("error", reject);
-    }).catch(async (error) => {
+    await synthesizeWindowsSpeechToWaveFile(text, this.speed, tempPath, signal).catch(async (error) => {
      await unlink(tempPath).catch(() => null);
      throw error;
    });