Add Windows Media TTS engine selection

2026-05-01 04:01:33 +09:00
parent 1a8e8d0a8f
commit 52d7f74049
12 changed files with 475 additions and 77 deletions
--- a/.env.example
+++ b/.env.example
@@ -16,6 +16,8 @@ LOCAL_STT_MODEL=small
 LOCAL_STT_DEVICE=auto
 LOCAL_STT_COMPUTE_TYPE=auto
 LOCAL_STT_BEAM_SIZE=3
+LOCAL_TTS_ENGINE=auto
+LOCAL_TTS_VOICE_NAME=
 LOCAL_TTS_MODEL_PATH=.local-ai/models/kokoro-v1.0.onnx
 LOCAL_TTS_VOICES_PATH=.local-ai/models/voices-v1.0.bin
 LOCAL_TTS_LANGUAGE=ko
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 - STT: `faster-whisper` + Whisper multilingual
 - LLM: `Ollama` + `qwen3:0.6b`
 - TTS:
- Windows: 시스템 기본 음성 엔진
+- Windows: `Windows.Media.SpeechSynthesis` 우선, 실패 시 시스템 기본 음성 엔진 fallback
 - Linux/macOS: `kokoro-onnx` + `misaki[ko]`
 - VAD: `avr-vad`

@@ -71,6 +71,12 @@ TTS만 단독으로 확인:
 bun run tts:test -- "안녕하세요. 출력 장치 테스트입니다."
 ```

+Windows 설치 음성 목록 확인:
+
+```bash
+bun run tts:voices
+```
+
 TTS WAV 파일만 생성해서 확인:

 ```bash
@@ -104,6 +110,8 @@ Discord 모드에서만 필수:
 - `LOCAL_STT_DEVICE`
 - `LOCAL_STT_COMPUTE_TYPE`
 - `LOCAL_STT_BEAM_SIZE`
+- `LOCAL_TTS_ENGINE`
+- `LOCAL_TTS_VOICE_NAME`
 - `LOCAL_TTS_MODEL_PATH`
 - `LOCAL_TTS_VOICES_PATH`
 - `LOCAL_TTS_LANGUAGE`
@@ -138,7 +146,8 @@ Windows에서 GPU STT를 쓰려면 `LOCAL_STT_DEVICE=auto` 그대로 두고 `bun

 - STT 기본 권장 모델은 `small`
 - LLM 기본 모델은 `qwen3:0.6b`
- TTS 기본 보이스는 `af_heart`
+- Windows TTS 기본 보이스는 설치된 `windows-media` 음성 중 현재 언어에 맞는 첫 번째 항목
+- Linux/macOS TTS 기본 보이스는 `af_heart`
 - TTS 기본 속도는 `1.12`

 더 빠르게 돌리고 싶으면:
@@ -169,11 +178,12 @@ OLLAMA_MODEL=qwen3:1.7b
 ## Windows 메모

 - `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다.
- Windows는 TTS를 Python 모델 대신 시스템 기본 음성 엔진으로 처리합니다.
+- Windows는 기본적으로 `windows-media` 엔진을 우선 쓰고, 실패하면 `system` 엔진으로 자동 fallback 합니다.
 - 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다.
 - Python 탐지가 안 되면 `.env` 에 `LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 를 넣으면 됩니다.
 - Windows의 `setup:local-ai`는 STT와 CUDA 런타임 wheel을 함께 설치합니다.
 - Linux/macOS의 `setup:local-ai`는 Kokoro ONNX 모델 파일도 자동으로 내려받습니다.
+- 더 자연스러운 음성을 고르려면 `bun run tts:voices` 로 설치된 음성 이름을 확인한 뒤 `LOCAL_TTS_VOICE_NAME` 에 넣으면 됩니다.

 ## 설계 메모

--- a/package.json
+++ b/package.json
@@ -10,6 +10,7 @@
    "start:local": "bun src/index.ts local",
    "tts:test": "bun src/index.ts local-say",
    "tts:dump": "bun src/index.ts local-say-dump",
+    "tts:voices": "bun src/index.ts local-tts-voices",
    "setup:local-ai": "bun src/setup-local-ai.ts",
    "devices": "bun src/index.ts local-devices",
    "audio:devices": "bun src/index.ts local-devices",
--- a/src/config.ts
+++ b/src/config.ts
@@ -28,6 +28,8 @@ const envSchema = z.object({
  LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(3),
  LOCAL_TTS_MODEL_PATH: z.string().min(1).default(".local-ai/models/kokoro-v1.0.onnx"),
  LOCAL_TTS_VOICES_PATH: z.string().min(1).default(".local-ai/models/voices-v1.0.bin"),
+  LOCAL_TTS_ENGINE: z.enum(["auto", "windows-media", "system", "kokoro"]).default("auto"),
+  LOCAL_TTS_VOICE_NAME: emptyToUndefined,
  LOCAL_TTS_LANGUAGE: z.string().min(1).default("ko"),
  LOCAL_TTS_SPEAKER: z.string().min(1).default("af_heart"),
  LOCAL_TTS_DEVICE: z.string().min(1).default("auto"),
--- a/src/discord-main.ts
+++ b/src/discord-main.ts
@@ -16,9 +16,8 @@ import { GuildVoiceSession } from "./audio/guild-voice-session.js";
 import { type DiscordRuntimeConfig } from "./config.js";
 import { Logger } from "./logger.js";
 import { LocalFasterWhisperSttService } from "./services/local-stt.js";
-import { LocalKokoroTtsService } from "./services/local-tts.js";
 import { OllamaLlmService } from "./services/ollama-llm.js";
-import { WindowsSystemTtsService } from "./services/windows-system-tts.js";
+import { createTtsService } from "./services/create-tts-service.js";

 export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
  const commands = [
@@ -39,15 +38,12 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger
  });

  const stt = new LocalFasterWhisperSttService(config, logger);
-  const tts =
-    process.platform === "win32"
-      ? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED)
-      : new LocalKokoroTtsService(config, logger);
+  const tts = createTtsService(config, logger);
  const llm = new OllamaLlmService(config);
  const sessions = new Map<string, GuildVoiceSession>();

  await stt.warmup();
-  await tts.warmup();
+  await tts.warmup?.();

  function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
    const member = interaction.member as GuildMember | null;
--- a/src/index.ts
+++ b/src/index.ts
@@ -3,7 +3,13 @@ import process from "node:process";
 import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js";
 import { runDiscordBot } from "./discord-main.js";
 import { Logger } from "./logger.js";
-import { dumpLocalTtsWave, printLocalAudioDevices, runLocalAssistant, runLocalTtsSmokeTest } from "./local-main.js";
+import {
+  dumpLocalTtsWave,
+  printLocalAudioDevices,
+  printLocalTtsVoices,
+  runLocalAssistant,
+  runLocalTtsSmokeTest,
+} from "./local-main.js";

 const mode = process.argv[2] ?? "discord";
 const config = loadConfig();
@@ -30,8 +36,13 @@ async function main(): Promise<void> {
      await dumpLocalTtsWave(requireAssistantRuntimeConfig(config), logger, text);
      return;
    }
+    case "local-tts-voices":
+      await printLocalTtsVoices(requireAssistantRuntimeConfig(config));
+      return;
    default:
-      throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices, local-say, local-say-dump`);
+      throw new Error(
+        `알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices, local-say, local-say-dump, local-tts-voices`,
+      );
  }
 }

--- a/src/local-main.ts
+++ b/src/local-main.ts
@@ -1,5 +1,5 @@
 import { spawn } from "node:child_process";
-import { mkdir } from "node:fs/promises";
+import { copyFile, mkdir } from "node:fs/promises";
 import path from "node:path";
 import process from "node:process";

@@ -9,10 +9,11 @@ import { LocalVoiceSession } from "./audio/local-voice-session.js";
 import { requireFfmpegPath } from "./audio/ffmpeg-path.js";
 import type { LlmService } from "./services/llm.js";
 import { LocalFasterWhisperSttService } from "./services/local-stt.js";
-import { LocalKokoroTtsService } from "./services/local-tts.js";
 import { OllamaLlmService } from "./services/ollama-llm.js";
 import type { SttService } from "./services/stt.js";
-import { synthesizeWindowsSpeechToWaveFile, WindowsSystemTtsService } from "./services/windows-system-tts.js";
+import { createTtsService } from "./services/create-tts-service.js";
+import { listWindowsMediaVoices } from "./services/windows-media-tts.js";
+import { listWindowsSystemVoices } from "./services/windows-system-tts.js";

 export async function printLocalAudioDevices(): Promise<void> {
  if (process.platform === "win32") {
@@ -73,14 +74,11 @@ export async function printLocalAudioDevices(): Promise<void> {

 export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
  const stt = new LocalFasterWhisperSttService(config, logger);
-  const tts =
-    process.platform === "win32"
-      ? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED)
-      : new LocalKokoroTtsService(config, logger);
+  const tts = createTtsService(config, logger);
  const llm = new OllamaLlmService(config);

  await stt.warmup();
-  await tts.warmup();
+  await tts.warmup?.();
  await llm.warmup?.();

  if (config.BOT_DEFAULT_LANGUAGE.startsWith("ko") && config.LOCAL_STT_MODEL === "tiny") {
@@ -130,10 +128,7 @@ export async function runLocalTtsSmokeTest(
  logger: Logger,
  text: string,
 ): Promise<void> {
-  const tts =
-    process.platform === "win32"
-      ? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED)
-      : new LocalKokoroTtsService(config, logger);
+  const tts = createTtsService(config, logger);

  const noOpStt: SttService = {
    async transcribePcm16() {
@@ -146,7 +141,7 @@ export async function runLocalTtsSmokeTest(
    },
  };

-  await tts.warmup();
+  await tts.warmup?.();

  const session = new LocalVoiceSession({
    config,
@@ -171,7 +166,7 @@ export async function runLocalTtsSmokeTest(

 export async function dumpLocalTtsWave(
  config: AssistantRuntimeConfig,
-  _logger: Logger,
+  logger: Logger,
  text: string,
  outputPath?: string,
 ): Promise<void> {
@@ -181,9 +176,57 @@ export async function dumpLocalTtsWave(

  const resolvedPath = path.resolve(outputPath?.trim() || "tts-test.wav");
  await mkdir(path.dirname(resolvedPath), { recursive: true });
-  await synthesizeWindowsSpeechToWaveFile(text, config.LOCAL_TTS_SPEED, resolvedPath);
+  const tts = createTtsService(config, logger);
+  await tts.warmup?.();
+  const playback = await tts.preparePlayback(text);
+
+  try {
+    if (!playback.sourceFilePath) {
+      throw new Error("현재 선택된 TTS 엔진은 직접 WAV 덤프를 지원하지 않습니다.");
+    }
+    await copyFile(playback.sourceFilePath, resolvedPath);
+  } finally {
+    playback.dispose();
+    await tts.destroy?.();
+  }

  console.log("TTS WAV 파일 생성 완료");
  console.log(`출력 파일: ${resolvedPath}`);
  console.log("이 파일이 재생되면 TTS 합성은 정상이고, 실시간 재생 경로만 따로 보면 됩니다.");
 }
+
+export async function printLocalTtsVoices(config: AssistantRuntimeConfig): Promise<void> {
+  if (process.platform !== "win32") {
+    console.log("현재 플랫폼은 Windows가 아니므로 설치된 시스템 TTS 목록 대신 Kokoro 설정만 사용합니다.");
+    console.log(`LOCAL_TTS_ENGINE=${config.LOCAL_TTS_ENGINE}`);
+    console.log(`LOCAL_TTS_SPEAKER=${config.LOCAL_TTS_SPEAKER}`);
+    return;
+  }
+
+  const [windowsMediaVoices, windowsSystemVoices] = await Promise.all([
+    listWindowsMediaVoices(),
+    listWindowsSystemVoices(),
+  ]);
+
+  console.log("\n=== Windows.Media.SpeechSynthesis voices (권장) ===");
+  if (windowsMediaVoices.length === 0) {
+    console.log("설치된 Windows Media 음성이 없습니다.");
+  } else {
+    for (const voice of windowsMediaVoices) {
+      console.log(`- ${voice.description} | name=${voice.displayName} | lang=${voice.language}`);
+    }
+  }
+
+  console.log("\n=== System.Speech voices (fallback) ===");
+  if (windowsSystemVoices.length === 0) {
+    console.log("설치된 System.Speech 음성이 없습니다.");
+  } else {
+    for (const voice of windowsSystemVoices) {
+      console.log(`- ${voice.description} | name=${voice.name} | lang=${voice.culture}`);
+    }
+  }
+
+  console.log("\n설정 예시");
+  console.log("LOCAL_TTS_ENGINE=windows-media");
+  console.log("LOCAL_TTS_VOICE_NAME=위 목록의 description 또는 name");
+}
--- a/src/services/create-tts-service.ts
+++ b/src/services/create-tts-service.ts
@@ -0,0 +1,112 @@
+import process from "node:process";
+
+import type { AssistantRuntimeConfig } from "../config.js";
+import type { Logger } from "../logger.js";
+import { LocalKokoroTtsService } from "./local-tts.js";
+import type { PreparedSpeechAudio, TtsService } from "./tts.js";
+import { WindowsMediaTtsService } from "./windows-media-tts.js";
+import { WindowsSystemTtsService } from "./windows-system-tts.js";
+
+interface NamedTtsService {
+  name: string;
+  service: TtsService;
+}
+
+class FallbackTtsService implements TtsService {
+  private activeIndex: number | null = null;
+
+  constructor(
+    private readonly logger: Logger,
+    private readonly services: NamedTtsService[],
+  ) {}
+
+  async warmup(): Promise<void> {
+    await this.ensureActive();
+  }
+
+  async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
+    const active = await this.ensureActive();
+
+    try {
+      return await active.service.preparePlayback(text, signal);
+    } catch (error) {
+      if (this.activeIndex === null || this.activeIndex >= this.services.length - 1) {
+        throw error;
+      }
+
+      const failedName = active.name;
+      this.activeIndex += 1;
+      const fallback = await this.activate(this.activeIndex);
+      this.logger.warn(`TTS 엔진 ${failedName} 이 실패해 ${fallback.name} 로 전환합니다.`, error);
+      return await fallback.service.preparePlayback(text, signal);
+    }
+  }
+
+  async destroy(): Promise<void> {
+    await Promise.allSettled(this.services.map((entry) => entry.service.destroy?.()));
+  }
+
+  private async ensureActive(): Promise<NamedTtsService> {
+    if (this.activeIndex !== null) {
+      return this.services[this.activeIndex]!;
+    }
+
+    let lastError: unknown = null;
+    for (let index = 0; index < this.services.length; index += 1) {
+      try {
+        return await this.activate(index);
+      } catch (error) {
+        lastError = error;
+        this.logger.warn(`TTS 엔진 ${this.services[index]!.name} 초기화 실패`, error);
+      }
+    }
+
+    throw lastError instanceof Error ? lastError : new Error("사용 가능한 TTS 엔진을 찾지 못했습니다.");
+  }
+
+  private async activate(index: number): Promise<NamedTtsService> {
+    const selected = this.services[index]!;
+    await selected.service.warmup?.();
+    this.activeIndex = index;
+    this.logger.info("Selected TTS engine", selected.name);
+    return selected;
+  }
+}
+
+export function createTtsService(config: AssistantRuntimeConfig, logger: Logger): TtsService {
+  if (process.platform !== "win32") {
+    return new LocalKokoroTtsService(config, logger);
+  }
+
+  const systemTts = new WindowsSystemTtsService(
+    config.LOCAL_TTS_SPEED,
+    config.LOCAL_TTS_VOICE_NAME,
+    config.LOCAL_TTS_LANGUAGE,
+  );
+  const windowsMediaTts = new WindowsMediaTtsService(
+    config.LOCAL_TTS_SPEED,
+    config.LOCAL_TTS_VOICE_NAME,
+    config.LOCAL_TTS_LANGUAGE,
+  );
+
+  switch (config.LOCAL_TTS_ENGINE) {
+    case "system":
+      return systemTts;
+    case "windows-media":
+      return windowsMediaTts;
+    case "kokoro":
+      return new LocalKokoroTtsService(config, logger);
+    case "auto":
+    default:
+      return new FallbackTtsService(logger, [
+        {
+          name: "windows-media",
+          service: windowsMediaTts,
+        },
+        {
+          name: "system",
+          service: systemTts,
+        },
+      ]);
+  }
+}
--- a/src/services/tts.ts
+++ b/src/services/tts.ts
@@ -7,6 +7,7 @@ export interface PreparedSpeechAudio {
 }

 export interface TtsService {
+  warmup?(): Promise<void>;
  preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio>;
  destroy?(): Promise<void>;
 }
--- a/src/services/windows-media-tts.ts
+++ b/src/services/windows-media-tts.ts
@@ -0,0 +1,141 @@
+import { createReadStream } from "node:fs";
+import { unlink } from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import type { PreparedSpeechAudio, TtsService } from "./tts.js";
+import { parsePowerShellJsonArray, runPowerShell } from "./windows-powershell.js";
+
+export interface WindowsMediaVoiceInfo {
+  displayName: string;
+  description: string;
+  language: string;
+  gender: string;
+  id: string;
+}
+
+function escapePowerShellSingleQuoted(text: string): string {
+  return text.replace(/\r?\n/g, " ").replace(/'/g, "''");
+}
+
+function windowsMediaPreamble(): string {
+  return [
+    "$ErrorActionPreference = 'Stop';",
+    "$ProgressPreference = 'SilentlyContinue';",
+    "Add-Type -AssemblyName System.Runtime.WindowsRuntime;",
+    "$null = [Windows.Media.SpeechSynthesis.SpeechSynthesizer, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime];",
+    "$null = [Windows.Storage.Streams.DataReader, Windows.Storage.Streams, ContentType=WindowsRuntime];",
+    "function Await-WinRt($operation) {",
+    "  $method = [System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object { $_.Name -eq 'AsTask' -and $_.IsGenericMethod -and $_.GetParameters().Count -eq 1 } | Select-Object -First 1;",
+    "  if (-not $method) { throw 'System.WindowsRuntimeSystemExtensions.AsTask 를 찾지 못했습니다.' }",
+    "  $resultType = $operation.GetType().GenericTypeArguments[0];",
+    "  $task = $method.MakeGenericMethod($resultType).Invoke($null, @($operation));",
+    "  return $task.GetAwaiter().GetResult();",
+    "}",
+  ].join(" ");
+}
+
+export async function listWindowsMediaVoices(signal?: AbortSignal): Promise<WindowsMediaVoiceInfo[]> {
+  const script = [
+    windowsMediaPreamble(),
+    "$voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices | ForEach-Object {",
+    "  [PSCustomObject]@{",
+    "    displayName = $_.DisplayName;",
+    "    description = $_.Description;",
+    "    language = $_.Language;",
+    "    gender = [string]$_.Gender;",
+    "    id = $_.Id;",
+    "  }",
+    "});",
+    "ConvertTo-Json -InputObject $voices -Compress;",
+  ].join(" ");
+
+  const { stdout } = await runPowerShell(script, signal);
+  return parsePowerShellJsonArray<WindowsMediaVoiceInfo>(stdout);
+}
+
+export async function synthesizeWindowsMediaSpeechToWaveFile(
+  text: string,
+  speed: number,
+  outputPath: string,
+  voiceName?: string,
+  language = "ko",
+  signal?: AbortSignal,
+): Promise<void> {
+  const script = [
+    windowsMediaPreamble(),
+    `$text = '${escapePowerShellSingleQuoted(text)}';`,
+    `$outputPath = '${escapePowerShellSingleQuoted(outputPath)}';`,
+    `$preferredVoice = '${escapePowerShellSingleQuoted(voiceName ?? "")}';`,
+    `$preferredLanguage = '${escapePowerShellSingleQuoted(language)}';`,
+    `$speakingRate = ${speed.toFixed(2)};`,
+    "$synth = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new();",
+    "try {",
+    "  $voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices;",
+    "  $selected = $null;",
+    "  if ($preferredVoice) {",
+    "    $selected = $voices | Where-Object {",
+    "      $_.DisplayName -eq $preferredVoice -or $_.Description -eq $preferredVoice -or $_.Id -eq $preferredVoice -or $_.DisplayName -like ('*' + $preferredVoice + '*') -or $_.Description -like ('*' + $preferredVoice + '*')",
+    "    } | Select-Object -First 1;",
+    "  }",
+    "  if (-not $selected -and $preferredLanguage) {",
+    "    $selected = $voices | Where-Object { $_.Language -like ($preferredLanguage + '*') } | Sort-Object @{Expression={ if ($_.DisplayName -match 'Natural' -or $_.Description -match 'Natural') { 0 } else { 1 } }}, Description | Select-Object -First 1;",
+    "  }",
+    "  if (-not $selected) { $selected = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice }",
+    "  if ($selected) { $synth.Voice = $selected }",
+    "  try { $synth.Options.SpeakingRate = $speakingRate } catch {}",
+    "  $stream = Await-WinRt ($synth.SynthesizeTextToStreamAsync($text));",
+    "  try {",
+    "    $size = [uint32]$stream.Size;",
+    "    $reader = [Windows.Storage.Streams.DataReader]::new($stream.GetInputStreamAt(0));",
+    "    try {",
+    "      $null = Await-WinRt ($reader.LoadAsync($size));",
+    "      $bytes = New-Object byte[] ([int]$size);",
+    "      $reader.ReadBytes($bytes);",
+    "      [System.IO.File]::WriteAllBytes($outputPath, $bytes);",
+    "    } finally { $reader.Dispose() }",
+    "  } finally { $stream.Dispose() }",
+    "} finally { $synth.Dispose() }",
+  ].join(" ");
+
+  await runPowerShell(script, signal);
+}
+
+export class WindowsMediaTtsService implements TtsService {
+  constructor(
+    private readonly speed: number,
+    private readonly voiceName?: string,
+    private readonly language = "ko",
+  ) {}
+
+  async warmup(): Promise<void> {
+    await listWindowsMediaVoices();
+  }
+
+  async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
+    const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-wmtts-${Date.now()}.wav`);
+    await synthesizeWindowsMediaSpeechToWaveFile(
+      text,
+      this.speed,
+      tempPath,
+      this.voiceName,
+      this.language,
+      signal,
+    ).catch(async (error) => {
+      await unlink(tempPath).catch(() => null);
+      throw error;
+    });
+
+    return {
+      stream: createReadStream(tempPath),
+      sourceFilePath: tempPath,
+      dispose: () => {
+        void unlink(tempPath).catch(() => null);
+      },
+    };
+  }
+
+  async destroy(): Promise<void> {
+    return;
+  }
+}
--- a/src/services/windows-powershell.ts
+++ b/src/services/windows-powershell.ts
@@ -0,0 +1,63 @@
+import { spawn } from "node:child_process";
+
+export interface PowerShellRunResult {
+  stdout: string;
+  stderr: string;
+}
+
+export async function runPowerShell(script: string, signal?: AbortSignal): Promise<PowerShellRunResult> {
+  const encodedCommand = Buffer.from(script, "utf16le").toString("base64");
+
+  return await new Promise<PowerShellRunResult>((resolve, reject) => {
+    const child = spawn("powershell", ["-NoProfile", "-EncodedCommand", encodedCommand], {
+      stdio: ["ignore", "pipe", "pipe"],
+    });
+
+    let stdout = "";
+    let stderr = "";
+
+    child.stdout.on("data", (chunk: Buffer) => {
+      stdout += chunk.toString();
+    });
+
+    child.stderr.on("data", (chunk: Buffer) => {
+      stderr += chunk.toString();
+    });
+
+    signal?.addEventListener(
+      "abort",
+      () => {
+        if (!child.killed) {
+          child.kill("SIGKILL");
+        }
+      },
+      { once: true },
+    );
+
+    child.on("exit", (code) => {
+      if (signal?.aborted) {
+        reject(new Error("powershell aborted"));
+        return;
+      }
+
+      if (code === 0) {
+        resolve({ stdout, stderr });
+        return;
+      }
+
+      reject(new Error(stderr.trim() || stdout.trim() || `powershell exited with code ${code ?? "null"}`));
+    });
+
+    child.on("error", reject);
+  });
+}
+
+export function parsePowerShellJsonArray<T>(stdout: string): T[] {
+  const trimmed = stdout.trim();
+  if (!trimmed) {
+    return [];
+  }
+
+  const parsed: unknown = JSON.parse(trimmed);
+  return Array.isArray(parsed) ? (parsed as T[]) : ([parsed] as T[]);
+}
--- a/src/services/windows-system-tts.ts
+++ b/src/services/windows-system-tts.ts
@@ -1,14 +1,21 @@
-import { spawn } from "node:child_process";
 import { createReadStream } from "node:fs";
 import { unlink } from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";

-import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
 import type { PreparedSpeechAudio, TtsService } from "./tts.js";
+import { parsePowerShellJsonArray, runPowerShell } from "./windows-powershell.js";
+
+export interface WindowsSystemVoiceInfo {
+  name: string;
+  culture: string;
+  description: string;
+  gender: string;
+  enabled: boolean;
+}

 function escapePowerShellSingleQuoted(text: string): string {
-  return text.replace(/'/g, "''");
+  return text.replace(/\r?\n/g, " ").replace(/'/g, "''");
 }

 function toSpeechRate(speed: number): number {
@@ -16,77 +23,86 @@ function toSpeechRate(speed: number): number {
  return Math.max(-10, Math.min(10, mapped));
 }

+export async function listWindowsSystemVoices(signal?: AbortSignal): Promise<WindowsSystemVoiceInfo[]> {
+  const script = [
+    "$ErrorActionPreference = 'Stop';",
+    "$ProgressPreference = 'SilentlyContinue';",
+    "Add-Type -AssemblyName System.Speech;",
+    "$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
+    "try {",
+    "  $voices = @($synth.GetInstalledVoices() | ForEach-Object {",
+    "    [PSCustomObject]@{",
+    "      name = $_.VoiceInfo.Name;",
+    "      culture = $_.VoiceInfo.Culture.Name;",
+    "      description = $_.VoiceInfo.Description;",
+    "      gender = [string]$_.VoiceInfo.Gender;",
+    "      enabled = [bool]$_.Enabled;",
+    "    }",
+    "  });",
+    "  ConvertTo-Json -InputObject $voices -Compress;",
+    "} finally { $synth.Dispose() }",
+  ].join(" ");
+
+  const { stdout } = await runPowerShell(script, signal);
+  return parsePowerShellJsonArray<WindowsSystemVoiceInfo>(stdout);
+}
+
 export async function synthesizeWindowsSpeechToWaveFile(
  text: string,
  speed: number,
  outputPath: string,
+  voiceName?: string,
+  language = "ko",
  signal?: AbortSignal,
 ): Promise<void> {
  const rate = toSpeechRate(speed);
  const script = [
+    "$ErrorActionPreference = 'Stop';",
+    "$ProgressPreference = 'SilentlyContinue';",
    "Add-Type -AssemblyName System.Speech;",
    "$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
-    "$koVoice = $synth.GetInstalledVoices() | Where-Object { $_.VoiceInfo.Culture.Name -like 'ko*' } | Select-Object -First 1;",
-    "if ($koVoice) { $synth.SelectVoice($koVoice.VoiceInfo.Name) }",
+    `$preferredVoice = '${escapePowerShellSingleQuoted(voiceName ?? "")}';`,
+    `$preferredLanguage = '${escapePowerShellSingleQuoted(language)}';`,
+    "try {",
+    "  $voices = $synth.GetInstalledVoices();",
+    "  $selected = $null;",
+    "  if ($preferredVoice) {",
+    "    $selected = $voices | Where-Object {",
+    "      $_.VoiceInfo.Name -eq $preferredVoice -or $_.VoiceInfo.Description -eq $preferredVoice -or $_.VoiceInfo.Name -like ('*' + $preferredVoice + '*') -or $_.VoiceInfo.Description -like ('*' + $preferredVoice + '*')",
+    "    } | Select-Object -First 1;",
+    "  }",
+    "  if (-not $selected -and $preferredLanguage) {",
+    "    $selected = $voices | Where-Object { $_.VoiceInfo.Culture.Name -like ($preferredLanguage + '*') } | Select-Object -First 1;",
+    "  }",
+    "  if ($selected) { $synth.SelectVoice($selected.VoiceInfo.Name) }",
    `$synth.Rate = ${rate};`,
    `$synth.SetOutputToWaveFile('${escapePowerShellSingleQuoted(outputPath)}');`,
    `$synth.Speak('${escapePowerShellSingleQuoted(text)}');`,
-    "$synth.Dispose();",
+    "} finally { $synth.Dispose() }",
  ].join(" ");

-  await new Promise<void>((resolve, reject) => {
-    const child = spawn("powershell", ["-NoProfile", "-Command", script], {
-      stdio: ["ignore", "ignore", "pipe"],
-    });
-
-    let stderr = "";
-    child.stderr.on("data", (chunk: Buffer) => {
-      stderr += chunk.toString();
-    });
-
-    signal?.addEventListener(
-      "abort",
-      () => {
-        if (!child.killed) {
-          child.kill("SIGKILL");
-        }
-      },
-      { once: true },
-    );
-
-    child.on("exit", (code) => {
-      if (signal?.aborted) {
-        reject(new Error("tts aborted"));
-        return;
-      }
-      if (code === 0) {
-        resolve();
-        return;
-      }
-      reject(new Error(stderr.trim() || `powershell tts exited with code ${code ?? "null"}`));
-    });
-    child.on("error", reject);
-  });
+  await runPowerShell(script, signal);
 }

 export class WindowsSystemTtsService implements TtsService {
-  constructor(private readonly speed: number) {
-    const resolvedFfmpegPath = resolveFfmpegPath();
-    if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
-      process.env.FFMPEG_PATH = resolvedFfmpegPath;
-    }
-  }
+  constructor(
+    private readonly speed: number,
+    private readonly voiceName?: string,
+    private readonly language = "ko",
+  ) {}

  async warmup(): Promise<void> {
-    return;
+    await listWindowsSystemVoices();
  }

  async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
    const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-tts-${Date.now()}.wav`);
-    await synthesizeWindowsSpeechToWaveFile(text, this.speed, tempPath, signal).catch(async (error) => {
+    await synthesizeWindowsSpeechToWaveFile(text, this.speed, tempPath, this.voiceName, this.language, signal).catch(
+      async (error) => {
        await unlink(tempPath).catch(() => null);
        throw error;
-    });
+      },
+    );

    return {
      stream: createReadStream(tempPath),