Add Windows Media TTS engine selection

This commit is contained in:
2026-05-01 04:01:33 +09:00
parent 1a8e8d0a8f
commit 52d7f74049
12 changed files with 475 additions and 77 deletions

View File

@@ -16,6 +16,8 @@ LOCAL_STT_MODEL=small
LOCAL_STT_DEVICE=auto
LOCAL_STT_COMPUTE_TYPE=auto
LOCAL_STT_BEAM_SIZE=3
LOCAL_TTS_ENGINE=auto
LOCAL_TTS_VOICE_NAME=
LOCAL_TTS_MODEL_PATH=.local-ai/models/kokoro-v1.0.onnx
LOCAL_TTS_VOICES_PATH=.local-ai/models/voices-v1.0.bin
LOCAL_TTS_LANGUAGE=ko

View File

@@ -7,7 +7,7 @@
- STT: `faster-whisper` + Whisper multilingual
- LLM: `Ollama` + `qwen3:0.6b`
- TTS:
- Windows: 시스템 기본 음성 엔진
- Windows: `Windows.Media.SpeechSynthesis` 우선, 실패 시 시스템 기본 음성 엔진 fallback
- Linux/macOS: `kokoro-onnx` + `misaki[ko]`
- VAD: `avr-vad`
@@ -71,6 +71,12 @@ TTS만 단독으로 확인:
bun run tts:test -- "안녕하세요. 출력 장치 테스트입니다."
```
Windows 설치 음성 목록 확인:
```bash
bun run tts:voices
```
TTS WAV 파일만 생성해서 확인:
```bash
@@ -104,6 +110,8 @@ Discord 모드에서만 필수:
- `LOCAL_STT_DEVICE`
- `LOCAL_STT_COMPUTE_TYPE`
- `LOCAL_STT_BEAM_SIZE`
- `LOCAL_TTS_ENGINE`
- `LOCAL_TTS_VOICE_NAME`
- `LOCAL_TTS_MODEL_PATH`
- `LOCAL_TTS_VOICES_PATH`
- `LOCAL_TTS_LANGUAGE`
@@ -138,7 +146,8 @@ Windows에서 GPU STT를 쓰려면 `LOCAL_STT_DEVICE=auto` 그대로 두고 `bun
- STT 기본 권장 모델은 `small`
- LLM 기본 모델은 `qwen3:0.6b`
- TTS 기본 보이스는 `af_heart`
- Windows TTS 기본 보이스는 설치된 `windows-media` 음성 중 현재 언어에 맞는 첫 번째 항목
- Linux/macOS TTS 기본 보이스는 `af_heart`
- TTS 기본 속도는 `1.12`
더 빠르게 돌리고 싶으면:
@@ -169,11 +178,12 @@ OLLAMA_MODEL=qwen3:1.7b
## Windows 메모
- `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다.
- Windows는 TTS를 Python 모델 대신 시스템 기본 음성 엔진으로 처리합니다.
- Windows는 기본적으로 `windows-media` 엔진을 우선 쓰고, 실패하면 `system` 엔진으로 자동 fallback 합니다.
- 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다.
- Python 탐지가 안 되면 `.env``LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 를 넣으면 됩니다.
- Windows의 `setup:local-ai`는 STT와 CUDA 런타임 wheel을 함께 설치합니다.
- Linux/macOS의 `setup:local-ai`는 Kokoro ONNX 모델 파일도 자동으로 내려받습니다.
- 더 자연스러운 음성을 고르려면 `bun run tts:voices` 로 설치된 음성 이름을 확인한 뒤 `LOCAL_TTS_VOICE_NAME` 에 넣으면 됩니다.
## 설계 메모

View File

@@ -10,6 +10,7 @@
"start:local": "bun src/index.ts local",
"tts:test": "bun src/index.ts local-say",
"tts:dump": "bun src/index.ts local-say-dump",
"tts:voices": "bun src/index.ts local-tts-voices",
"setup:local-ai": "bun src/setup-local-ai.ts",
"devices": "bun src/index.ts local-devices",
"audio:devices": "bun src/index.ts local-devices",

View File

@@ -28,6 +28,8 @@ const envSchema = z.object({
LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(3),
LOCAL_TTS_MODEL_PATH: z.string().min(1).default(".local-ai/models/kokoro-v1.0.onnx"),
LOCAL_TTS_VOICES_PATH: z.string().min(1).default(".local-ai/models/voices-v1.0.bin"),
LOCAL_TTS_ENGINE: z.enum(["auto", "windows-media", "system", "kokoro"]).default("auto"),
LOCAL_TTS_VOICE_NAME: emptyToUndefined,
LOCAL_TTS_LANGUAGE: z.string().min(1).default("ko"),
LOCAL_TTS_SPEAKER: z.string().min(1).default("af_heart"),
LOCAL_TTS_DEVICE: z.string().min(1).default("auto"),

View File

@@ -16,9 +16,8 @@ import { GuildVoiceSession } from "./audio/guild-voice-session.js";
import { type DiscordRuntimeConfig } from "./config.js";
import { Logger } from "./logger.js";
import { LocalFasterWhisperSttService } from "./services/local-stt.js";
import { LocalKokoroTtsService } from "./services/local-tts.js";
import { OllamaLlmService } from "./services/ollama-llm.js";
import { WindowsSystemTtsService } from "./services/windows-system-tts.js";
import { createTtsService } from "./services/create-tts-service.js";
export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
const commands = [
@@ -39,15 +38,12 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger
});
const stt = new LocalFasterWhisperSttService(config, logger);
const tts =
process.platform === "win32"
? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED)
: new LocalKokoroTtsService(config, logger);
const tts = createTtsService(config, logger);
const llm = new OllamaLlmService(config);
const sessions = new Map<string, GuildVoiceSession>();
await stt.warmup();
await tts.warmup();
await tts.warmup?.();
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
const member = interaction.member as GuildMember | null;

View File

@@ -3,7 +3,13 @@ import process from "node:process";
import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js";
import { runDiscordBot } from "./discord-main.js";
import { Logger } from "./logger.js";
import { dumpLocalTtsWave, printLocalAudioDevices, runLocalAssistant, runLocalTtsSmokeTest } from "./local-main.js";
import {
dumpLocalTtsWave,
printLocalAudioDevices,
printLocalTtsVoices,
runLocalAssistant,
runLocalTtsSmokeTest,
} from "./local-main.js";
const mode = process.argv[2] ?? "discord";
const config = loadConfig();
@@ -30,8 +36,13 @@ async function main(): Promise<void> {
await dumpLocalTtsWave(requireAssistantRuntimeConfig(config), logger, text);
return;
}
case "local-tts-voices":
await printLocalTtsVoices(requireAssistantRuntimeConfig(config));
return;
default:
throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices, local-say, local-say-dump`);
throw new Error(
`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices, local-say, local-say-dump, local-tts-voices`,
);
}
}

View File

@@ -1,5 +1,5 @@
import { spawn } from "node:child_process";
import { mkdir } from "node:fs/promises";
import { copyFile, mkdir } from "node:fs/promises";
import path from "node:path";
import process from "node:process";
@@ -9,10 +9,11 @@ import { LocalVoiceSession } from "./audio/local-voice-session.js";
import { requireFfmpegPath } from "./audio/ffmpeg-path.js";
import type { LlmService } from "./services/llm.js";
import { LocalFasterWhisperSttService } from "./services/local-stt.js";
import { LocalKokoroTtsService } from "./services/local-tts.js";
import { OllamaLlmService } from "./services/ollama-llm.js";
import type { SttService } from "./services/stt.js";
import { synthesizeWindowsSpeechToWaveFile, WindowsSystemTtsService } from "./services/windows-system-tts.js";
import { createTtsService } from "./services/create-tts-service.js";
import { listWindowsMediaVoices } from "./services/windows-media-tts.js";
import { listWindowsSystemVoices } from "./services/windows-system-tts.js";
export async function printLocalAudioDevices(): Promise<void> {
if (process.platform === "win32") {
@@ -73,14 +74,11 @@ export async function printLocalAudioDevices(): Promise<void> {
export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
const stt = new LocalFasterWhisperSttService(config, logger);
const tts =
process.platform === "win32"
? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED)
: new LocalKokoroTtsService(config, logger);
const tts = createTtsService(config, logger);
const llm = new OllamaLlmService(config);
await stt.warmup();
await tts.warmup();
await tts.warmup?.();
await llm.warmup?.();
if (config.BOT_DEFAULT_LANGUAGE.startsWith("ko") && config.LOCAL_STT_MODEL === "tiny") {
@@ -130,10 +128,7 @@ export async function runLocalTtsSmokeTest(
logger: Logger,
text: string,
): Promise<void> {
const tts =
process.platform === "win32"
? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED)
: new LocalKokoroTtsService(config, logger);
const tts = createTtsService(config, logger);
const noOpStt: SttService = {
async transcribePcm16() {
@@ -146,7 +141,7 @@ export async function runLocalTtsSmokeTest(
},
};
await tts.warmup();
await tts.warmup?.();
const session = new LocalVoiceSession({
config,
@@ -171,7 +166,7 @@ export async function runLocalTtsSmokeTest(
export async function dumpLocalTtsWave(
config: AssistantRuntimeConfig,
_logger: Logger,
logger: Logger,
text: string,
outputPath?: string,
): Promise<void> {
@@ -181,9 +176,57 @@ export async function dumpLocalTtsWave(
const resolvedPath = path.resolve(outputPath?.trim() || "tts-test.wav");
await mkdir(path.dirname(resolvedPath), { recursive: true });
await synthesizeWindowsSpeechToWaveFile(text, config.LOCAL_TTS_SPEED, resolvedPath);
const tts = createTtsService(config, logger);
await tts.warmup?.();
const playback = await tts.preparePlayback(text);
try {
if (!playback.sourceFilePath) {
throw new Error("현재 선택된 TTS 엔진은 직접 WAV 덤프를 지원하지 않습니다.");
}
await copyFile(playback.sourceFilePath, resolvedPath);
} finally {
playback.dispose();
await tts.destroy?.();
}
console.log("TTS WAV 파일 생성 완료");
console.log(`출력 파일: ${resolvedPath}`);
console.log("이 파일이 재생되면 TTS 합성은 정상이고, 실시간 재생 경로만 따로 보면 됩니다.");
}
export async function printLocalTtsVoices(config: AssistantRuntimeConfig): Promise<void> {
if (process.platform !== "win32") {
console.log("현재 플랫폼은 Windows가 아니므로 설치된 시스템 TTS 목록 대신 Kokoro 설정만 사용합니다.");
console.log(`LOCAL_TTS_ENGINE=${config.LOCAL_TTS_ENGINE}`);
console.log(`LOCAL_TTS_SPEAKER=${config.LOCAL_TTS_SPEAKER}`);
return;
}
const [windowsMediaVoices, windowsSystemVoices] = await Promise.all([
listWindowsMediaVoices(),
listWindowsSystemVoices(),
]);
console.log("\n=== Windows.Media.SpeechSynthesis voices (권장) ===");
if (windowsMediaVoices.length === 0) {
console.log("설치된 Windows Media 음성이 없습니다.");
} else {
for (const voice of windowsMediaVoices) {
console.log(`- ${voice.description} | name=${voice.displayName} | lang=${voice.language}`);
}
}
console.log("\n=== System.Speech voices (fallback) ===");
if (windowsSystemVoices.length === 0) {
console.log("설치된 System.Speech 음성이 없습니다.");
} else {
for (const voice of windowsSystemVoices) {
console.log(`- ${voice.description} | name=${voice.name} | lang=${voice.culture}`);
}
}
console.log("\n설정 예시");
console.log("LOCAL_TTS_ENGINE=windows-media");
console.log("LOCAL_TTS_VOICE_NAME=위 목록의 description 또는 name");
}

View File

@@ -0,0 +1,112 @@
import process from "node:process";
import type { AssistantRuntimeConfig } from "../config.js";
import type { Logger } from "../logger.js";
import { LocalKokoroTtsService } from "./local-tts.js";
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
import { WindowsMediaTtsService } from "./windows-media-tts.js";
import { WindowsSystemTtsService } from "./windows-system-tts.js";
interface NamedTtsService {
name: string;
service: TtsService;
}
class FallbackTtsService implements TtsService {
private activeIndex: number | null = null;
constructor(
private readonly logger: Logger,
private readonly services: NamedTtsService[],
) {}
async warmup(): Promise<void> {
await this.ensureActive();
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const active = await this.ensureActive();
try {
return await active.service.preparePlayback(text, signal);
} catch (error) {
if (this.activeIndex === null || this.activeIndex >= this.services.length - 1) {
throw error;
}
const failedName = active.name;
this.activeIndex += 1;
const fallback = await this.activate(this.activeIndex);
this.logger.warn(`TTS 엔진 ${failedName} 이 실패해 ${fallback.name} 로 전환합니다.`, error);
return await fallback.service.preparePlayback(text, signal);
}
}
async destroy(): Promise<void> {
await Promise.allSettled(this.services.map((entry) => entry.service.destroy?.()));
}
private async ensureActive(): Promise<NamedTtsService> {
if (this.activeIndex !== null) {
return this.services[this.activeIndex]!;
}
let lastError: unknown = null;
for (let index = 0; index < this.services.length; index += 1) {
try {
return await this.activate(index);
} catch (error) {
lastError = error;
this.logger.warn(`TTS 엔진 ${this.services[index]!.name} 초기화 실패`, error);
}
}
throw lastError instanceof Error ? lastError : new Error("사용 가능한 TTS 엔진을 찾지 못했습니다.");
}
private async activate(index: number): Promise<NamedTtsService> {
const selected = this.services[index]!;
await selected.service.warmup?.();
this.activeIndex = index;
this.logger.info("Selected TTS engine", selected.name);
return selected;
}
}
export function createTtsService(config: AssistantRuntimeConfig, logger: Logger): TtsService {
if (process.platform !== "win32") {
return new LocalKokoroTtsService(config, logger);
}
const systemTts = new WindowsSystemTtsService(
config.LOCAL_TTS_SPEED,
config.LOCAL_TTS_VOICE_NAME,
config.LOCAL_TTS_LANGUAGE,
);
const windowsMediaTts = new WindowsMediaTtsService(
config.LOCAL_TTS_SPEED,
config.LOCAL_TTS_VOICE_NAME,
config.LOCAL_TTS_LANGUAGE,
);
switch (config.LOCAL_TTS_ENGINE) {
case "system":
return systemTts;
case "windows-media":
return windowsMediaTts;
case "kokoro":
return new LocalKokoroTtsService(config, logger);
case "auto":
default:
return new FallbackTtsService(logger, [
{
name: "windows-media",
service: windowsMediaTts,
},
{
name: "system",
service: systemTts,
},
]);
}
}

View File

@@ -7,6 +7,7 @@ export interface PreparedSpeechAudio {
}
export interface TtsService {
warmup?(): Promise<void>;
preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio>;
destroy?(): Promise<void>;
}

View File

@@ -0,0 +1,141 @@
import { createReadStream } from "node:fs";
import { unlink } from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
import { parsePowerShellJsonArray, runPowerShell } from "./windows-powershell.js";
export interface WindowsMediaVoiceInfo {
displayName: string;
description: string;
language: string;
gender: string;
id: string;
}
function escapePowerShellSingleQuoted(text: string): string {
return text.replace(/\r?\n/g, " ").replace(/'/g, "''");
}
function windowsMediaPreamble(): string {
return [
"$ErrorActionPreference = 'Stop';",
"$ProgressPreference = 'SilentlyContinue';",
"Add-Type -AssemblyName System.Runtime.WindowsRuntime;",
"$null = [Windows.Media.SpeechSynthesis.SpeechSynthesizer, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime];",
"$null = [Windows.Storage.Streams.DataReader, Windows.Storage.Streams, ContentType=WindowsRuntime];",
"function Await-WinRt($operation) {",
" $method = [System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object { $_.Name -eq 'AsTask' -and $_.IsGenericMethod -and $_.GetParameters().Count -eq 1 } | Select-Object -First 1;",
" if (-not $method) { throw 'System.WindowsRuntimeSystemExtensions.AsTask 를 찾지 못했습니다.' }",
" $resultType = $operation.GetType().GenericTypeArguments[0];",
" $task = $method.MakeGenericMethod($resultType).Invoke($null, @($operation));",
" return $task.GetAwaiter().GetResult();",
"}",
].join(" ");
}
export async function listWindowsMediaVoices(signal?: AbortSignal): Promise<WindowsMediaVoiceInfo[]> {
const script = [
windowsMediaPreamble(),
"$voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices | ForEach-Object {",
" [PSCustomObject]@{",
" displayName = $_.DisplayName;",
" description = $_.Description;",
" language = $_.Language;",
" gender = [string]$_.Gender;",
" id = $_.Id;",
" }",
"});",
"ConvertTo-Json -InputObject $voices -Compress;",
].join(" ");
const { stdout } = await runPowerShell(script, signal);
return parsePowerShellJsonArray<WindowsMediaVoiceInfo>(stdout);
}
export async function synthesizeWindowsMediaSpeechToWaveFile(
text: string,
speed: number,
outputPath: string,
voiceName?: string,
language = "ko",
signal?: AbortSignal,
): Promise<void> {
const script = [
windowsMediaPreamble(),
`$text = '${escapePowerShellSingleQuoted(text)}';`,
`$outputPath = '${escapePowerShellSingleQuoted(outputPath)}';`,
`$preferredVoice = '${escapePowerShellSingleQuoted(voiceName ?? "")}';`,
`$preferredLanguage = '${escapePowerShellSingleQuoted(language)}';`,
`$speakingRate = ${speed.toFixed(2)};`,
"$synth = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new();",
"try {",
" $voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices;",
" $selected = $null;",
" if ($preferredVoice) {",
" $selected = $voices | Where-Object {",
" $_.DisplayName -eq $preferredVoice -or $_.Description -eq $preferredVoice -or $_.Id -eq $preferredVoice -or $_.DisplayName -like ('*' + $preferredVoice + '*') -or $_.Description -like ('*' + $preferredVoice + '*')",
" } | Select-Object -First 1;",
" }",
" if (-not $selected -and $preferredLanguage) {",
" $selected = $voices | Where-Object { $_.Language -like ($preferredLanguage + '*') } | Sort-Object @{Expression={ if ($_.DisplayName -match 'Natural' -or $_.Description -match 'Natural') { 0 } else { 1 } }}, Description | Select-Object -First 1;",
" }",
" if (-not $selected) { $selected = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice }",
" if ($selected) { $synth.Voice = $selected }",
" try { $synth.Options.SpeakingRate = $speakingRate } catch {}",
" $stream = Await-WinRt ($synth.SynthesizeTextToStreamAsync($text));",
" try {",
" $size = [uint32]$stream.Size;",
" $reader = [Windows.Storage.Streams.DataReader]::new($stream.GetInputStreamAt(0));",
" try {",
" $null = Await-WinRt ($reader.LoadAsync($size));",
" $bytes = New-Object byte[] ([int]$size);",
" $reader.ReadBytes($bytes);",
" [System.IO.File]::WriteAllBytes($outputPath, $bytes);",
" } finally { $reader.Dispose() }",
" } finally { $stream.Dispose() }",
"} finally { $synth.Dispose() }",
].join(" ");
await runPowerShell(script, signal);
}
export class WindowsMediaTtsService implements TtsService {
constructor(
private readonly speed: number,
private readonly voiceName?: string,
private readonly language = "ko",
) {}
async warmup(): Promise<void> {
await listWindowsMediaVoices();
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-wmtts-${Date.now()}.wav`);
await synthesizeWindowsMediaSpeechToWaveFile(
text,
this.speed,
tempPath,
this.voiceName,
this.language,
signal,
).catch(async (error) => {
await unlink(tempPath).catch(() => null);
throw error;
});
return {
stream: createReadStream(tempPath),
sourceFilePath: tempPath,
dispose: () => {
void unlink(tempPath).catch(() => null);
},
};
}
async destroy(): Promise<void> {
return;
}
}

View File

@@ -0,0 +1,63 @@
import { spawn } from "node:child_process";
export interface PowerShellRunResult {
stdout: string;
stderr: string;
}
export async function runPowerShell(script: string, signal?: AbortSignal): Promise<PowerShellRunResult> {
const encodedCommand = Buffer.from(script, "utf16le").toString("base64");
return await new Promise<PowerShellRunResult>((resolve, reject) => {
const child = spawn("powershell", ["-NoProfile", "-EncodedCommand", encodedCommand], {
stdio: ["ignore", "pipe", "pipe"],
});
let stdout = "";
let stderr = "";
child.stdout.on("data", (chunk: Buffer) => {
stdout += chunk.toString();
});
child.stderr.on("data", (chunk: Buffer) => {
stderr += chunk.toString();
});
signal?.addEventListener(
"abort",
() => {
if (!child.killed) {
child.kill("SIGKILL");
}
},
{ once: true },
);
child.on("exit", (code) => {
if (signal?.aborted) {
reject(new Error("powershell aborted"));
return;
}
if (code === 0) {
resolve({ stdout, stderr });
return;
}
reject(new Error(stderr.trim() || stdout.trim() || `powershell exited with code ${code ?? "null"}`));
});
child.on("error", reject);
});
}
export function parsePowerShellJsonArray<T>(stdout: string): T[] {
const trimmed = stdout.trim();
if (!trimmed) {
return [];
}
const parsed: unknown = JSON.parse(trimmed);
return Array.isArray(parsed) ? (parsed as T[]) : ([parsed] as T[]);
}

View File

@@ -1,14 +1,21 @@
import { spawn } from "node:child_process";
import { createReadStream } from "node:fs";
import { unlink } from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
import { parsePowerShellJsonArray, runPowerShell } from "./windows-powershell.js";
export interface WindowsSystemVoiceInfo {
name: string;
culture: string;
description: string;
gender: string;
enabled: boolean;
}
function escapePowerShellSingleQuoted(text: string): string {
return text.replace(/'/g, "''");
return text.replace(/\r?\n/g, " ").replace(/'/g, "''");
}
function toSpeechRate(speed: number): number {
@@ -16,77 +23,86 @@ function toSpeechRate(speed: number): number {
return Math.max(-10, Math.min(10, mapped));
}
export async function listWindowsSystemVoices(signal?: AbortSignal): Promise<WindowsSystemVoiceInfo[]> {
const script = [
"$ErrorActionPreference = 'Stop';",
"$ProgressPreference = 'SilentlyContinue';",
"Add-Type -AssemblyName System.Speech;",
"$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
"try {",
" $voices = @($synth.GetInstalledVoices() | ForEach-Object {",
" [PSCustomObject]@{",
" name = $_.VoiceInfo.Name;",
" culture = $_.VoiceInfo.Culture.Name;",
" description = $_.VoiceInfo.Description;",
" gender = [string]$_.VoiceInfo.Gender;",
" enabled = [bool]$_.Enabled;",
" }",
" });",
" ConvertTo-Json -InputObject $voices -Compress;",
"} finally { $synth.Dispose() }",
].join(" ");
const { stdout } = await runPowerShell(script, signal);
return parsePowerShellJsonArray<WindowsSystemVoiceInfo>(stdout);
}
export async function synthesizeWindowsSpeechToWaveFile(
text: string,
speed: number,
outputPath: string,
voiceName?: string,
language = "ko",
signal?: AbortSignal,
): Promise<void> {
const rate = toSpeechRate(speed);
const script = [
"$ErrorActionPreference = 'Stop';",
"$ProgressPreference = 'SilentlyContinue';",
"Add-Type -AssemblyName System.Speech;",
"$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
"$koVoice = $synth.GetInstalledVoices() | Where-Object { $_.VoiceInfo.Culture.Name -like 'ko*' } | Select-Object -First 1;",
"if ($koVoice) { $synth.SelectVoice($koVoice.VoiceInfo.Name) }",
`$preferredVoice = '${escapePowerShellSingleQuoted(voiceName ?? "")}';`,
`$preferredLanguage = '${escapePowerShellSingleQuoted(language)}';`,
"try {",
" $voices = $synth.GetInstalledVoices();",
" $selected = $null;",
" if ($preferredVoice) {",
" $selected = $voices | Where-Object {",
" $_.VoiceInfo.Name -eq $preferredVoice -or $_.VoiceInfo.Description -eq $preferredVoice -or $_.VoiceInfo.Name -like ('*' + $preferredVoice + '*') -or $_.VoiceInfo.Description -like ('*' + $preferredVoice + '*')",
" } | Select-Object -First 1;",
" }",
" if (-not $selected -and $preferredLanguage) {",
" $selected = $voices | Where-Object { $_.VoiceInfo.Culture.Name -like ($preferredLanguage + '*') } | Select-Object -First 1;",
" }",
" if ($selected) { $synth.SelectVoice($selected.VoiceInfo.Name) }",
`$synth.Rate = ${rate};`,
`$synth.SetOutputToWaveFile('${escapePowerShellSingleQuoted(outputPath)}');`,
`$synth.Speak('${escapePowerShellSingleQuoted(text)}');`,
"$synth.Dispose();",
"} finally { $synth.Dispose() }",
].join(" ");
await new Promise<void>((resolve, reject) => {
const child = spawn("powershell", ["-NoProfile", "-Command", script], {
stdio: ["ignore", "ignore", "pipe"],
});
let stderr = "";
child.stderr.on("data", (chunk: Buffer) => {
stderr += chunk.toString();
});
signal?.addEventListener(
"abort",
() => {
if (!child.killed) {
child.kill("SIGKILL");
}
},
{ once: true },
);
child.on("exit", (code) => {
if (signal?.aborted) {
reject(new Error("tts aborted"));
return;
}
if (code === 0) {
resolve();
return;
}
reject(new Error(stderr.trim() || `powershell tts exited with code ${code ?? "null"}`));
});
child.on("error", reject);
});
await runPowerShell(script, signal);
}
export class WindowsSystemTtsService implements TtsService {
constructor(private readonly speed: number) {
const resolvedFfmpegPath = resolveFfmpegPath();
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
process.env.FFMPEG_PATH = resolvedFfmpegPath;
}
}
constructor(
private readonly speed: number,
private readonly voiceName?: string,
private readonly language = "ko",
) {}
async warmup(): Promise<void> {
return;
await listWindowsSystemVoices();
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-tts-${Date.now()}.wav`);
await synthesizeWindowsSpeechToWaveFile(text, this.speed, tempPath, signal).catch(async (error) => {
await synthesizeWindowsSpeechToWaveFile(text, this.speed, tempPath, this.voiceName, this.language, signal).catch(
async (error) => {
await unlink(tempPath).catch(() => null);
throw error;
});
},
);
return {
stream: createReadStream(tempPath),