diff --git a/.env.example b/.env.example index 7fc2eeb..4dd4333 100644 --- a/.env.example +++ b/.env.example @@ -16,6 +16,8 @@ LOCAL_STT_MODEL=small LOCAL_STT_DEVICE=auto LOCAL_STT_COMPUTE_TYPE=auto LOCAL_STT_BEAM_SIZE=3 +LOCAL_TTS_ENGINE=auto +LOCAL_TTS_VOICE_NAME= LOCAL_TTS_MODEL_PATH=.local-ai/models/kokoro-v1.0.onnx LOCAL_TTS_VOICES_PATH=.local-ai/models/voices-v1.0.bin LOCAL_TTS_LANGUAGE=ko diff --git a/README.md b/README.md index a33440f..171c945 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ - STT: `faster-whisper` + Whisper multilingual - LLM: `Ollama` + `qwen3:0.6b` - TTS: -- Windows: 시스템 기본 음성 엔진 +- Windows: `Windows.Media.SpeechSynthesis` 우선, 실패 시 시스템 기본 음성 엔진 fallback - Linux/macOS: `kokoro-onnx` + `misaki[ko]` - VAD: `avr-vad` @@ -71,6 +71,12 @@ TTS만 단독으로 확인: bun run tts:test -- "안녕하세요. 출력 장치 테스트입니다." ``` +Windows 설치 음성 목록 확인: + +```bash +bun run tts:voices +``` + TTS WAV 파일만 생성해서 확인: ```bash @@ -104,6 +110,8 @@ Discord 모드에서만 필수: - `LOCAL_STT_DEVICE` - `LOCAL_STT_COMPUTE_TYPE` - `LOCAL_STT_BEAM_SIZE` +- `LOCAL_TTS_ENGINE` +- `LOCAL_TTS_VOICE_NAME` - `LOCAL_TTS_MODEL_PATH` - `LOCAL_TTS_VOICES_PATH` - `LOCAL_TTS_LANGUAGE` @@ -138,7 +146,8 @@ Windows에서 GPU STT를 쓰려면 `LOCAL_STT_DEVICE=auto` 그대로 두고 `bun - STT 기본 권장 모델은 `small` - LLM 기본 모델은 `qwen3:0.6b` -- TTS 기본 보이스는 `af_heart` +- Windows TTS 기본 보이스는 설치된 `windows-media` 음성 중 현재 언어에 맞는 첫 번째 항목 +- Linux/macOS TTS 기본 보이스는 `af_heart` - TTS 기본 속도는 `1.12` 더 빠르게 돌리고 싶으면: @@ -169,11 +178,12 @@ OLLAMA_MODEL=qwen3:1.7b ## Windows 메모 - `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다. -- Windows는 TTS를 Python 모델 대신 시스템 기본 음성 엔진으로 처리합니다. +- Windows는 기본적으로 `windows-media` 엔진을 우선 쓰고, 실패하면 `system` 엔진으로 자동 fallback 합니다. - 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다. - Python 탐지가 안 되면 `.env` 에 `LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 를 넣으면 됩니다. - Windows의 `setup:local-ai`는 STT와 CUDA 런타임 wheel을 함께 설치합니다. - Linux/macOS의 `setup:local-ai`는 Kokoro ONNX 모델 파일도 자동으로 내려받습니다. +- 더 자연스러운 음성을 고르려면 `bun run tts:voices` 로 설치된 음성 이름을 확인한 뒤 `LOCAL_TTS_VOICE_NAME` 에 넣으면 됩니다. ## 설계 메모 diff --git a/package.json b/package.json index a23a844..df7d284 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,7 @@ "start:local": "bun src/index.ts local", "tts:test": "bun src/index.ts local-say", "tts:dump": "bun src/index.ts local-say-dump", + "tts:voices": "bun src/index.ts local-tts-voices", "setup:local-ai": "bun src/setup-local-ai.ts", "devices": "bun src/index.ts local-devices", "audio:devices": "bun src/index.ts local-devices", diff --git a/src/config.ts b/src/config.ts index 83115d9..f4d56e3 100644 --- a/src/config.ts +++ b/src/config.ts @@ -28,6 +28,8 @@ const envSchema = z.object({ LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(3), LOCAL_TTS_MODEL_PATH: z.string().min(1).default(".local-ai/models/kokoro-v1.0.onnx"), LOCAL_TTS_VOICES_PATH: z.string().min(1).default(".local-ai/models/voices-v1.0.bin"), + LOCAL_TTS_ENGINE: z.enum(["auto", "windows-media", "system", "kokoro"]).default("auto"), + LOCAL_TTS_VOICE_NAME: emptyToUndefined, LOCAL_TTS_LANGUAGE: z.string().min(1).default("ko"), LOCAL_TTS_SPEAKER: z.string().min(1).default("af_heart"), LOCAL_TTS_DEVICE: z.string().min(1).default("auto"), diff --git a/src/discord-main.ts b/src/discord-main.ts index 0f42e14..126b1d4 100644 --- a/src/discord-main.ts +++ b/src/discord-main.ts @@ -16,9 +16,8 @@ import { GuildVoiceSession } from "./audio/guild-voice-session.js"; import { type DiscordRuntimeConfig } from "./config.js"; import { Logger } from "./logger.js"; import { LocalFasterWhisperSttService } from "./services/local-stt.js"; -import { LocalKokoroTtsService } from "./services/local-tts.js"; import { OllamaLlmService } from "./services/ollama-llm.js"; -import { WindowsSystemTtsService } from "./services/windows-system-tts.js"; +import { createTtsService } from "./services/create-tts-service.js"; export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise { const commands = [ @@ -39,15 +38,12 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger }); const stt = new LocalFasterWhisperSttService(config, logger); - const tts = - process.platform === "win32" - ? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED) - : new LocalKokoroTtsService(config, logger); + const tts = createTtsService(config, logger); const llm = new OllamaLlmService(config); const sessions = new Map(); await stt.warmup(); - await tts.warmup(); + await tts.warmup?.(); function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null { const member = interaction.member as GuildMember | null; diff --git a/src/index.ts b/src/index.ts index 5dd789f..00c8830 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,7 +3,13 @@ import process from "node:process"; import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js"; import { runDiscordBot } from "./discord-main.js"; import { Logger } from "./logger.js"; -import { dumpLocalTtsWave, printLocalAudioDevices, runLocalAssistant, runLocalTtsSmokeTest } from "./local-main.js"; +import { + dumpLocalTtsWave, + printLocalAudioDevices, + printLocalTtsVoices, + runLocalAssistant, + runLocalTtsSmokeTest, +} from "./local-main.js"; const mode = process.argv[2] ?? "discord"; const config = loadConfig(); @@ -30,8 +36,13 @@ async function main(): Promise { await dumpLocalTtsWave(requireAssistantRuntimeConfig(config), logger, text); return; } + case "local-tts-voices": + await printLocalTtsVoices(requireAssistantRuntimeConfig(config)); + return; default: - throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices, local-say, local-say-dump`); + throw new Error( + `알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices, local-say, local-say-dump, local-tts-voices`, + ); } } diff --git a/src/local-main.ts b/src/local-main.ts index dd92b8a..f67cd48 100644 --- a/src/local-main.ts +++ b/src/local-main.ts @@ -1,5 +1,5 @@ import { spawn } from "node:child_process"; -import { mkdir } from "node:fs/promises"; +import { copyFile, mkdir } from "node:fs/promises"; import path from "node:path"; import process from "node:process"; @@ -9,10 +9,11 @@ import { LocalVoiceSession } from "./audio/local-voice-session.js"; import { requireFfmpegPath } from "./audio/ffmpeg-path.js"; import type { LlmService } from "./services/llm.js"; import { LocalFasterWhisperSttService } from "./services/local-stt.js"; -import { LocalKokoroTtsService } from "./services/local-tts.js"; import { OllamaLlmService } from "./services/ollama-llm.js"; import type { SttService } from "./services/stt.js"; -import { synthesizeWindowsSpeechToWaveFile, WindowsSystemTtsService } from "./services/windows-system-tts.js"; +import { createTtsService } from "./services/create-tts-service.js"; +import { listWindowsMediaVoices } from "./services/windows-media-tts.js"; +import { listWindowsSystemVoices } from "./services/windows-system-tts.js"; export async function printLocalAudioDevices(): Promise { if (process.platform === "win32") { @@ -73,14 +74,11 @@ export async function printLocalAudioDevices(): Promise { export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise { const stt = new LocalFasterWhisperSttService(config, logger); - const tts = - process.platform === "win32" - ? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED) - : new LocalKokoroTtsService(config, logger); + const tts = createTtsService(config, logger); const llm = new OllamaLlmService(config); await stt.warmup(); - await tts.warmup(); + await tts.warmup?.(); await llm.warmup?.(); if (config.BOT_DEFAULT_LANGUAGE.startsWith("ko") && config.LOCAL_STT_MODEL === "tiny") { @@ -130,10 +128,7 @@ export async function runLocalTtsSmokeTest( logger: Logger, text: string, ): Promise { - const tts = - process.platform === "win32" - ? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED) - : new LocalKokoroTtsService(config, logger); + const tts = createTtsService(config, logger); const noOpStt: SttService = { async transcribePcm16() { @@ -146,7 +141,7 @@ export async function runLocalTtsSmokeTest( }, }; - await tts.warmup(); + await tts.warmup?.(); const session = new LocalVoiceSession({ config, @@ -171,7 +166,7 @@ export async function runLocalTtsSmokeTest( export async function dumpLocalTtsWave( config: AssistantRuntimeConfig, - _logger: Logger, + logger: Logger, text: string, outputPath?: string, ): Promise { @@ -181,9 +176,57 @@ export async function dumpLocalTtsWave( const resolvedPath = path.resolve(outputPath?.trim() || "tts-test.wav"); await mkdir(path.dirname(resolvedPath), { recursive: true }); - await synthesizeWindowsSpeechToWaveFile(text, config.LOCAL_TTS_SPEED, resolvedPath); + const tts = createTtsService(config, logger); + await tts.warmup?.(); + const playback = await tts.preparePlayback(text); + + try { + if (!playback.sourceFilePath) { + throw new Error("현재 선택된 TTS 엔진은 직접 WAV 덤프를 지원하지 않습니다."); + } + await copyFile(playback.sourceFilePath, resolvedPath); + } finally { + playback.dispose(); + await tts.destroy?.(); + } console.log("TTS WAV 파일 생성 완료"); console.log(`출력 파일: ${resolvedPath}`); console.log("이 파일이 재생되면 TTS 합성은 정상이고, 실시간 재생 경로만 따로 보면 됩니다."); } + +export async function printLocalTtsVoices(config: AssistantRuntimeConfig): Promise { + if (process.platform !== "win32") { + console.log("현재 플랫폼은 Windows가 아니므로 설치된 시스템 TTS 목록 대신 Kokoro 설정만 사용합니다."); + console.log(`LOCAL_TTS_ENGINE=${config.LOCAL_TTS_ENGINE}`); + console.log(`LOCAL_TTS_SPEAKER=${config.LOCAL_TTS_SPEAKER}`); + return; + } + + const [windowsMediaVoices, windowsSystemVoices] = await Promise.all([ + listWindowsMediaVoices(), + listWindowsSystemVoices(), + ]); + + console.log("\n=== Windows.Media.SpeechSynthesis voices (권장) ==="); + if (windowsMediaVoices.length === 0) { + console.log("설치된 Windows Media 음성이 없습니다."); + } else { + for (const voice of windowsMediaVoices) { + console.log(`- ${voice.description} | name=${voice.displayName} | lang=${voice.language}`); + } + } + + console.log("\n=== System.Speech voices (fallback) ==="); + if (windowsSystemVoices.length === 0) { + console.log("설치된 System.Speech 음성이 없습니다."); + } else { + for (const voice of windowsSystemVoices) { + console.log(`- ${voice.description} | name=${voice.name} | lang=${voice.culture}`); + } + } + + console.log("\n설정 예시"); + console.log("LOCAL_TTS_ENGINE=windows-media"); + console.log("LOCAL_TTS_VOICE_NAME=위 목록의 description 또는 name"); +} diff --git a/src/services/create-tts-service.ts b/src/services/create-tts-service.ts new file mode 100644 index 0000000..25ef392 --- /dev/null +++ b/src/services/create-tts-service.ts @@ -0,0 +1,112 @@ +import process from "node:process"; + +import type { AssistantRuntimeConfig } from "../config.js"; +import type { Logger } from "../logger.js"; +import { LocalKokoroTtsService } from "./local-tts.js"; +import type { PreparedSpeechAudio, TtsService } from "./tts.js"; +import { WindowsMediaTtsService } from "./windows-media-tts.js"; +import { WindowsSystemTtsService } from "./windows-system-tts.js"; + +interface NamedTtsService { + name: string; + service: TtsService; +} + +class FallbackTtsService implements TtsService { + private activeIndex: number | null = null; + + constructor( + private readonly logger: Logger, + private readonly services: NamedTtsService[], + ) {} + + async warmup(): Promise { + await this.ensureActive(); + } + + async preparePlayback(text: string, signal?: AbortSignal): Promise { + const active = await this.ensureActive(); + + try { + return await active.service.preparePlayback(text, signal); + } catch (error) { + if (this.activeIndex === null || this.activeIndex >= this.services.length - 1) { + throw error; + } + + const failedName = active.name; + this.activeIndex += 1; + const fallback = await this.activate(this.activeIndex); + this.logger.warn(`TTS 엔진 ${failedName} 이 실패해 ${fallback.name} 로 전환합니다.`, error); + return await fallback.service.preparePlayback(text, signal); + } + } + + async destroy(): Promise { + await Promise.allSettled(this.services.map((entry) => entry.service.destroy?.())); + } + + private async ensureActive(): Promise { + if (this.activeIndex !== null) { + return this.services[this.activeIndex]!; + } + + let lastError: unknown = null; + for (let index = 0; index < this.services.length; index += 1) { + try { + return await this.activate(index); + } catch (error) { + lastError = error; + this.logger.warn(`TTS 엔진 ${this.services[index]!.name} 초기화 실패`, error); + } + } + + throw lastError instanceof Error ? lastError : new Error("사용 가능한 TTS 엔진을 찾지 못했습니다."); + } + + private async activate(index: number): Promise { + const selected = this.services[index]!; + await selected.service.warmup?.(); + this.activeIndex = index; + this.logger.info("Selected TTS engine", selected.name); + return selected; + } +} + +export function createTtsService(config: AssistantRuntimeConfig, logger: Logger): TtsService { + if (process.platform !== "win32") { + return new LocalKokoroTtsService(config, logger); + } + + const systemTts = new WindowsSystemTtsService( + config.LOCAL_TTS_SPEED, + config.LOCAL_TTS_VOICE_NAME, + config.LOCAL_TTS_LANGUAGE, + ); + const windowsMediaTts = new WindowsMediaTtsService( + config.LOCAL_TTS_SPEED, + config.LOCAL_TTS_VOICE_NAME, + config.LOCAL_TTS_LANGUAGE, + ); + + switch (config.LOCAL_TTS_ENGINE) { + case "system": + return systemTts; + case "windows-media": + return windowsMediaTts; + case "kokoro": + return new LocalKokoroTtsService(config, logger); + case "auto": + default: + return new FallbackTtsService(logger, [ + { + name: "windows-media", + service: windowsMediaTts, + }, + { + name: "system", + service: systemTts, + }, + ]); + } +} diff --git a/src/services/tts.ts b/src/services/tts.ts index 5ee5981..10facdf 100644 --- a/src/services/tts.ts +++ b/src/services/tts.ts @@ -7,6 +7,7 @@ export interface PreparedSpeechAudio { } export interface TtsService { + warmup?(): Promise; preparePlayback(text: string, signal?: AbortSignal): Promise; destroy?(): Promise; } diff --git a/src/services/windows-media-tts.ts b/src/services/windows-media-tts.ts new file mode 100644 index 0000000..85fc010 --- /dev/null +++ b/src/services/windows-media-tts.ts @@ -0,0 +1,141 @@ +import { createReadStream } from "node:fs"; +import { unlink } from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import type { PreparedSpeechAudio, TtsService } from "./tts.js"; +import { parsePowerShellJsonArray, runPowerShell } from "./windows-powershell.js"; + +export interface WindowsMediaVoiceInfo { + displayName: string; + description: string; + language: string; + gender: string; + id: string; +} + +function escapePowerShellSingleQuoted(text: string): string { + return text.replace(/\r?\n/g, " ").replace(/'/g, "''"); +} + +function windowsMediaPreamble(): string { + return [ + "$ErrorActionPreference = 'Stop';", + "$ProgressPreference = 'SilentlyContinue';", + "Add-Type -AssemblyName System.Runtime.WindowsRuntime;", + "$null = [Windows.Media.SpeechSynthesis.SpeechSynthesizer, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime];", + "$null = [Windows.Storage.Streams.DataReader, Windows.Storage.Streams, ContentType=WindowsRuntime];", + "function Await-WinRt($operation) {", + " $method = [System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object { $_.Name -eq 'AsTask' -and $_.IsGenericMethod -and $_.GetParameters().Count -eq 1 } | Select-Object -First 1;", + " if (-not $method) { throw 'System.WindowsRuntimeSystemExtensions.AsTask 를 찾지 못했습니다.' }", + " $resultType = $operation.GetType().GenericTypeArguments[0];", + " $task = $method.MakeGenericMethod($resultType).Invoke($null, @($operation));", + " return $task.GetAwaiter().GetResult();", + "}", + ].join(" "); +} + +export async function listWindowsMediaVoices(signal?: AbortSignal): Promise { + const script = [ + windowsMediaPreamble(), + "$voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices | ForEach-Object {", + " [PSCustomObject]@{", + " displayName = $_.DisplayName;", + " description = $_.Description;", + " language = $_.Language;", + " gender = [string]$_.Gender;", + " id = $_.Id;", + " }", + "});", + "ConvertTo-Json -InputObject $voices -Compress;", + ].join(" "); + + const { stdout } = await runPowerShell(script, signal); + return parsePowerShellJsonArray(stdout); +} + +export async function synthesizeWindowsMediaSpeechToWaveFile( + text: string, + speed: number, + outputPath: string, + voiceName?: string, + language = "ko", + signal?: AbortSignal, +): Promise { + const script = [ + windowsMediaPreamble(), + `$text = '${escapePowerShellSingleQuoted(text)}';`, + `$outputPath = '${escapePowerShellSingleQuoted(outputPath)}';`, + `$preferredVoice = '${escapePowerShellSingleQuoted(voiceName ?? "")}';`, + `$preferredLanguage = '${escapePowerShellSingleQuoted(language)}';`, + `$speakingRate = ${speed.toFixed(2)};`, + "$synth = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new();", + "try {", + " $voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices;", + " $selected = $null;", + " if ($preferredVoice) {", + " $selected = $voices | Where-Object {", + " $_.DisplayName -eq $preferredVoice -or $_.Description -eq $preferredVoice -or $_.Id -eq $preferredVoice -or $_.DisplayName -like ('*' + $preferredVoice + '*') -or $_.Description -like ('*' + $preferredVoice + '*')", + " } | Select-Object -First 1;", + " }", + " if (-not $selected -and $preferredLanguage) {", + " $selected = $voices | Where-Object { $_.Language -like ($preferredLanguage + '*') } | Sort-Object @{Expression={ if ($_.DisplayName -match 'Natural' -or $_.Description -match 'Natural') { 0 } else { 1 } }}, Description | Select-Object -First 1;", + " }", + " if (-not $selected) { $selected = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice }", + " if ($selected) { $synth.Voice = $selected }", + " try { $synth.Options.SpeakingRate = $speakingRate } catch {}", + " $stream = Await-WinRt ($synth.SynthesizeTextToStreamAsync($text));", + " try {", + " $size = [uint32]$stream.Size;", + " $reader = [Windows.Storage.Streams.DataReader]::new($stream.GetInputStreamAt(0));", + " try {", + " $null = Await-WinRt ($reader.LoadAsync($size));", + " $bytes = New-Object byte[] ([int]$size);", + " $reader.ReadBytes($bytes);", + " [System.IO.File]::WriteAllBytes($outputPath, $bytes);", + " } finally { $reader.Dispose() }", + " } finally { $stream.Dispose() }", + "} finally { $synth.Dispose() }", + ].join(" "); + + await runPowerShell(script, signal); +} + +export class WindowsMediaTtsService implements TtsService { + constructor( + private readonly speed: number, + private readonly voiceName?: string, + private readonly language = "ko", + ) {} + + async warmup(): Promise { + await listWindowsMediaVoices(); + } + + async preparePlayback(text: string, signal?: AbortSignal): Promise { + const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-wmtts-${Date.now()}.wav`); + await synthesizeWindowsMediaSpeechToWaveFile( + text, + this.speed, + tempPath, + this.voiceName, + this.language, + signal, + ).catch(async (error) => { + await unlink(tempPath).catch(() => null); + throw error; + }); + + return { + stream: createReadStream(tempPath), + sourceFilePath: tempPath, + dispose: () => { + void unlink(tempPath).catch(() => null); + }, + }; + } + + async destroy(): Promise { + return; + } +} diff --git a/src/services/windows-powershell.ts b/src/services/windows-powershell.ts new file mode 100644 index 0000000..392b404 --- /dev/null +++ b/src/services/windows-powershell.ts @@ -0,0 +1,63 @@ +import { spawn } from "node:child_process"; + +export interface PowerShellRunResult { + stdout: string; + stderr: string; +} + +export async function runPowerShell(script: string, signal?: AbortSignal): Promise { + const encodedCommand = Buffer.from(script, "utf16le").toString("base64"); + + return await new Promise((resolve, reject) => { + const child = spawn("powershell", ["-NoProfile", "-EncodedCommand", encodedCommand], { + stdio: ["ignore", "pipe", "pipe"], + }); + + let stdout = ""; + let stderr = ""; + + child.stdout.on("data", (chunk: Buffer) => { + stdout += chunk.toString(); + }); + + child.stderr.on("data", (chunk: Buffer) => { + stderr += chunk.toString(); + }); + + signal?.addEventListener( + "abort", + () => { + if (!child.killed) { + child.kill("SIGKILL"); + } + }, + { once: true }, + ); + + child.on("exit", (code) => { + if (signal?.aborted) { + reject(new Error("powershell aborted")); + return; + } + + if (code === 0) { + resolve({ stdout, stderr }); + return; + } + + reject(new Error(stderr.trim() || stdout.trim() || `powershell exited with code ${code ?? "null"}`)); + }); + + child.on("error", reject); + }); +} + +export function parsePowerShellJsonArray(stdout: string): T[] { + const trimmed = stdout.trim(); + if (!trimmed) { + return []; + } + + const parsed: unknown = JSON.parse(trimmed); + return Array.isArray(parsed) ? (parsed as T[]) : ([parsed] as T[]); +} diff --git a/src/services/windows-system-tts.ts b/src/services/windows-system-tts.ts index de08952..1369483 100644 --- a/src/services/windows-system-tts.ts +++ b/src/services/windows-system-tts.ts @@ -1,14 +1,21 @@ -import { spawn } from "node:child_process"; import { createReadStream } from "node:fs"; import { unlink } from "node:fs/promises"; import os from "node:os"; import path from "node:path"; -import { resolveFfmpegPath } from "../audio/ffmpeg-path.js"; import type { PreparedSpeechAudio, TtsService } from "./tts.js"; +import { parsePowerShellJsonArray, runPowerShell } from "./windows-powershell.js"; + +export interface WindowsSystemVoiceInfo { + name: string; + culture: string; + description: string; + gender: string; + enabled: boolean; +} function escapePowerShellSingleQuoted(text: string): string { - return text.replace(/'/g, "''"); + return text.replace(/\r?\n/g, " ").replace(/'/g, "''"); } function toSpeechRate(speed: number): number { @@ -16,77 +23,86 @@ function toSpeechRate(speed: number): number { return Math.max(-10, Math.min(10, mapped)); } +export async function listWindowsSystemVoices(signal?: AbortSignal): Promise { + const script = [ + "$ErrorActionPreference = 'Stop';", + "$ProgressPreference = 'SilentlyContinue';", + "Add-Type -AssemblyName System.Speech;", + "$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;", + "try {", + " $voices = @($synth.GetInstalledVoices() | ForEach-Object {", + " [PSCustomObject]@{", + " name = $_.VoiceInfo.Name;", + " culture = $_.VoiceInfo.Culture.Name;", + " description = $_.VoiceInfo.Description;", + " gender = [string]$_.VoiceInfo.Gender;", + " enabled = [bool]$_.Enabled;", + " }", + " });", + " ConvertTo-Json -InputObject $voices -Compress;", + "} finally { $synth.Dispose() }", + ].join(" "); + + const { stdout } = await runPowerShell(script, signal); + return parsePowerShellJsonArray(stdout); +} + export async function synthesizeWindowsSpeechToWaveFile( text: string, speed: number, outputPath: string, + voiceName?: string, + language = "ko", signal?: AbortSignal, ): Promise { const rate = toSpeechRate(speed); const script = [ + "$ErrorActionPreference = 'Stop';", + "$ProgressPreference = 'SilentlyContinue';", "Add-Type -AssemblyName System.Speech;", "$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;", - "$koVoice = $synth.GetInstalledVoices() | Where-Object { $_.VoiceInfo.Culture.Name -like 'ko*' } | Select-Object -First 1;", - "if ($koVoice) { $synth.SelectVoice($koVoice.VoiceInfo.Name) }", + `$preferredVoice = '${escapePowerShellSingleQuoted(voiceName ?? "")}';`, + `$preferredLanguage = '${escapePowerShellSingleQuoted(language)}';`, + "try {", + " $voices = $synth.GetInstalledVoices();", + " $selected = $null;", + " if ($preferredVoice) {", + " $selected = $voices | Where-Object {", + " $_.VoiceInfo.Name -eq $preferredVoice -or $_.VoiceInfo.Description -eq $preferredVoice -or $_.VoiceInfo.Name -like ('*' + $preferredVoice + '*') -or $_.VoiceInfo.Description -like ('*' + $preferredVoice + '*')", + " } | Select-Object -First 1;", + " }", + " if (-not $selected -and $preferredLanguage) {", + " $selected = $voices | Where-Object { $_.VoiceInfo.Culture.Name -like ($preferredLanguage + '*') } | Select-Object -First 1;", + " }", + " if ($selected) { $synth.SelectVoice($selected.VoiceInfo.Name) }", `$synth.Rate = ${rate};`, `$synth.SetOutputToWaveFile('${escapePowerShellSingleQuoted(outputPath)}');`, `$synth.Speak('${escapePowerShellSingleQuoted(text)}');`, - "$synth.Dispose();", + "} finally { $synth.Dispose() }", ].join(" "); - await new Promise((resolve, reject) => { - const child = spawn("powershell", ["-NoProfile", "-Command", script], { - stdio: ["ignore", "ignore", "pipe"], - }); - - let stderr = ""; - child.stderr.on("data", (chunk: Buffer) => { - stderr += chunk.toString(); - }); - - signal?.addEventListener( - "abort", - () => { - if (!child.killed) { - child.kill("SIGKILL"); - } - }, - { once: true }, - ); - - child.on("exit", (code) => { - if (signal?.aborted) { - reject(new Error("tts aborted")); - return; - } - if (code === 0) { - resolve(); - return; - } - reject(new Error(stderr.trim() || `powershell tts exited with code ${code ?? "null"}`)); - }); - child.on("error", reject); - }); + await runPowerShell(script, signal); } export class WindowsSystemTtsService implements TtsService { - constructor(private readonly speed: number) { - const resolvedFfmpegPath = resolveFfmpegPath(); - if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) { - process.env.FFMPEG_PATH = resolvedFfmpegPath; - } - } + constructor( + private readonly speed: number, + private readonly voiceName?: string, + private readonly language = "ko", + ) {} async warmup(): Promise { - return; + await listWindowsSystemVoices(); } async preparePlayback(text: string, signal?: AbortSignal): Promise { const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-tts-${Date.now()}.wav`); - await synthesizeWindowsSpeechToWaveFile(text, this.speed, tempPath, signal).catch(async (error) => { - await unlink(tempPath).catch(() => null); - throw error; - }); + await synthesizeWindowsSpeechToWaveFile(text, this.speed, tempPath, this.voiceName, this.language, signal).catch( + async (error) => { + await unlink(tempPath).catch(() => null); + throw error; + }, + ); return { stream: createReadStream(tempPath),