Add Windows Media TTS engine selection
This commit is contained in:
@@ -16,6 +16,8 @@ LOCAL_STT_MODEL=small
|
||||
LOCAL_STT_DEVICE=auto
|
||||
LOCAL_STT_COMPUTE_TYPE=auto
|
||||
LOCAL_STT_BEAM_SIZE=3
|
||||
LOCAL_TTS_ENGINE=auto
|
||||
LOCAL_TTS_VOICE_NAME=
|
||||
LOCAL_TTS_MODEL_PATH=.local-ai/models/kokoro-v1.0.onnx
|
||||
LOCAL_TTS_VOICES_PATH=.local-ai/models/voices-v1.0.bin
|
||||
LOCAL_TTS_LANGUAGE=ko
|
||||
|
||||
16
README.md
16
README.md
@@ -7,7 +7,7 @@
|
||||
- STT: `faster-whisper` + Whisper multilingual
|
||||
- LLM: `Ollama` + `qwen3:0.6b`
|
||||
- TTS:
|
||||
- Windows: 시스템 기본 음성 엔진
|
||||
- Windows: `Windows.Media.SpeechSynthesis` 우선, 실패 시 시스템 기본 음성 엔진 fallback
|
||||
- Linux/macOS: `kokoro-onnx` + `misaki[ko]`
|
||||
- VAD: `avr-vad`
|
||||
|
||||
@@ -71,6 +71,12 @@ TTS만 단독으로 확인:
|
||||
bun run tts:test -- "안녕하세요. 출력 장치 테스트입니다."
|
||||
```
|
||||
|
||||
Windows 설치 음성 목록 확인:
|
||||
|
||||
```bash
|
||||
bun run tts:voices
|
||||
```
|
||||
|
||||
TTS WAV 파일만 생성해서 확인:
|
||||
|
||||
```bash
|
||||
@@ -104,6 +110,8 @@ Discord 모드에서만 필수:
|
||||
- `LOCAL_STT_DEVICE`
|
||||
- `LOCAL_STT_COMPUTE_TYPE`
|
||||
- `LOCAL_STT_BEAM_SIZE`
|
||||
- `LOCAL_TTS_ENGINE`
|
||||
- `LOCAL_TTS_VOICE_NAME`
|
||||
- `LOCAL_TTS_MODEL_PATH`
|
||||
- `LOCAL_TTS_VOICES_PATH`
|
||||
- `LOCAL_TTS_LANGUAGE`
|
||||
@@ -138,7 +146,8 @@ Windows에서 GPU STT를 쓰려면 `LOCAL_STT_DEVICE=auto` 그대로 두고 `bun
|
||||
|
||||
- STT 기본 권장 모델은 `small`
|
||||
- LLM 기본 모델은 `qwen3:0.6b`
|
||||
- TTS 기본 보이스는 `af_heart`
|
||||
- Windows TTS 기본 보이스는 설치된 `windows-media` 음성 중 현재 언어에 맞는 첫 번째 항목
|
||||
- Linux/macOS TTS 기본 보이스는 `af_heart`
|
||||
- TTS 기본 속도는 `1.12`
|
||||
|
||||
더 빠르게 돌리고 싶으면:
|
||||
@@ -169,11 +178,12 @@ OLLAMA_MODEL=qwen3:1.7b
|
||||
## Windows 메모
|
||||
|
||||
- `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다.
|
||||
- Windows는 TTS를 Python 모델 대신 시스템 기본 음성 엔진으로 처리합니다.
|
||||
- Windows는 기본적으로 `windows-media` 엔진을 우선 쓰고, 실패하면 `system` 엔진으로 자동 fallback 합니다.
|
||||
- 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다.
|
||||
- Python 탐지가 안 되면 `.env` 에 `LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 를 넣으면 됩니다.
|
||||
- Windows의 `setup:local-ai`는 STT와 CUDA 런타임 wheel을 함께 설치합니다.
|
||||
- Linux/macOS의 `setup:local-ai`는 Kokoro ONNX 모델 파일도 자동으로 내려받습니다.
|
||||
- 더 자연스러운 음성을 고르려면 `bun run tts:voices` 로 설치된 음성 이름을 확인한 뒤 `LOCAL_TTS_VOICE_NAME` 에 넣으면 됩니다.
|
||||
|
||||
## 설계 메모
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
"start:local": "bun src/index.ts local",
|
||||
"tts:test": "bun src/index.ts local-say",
|
||||
"tts:dump": "bun src/index.ts local-say-dump",
|
||||
"tts:voices": "bun src/index.ts local-tts-voices",
|
||||
"setup:local-ai": "bun src/setup-local-ai.ts",
|
||||
"devices": "bun src/index.ts local-devices",
|
||||
"audio:devices": "bun src/index.ts local-devices",
|
||||
|
||||
@@ -28,6 +28,8 @@ const envSchema = z.object({
|
||||
LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(3),
|
||||
LOCAL_TTS_MODEL_PATH: z.string().min(1).default(".local-ai/models/kokoro-v1.0.onnx"),
|
||||
LOCAL_TTS_VOICES_PATH: z.string().min(1).default(".local-ai/models/voices-v1.0.bin"),
|
||||
LOCAL_TTS_ENGINE: z.enum(["auto", "windows-media", "system", "kokoro"]).default("auto"),
|
||||
LOCAL_TTS_VOICE_NAME: emptyToUndefined,
|
||||
LOCAL_TTS_LANGUAGE: z.string().min(1).default("ko"),
|
||||
LOCAL_TTS_SPEAKER: z.string().min(1).default("af_heart"),
|
||||
LOCAL_TTS_DEVICE: z.string().min(1).default("auto"),
|
||||
|
||||
@@ -16,9 +16,8 @@ import { GuildVoiceSession } from "./audio/guild-voice-session.js";
|
||||
import { type DiscordRuntimeConfig } from "./config.js";
|
||||
import { Logger } from "./logger.js";
|
||||
import { LocalFasterWhisperSttService } from "./services/local-stt.js";
|
||||
import { LocalKokoroTtsService } from "./services/local-tts.js";
|
||||
import { OllamaLlmService } from "./services/ollama-llm.js";
|
||||
import { WindowsSystemTtsService } from "./services/windows-system-tts.js";
|
||||
import { createTtsService } from "./services/create-tts-service.js";
|
||||
|
||||
export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
|
||||
const commands = [
|
||||
@@ -39,15 +38,12 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger
|
||||
});
|
||||
|
||||
const stt = new LocalFasterWhisperSttService(config, logger);
|
||||
const tts =
|
||||
process.platform === "win32"
|
||||
? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED)
|
||||
: new LocalKokoroTtsService(config, logger);
|
||||
const tts = createTtsService(config, logger);
|
||||
const llm = new OllamaLlmService(config);
|
||||
const sessions = new Map<string, GuildVoiceSession>();
|
||||
|
||||
await stt.warmup();
|
||||
await tts.warmup();
|
||||
await tts.warmup?.();
|
||||
|
||||
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
|
||||
const member = interaction.member as GuildMember | null;
|
||||
|
||||
15
src/index.ts
15
src/index.ts
@@ -3,7 +3,13 @@ import process from "node:process";
|
||||
import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js";
|
||||
import { runDiscordBot } from "./discord-main.js";
|
||||
import { Logger } from "./logger.js";
|
||||
import { dumpLocalTtsWave, printLocalAudioDevices, runLocalAssistant, runLocalTtsSmokeTest } from "./local-main.js";
|
||||
import {
|
||||
dumpLocalTtsWave,
|
||||
printLocalAudioDevices,
|
||||
printLocalTtsVoices,
|
||||
runLocalAssistant,
|
||||
runLocalTtsSmokeTest,
|
||||
} from "./local-main.js";
|
||||
|
||||
const mode = process.argv[2] ?? "discord";
|
||||
const config = loadConfig();
|
||||
@@ -30,8 +36,13 @@ async function main(): Promise<void> {
|
||||
await dumpLocalTtsWave(requireAssistantRuntimeConfig(config), logger, text);
|
||||
return;
|
||||
}
|
||||
case "local-tts-voices":
|
||||
await printLocalTtsVoices(requireAssistantRuntimeConfig(config));
|
||||
return;
|
||||
default:
|
||||
throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices, local-say, local-say-dump`);
|
||||
throw new Error(
|
||||
`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices, local-say, local-say-dump, local-tts-voices`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { spawn } from "node:child_process";
|
||||
import { mkdir } from "node:fs/promises";
|
||||
import { copyFile, mkdir } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import process from "node:process";
|
||||
|
||||
@@ -9,10 +9,11 @@ import { LocalVoiceSession } from "./audio/local-voice-session.js";
|
||||
import { requireFfmpegPath } from "./audio/ffmpeg-path.js";
|
||||
import type { LlmService } from "./services/llm.js";
|
||||
import { LocalFasterWhisperSttService } from "./services/local-stt.js";
|
||||
import { LocalKokoroTtsService } from "./services/local-tts.js";
|
||||
import { OllamaLlmService } from "./services/ollama-llm.js";
|
||||
import type { SttService } from "./services/stt.js";
|
||||
import { synthesizeWindowsSpeechToWaveFile, WindowsSystemTtsService } from "./services/windows-system-tts.js";
|
||||
import { createTtsService } from "./services/create-tts-service.js";
|
||||
import { listWindowsMediaVoices } from "./services/windows-media-tts.js";
|
||||
import { listWindowsSystemVoices } from "./services/windows-system-tts.js";
|
||||
|
||||
export async function printLocalAudioDevices(): Promise<void> {
|
||||
if (process.platform === "win32") {
|
||||
@@ -73,14 +74,11 @@ export async function printLocalAudioDevices(): Promise<void> {
|
||||
|
||||
export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
|
||||
const stt = new LocalFasterWhisperSttService(config, logger);
|
||||
const tts =
|
||||
process.platform === "win32"
|
||||
? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED)
|
||||
: new LocalKokoroTtsService(config, logger);
|
||||
const tts = createTtsService(config, logger);
|
||||
const llm = new OllamaLlmService(config);
|
||||
|
||||
await stt.warmup();
|
||||
await tts.warmup();
|
||||
await tts.warmup?.();
|
||||
await llm.warmup?.();
|
||||
|
||||
if (config.BOT_DEFAULT_LANGUAGE.startsWith("ko") && config.LOCAL_STT_MODEL === "tiny") {
|
||||
@@ -130,10 +128,7 @@ export async function runLocalTtsSmokeTest(
|
||||
logger: Logger,
|
||||
text: string,
|
||||
): Promise<void> {
|
||||
const tts =
|
||||
process.platform === "win32"
|
||||
? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED)
|
||||
: new LocalKokoroTtsService(config, logger);
|
||||
const tts = createTtsService(config, logger);
|
||||
|
||||
const noOpStt: SttService = {
|
||||
async transcribePcm16() {
|
||||
@@ -146,7 +141,7 @@ export async function runLocalTtsSmokeTest(
|
||||
},
|
||||
};
|
||||
|
||||
await tts.warmup();
|
||||
await tts.warmup?.();
|
||||
|
||||
const session = new LocalVoiceSession({
|
||||
config,
|
||||
@@ -171,7 +166,7 @@ export async function runLocalTtsSmokeTest(
|
||||
|
||||
export async function dumpLocalTtsWave(
|
||||
config: AssistantRuntimeConfig,
|
||||
_logger: Logger,
|
||||
logger: Logger,
|
||||
text: string,
|
||||
outputPath?: string,
|
||||
): Promise<void> {
|
||||
@@ -181,9 +176,57 @@ export async function dumpLocalTtsWave(
|
||||
|
||||
const resolvedPath = path.resolve(outputPath?.trim() || "tts-test.wav");
|
||||
await mkdir(path.dirname(resolvedPath), { recursive: true });
|
||||
await synthesizeWindowsSpeechToWaveFile(text, config.LOCAL_TTS_SPEED, resolvedPath);
|
||||
const tts = createTtsService(config, logger);
|
||||
await tts.warmup?.();
|
||||
const playback = await tts.preparePlayback(text);
|
||||
|
||||
try {
|
||||
if (!playback.sourceFilePath) {
|
||||
throw new Error("현재 선택된 TTS 엔진은 직접 WAV 덤프를 지원하지 않습니다.");
|
||||
}
|
||||
await copyFile(playback.sourceFilePath, resolvedPath);
|
||||
} finally {
|
||||
playback.dispose();
|
||||
await tts.destroy?.();
|
||||
}
|
||||
|
||||
console.log("TTS WAV 파일 생성 완료");
|
||||
console.log(`출력 파일: ${resolvedPath}`);
|
||||
console.log("이 파일이 재생되면 TTS 합성은 정상이고, 실시간 재생 경로만 따로 보면 됩니다.");
|
||||
}
|
||||
|
||||
export async function printLocalTtsVoices(config: AssistantRuntimeConfig): Promise<void> {
|
||||
if (process.platform !== "win32") {
|
||||
console.log("현재 플랫폼은 Windows가 아니므로 설치된 시스템 TTS 목록 대신 Kokoro 설정만 사용합니다.");
|
||||
console.log(`LOCAL_TTS_ENGINE=${config.LOCAL_TTS_ENGINE}`);
|
||||
console.log(`LOCAL_TTS_SPEAKER=${config.LOCAL_TTS_SPEAKER}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const [windowsMediaVoices, windowsSystemVoices] = await Promise.all([
|
||||
listWindowsMediaVoices(),
|
||||
listWindowsSystemVoices(),
|
||||
]);
|
||||
|
||||
console.log("\n=== Windows.Media.SpeechSynthesis voices (권장) ===");
|
||||
if (windowsMediaVoices.length === 0) {
|
||||
console.log("설치된 Windows Media 음성이 없습니다.");
|
||||
} else {
|
||||
for (const voice of windowsMediaVoices) {
|
||||
console.log(`- ${voice.description} | name=${voice.displayName} | lang=${voice.language}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("\n=== System.Speech voices (fallback) ===");
|
||||
if (windowsSystemVoices.length === 0) {
|
||||
console.log("설치된 System.Speech 음성이 없습니다.");
|
||||
} else {
|
||||
for (const voice of windowsSystemVoices) {
|
||||
console.log(`- ${voice.description} | name=${voice.name} | lang=${voice.culture}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("\n설정 예시");
|
||||
console.log("LOCAL_TTS_ENGINE=windows-media");
|
||||
console.log("LOCAL_TTS_VOICE_NAME=위 목록의 description 또는 name");
|
||||
}
|
||||
|
||||
112
src/services/create-tts-service.ts
Normal file
112
src/services/create-tts-service.ts
Normal file
@@ -0,0 +1,112 @@
|
||||
import process from "node:process";
|
||||
|
||||
import type { AssistantRuntimeConfig } from "../config.js";
|
||||
import type { Logger } from "../logger.js";
|
||||
import { LocalKokoroTtsService } from "./local-tts.js";
|
||||
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
|
||||
import { WindowsMediaTtsService } from "./windows-media-tts.js";
|
||||
import { WindowsSystemTtsService } from "./windows-system-tts.js";
|
||||
|
||||
interface NamedTtsService {
|
||||
name: string;
|
||||
service: TtsService;
|
||||
}
|
||||
|
||||
class FallbackTtsService implements TtsService {
|
||||
private activeIndex: number | null = null;
|
||||
|
||||
constructor(
|
||||
private readonly logger: Logger,
|
||||
private readonly services: NamedTtsService[],
|
||||
) {}
|
||||
|
||||
async warmup(): Promise<void> {
|
||||
await this.ensureActive();
|
||||
}
|
||||
|
||||
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
|
||||
const active = await this.ensureActive();
|
||||
|
||||
try {
|
||||
return await active.service.preparePlayback(text, signal);
|
||||
} catch (error) {
|
||||
if (this.activeIndex === null || this.activeIndex >= this.services.length - 1) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
const failedName = active.name;
|
||||
this.activeIndex += 1;
|
||||
const fallback = await this.activate(this.activeIndex);
|
||||
this.logger.warn(`TTS 엔진 ${failedName} 이 실패해 ${fallback.name} 로 전환합니다.`, error);
|
||||
return await fallback.service.preparePlayback(text, signal);
|
||||
}
|
||||
}
|
||||
|
||||
async destroy(): Promise<void> {
|
||||
await Promise.allSettled(this.services.map((entry) => entry.service.destroy?.()));
|
||||
}
|
||||
|
||||
private async ensureActive(): Promise<NamedTtsService> {
|
||||
if (this.activeIndex !== null) {
|
||||
return this.services[this.activeIndex]!;
|
||||
}
|
||||
|
||||
let lastError: unknown = null;
|
||||
for (let index = 0; index < this.services.length; index += 1) {
|
||||
try {
|
||||
return await this.activate(index);
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
this.logger.warn(`TTS 엔진 ${this.services[index]!.name} 초기화 실패`, error);
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError instanceof Error ? lastError : new Error("사용 가능한 TTS 엔진을 찾지 못했습니다.");
|
||||
}
|
||||
|
||||
private async activate(index: number): Promise<NamedTtsService> {
|
||||
const selected = this.services[index]!;
|
||||
await selected.service.warmup?.();
|
||||
this.activeIndex = index;
|
||||
this.logger.info("Selected TTS engine", selected.name);
|
||||
return selected;
|
||||
}
|
||||
}
|
||||
|
||||
export function createTtsService(config: AssistantRuntimeConfig, logger: Logger): TtsService {
|
||||
if (process.platform !== "win32") {
|
||||
return new LocalKokoroTtsService(config, logger);
|
||||
}
|
||||
|
||||
const systemTts = new WindowsSystemTtsService(
|
||||
config.LOCAL_TTS_SPEED,
|
||||
config.LOCAL_TTS_VOICE_NAME,
|
||||
config.LOCAL_TTS_LANGUAGE,
|
||||
);
|
||||
const windowsMediaTts = new WindowsMediaTtsService(
|
||||
config.LOCAL_TTS_SPEED,
|
||||
config.LOCAL_TTS_VOICE_NAME,
|
||||
config.LOCAL_TTS_LANGUAGE,
|
||||
);
|
||||
|
||||
switch (config.LOCAL_TTS_ENGINE) {
|
||||
case "system":
|
||||
return systemTts;
|
||||
case "windows-media":
|
||||
return windowsMediaTts;
|
||||
case "kokoro":
|
||||
return new LocalKokoroTtsService(config, logger);
|
||||
case "auto":
|
||||
default:
|
||||
return new FallbackTtsService(logger, [
|
||||
{
|
||||
name: "windows-media",
|
||||
service: windowsMediaTts,
|
||||
},
|
||||
{
|
||||
name: "system",
|
||||
service: systemTts,
|
||||
},
|
||||
]);
|
||||
}
|
||||
}
|
||||
@@ -7,6 +7,7 @@ export interface PreparedSpeechAudio {
|
||||
}
|
||||
|
||||
export interface TtsService {
|
||||
warmup?(): Promise<void>;
|
||||
preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio>;
|
||||
destroy?(): Promise<void>;
|
||||
}
|
||||
|
||||
141
src/services/windows-media-tts.ts
Normal file
141
src/services/windows-media-tts.ts
Normal file
@@ -0,0 +1,141 @@
|
||||
import { createReadStream } from "node:fs";
|
||||
import { unlink } from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
|
||||
import { parsePowerShellJsonArray, runPowerShell } from "./windows-powershell.js";
|
||||
|
||||
export interface WindowsMediaVoiceInfo {
|
||||
displayName: string;
|
||||
description: string;
|
||||
language: string;
|
||||
gender: string;
|
||||
id: string;
|
||||
}
|
||||
|
||||
function escapePowerShellSingleQuoted(text: string): string {
|
||||
return text.replace(/\r?\n/g, " ").replace(/'/g, "''");
|
||||
}
|
||||
|
||||
function windowsMediaPreamble(): string {
|
||||
return [
|
||||
"$ErrorActionPreference = 'Stop';",
|
||||
"$ProgressPreference = 'SilentlyContinue';",
|
||||
"Add-Type -AssemblyName System.Runtime.WindowsRuntime;",
|
||||
"$null = [Windows.Media.SpeechSynthesis.SpeechSynthesizer, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime];",
|
||||
"$null = [Windows.Storage.Streams.DataReader, Windows.Storage.Streams, ContentType=WindowsRuntime];",
|
||||
"function Await-WinRt($operation) {",
|
||||
" $method = [System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object { $_.Name -eq 'AsTask' -and $_.IsGenericMethod -and $_.GetParameters().Count -eq 1 } | Select-Object -First 1;",
|
||||
" if (-not $method) { throw 'System.WindowsRuntimeSystemExtensions.AsTask 를 찾지 못했습니다.' }",
|
||||
" $resultType = $operation.GetType().GenericTypeArguments[0];",
|
||||
" $task = $method.MakeGenericMethod($resultType).Invoke($null, @($operation));",
|
||||
" return $task.GetAwaiter().GetResult();",
|
||||
"}",
|
||||
].join(" ");
|
||||
}
|
||||
|
||||
export async function listWindowsMediaVoices(signal?: AbortSignal): Promise<WindowsMediaVoiceInfo[]> {
|
||||
const script = [
|
||||
windowsMediaPreamble(),
|
||||
"$voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices | ForEach-Object {",
|
||||
" [PSCustomObject]@{",
|
||||
" displayName = $_.DisplayName;",
|
||||
" description = $_.Description;",
|
||||
" language = $_.Language;",
|
||||
" gender = [string]$_.Gender;",
|
||||
" id = $_.Id;",
|
||||
" }",
|
||||
"});",
|
||||
"ConvertTo-Json -InputObject $voices -Compress;",
|
||||
].join(" ");
|
||||
|
||||
const { stdout } = await runPowerShell(script, signal);
|
||||
return parsePowerShellJsonArray<WindowsMediaVoiceInfo>(stdout);
|
||||
}
|
||||
|
||||
export async function synthesizeWindowsMediaSpeechToWaveFile(
|
||||
text: string,
|
||||
speed: number,
|
||||
outputPath: string,
|
||||
voiceName?: string,
|
||||
language = "ko",
|
||||
signal?: AbortSignal,
|
||||
): Promise<void> {
|
||||
const script = [
|
||||
windowsMediaPreamble(),
|
||||
`$text = '${escapePowerShellSingleQuoted(text)}';`,
|
||||
`$outputPath = '${escapePowerShellSingleQuoted(outputPath)}';`,
|
||||
`$preferredVoice = '${escapePowerShellSingleQuoted(voiceName ?? "")}';`,
|
||||
`$preferredLanguage = '${escapePowerShellSingleQuoted(language)}';`,
|
||||
`$speakingRate = ${speed.toFixed(2)};`,
|
||||
"$synth = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new();",
|
||||
"try {",
|
||||
" $voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices;",
|
||||
" $selected = $null;",
|
||||
" if ($preferredVoice) {",
|
||||
" $selected = $voices | Where-Object {",
|
||||
" $_.DisplayName -eq $preferredVoice -or $_.Description -eq $preferredVoice -or $_.Id -eq $preferredVoice -or $_.DisplayName -like ('*' + $preferredVoice + '*') -or $_.Description -like ('*' + $preferredVoice + '*')",
|
||||
" } | Select-Object -First 1;",
|
||||
" }",
|
||||
" if (-not $selected -and $preferredLanguage) {",
|
||||
" $selected = $voices | Where-Object { $_.Language -like ($preferredLanguage + '*') } | Sort-Object @{Expression={ if ($_.DisplayName -match 'Natural' -or $_.Description -match 'Natural') { 0 } else { 1 } }}, Description | Select-Object -First 1;",
|
||||
" }",
|
||||
" if (-not $selected) { $selected = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice }",
|
||||
" if ($selected) { $synth.Voice = $selected }",
|
||||
" try { $synth.Options.SpeakingRate = $speakingRate } catch {}",
|
||||
" $stream = Await-WinRt ($synth.SynthesizeTextToStreamAsync($text));",
|
||||
" try {",
|
||||
" $size = [uint32]$stream.Size;",
|
||||
" $reader = [Windows.Storage.Streams.DataReader]::new($stream.GetInputStreamAt(0));",
|
||||
" try {",
|
||||
" $null = Await-WinRt ($reader.LoadAsync($size));",
|
||||
" $bytes = New-Object byte[] ([int]$size);",
|
||||
" $reader.ReadBytes($bytes);",
|
||||
" [System.IO.File]::WriteAllBytes($outputPath, $bytes);",
|
||||
" } finally { $reader.Dispose() }",
|
||||
" } finally { $stream.Dispose() }",
|
||||
"} finally { $synth.Dispose() }",
|
||||
].join(" ");
|
||||
|
||||
await runPowerShell(script, signal);
|
||||
}
|
||||
|
||||
export class WindowsMediaTtsService implements TtsService {
|
||||
constructor(
|
||||
private readonly speed: number,
|
||||
private readonly voiceName?: string,
|
||||
private readonly language = "ko",
|
||||
) {}
|
||||
|
||||
async warmup(): Promise<void> {
|
||||
await listWindowsMediaVoices();
|
||||
}
|
||||
|
||||
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
|
||||
const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-wmtts-${Date.now()}.wav`);
|
||||
await synthesizeWindowsMediaSpeechToWaveFile(
|
||||
text,
|
||||
this.speed,
|
||||
tempPath,
|
||||
this.voiceName,
|
||||
this.language,
|
||||
signal,
|
||||
).catch(async (error) => {
|
||||
await unlink(tempPath).catch(() => null);
|
||||
throw error;
|
||||
});
|
||||
|
||||
return {
|
||||
stream: createReadStream(tempPath),
|
||||
sourceFilePath: tempPath,
|
||||
dispose: () => {
|
||||
void unlink(tempPath).catch(() => null);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
async destroy(): Promise<void> {
|
||||
return;
|
||||
}
|
||||
}
|
||||
63
src/services/windows-powershell.ts
Normal file
63
src/services/windows-powershell.ts
Normal file
@@ -0,0 +1,63 @@
|
||||
import { spawn } from "node:child_process";
|
||||
|
||||
export interface PowerShellRunResult {
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
}
|
||||
|
||||
export async function runPowerShell(script: string, signal?: AbortSignal): Promise<PowerShellRunResult> {
|
||||
const encodedCommand = Buffer.from(script, "utf16le").toString("base64");
|
||||
|
||||
return await new Promise<PowerShellRunResult>((resolve, reject) => {
|
||||
const child = spawn("powershell", ["-NoProfile", "-EncodedCommand", encodedCommand], {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
let stdout = "";
|
||||
let stderr = "";
|
||||
|
||||
child.stdout.on("data", (chunk: Buffer) => {
|
||||
stdout += chunk.toString();
|
||||
});
|
||||
|
||||
child.stderr.on("data", (chunk: Buffer) => {
|
||||
stderr += chunk.toString();
|
||||
});
|
||||
|
||||
signal?.addEventListener(
|
||||
"abort",
|
||||
() => {
|
||||
if (!child.killed) {
|
||||
child.kill("SIGKILL");
|
||||
}
|
||||
},
|
||||
{ once: true },
|
||||
);
|
||||
|
||||
child.on("exit", (code) => {
|
||||
if (signal?.aborted) {
|
||||
reject(new Error("powershell aborted"));
|
||||
return;
|
||||
}
|
||||
|
||||
if (code === 0) {
|
||||
resolve({ stdout, stderr });
|
||||
return;
|
||||
}
|
||||
|
||||
reject(new Error(stderr.trim() || stdout.trim() || `powershell exited with code ${code ?? "null"}`));
|
||||
});
|
||||
|
||||
child.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
||||
export function parsePowerShellJsonArray<T>(stdout: string): T[] {
|
||||
const trimmed = stdout.trim();
|
||||
if (!trimmed) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const parsed: unknown = JSON.parse(trimmed);
|
||||
return Array.isArray(parsed) ? (parsed as T[]) : ([parsed] as T[]);
|
||||
}
|
||||
@@ -1,14 +1,21 @@
|
||||
import { spawn } from "node:child_process";
|
||||
import { createReadStream } from "node:fs";
|
||||
import { unlink } from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
|
||||
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
|
||||
import { parsePowerShellJsonArray, runPowerShell } from "./windows-powershell.js";
|
||||
|
||||
export interface WindowsSystemVoiceInfo {
|
||||
name: string;
|
||||
culture: string;
|
||||
description: string;
|
||||
gender: string;
|
||||
enabled: boolean;
|
||||
}
|
||||
|
||||
function escapePowerShellSingleQuoted(text: string): string {
|
||||
return text.replace(/'/g, "''");
|
||||
return text.replace(/\r?\n/g, " ").replace(/'/g, "''");
|
||||
}
|
||||
|
||||
function toSpeechRate(speed: number): number {
|
||||
@@ -16,77 +23,86 @@ function toSpeechRate(speed: number): number {
|
||||
return Math.max(-10, Math.min(10, mapped));
|
||||
}
|
||||
|
||||
export async function listWindowsSystemVoices(signal?: AbortSignal): Promise<WindowsSystemVoiceInfo[]> {
|
||||
const script = [
|
||||
"$ErrorActionPreference = 'Stop';",
|
||||
"$ProgressPreference = 'SilentlyContinue';",
|
||||
"Add-Type -AssemblyName System.Speech;",
|
||||
"$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
|
||||
"try {",
|
||||
" $voices = @($synth.GetInstalledVoices() | ForEach-Object {",
|
||||
" [PSCustomObject]@{",
|
||||
" name = $_.VoiceInfo.Name;",
|
||||
" culture = $_.VoiceInfo.Culture.Name;",
|
||||
" description = $_.VoiceInfo.Description;",
|
||||
" gender = [string]$_.VoiceInfo.Gender;",
|
||||
" enabled = [bool]$_.Enabled;",
|
||||
" }",
|
||||
" });",
|
||||
" ConvertTo-Json -InputObject $voices -Compress;",
|
||||
"} finally { $synth.Dispose() }",
|
||||
].join(" ");
|
||||
|
||||
const { stdout } = await runPowerShell(script, signal);
|
||||
return parsePowerShellJsonArray<WindowsSystemVoiceInfo>(stdout);
|
||||
}
|
||||
|
||||
export async function synthesizeWindowsSpeechToWaveFile(
|
||||
text: string,
|
||||
speed: number,
|
||||
outputPath: string,
|
||||
voiceName?: string,
|
||||
language = "ko",
|
||||
signal?: AbortSignal,
|
||||
): Promise<void> {
|
||||
const rate = toSpeechRate(speed);
|
||||
const script = [
|
||||
"$ErrorActionPreference = 'Stop';",
|
||||
"$ProgressPreference = 'SilentlyContinue';",
|
||||
"Add-Type -AssemblyName System.Speech;",
|
||||
"$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
|
||||
"$koVoice = $synth.GetInstalledVoices() | Where-Object { $_.VoiceInfo.Culture.Name -like 'ko*' } | Select-Object -First 1;",
|
||||
"if ($koVoice) { $synth.SelectVoice($koVoice.VoiceInfo.Name) }",
|
||||
`$preferredVoice = '${escapePowerShellSingleQuoted(voiceName ?? "")}';`,
|
||||
`$preferredLanguage = '${escapePowerShellSingleQuoted(language)}';`,
|
||||
"try {",
|
||||
" $voices = $synth.GetInstalledVoices();",
|
||||
" $selected = $null;",
|
||||
" if ($preferredVoice) {",
|
||||
" $selected = $voices | Where-Object {",
|
||||
" $_.VoiceInfo.Name -eq $preferredVoice -or $_.VoiceInfo.Description -eq $preferredVoice -or $_.VoiceInfo.Name -like ('*' + $preferredVoice + '*') -or $_.VoiceInfo.Description -like ('*' + $preferredVoice + '*')",
|
||||
" } | Select-Object -First 1;",
|
||||
" }",
|
||||
" if (-not $selected -and $preferredLanguage) {",
|
||||
" $selected = $voices | Where-Object { $_.VoiceInfo.Culture.Name -like ($preferredLanguage + '*') } | Select-Object -First 1;",
|
||||
" }",
|
||||
" if ($selected) { $synth.SelectVoice($selected.VoiceInfo.Name) }",
|
||||
`$synth.Rate = ${rate};`,
|
||||
`$synth.SetOutputToWaveFile('${escapePowerShellSingleQuoted(outputPath)}');`,
|
||||
`$synth.Speak('${escapePowerShellSingleQuoted(text)}');`,
|
||||
"$synth.Dispose();",
|
||||
"} finally { $synth.Dispose() }",
|
||||
].join(" ");
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const child = spawn("powershell", ["-NoProfile", "-Command", script], {
|
||||
stdio: ["ignore", "ignore", "pipe"],
|
||||
});
|
||||
|
||||
let stderr = "";
|
||||
child.stderr.on("data", (chunk: Buffer) => {
|
||||
stderr += chunk.toString();
|
||||
});
|
||||
|
||||
signal?.addEventListener(
|
||||
"abort",
|
||||
() => {
|
||||
if (!child.killed) {
|
||||
child.kill("SIGKILL");
|
||||
}
|
||||
},
|
||||
{ once: true },
|
||||
);
|
||||
|
||||
child.on("exit", (code) => {
|
||||
if (signal?.aborted) {
|
||||
reject(new Error("tts aborted"));
|
||||
return;
|
||||
}
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
reject(new Error(stderr.trim() || `powershell tts exited with code ${code ?? "null"}`));
|
||||
});
|
||||
child.on("error", reject);
|
||||
});
|
||||
await runPowerShell(script, signal);
|
||||
}
|
||||
|
||||
export class WindowsSystemTtsService implements TtsService {
|
||||
constructor(private readonly speed: number) {
|
||||
const resolvedFfmpegPath = resolveFfmpegPath();
|
||||
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
|
||||
process.env.FFMPEG_PATH = resolvedFfmpegPath;
|
||||
}
|
||||
}
|
||||
constructor(
|
||||
private readonly speed: number,
|
||||
private readonly voiceName?: string,
|
||||
private readonly language = "ko",
|
||||
) {}
|
||||
|
||||
async warmup(): Promise<void> {
|
||||
return;
|
||||
await listWindowsSystemVoices();
|
||||
}
|
||||
|
||||
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
|
||||
const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-tts-${Date.now()}.wav`);
|
||||
await synthesizeWindowsSpeechToWaveFile(text, this.speed, tempPath, signal).catch(async (error) => {
|
||||
await synthesizeWindowsSpeechToWaveFile(text, this.speed, tempPath, this.voiceName, this.language, signal).catch(
|
||||
async (error) => {
|
||||
await unlink(tempPath).catch(() => null);
|
||||
throw error;
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
return {
|
||||
stream: createReadStream(tempPath),
|
||||
|
||||
Reference in New Issue
Block a user