Use Windows system TTS for local mode

This commit is contained in:
2026-04-30 04:01:24 +09:00
parent dc39998241
commit 7ba392c0e7
7 changed files with 158 additions and 8 deletions

View File

@@ -6,7 +6,9 @@
- STT: `faster-whisper` + Whisper multilingual - STT: `faster-whisper` + Whisper multilingual
- LLM: `Ollama` + `qwen3:0.6b` - LLM: `Ollama` + `qwen3:0.6b`
- TTS: `kokoro-onnx` + `misaki[ko]` - TTS:
- Windows: 시스템 기본 음성 엔진
- Linux/macOS: `kokoro-onnx` + `misaki[ko]`
- VAD: `avr-vad` - VAD: `avr-vad`
외부 유료 API나 무료 한도형 API는 쓰지 않습니다. 외부 유료 API나 무료 한도형 API는 쓰지 않습니다.
@@ -143,9 +145,11 @@ OLLAMA_MODEL=qwen3:1.7b
## Windows 메모 ## Windows 메모
- `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다. - `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다.
- Windows는 TTS를 Python 모델 대신 시스템 기본 음성 엔진으로 처리합니다.
- 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다. - 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다.
- Python 탐지가 안 되면 `.env``LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 를 넣으면 됩니다. - Python 탐지가 안 되면 `.env``LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 를 넣으면 됩니다.
- `setup:local-ai`Kokoro ONNX 모델 파일도 자동으로 내려받습니다. - Windows의 `setup:local-ai`STT만 설치합니다.
- Linux/macOS의 `setup:local-ai`는 Kokoro ONNX 모델 파일도 자동으로 내려받습니다.
## 설계 메모 ## 설계 메모

View File

@@ -0,0 +1 @@
faster-whisper==1.2.1

View File

@@ -18,6 +18,7 @@ import { Logger } from "./logger.js";
import { LocalFasterWhisperSttService } from "./services/local-stt.js"; import { LocalFasterWhisperSttService } from "./services/local-stt.js";
import { LocalKokoroTtsService } from "./services/local-tts.js"; import { LocalKokoroTtsService } from "./services/local-tts.js";
import { OllamaLlmService } from "./services/ollama-llm.js"; import { OllamaLlmService } from "./services/ollama-llm.js";
import { WindowsSystemTtsService } from "./services/windows-system-tts.js";
export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> { export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
const commands = [ const commands = [
@@ -38,7 +39,10 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger
}); });
const stt = new LocalFasterWhisperSttService(config, logger); const stt = new LocalFasterWhisperSttService(config, logger);
const tts = new LocalKokoroTtsService(config, logger); const tts =
process.platform === "win32"
? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED)
: new LocalKokoroTtsService(config, logger);
const llm = new OllamaLlmService(config); const llm = new OllamaLlmService(config);
const sessions = new Map<string, GuildVoiceSession>(); const sessions = new Map<string, GuildVoiceSession>();

View File

@@ -8,6 +8,7 @@ import { requireFfmpegPath } from "./audio/ffmpeg-path.js";
import { LocalFasterWhisperSttService } from "./services/local-stt.js"; import { LocalFasterWhisperSttService } from "./services/local-stt.js";
import { LocalKokoroTtsService } from "./services/local-tts.js"; import { LocalKokoroTtsService } from "./services/local-tts.js";
import { OllamaLlmService } from "./services/ollama-llm.js"; import { OllamaLlmService } from "./services/ollama-llm.js";
import { WindowsSystemTtsService } from "./services/windows-system-tts.js";
export async function printLocalAudioDevices(): Promise<void> { export async function printLocalAudioDevices(): Promise<void> {
if (process.platform === "win32") { if (process.platform === "win32") {
@@ -68,7 +69,10 @@ export async function printLocalAudioDevices(): Promise<void> {
export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> { export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
const stt = new LocalFasterWhisperSttService(config, logger); const stt = new LocalFasterWhisperSttService(config, logger);
const tts = new LocalKokoroTtsService(config, logger); const tts =
process.platform === "win32"
? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED)
: new LocalKokoroTtsService(config, logger);
const llm = new OllamaLlmService(config); const llm = new OllamaLlmService(config);
await stt.warmup(); await stt.warmup();

View File

@@ -117,6 +117,7 @@ export class PythonJsonWorker {
HF_HOME: cachePath, HF_HOME: cachePath,
TRANSFORMERS_CACHE: cachePath, TRANSFORMERS_CACHE: cachePath,
PYTHONIOENCODING: "utf-8", PYTHONIOENCODING: "utf-8",
HF_HUB_DISABLE_SYMLINKS_WARNING: "1",
BOT_DEFAULT_LANGUAGE: this.config.BOT_DEFAULT_LANGUAGE, BOT_DEFAULT_LANGUAGE: this.config.BOT_DEFAULT_LANGUAGE,
...this.workerEnv, ...this.workerEnv,
}, },
@@ -132,6 +133,9 @@ export class PythonJsonWorker {
let payload: WorkerResponse; let payload: WorkerResponse;
try { try {
if (!line.startsWith("{")) {
return;
}
payload = JSON.parse(line) as WorkerResponse; payload = JSON.parse(line) as WorkerResponse;
} catch (error) { } catch (error) {
this.logger.warn(`${this.label} stdout parse failed`, error); this.logger.warn(`${this.label} stdout parse failed`, error);

View File

@@ -0,0 +1,126 @@
import { spawn } from "node:child_process";
import { createReadStream } from "node:fs";
import { unlink } from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import prism from "prism-media";
import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
function escapePowerShellSingleQuoted(text: string): string {
return text.replace(/'/g, "''");
}
function toSpeechRate(speed: number): number {
const mapped = Math.round((speed - 1) * 8);
return Math.max(-10, Math.min(10, mapped));
}
export class WindowsSystemTtsService implements TtsService {
constructor(private readonly speed: number) {
const resolvedFfmpegPath = resolveFfmpegPath();
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
process.env.FFMPEG_PATH = resolvedFfmpegPath;
}
}
async warmup(): Promise<void> {
return;
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-tts-${Date.now()}.wav`);
const rate = toSpeechRate(this.speed);
const script = [
"Add-Type -AssemblyName System.Speech;",
"$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
"$koVoice = $synth.GetInstalledVoices() | Where-Object { $_.VoiceInfo.Culture.Name -like 'ko*' } | Select-Object -First 1;",
"if ($koVoice) { $synth.SelectVoice($koVoice.VoiceInfo.Name) }",
`$synth.Rate = ${rate};`,
`$synth.SetOutputToWaveFile('${escapePowerShellSingleQuoted(tempPath)}');`,
`$synth.Speak('${escapePowerShellSingleQuoted(text)}');`,
"$synth.Dispose();",
].join(" ");
await new Promise<void>((resolve, reject) => {
const child = spawn("powershell", ["-NoProfile", "-Command", script], {
stdio: ["ignore", "ignore", "pipe"],
});
let stderr = "";
child.stderr.on("data", (chunk: Buffer) => {
stderr += chunk.toString();
});
signal?.addEventListener(
"abort",
() => {
if (!child.killed) {
child.kill("SIGKILL");
}
},
{ once: true },
);
child.on("exit", (code) => {
if (signal?.aborted) {
reject(new Error("tts aborted"));
return;
}
if (code === 0) {
resolve();
return;
}
reject(new Error(stderr.trim() || `powershell tts exited with code ${code ?? "null"}`));
});
child.on("error", reject);
}).catch(async (error) => {
await unlink(tempPath).catch(() => null);
throw error;
});
const input = createReadStream(tempPath);
const ffmpeg = new prism.FFmpeg({
args: [
"-analyzeduration",
"0",
"-loglevel",
"0",
"-i",
tempPath,
"-f",
"s16le",
"-ar",
"48000",
"-ac",
"2",
"pipe:1",
],
});
signal?.addEventListener(
"abort",
() => {
input.destroy();
ffmpeg.destroy();
void unlink(tempPath).catch(() => null);
},
{ once: true },
);
return {
stream: ffmpeg,
dispose: () => {
input.destroy();
ffmpeg.destroy();
void unlink(tempPath).catch(() => null);
},
};
}
async destroy(): Promise<void> {
return;
}
}

View File

@@ -82,11 +82,16 @@ async function main(): Promise<void> {
const cachePath = resolveLocalAiCachePath(config); const cachePath = resolveLocalAiCachePath(config);
const ttsModelPath = resolveLocalAiTtsModelPath(config); const ttsModelPath = resolveLocalAiTtsModelPath(config);
const ttsVoicesPath = resolveLocalAiTtsVoicesPath(config); const ttsVoicesPath = resolveLocalAiTtsVoicesPath(config);
const requirementsPath = path.resolve(process.cwd(), "python", "requirements.txt"); const requirementsPath = path.resolve(
process.cwd(),
"python",
process.platform === "win32" ? "requirements-windows.txt" : "requirements.txt",
);
const baseEnv = { const baseEnv = {
HF_HOME: cachePath, HF_HOME: cachePath,
TRANSFORMERS_CACHE: cachePath, TRANSFORMERS_CACHE: cachePath,
PYTHONIOENCODING: "utf-8", PYTHONIOENCODING: "utf-8",
HF_HUB_DISABLE_SYMLINKS_WARNING: "1",
}; };
await mkdir(cachePath, { recursive: true }); await mkdir(cachePath, { recursive: true });
@@ -106,9 +111,11 @@ async function main(): Promise<void> {
console.log("로컬 AI 의존성 설치를 시작합니다."); console.log("로컬 AI 의존성 설치를 시작합니다.");
await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], baseEnv); await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], baseEnv);
await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], baseEnv); await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], baseEnv);
if (process.platform !== "win32") {
console.log("로컬 TTS 모델 파일을 확인합니다."); console.log("로컬 TTS 모델 파일을 확인합니다.");
await ensureDownload(KOKORO_MODEL_URL, ttsModelPath); await ensureDownload(KOKORO_MODEL_URL, ttsModelPath);
await ensureDownload(KOKORO_VOICES_URL, ttsVoicesPath); await ensureDownload(KOKORO_VOICES_URL, ttsVoicesPath);
}
console.log("설치가 끝났습니다."); console.log("설치가 끝났습니다.");
console.log("다음 순서:"); console.log("다음 순서:");