diff --git a/README.md b/README.md index 064fae9..776d6b0 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ - STT: `faster-whisper` + Whisper multilingual - LLM: `Ollama` + `qwen3:0.6b` -- TTS: `kokoro-onnx` + `misaki[ko]` +- TTS: +- Windows: 시스템 기본 음성 엔진 +- Linux/macOS: `kokoro-onnx` + `misaki[ko]` - VAD: `avr-vad` 외부 유료 API나 무료 한도형 API는 쓰지 않습니다. @@ -143,9 +145,11 @@ OLLAMA_MODEL=qwen3:1.7b ## Windows 메모 - `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다. +- Windows는 TTS를 Python 모델 대신 시스템 기본 음성 엔진으로 처리합니다. - 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다. - Python 탐지가 안 되면 `.env` 에 `LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 를 넣으면 됩니다. -- `setup:local-ai`는 Kokoro ONNX 모델 파일도 자동으로 내려받습니다. +- Windows의 `setup:local-ai`는 STT만 설치합니다. +- Linux/macOS의 `setup:local-ai`는 Kokoro ONNX 모델 파일도 자동으로 내려받습니다. ## 설계 메모 diff --git a/python/requirements-windows.txt b/python/requirements-windows.txt new file mode 100644 index 0000000..222236a --- /dev/null +++ b/python/requirements-windows.txt @@ -0,0 +1 @@ +faster-whisper==1.2.1 diff --git a/src/discord-main.ts b/src/discord-main.ts index 0a11791..0f42e14 100644 --- a/src/discord-main.ts +++ b/src/discord-main.ts @@ -18,6 +18,7 @@ import { Logger } from "./logger.js"; import { LocalFasterWhisperSttService } from "./services/local-stt.js"; import { LocalKokoroTtsService } from "./services/local-tts.js"; import { OllamaLlmService } from "./services/ollama-llm.js"; +import { WindowsSystemTtsService } from "./services/windows-system-tts.js"; export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise { const commands = [ @@ -38,7 +39,10 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger }); const stt = new LocalFasterWhisperSttService(config, logger); - const tts = new LocalKokoroTtsService(config, logger); + const tts = + process.platform === "win32" + ? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED) + : new LocalKokoroTtsService(config, logger); const llm = new OllamaLlmService(config); const sessions = new Map(); diff --git a/src/local-main.ts b/src/local-main.ts index 5a573ca..c35a9c6 100644 --- a/src/local-main.ts +++ b/src/local-main.ts @@ -8,6 +8,7 @@ import { requireFfmpegPath } from "./audio/ffmpeg-path.js"; import { LocalFasterWhisperSttService } from "./services/local-stt.js"; import { LocalKokoroTtsService } from "./services/local-tts.js"; import { OllamaLlmService } from "./services/ollama-llm.js"; +import { WindowsSystemTtsService } from "./services/windows-system-tts.js"; export async function printLocalAudioDevices(): Promise { if (process.platform === "win32") { @@ -68,7 +69,10 @@ export async function printLocalAudioDevices(): Promise { export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise { const stt = new LocalFasterWhisperSttService(config, logger); - const tts = new LocalKokoroTtsService(config, logger); + const tts = + process.platform === "win32" + ? new WindowsSystemTtsService(config.LOCAL_TTS_SPEED) + : new LocalKokoroTtsService(config, logger); const llm = new OllamaLlmService(config); await stt.warmup(); diff --git a/src/services/python-json-worker.ts b/src/services/python-json-worker.ts index 59c69a4..8369f34 100644 --- a/src/services/python-json-worker.ts +++ b/src/services/python-json-worker.ts @@ -117,6 +117,7 @@ export class PythonJsonWorker { HF_HOME: cachePath, TRANSFORMERS_CACHE: cachePath, PYTHONIOENCODING: "utf-8", + HF_HUB_DISABLE_SYMLINKS_WARNING: "1", BOT_DEFAULT_LANGUAGE: this.config.BOT_DEFAULT_LANGUAGE, ...this.workerEnv, }, @@ -132,6 +133,9 @@ export class PythonJsonWorker { let payload: WorkerResponse; try { + if (!line.startsWith("{")) { + return; + } payload = JSON.parse(line) as WorkerResponse; } catch (error) { this.logger.warn(`${this.label} stdout parse failed`, error); diff --git a/src/services/windows-system-tts.ts b/src/services/windows-system-tts.ts new file mode 100644 index 0000000..22a1317 --- /dev/null +++ b/src/services/windows-system-tts.ts @@ -0,0 +1,126 @@ +import { spawn } from "node:child_process"; +import { createReadStream } from "node:fs"; +import { unlink } from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import prism from "prism-media"; + +import { resolveFfmpegPath } from "../audio/ffmpeg-path.js"; +import type { PreparedSpeechAudio, TtsService } from "./tts.js"; + +function escapePowerShellSingleQuoted(text: string): string { + return text.replace(/'/g, "''"); +} + +function toSpeechRate(speed: number): number { + const mapped = Math.round((speed - 1) * 8); + return Math.max(-10, Math.min(10, mapped)); +} + +export class WindowsSystemTtsService implements TtsService { + constructor(private readonly speed: number) { + const resolvedFfmpegPath = resolveFfmpegPath(); + if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) { + process.env.FFMPEG_PATH = resolvedFfmpegPath; + } + } + + async warmup(): Promise { + return; + } + + async preparePlayback(text: string, signal?: AbortSignal): Promise { + const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-tts-${Date.now()}.wav`); + const rate = toSpeechRate(this.speed); + const script = [ + "Add-Type -AssemblyName System.Speech;", + "$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;", + "$koVoice = $synth.GetInstalledVoices() | Where-Object { $_.VoiceInfo.Culture.Name -like 'ko*' } | Select-Object -First 1;", + "if ($koVoice) { $synth.SelectVoice($koVoice.VoiceInfo.Name) }", + `$synth.Rate = ${rate};`, + `$synth.SetOutputToWaveFile('${escapePowerShellSingleQuoted(tempPath)}');`, + `$synth.Speak('${escapePowerShellSingleQuoted(text)}');`, + "$synth.Dispose();", + ].join(" "); + + await new Promise((resolve, reject) => { + const child = spawn("powershell", ["-NoProfile", "-Command", script], { + stdio: ["ignore", "ignore", "pipe"], + }); + + let stderr = ""; + child.stderr.on("data", (chunk: Buffer) => { + stderr += chunk.toString(); + }); + + signal?.addEventListener( + "abort", + () => { + if (!child.killed) { + child.kill("SIGKILL"); + } + }, + { once: true }, + ); + + child.on("exit", (code) => { + if (signal?.aborted) { + reject(new Error("tts aborted")); + return; + } + if (code === 0) { + resolve(); + return; + } + reject(new Error(stderr.trim() || `powershell tts exited with code ${code ?? "null"}`)); + }); + child.on("error", reject); + }).catch(async (error) => { + await unlink(tempPath).catch(() => null); + throw error; + }); + + const input = createReadStream(tempPath); + const ffmpeg = new prism.FFmpeg({ + args: [ + "-analyzeduration", + "0", + "-loglevel", + "0", + "-i", + tempPath, + "-f", + "s16le", + "-ar", + "48000", + "-ac", + "2", + "pipe:1", + ], + }); + + signal?.addEventListener( + "abort", + () => { + input.destroy(); + ffmpeg.destroy(); + void unlink(tempPath).catch(() => null); + }, + { once: true }, + ); + + return { + stream: ffmpeg, + dispose: () => { + input.destroy(); + ffmpeg.destroy(); + void unlink(tempPath).catch(() => null); + }, + }; + } + + async destroy(): Promise { + return; + } +} diff --git a/src/setup-local-ai.ts b/src/setup-local-ai.ts index 1b80195..7a37eaf 100644 --- a/src/setup-local-ai.ts +++ b/src/setup-local-ai.ts @@ -82,11 +82,16 @@ async function main(): Promise { const cachePath = resolveLocalAiCachePath(config); const ttsModelPath = resolveLocalAiTtsModelPath(config); const ttsVoicesPath = resolveLocalAiTtsVoicesPath(config); - const requirementsPath = path.resolve(process.cwd(), "python", "requirements.txt"); + const requirementsPath = path.resolve( + process.cwd(), + "python", + process.platform === "win32" ? "requirements-windows.txt" : "requirements.txt", + ); const baseEnv = { HF_HOME: cachePath, TRANSFORMERS_CACHE: cachePath, PYTHONIOENCODING: "utf-8", + HF_HUB_DISABLE_SYMLINKS_WARNING: "1", }; await mkdir(cachePath, { recursive: true }); @@ -106,9 +111,11 @@ async function main(): Promise { console.log("로컬 AI 의존성 설치를 시작합니다."); await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], baseEnv); await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], baseEnv); - console.log("로컬 TTS 모델 파일을 확인합니다."); - await ensureDownload(KOKORO_MODEL_URL, ttsModelPath); - await ensureDownload(KOKORO_VOICES_URL, ttsVoicesPath); + if (process.platform !== "win32") { + console.log("로컬 TTS 모델 파일을 확인합니다."); + await ensureDownload(KOKORO_MODEL_URL, ttsModelPath); + await ensureDownload(KOKORO_VOICES_URL, ttsVoicesPath); + } console.log("설치가 끝났습니다."); console.log("다음 순서:");