From 18369ea7cb3ea258c9f77cf95e26ebe6f135a8f2 Mon Sep 17 00:00:00 2001 From: claude-bot Date: Thu, 30 Apr 2026 03:51:08 +0900 Subject: [PATCH] Switch local TTS to Kokoro ONNX --- .env.example | 6 ++- README.md | 6 ++- python/local_tts_worker.py | 83 +++++++++++++++++++++----------------- python/requirements.txt | 3 +- src/config.ts | 6 ++- src/discord-main.ts | 4 +- src/local-main.ts | 4 +- src/python-runtime.ts | 8 ++++ src/services/local-tts.ts | 5 ++- src/setup-local-ai.ts | 36 ++++++++++++++++- 10 files changed, 112 insertions(+), 49 deletions(-) diff --git a/.env.example b/.env.example index 95d481f..4f98eb8 100644 --- a/.env.example +++ b/.env.example @@ -15,8 +15,10 @@ LOCAL_STT_MODEL=tiny LOCAL_STT_DEVICE=auto LOCAL_STT_COMPUTE_TYPE=auto LOCAL_STT_BEAM_SIZE=1 -LOCAL_TTS_LANGUAGE=KR -LOCAL_TTS_SPEAKER=KR +LOCAL_TTS_MODEL_PATH=.local-ai/models/kokoro-v1.0.onnx +LOCAL_TTS_VOICES_PATH=.local-ai/models/voices-v1.0.bin +LOCAL_TTS_LANGUAGE=ko +LOCAL_TTS_SPEAKER=af_heart LOCAL_TTS_DEVICE=auto LOCAL_TTS_SPEED=1.12 diff --git a/README.md b/README.md index e60b5c2..064fae9 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ - STT: `faster-whisper` + Whisper multilingual - LLM: `Ollama` + `qwen3:0.6b` -- TTS: `MeloTTS` Korean +- TTS: `kokoro-onnx` + `misaki[ko]` - VAD: `avr-vad` 외부 유료 API나 무료 한도형 API는 쓰지 않습니다. @@ -88,6 +88,8 @@ Discord 모드에서만 필수: - `LOCAL_STT_DEVICE` - `LOCAL_STT_COMPUTE_TYPE` - `LOCAL_STT_BEAM_SIZE` +- `LOCAL_TTS_MODEL_PATH` +- `LOCAL_TTS_VOICES_PATH` - `LOCAL_TTS_LANGUAGE` - `LOCAL_TTS_SPEAKER` - `LOCAL_TTS_DEVICE` @@ -118,6 +120,7 @@ Discord 모드에서만 필수: - STT 기본 모델은 `tiny` - LLM 기본 모델은 `qwen3:0.6b` +- TTS 기본 보이스는 `af_heart` - TTS 기본 속도는 `1.12` 정확도가 아쉬우면: @@ -142,6 +145,7 @@ OLLAMA_MODEL=qwen3:1.7b - `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다. - 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다. - Python 탐지가 안 되면 `.env` 에 `LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 를 넣으면 됩니다. +- `setup:local-ai`는 Kokoro ONNX 모델 파일도 자동으로 내려받습니다. ## 설계 메모 diff --git a/python/local_tts_worker.py b/python/local_tts_worker.py index 10a25e9..94d0179 100644 --- a/python/local_tts_worker.py +++ b/python/local_tts_worker.py @@ -1,9 +1,12 @@ import base64 +import io import json import os import sys -import tempfile import traceback +import wave + +import numpy as np os.environ.setdefault("PYTHONIOENCODING", "utf-8") @@ -27,53 +30,61 @@ def write_response(request_id: int, ok: bool, result=None, error: str | None = N sys.stdout.flush() +def normalize_lang(raw: str) -> str: + lowered = raw.strip().lower() + if lowered in {"kr", "ko-kr"}: + return "ko" + return lowered or "ko" + + +def normalize_voice(raw: str) -> str: + value = raw.strip() + if value.upper() in {"KR", "KO"} or not value: + return "af_heart" + return value + + class TtsWorker: def __init__(self) -> None: - from melo.api import TTS + from kokoro_onnx import Kokoro + from misaki import ko - self.language = os.environ.get("LOCAL_TTS_LANGUAGE", "KR").strip() or "KR" - self.speaker_key = os.environ.get("LOCAL_TTS_SPEAKER", "KR").strip() or "KR" - self.device = os.environ.get("LOCAL_TTS_DEVICE", "auto").strip() or "auto" + self.model_path = os.environ["LOCAL_TTS_MODEL_PATH"] + self.voices_path = os.environ["LOCAL_TTS_VOICES_PATH"] + self.language = normalize_lang(os.environ.get("LOCAL_TTS_LANGUAGE", "ko")) + self.voice = normalize_voice(os.environ.get("LOCAL_TTS_SPEAKER", "af_heart")) self.speed = float(os.environ.get("LOCAL_TTS_SPEED", "1.12")) - - self.model = TTS(language=self.language, device=self.device) - speaker_ids = self.model.hps.data.spk2id - self.speaker_id = speaker_ids.get(self.speaker_key) - - if self.speaker_id is None: - normalized = self.speaker_key.upper() - self.speaker_id = speaker_ids.get(normalized) - - if self.speaker_id is None: - self.speaker_id = next(iter(speaker_ids.values())) + self.g2p = ko.KOG2P() + self.model = Kokoro(self.model_path, self.voices_path) log( - f"local-tts ready language={self.language} speaker={self.speaker_key} device={self.device} speed={self.speed}" + f"local-tts ready model={os.path.basename(self.model_path)} voice={self.voice} language={self.language} speed={self.speed}" ) def synthesize(self, text: str) -> bytes: - temp_path = "" + phonemes, _tokens = self.g2p(text) + samples, sample_rate = self.model.create( + phonemes, + voice=self.voice, + speed=self.speed, + lang="en-us", + is_phonemes=True, + ) + return build_wav_bytes(samples, sample_rate) - try: - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as handle: - temp_path = handle.name - self.model.tts_to_file( - text, - self.speaker_id, - temp_path, - speed=self.speed, - quiet=True, - ) +def build_wav_bytes(samples: np.ndarray, sample_rate: int) -> bytes: + clipped = np.clip(samples, -1.0, 1.0) + pcm = (clipped * 32767.0).astype(np.int16) + buffer = io.BytesIO() - with open(temp_path, "rb") as handle: - return handle.read() - finally: - if temp_path: - try: - os.unlink(temp_path) - except OSError: - pass + with wave.open(buffer, "wb") as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(sample_rate) + wav_file.writeframes(pcm.tobytes()) + + return buffer.getvalue() def main() -> int: diff --git a/python/requirements.txt b/python/requirements.txt index ccdfa62..35c1a15 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,2 +1,3 @@ faster-whisper==1.2.1 -git+https://github.com/myshell-ai/MeloTTS.git@v0.1.2 +kokoro-onnx==0.5.0 +misaki[ko]==0.9.4 diff --git a/src/config.ts b/src/config.ts index c814ac1..6259d4c 100644 --- a/src/config.ts +++ b/src/config.ts @@ -26,8 +26,10 @@ const envSchema = z.object({ LOCAL_STT_DEVICE: z.string().min(1).default("auto"), LOCAL_STT_COMPUTE_TYPE: z.string().min(1).default("auto"), LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1), - LOCAL_TTS_LANGUAGE: z.string().min(1).default("KR"), - LOCAL_TTS_SPEAKER: z.string().min(1).default("KR"), + LOCAL_TTS_MODEL_PATH: z.string().min(1).default(".local-ai/models/kokoro-v1.0.onnx"), + LOCAL_TTS_VOICES_PATH: z.string().min(1).default(".local-ai/models/voices-v1.0.bin"), + LOCAL_TTS_LANGUAGE: z.string().min(1).default("ko"), + LOCAL_TTS_SPEAKER: z.string().min(1).default("af_heart"), LOCAL_TTS_DEVICE: z.string().min(1).default("auto"), LOCAL_TTS_SPEED: z.coerce.number().min(0.8).max(1.6).default(1.12), BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"), diff --git a/src/discord-main.ts b/src/discord-main.ts index 88cde72..0a11791 100644 --- a/src/discord-main.ts +++ b/src/discord-main.ts @@ -16,7 +16,7 @@ import { GuildVoiceSession } from "./audio/guild-voice-session.js"; import { type DiscordRuntimeConfig } from "./config.js"; import { Logger } from "./logger.js"; import { LocalFasterWhisperSttService } from "./services/local-stt.js"; -import { LocalMeloTtsService } from "./services/local-tts.js"; +import { LocalKokoroTtsService } from "./services/local-tts.js"; import { OllamaLlmService } from "./services/ollama-llm.js"; export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise { @@ -38,7 +38,7 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger }); const stt = new LocalFasterWhisperSttService(config, logger); - const tts = new LocalMeloTtsService(config, logger); + const tts = new LocalKokoroTtsService(config, logger); const llm = new OllamaLlmService(config); const sessions = new Map(); diff --git a/src/local-main.ts b/src/local-main.ts index ebd2c5a..5a573ca 100644 --- a/src/local-main.ts +++ b/src/local-main.ts @@ -6,7 +6,7 @@ import { Logger } from "./logger.js"; import { LocalVoiceSession } from "./audio/local-voice-session.js"; import { requireFfmpegPath } from "./audio/ffmpeg-path.js"; import { LocalFasterWhisperSttService } from "./services/local-stt.js"; -import { LocalMeloTtsService } from "./services/local-tts.js"; +import { LocalKokoroTtsService } from "./services/local-tts.js"; import { OllamaLlmService } from "./services/ollama-llm.js"; export async function printLocalAudioDevices(): Promise { @@ -68,7 +68,7 @@ export async function printLocalAudioDevices(): Promise { export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise { const stt = new LocalFasterWhisperSttService(config, logger); - const tts = new LocalMeloTtsService(config, logger); + const tts = new LocalKokoroTtsService(config, logger); const llm = new OllamaLlmService(config); await stt.warmup(); diff --git a/src/python-runtime.ts b/src/python-runtime.ts index ab40499..264aa00 100644 --- a/src/python-runtime.ts +++ b/src/python-runtime.ts @@ -30,6 +30,14 @@ export function resolveLocalAiCachePath(config: AppConfig): string { return path.resolve(process.cwd(), config.LOCAL_AI_CACHE_DIR); } +export function resolveLocalAiTtsModelPath(config: AppConfig): string { + return path.resolve(process.cwd(), config.LOCAL_TTS_MODEL_PATH); +} + +export function resolveLocalAiTtsVoicesPath(config: AppConfig): string { + return path.resolve(process.cwd(), config.LOCAL_TTS_VOICES_PATH); +} + export function resolveVenvPythonPath(config: AppConfig): string { const venvPath = resolveLocalAiVenvPath(config); return process.platform === "win32" diff --git a/src/services/local-tts.ts b/src/services/local-tts.ts index 3191c09..36a837d 100644 --- a/src/services/local-tts.ts +++ b/src/services/local-tts.ts @@ -7,12 +7,13 @@ import type { Logger } from "../logger.js"; import { resolveFfmpegPath } from "../audio/ffmpeg-path.js"; import { PythonJsonWorker } from "./python-json-worker.js"; import type { PreparedSpeechAudio, TtsService } from "./tts.js"; +import { resolveLocalAiTtsModelPath, resolveLocalAiTtsVoicesPath } from "../python-runtime.js"; interface SynthesizeResult { wav_base64?: string; } -export class LocalMeloTtsService implements TtsService { +export class LocalKokoroTtsService implements TtsService { private readonly worker: PythonJsonWorker; constructor(config: AssistantRuntimeConfig, logger: Logger) { @@ -22,6 +23,8 @@ export class LocalMeloTtsService implements TtsService { } this.worker = new PythonJsonWorker(config, logger, "local_tts_worker.py", "local-tts", { + LOCAL_TTS_MODEL_PATH: resolveLocalAiTtsModelPath(config), + LOCAL_TTS_VOICES_PATH: resolveLocalAiTtsVoicesPath(config), LOCAL_TTS_LANGUAGE: config.LOCAL_TTS_LANGUAGE, LOCAL_TTS_SPEAKER: config.LOCAL_TTS_SPEAKER, LOCAL_TTS_DEVICE: config.LOCAL_TTS_DEVICE, diff --git a/src/setup-local-ai.ts b/src/setup-local-ai.ts index 69e1e56..1b80195 100644 --- a/src/setup-local-ai.ts +++ b/src/setup-local-ai.ts @@ -1,10 +1,22 @@ import { existsSync } from "node:fs"; -import { mkdir } from "node:fs/promises"; +import { mkdir, writeFile } from "node:fs/promises"; import { spawn } from "node:child_process"; import path from "node:path"; import { loadConfig } from "./config.js"; -import { resolveLocalAiCachePath, resolveLocalAiVenvPath, resolvePythonLaunch, resolveVenvPythonPath } from "./python-runtime.js"; +import { + resolveLocalAiCachePath, + resolveLocalAiTtsModelPath, + resolveLocalAiTtsVoicesPath, + resolveLocalAiVenvPath, + resolvePythonLaunch, + resolveVenvPythonPath, +} from "./python-runtime.js"; + +const KOKORO_MODEL_URL = + "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"; +const KOKORO_VOICES_URL = + "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"; async function run(command: string, args: string[], extraEnv?: NodeJS.ProcessEnv): Promise { await new Promise((resolve, reject) => { @@ -48,11 +60,28 @@ async function ensurePip(pythonBin: string, env: NodeJS.ProcessEnv): Promise { + if (existsSync(filePath)) { + return; + } + + await mkdir(path.dirname(filePath), { recursive: true }); + const response = await fetch(url); + if (!response.ok) { + throw new Error(`다운로드 실패: ${url} (${response.status})`); + } + + const bytes = Buffer.from(await response.arrayBuffer()); + await writeFile(filePath, bytes); +} + async function main(): Promise { const config = loadConfig(); const venvPath = resolveLocalAiVenvPath(config); const venvPython = resolveVenvPythonPath(config); const cachePath = resolveLocalAiCachePath(config); + const ttsModelPath = resolveLocalAiTtsModelPath(config); + const ttsVoicesPath = resolveLocalAiTtsVoicesPath(config); const requirementsPath = path.resolve(process.cwd(), "python", "requirements.txt"); const baseEnv = { HF_HOME: cachePath, @@ -77,6 +106,9 @@ async function main(): Promise { console.log("로컬 AI 의존성 설치를 시작합니다."); await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], baseEnv); await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], baseEnv); + console.log("로컬 TTS 모델 파일을 확인합니다."); + await ensureDownload(KOKORO_MODEL_URL, ttsModelPath); + await ensureDownload(KOKORO_VOICES_URL, ttsVoicesPath); console.log("설치가 끝났습니다."); console.log("다음 순서:");