Switch local TTS to Kokoro ONNX

This commit is contained in:
2026-04-30 03:51:08 +09:00
parent 178283be61
commit 18369ea7cb
10 changed files with 112 additions and 49 deletions

View File

@@ -15,8 +15,10 @@ LOCAL_STT_MODEL=tiny
LOCAL_STT_DEVICE=auto LOCAL_STT_DEVICE=auto
LOCAL_STT_COMPUTE_TYPE=auto LOCAL_STT_COMPUTE_TYPE=auto
LOCAL_STT_BEAM_SIZE=1 LOCAL_STT_BEAM_SIZE=1
LOCAL_TTS_LANGUAGE=KR LOCAL_TTS_MODEL_PATH=.local-ai/models/kokoro-v1.0.onnx
LOCAL_TTS_SPEAKER=KR LOCAL_TTS_VOICES_PATH=.local-ai/models/voices-v1.0.bin
LOCAL_TTS_LANGUAGE=ko
LOCAL_TTS_SPEAKER=af_heart
LOCAL_TTS_DEVICE=auto LOCAL_TTS_DEVICE=auto
LOCAL_TTS_SPEED=1.12 LOCAL_TTS_SPEED=1.12

View File

@@ -6,7 +6,7 @@
- STT: `faster-whisper` + Whisper multilingual - STT: `faster-whisper` + Whisper multilingual
- LLM: `Ollama` + `qwen3:0.6b` - LLM: `Ollama` + `qwen3:0.6b`
- TTS: `MeloTTS` Korean - TTS: `kokoro-onnx` + `misaki[ko]`
- VAD: `avr-vad` - VAD: `avr-vad`
외부 유료 API나 무료 한도형 API는 쓰지 않습니다. 외부 유료 API나 무료 한도형 API는 쓰지 않습니다.
@@ -88,6 +88,8 @@ Discord 모드에서만 필수:
- `LOCAL_STT_DEVICE` - `LOCAL_STT_DEVICE`
- `LOCAL_STT_COMPUTE_TYPE` - `LOCAL_STT_COMPUTE_TYPE`
- `LOCAL_STT_BEAM_SIZE` - `LOCAL_STT_BEAM_SIZE`
- `LOCAL_TTS_MODEL_PATH`
- `LOCAL_TTS_VOICES_PATH`
- `LOCAL_TTS_LANGUAGE` - `LOCAL_TTS_LANGUAGE`
- `LOCAL_TTS_SPEAKER` - `LOCAL_TTS_SPEAKER`
- `LOCAL_TTS_DEVICE` - `LOCAL_TTS_DEVICE`
@@ -118,6 +120,7 @@ Discord 모드에서만 필수:
- STT 기본 모델은 `tiny` - STT 기본 모델은 `tiny`
- LLM 기본 모델은 `qwen3:0.6b` - LLM 기본 모델은 `qwen3:0.6b`
- TTS 기본 보이스는 `af_heart`
- TTS 기본 속도는 `1.12` - TTS 기본 속도는 `1.12`
정확도가 아쉬우면: 정확도가 아쉬우면:
@@ -142,6 +145,7 @@ OLLAMA_MODEL=qwen3:1.7b
- `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다. - `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다.
- 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다. - 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다.
- Python 탐지가 안 되면 `.env``LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 를 넣으면 됩니다. - Python 탐지가 안 되면 `.env``LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 를 넣으면 됩니다.
- `setup:local-ai`는 Kokoro ONNX 모델 파일도 자동으로 내려받습니다.
## 설계 메모 ## 설계 메모

View File

@@ -1,9 +1,12 @@
import base64 import base64
import io
import json import json
import os import os
import sys import sys
import tempfile
import traceback import traceback
import wave
import numpy as np
os.environ.setdefault("PYTHONIOENCODING", "utf-8") os.environ.setdefault("PYTHONIOENCODING", "utf-8")
@@ -27,53 +30,61 @@ def write_response(request_id: int, ok: bool, result=None, error: str | None = N
sys.stdout.flush() sys.stdout.flush()
def normalize_lang(raw: str) -> str:
lowered = raw.strip().lower()
if lowered in {"kr", "ko-kr"}:
return "ko"
return lowered or "ko"
def normalize_voice(raw: str) -> str:
value = raw.strip()
if value.upper() in {"KR", "KO"} or not value:
return "af_heart"
return value
class TtsWorker: class TtsWorker:
def __init__(self) -> None: def __init__(self) -> None:
from melo.api import TTS from kokoro_onnx import Kokoro
from misaki import ko
self.language = os.environ.get("LOCAL_TTS_LANGUAGE", "KR").strip() or "KR" self.model_path = os.environ["LOCAL_TTS_MODEL_PATH"]
self.speaker_key = os.environ.get("LOCAL_TTS_SPEAKER", "KR").strip() or "KR" self.voices_path = os.environ["LOCAL_TTS_VOICES_PATH"]
self.device = os.environ.get("LOCAL_TTS_DEVICE", "auto").strip() or "auto" self.language = normalize_lang(os.environ.get("LOCAL_TTS_LANGUAGE", "ko"))
self.voice = normalize_voice(os.environ.get("LOCAL_TTS_SPEAKER", "af_heart"))
self.speed = float(os.environ.get("LOCAL_TTS_SPEED", "1.12")) self.speed = float(os.environ.get("LOCAL_TTS_SPEED", "1.12"))
self.g2p = ko.KOG2P()
self.model = TTS(language=self.language, device=self.device) self.model = Kokoro(self.model_path, self.voices_path)
speaker_ids = self.model.hps.data.spk2id
self.speaker_id = speaker_ids.get(self.speaker_key)
if self.speaker_id is None:
normalized = self.speaker_key.upper()
self.speaker_id = speaker_ids.get(normalized)
if self.speaker_id is None:
self.speaker_id = next(iter(speaker_ids.values()))
log( log(
f"local-tts ready language={self.language} speaker={self.speaker_key} device={self.device} speed={self.speed}" f"local-tts ready model={os.path.basename(self.model_path)} voice={self.voice} language={self.language} speed={self.speed}"
) )
def synthesize(self, text: str) -> bytes: def synthesize(self, text: str) -> bytes:
temp_path = "" phonemes, _tokens = self.g2p(text)
samples, sample_rate = self.model.create(
phonemes,
voice=self.voice,
speed=self.speed,
lang="en-us",
is_phonemes=True,
)
return build_wav_bytes(samples, sample_rate)
try:
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as handle:
temp_path = handle.name
self.model.tts_to_file( def build_wav_bytes(samples: np.ndarray, sample_rate: int) -> bytes:
text, clipped = np.clip(samples, -1.0, 1.0)
self.speaker_id, pcm = (clipped * 32767.0).astype(np.int16)
temp_path, buffer = io.BytesIO()
speed=self.speed,
quiet=True,
)
with open(temp_path, "rb") as handle: with wave.open(buffer, "wb") as wav_file:
return handle.read() wav_file.setnchannels(1)
finally: wav_file.setsampwidth(2)
if temp_path: wav_file.setframerate(sample_rate)
try: wav_file.writeframes(pcm.tobytes())
os.unlink(temp_path)
except OSError: return buffer.getvalue()
pass
def main() -> int: def main() -> int:

View File

@@ -1,2 +1,3 @@
faster-whisper==1.2.1 faster-whisper==1.2.1
git+https://github.com/myshell-ai/MeloTTS.git@v0.1.2 kokoro-onnx==0.5.0
misaki[ko]==0.9.4

View File

@@ -26,8 +26,10 @@ const envSchema = z.object({
LOCAL_STT_DEVICE: z.string().min(1).default("auto"), LOCAL_STT_DEVICE: z.string().min(1).default("auto"),
LOCAL_STT_COMPUTE_TYPE: z.string().min(1).default("auto"), LOCAL_STT_COMPUTE_TYPE: z.string().min(1).default("auto"),
LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1), LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1),
LOCAL_TTS_LANGUAGE: z.string().min(1).default("KR"), LOCAL_TTS_MODEL_PATH: z.string().min(1).default(".local-ai/models/kokoro-v1.0.onnx"),
LOCAL_TTS_SPEAKER: z.string().min(1).default("KR"), LOCAL_TTS_VOICES_PATH: z.string().min(1).default(".local-ai/models/voices-v1.0.bin"),
LOCAL_TTS_LANGUAGE: z.string().min(1).default("ko"),
LOCAL_TTS_SPEAKER: z.string().min(1).default("af_heart"),
LOCAL_TTS_DEVICE: z.string().min(1).default("auto"), LOCAL_TTS_DEVICE: z.string().min(1).default("auto"),
LOCAL_TTS_SPEED: z.coerce.number().min(0.8).max(1.6).default(1.12), LOCAL_TTS_SPEED: z.coerce.number().min(0.8).max(1.6).default(1.12),
BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"), BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),

View File

@@ -16,7 +16,7 @@ import { GuildVoiceSession } from "./audio/guild-voice-session.js";
import { type DiscordRuntimeConfig } from "./config.js"; import { type DiscordRuntimeConfig } from "./config.js";
import { Logger } from "./logger.js"; import { Logger } from "./logger.js";
import { LocalFasterWhisperSttService } from "./services/local-stt.js"; import { LocalFasterWhisperSttService } from "./services/local-stt.js";
import { LocalMeloTtsService } from "./services/local-tts.js"; import { LocalKokoroTtsService } from "./services/local-tts.js";
import { OllamaLlmService } from "./services/ollama-llm.js"; import { OllamaLlmService } from "./services/ollama-llm.js";
export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> { export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
@@ -38,7 +38,7 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger
}); });
const stt = new LocalFasterWhisperSttService(config, logger); const stt = new LocalFasterWhisperSttService(config, logger);
const tts = new LocalMeloTtsService(config, logger); const tts = new LocalKokoroTtsService(config, logger);
const llm = new OllamaLlmService(config); const llm = new OllamaLlmService(config);
const sessions = new Map<string, GuildVoiceSession>(); const sessions = new Map<string, GuildVoiceSession>();

View File

@@ -6,7 +6,7 @@ import { Logger } from "./logger.js";
import { LocalVoiceSession } from "./audio/local-voice-session.js"; import { LocalVoiceSession } from "./audio/local-voice-session.js";
import { requireFfmpegPath } from "./audio/ffmpeg-path.js"; import { requireFfmpegPath } from "./audio/ffmpeg-path.js";
import { LocalFasterWhisperSttService } from "./services/local-stt.js"; import { LocalFasterWhisperSttService } from "./services/local-stt.js";
import { LocalMeloTtsService } from "./services/local-tts.js"; import { LocalKokoroTtsService } from "./services/local-tts.js";
import { OllamaLlmService } from "./services/ollama-llm.js"; import { OllamaLlmService } from "./services/ollama-llm.js";
export async function printLocalAudioDevices(): Promise<void> { export async function printLocalAudioDevices(): Promise<void> {
@@ -68,7 +68,7 @@ export async function printLocalAudioDevices(): Promise<void> {
export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> { export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
const stt = new LocalFasterWhisperSttService(config, logger); const stt = new LocalFasterWhisperSttService(config, logger);
const tts = new LocalMeloTtsService(config, logger); const tts = new LocalKokoroTtsService(config, logger);
const llm = new OllamaLlmService(config); const llm = new OllamaLlmService(config);
await stt.warmup(); await stt.warmup();

View File

@@ -30,6 +30,14 @@ export function resolveLocalAiCachePath(config: AppConfig): string {
return path.resolve(process.cwd(), config.LOCAL_AI_CACHE_DIR); return path.resolve(process.cwd(), config.LOCAL_AI_CACHE_DIR);
} }
export function resolveLocalAiTtsModelPath(config: AppConfig): string {
return path.resolve(process.cwd(), config.LOCAL_TTS_MODEL_PATH);
}
export function resolveLocalAiTtsVoicesPath(config: AppConfig): string {
return path.resolve(process.cwd(), config.LOCAL_TTS_VOICES_PATH);
}
export function resolveVenvPythonPath(config: AppConfig): string { export function resolveVenvPythonPath(config: AppConfig): string {
const venvPath = resolveLocalAiVenvPath(config); const venvPath = resolveLocalAiVenvPath(config);
return process.platform === "win32" return process.platform === "win32"

View File

@@ -7,12 +7,13 @@ import type { Logger } from "../logger.js";
import { resolveFfmpegPath } from "../audio/ffmpeg-path.js"; import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
import { PythonJsonWorker } from "./python-json-worker.js"; import { PythonJsonWorker } from "./python-json-worker.js";
import type { PreparedSpeechAudio, TtsService } from "./tts.js"; import type { PreparedSpeechAudio, TtsService } from "./tts.js";
import { resolveLocalAiTtsModelPath, resolveLocalAiTtsVoicesPath } from "../python-runtime.js";
interface SynthesizeResult { interface SynthesizeResult {
wav_base64?: string; wav_base64?: string;
} }
export class LocalMeloTtsService implements TtsService { export class LocalKokoroTtsService implements TtsService {
private readonly worker: PythonJsonWorker; private readonly worker: PythonJsonWorker;
constructor(config: AssistantRuntimeConfig, logger: Logger) { constructor(config: AssistantRuntimeConfig, logger: Logger) {
@@ -22,6 +23,8 @@ export class LocalMeloTtsService implements TtsService {
} }
this.worker = new PythonJsonWorker(config, logger, "local_tts_worker.py", "local-tts", { this.worker = new PythonJsonWorker(config, logger, "local_tts_worker.py", "local-tts", {
LOCAL_TTS_MODEL_PATH: resolveLocalAiTtsModelPath(config),
LOCAL_TTS_VOICES_PATH: resolveLocalAiTtsVoicesPath(config),
LOCAL_TTS_LANGUAGE: config.LOCAL_TTS_LANGUAGE, LOCAL_TTS_LANGUAGE: config.LOCAL_TTS_LANGUAGE,
LOCAL_TTS_SPEAKER: config.LOCAL_TTS_SPEAKER, LOCAL_TTS_SPEAKER: config.LOCAL_TTS_SPEAKER,
LOCAL_TTS_DEVICE: config.LOCAL_TTS_DEVICE, LOCAL_TTS_DEVICE: config.LOCAL_TTS_DEVICE,

View File

@@ -1,10 +1,22 @@
import { existsSync } from "node:fs"; import { existsSync } from "node:fs";
import { mkdir } from "node:fs/promises"; import { mkdir, writeFile } from "node:fs/promises";
import { spawn } from "node:child_process"; import { spawn } from "node:child_process";
import path from "node:path"; import path from "node:path";
import { loadConfig } from "./config.js"; import { loadConfig } from "./config.js";
import { resolveLocalAiCachePath, resolveLocalAiVenvPath, resolvePythonLaunch, resolveVenvPythonPath } from "./python-runtime.js"; import {
resolveLocalAiCachePath,
resolveLocalAiTtsModelPath,
resolveLocalAiTtsVoicesPath,
resolveLocalAiVenvPath,
resolvePythonLaunch,
resolveVenvPythonPath,
} from "./python-runtime.js";
const KOKORO_MODEL_URL =
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx";
const KOKORO_VOICES_URL =
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin";
async function run(command: string, args: string[], extraEnv?: NodeJS.ProcessEnv): Promise<void> { async function run(command: string, args: string[], extraEnv?: NodeJS.ProcessEnv): Promise<void> {
await new Promise<void>((resolve, reject) => { await new Promise<void>((resolve, reject) => {
@@ -48,11 +60,28 @@ async function ensurePip(pythonBin: string, env: NodeJS.ProcessEnv): Promise<voi
}); });
} }
async function ensureDownload(url: string, filePath: string): Promise<void> {
if (existsSync(filePath)) {
return;
}
await mkdir(path.dirname(filePath), { recursive: true });
const response = await fetch(url);
if (!response.ok) {
throw new Error(`다운로드 실패: ${url} (${response.status})`);
}
const bytes = Buffer.from(await response.arrayBuffer());
await writeFile(filePath, bytes);
}
async function main(): Promise<void> { async function main(): Promise<void> {
const config = loadConfig(); const config = loadConfig();
const venvPath = resolveLocalAiVenvPath(config); const venvPath = resolveLocalAiVenvPath(config);
const venvPython = resolveVenvPythonPath(config); const venvPython = resolveVenvPythonPath(config);
const cachePath = resolveLocalAiCachePath(config); const cachePath = resolveLocalAiCachePath(config);
const ttsModelPath = resolveLocalAiTtsModelPath(config);
const ttsVoicesPath = resolveLocalAiTtsVoicesPath(config);
const requirementsPath = path.resolve(process.cwd(), "python", "requirements.txt"); const requirementsPath = path.resolve(process.cwd(), "python", "requirements.txt");
const baseEnv = { const baseEnv = {
HF_HOME: cachePath, HF_HOME: cachePath,
@@ -77,6 +106,9 @@ async function main(): Promise<void> {
console.log("로컬 AI 의존성 설치를 시작합니다."); console.log("로컬 AI 의존성 설치를 시작합니다.");
await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], baseEnv); await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], baseEnv);
await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], baseEnv); await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], baseEnv);
console.log("로컬 TTS 모델 파일을 확인합니다.");
await ensureDownload(KOKORO_MODEL_URL, ttsModelPath);
await ensureDownload(KOKORO_VOICES_URL, ttsVoicesPath);
console.log("설치가 끝났습니다."); console.log("설치가 끝났습니다.");
console.log("다음 순서:"); console.log("다음 순서:");