Switch local TTS to Kokoro ONNX
This commit is contained in:
@@ -15,8 +15,10 @@ LOCAL_STT_MODEL=tiny
|
||||
LOCAL_STT_DEVICE=auto
|
||||
LOCAL_STT_COMPUTE_TYPE=auto
|
||||
LOCAL_STT_BEAM_SIZE=1
|
||||
LOCAL_TTS_LANGUAGE=KR
|
||||
LOCAL_TTS_SPEAKER=KR
|
||||
LOCAL_TTS_MODEL_PATH=.local-ai/models/kokoro-v1.0.onnx
|
||||
LOCAL_TTS_VOICES_PATH=.local-ai/models/voices-v1.0.bin
|
||||
LOCAL_TTS_LANGUAGE=ko
|
||||
LOCAL_TTS_SPEAKER=af_heart
|
||||
LOCAL_TTS_DEVICE=auto
|
||||
LOCAL_TTS_SPEED=1.12
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
- STT: `faster-whisper` + Whisper multilingual
|
||||
- LLM: `Ollama` + `qwen3:0.6b`
|
||||
- TTS: `MeloTTS` Korean
|
||||
- TTS: `kokoro-onnx` + `misaki[ko]`
|
||||
- VAD: `avr-vad`
|
||||
|
||||
외부 유료 API나 무료 한도형 API는 쓰지 않습니다.
|
||||
@@ -88,6 +88,8 @@ Discord 모드에서만 필수:
|
||||
- `LOCAL_STT_DEVICE`
|
||||
- `LOCAL_STT_COMPUTE_TYPE`
|
||||
- `LOCAL_STT_BEAM_SIZE`
|
||||
- `LOCAL_TTS_MODEL_PATH`
|
||||
- `LOCAL_TTS_VOICES_PATH`
|
||||
- `LOCAL_TTS_LANGUAGE`
|
||||
- `LOCAL_TTS_SPEAKER`
|
||||
- `LOCAL_TTS_DEVICE`
|
||||
@@ -118,6 +120,7 @@ Discord 모드에서만 필수:
|
||||
|
||||
- STT 기본 모델은 `tiny`
|
||||
- LLM 기본 모델은 `qwen3:0.6b`
|
||||
- TTS 기본 보이스는 `af_heart`
|
||||
- TTS 기본 속도는 `1.12`
|
||||
|
||||
정확도가 아쉬우면:
|
||||
@@ -142,6 +145,7 @@ OLLAMA_MODEL=qwen3:1.7b
|
||||
- `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다.
|
||||
- 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다.
|
||||
- Python 탐지가 안 되면 `.env` 에 `LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 를 넣으면 됩니다.
|
||||
- `setup:local-ai`는 Kokoro ONNX 모델 파일도 자동으로 내려받습니다.
|
||||
|
||||
## 설계 메모
|
||||
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import traceback
|
||||
import wave
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
os.environ.setdefault("PYTHONIOENCODING", "utf-8")
|
||||
@@ -27,53 +30,61 @@ def write_response(request_id: int, ok: bool, result=None, error: str | None = N
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def normalize_lang(raw: str) -> str:
|
||||
lowered = raw.strip().lower()
|
||||
if lowered in {"kr", "ko-kr"}:
|
||||
return "ko"
|
||||
return lowered or "ko"
|
||||
|
||||
|
||||
def normalize_voice(raw: str) -> str:
|
||||
value = raw.strip()
|
||||
if value.upper() in {"KR", "KO"} or not value:
|
||||
return "af_heart"
|
||||
return value
|
||||
|
||||
|
||||
class TtsWorker:
|
||||
def __init__(self) -> None:
|
||||
from melo.api import TTS
|
||||
from kokoro_onnx import Kokoro
|
||||
from misaki import ko
|
||||
|
||||
self.language = os.environ.get("LOCAL_TTS_LANGUAGE", "KR").strip() or "KR"
|
||||
self.speaker_key = os.environ.get("LOCAL_TTS_SPEAKER", "KR").strip() or "KR"
|
||||
self.device = os.environ.get("LOCAL_TTS_DEVICE", "auto").strip() or "auto"
|
||||
self.model_path = os.environ["LOCAL_TTS_MODEL_PATH"]
|
||||
self.voices_path = os.environ["LOCAL_TTS_VOICES_PATH"]
|
||||
self.language = normalize_lang(os.environ.get("LOCAL_TTS_LANGUAGE", "ko"))
|
||||
self.voice = normalize_voice(os.environ.get("LOCAL_TTS_SPEAKER", "af_heart"))
|
||||
self.speed = float(os.environ.get("LOCAL_TTS_SPEED", "1.12"))
|
||||
|
||||
self.model = TTS(language=self.language, device=self.device)
|
||||
speaker_ids = self.model.hps.data.spk2id
|
||||
self.speaker_id = speaker_ids.get(self.speaker_key)
|
||||
|
||||
if self.speaker_id is None:
|
||||
normalized = self.speaker_key.upper()
|
||||
self.speaker_id = speaker_ids.get(normalized)
|
||||
|
||||
if self.speaker_id is None:
|
||||
self.speaker_id = next(iter(speaker_ids.values()))
|
||||
self.g2p = ko.KOG2P()
|
||||
self.model = Kokoro(self.model_path, self.voices_path)
|
||||
|
||||
log(
|
||||
f"local-tts ready language={self.language} speaker={self.speaker_key} device={self.device} speed={self.speed}"
|
||||
f"local-tts ready model={os.path.basename(self.model_path)} voice={self.voice} language={self.language} speed={self.speed}"
|
||||
)
|
||||
|
||||
def synthesize(self, text: str) -> bytes:
|
||||
temp_path = ""
|
||||
phonemes, _tokens = self.g2p(text)
|
||||
samples, sample_rate = self.model.create(
|
||||
phonemes,
|
||||
voice=self.voice,
|
||||
speed=self.speed,
|
||||
lang="en-us",
|
||||
is_phonemes=True,
|
||||
)
|
||||
return build_wav_bytes(samples, sample_rate)
|
||||
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as handle:
|
||||
temp_path = handle.name
|
||||
|
||||
self.model.tts_to_file(
|
||||
text,
|
||||
self.speaker_id,
|
||||
temp_path,
|
||||
speed=self.speed,
|
||||
quiet=True,
|
||||
)
|
||||
def build_wav_bytes(samples: np.ndarray, sample_rate: int) -> bytes:
|
||||
clipped = np.clip(samples, -1.0, 1.0)
|
||||
pcm = (clipped * 32767.0).astype(np.int16)
|
||||
buffer = io.BytesIO()
|
||||
|
||||
with open(temp_path, "rb") as handle:
|
||||
return handle.read()
|
||||
finally:
|
||||
if temp_path:
|
||||
try:
|
||||
os.unlink(temp_path)
|
||||
except OSError:
|
||||
pass
|
||||
with wave.open(buffer, "wb") as wav_file:
|
||||
wav_file.setnchannels(1)
|
||||
wav_file.setsampwidth(2)
|
||||
wav_file.setframerate(sample_rate)
|
||||
wav_file.writeframes(pcm.tobytes())
|
||||
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
faster-whisper==1.2.1
|
||||
git+https://github.com/myshell-ai/MeloTTS.git@v0.1.2
|
||||
kokoro-onnx==0.5.0
|
||||
misaki[ko]==0.9.4
|
||||
|
||||
@@ -26,8 +26,10 @@ const envSchema = z.object({
|
||||
LOCAL_STT_DEVICE: z.string().min(1).default("auto"),
|
||||
LOCAL_STT_COMPUTE_TYPE: z.string().min(1).default("auto"),
|
||||
LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1),
|
||||
LOCAL_TTS_LANGUAGE: z.string().min(1).default("KR"),
|
||||
LOCAL_TTS_SPEAKER: z.string().min(1).default("KR"),
|
||||
LOCAL_TTS_MODEL_PATH: z.string().min(1).default(".local-ai/models/kokoro-v1.0.onnx"),
|
||||
LOCAL_TTS_VOICES_PATH: z.string().min(1).default(".local-ai/models/voices-v1.0.bin"),
|
||||
LOCAL_TTS_LANGUAGE: z.string().min(1).default("ko"),
|
||||
LOCAL_TTS_SPEAKER: z.string().min(1).default("af_heart"),
|
||||
LOCAL_TTS_DEVICE: z.string().min(1).default("auto"),
|
||||
LOCAL_TTS_SPEED: z.coerce.number().min(0.8).max(1.6).default(1.12),
|
||||
BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),
|
||||
|
||||
@@ -16,7 +16,7 @@ import { GuildVoiceSession } from "./audio/guild-voice-session.js";
|
||||
import { type DiscordRuntimeConfig } from "./config.js";
|
||||
import { Logger } from "./logger.js";
|
||||
import { LocalFasterWhisperSttService } from "./services/local-stt.js";
|
||||
import { LocalMeloTtsService } from "./services/local-tts.js";
|
||||
import { LocalKokoroTtsService } from "./services/local-tts.js";
|
||||
import { OllamaLlmService } from "./services/ollama-llm.js";
|
||||
|
||||
export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
|
||||
@@ -38,7 +38,7 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger
|
||||
});
|
||||
|
||||
const stt = new LocalFasterWhisperSttService(config, logger);
|
||||
const tts = new LocalMeloTtsService(config, logger);
|
||||
const tts = new LocalKokoroTtsService(config, logger);
|
||||
const llm = new OllamaLlmService(config);
|
||||
const sessions = new Map<string, GuildVoiceSession>();
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import { Logger } from "./logger.js";
|
||||
import { LocalVoiceSession } from "./audio/local-voice-session.js";
|
||||
import { requireFfmpegPath } from "./audio/ffmpeg-path.js";
|
||||
import { LocalFasterWhisperSttService } from "./services/local-stt.js";
|
||||
import { LocalMeloTtsService } from "./services/local-tts.js";
|
||||
import { LocalKokoroTtsService } from "./services/local-tts.js";
|
||||
import { OllamaLlmService } from "./services/ollama-llm.js";
|
||||
|
||||
export async function printLocalAudioDevices(): Promise<void> {
|
||||
@@ -68,7 +68,7 @@ export async function printLocalAudioDevices(): Promise<void> {
|
||||
|
||||
export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
|
||||
const stt = new LocalFasterWhisperSttService(config, logger);
|
||||
const tts = new LocalMeloTtsService(config, logger);
|
||||
const tts = new LocalKokoroTtsService(config, logger);
|
||||
const llm = new OllamaLlmService(config);
|
||||
|
||||
await stt.warmup();
|
||||
|
||||
@@ -30,6 +30,14 @@ export function resolveLocalAiCachePath(config: AppConfig): string {
|
||||
return path.resolve(process.cwd(), config.LOCAL_AI_CACHE_DIR);
|
||||
}
|
||||
|
||||
export function resolveLocalAiTtsModelPath(config: AppConfig): string {
|
||||
return path.resolve(process.cwd(), config.LOCAL_TTS_MODEL_PATH);
|
||||
}
|
||||
|
||||
export function resolveLocalAiTtsVoicesPath(config: AppConfig): string {
|
||||
return path.resolve(process.cwd(), config.LOCAL_TTS_VOICES_PATH);
|
||||
}
|
||||
|
||||
export function resolveVenvPythonPath(config: AppConfig): string {
|
||||
const venvPath = resolveLocalAiVenvPath(config);
|
||||
return process.platform === "win32"
|
||||
|
||||
@@ -7,12 +7,13 @@ import type { Logger } from "../logger.js";
|
||||
import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
|
||||
import { PythonJsonWorker } from "./python-json-worker.js";
|
||||
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
|
||||
import { resolveLocalAiTtsModelPath, resolveLocalAiTtsVoicesPath } from "../python-runtime.js";
|
||||
|
||||
interface SynthesizeResult {
|
||||
wav_base64?: string;
|
||||
}
|
||||
|
||||
export class LocalMeloTtsService implements TtsService {
|
||||
export class LocalKokoroTtsService implements TtsService {
|
||||
private readonly worker: PythonJsonWorker;
|
||||
|
||||
constructor(config: AssistantRuntimeConfig, logger: Logger) {
|
||||
@@ -22,6 +23,8 @@ export class LocalMeloTtsService implements TtsService {
|
||||
}
|
||||
|
||||
this.worker = new PythonJsonWorker(config, logger, "local_tts_worker.py", "local-tts", {
|
||||
LOCAL_TTS_MODEL_PATH: resolveLocalAiTtsModelPath(config),
|
||||
LOCAL_TTS_VOICES_PATH: resolveLocalAiTtsVoicesPath(config),
|
||||
LOCAL_TTS_LANGUAGE: config.LOCAL_TTS_LANGUAGE,
|
||||
LOCAL_TTS_SPEAKER: config.LOCAL_TTS_SPEAKER,
|
||||
LOCAL_TTS_DEVICE: config.LOCAL_TTS_DEVICE,
|
||||
|
||||
@@ -1,10 +1,22 @@
|
||||
import { existsSync } from "node:fs";
|
||||
import { mkdir } from "node:fs/promises";
|
||||
import { mkdir, writeFile } from "node:fs/promises";
|
||||
import { spawn } from "node:child_process";
|
||||
import path from "node:path";
|
||||
|
||||
import { loadConfig } from "./config.js";
|
||||
import { resolveLocalAiCachePath, resolveLocalAiVenvPath, resolvePythonLaunch, resolveVenvPythonPath } from "./python-runtime.js";
|
||||
import {
|
||||
resolveLocalAiCachePath,
|
||||
resolveLocalAiTtsModelPath,
|
||||
resolveLocalAiTtsVoicesPath,
|
||||
resolveLocalAiVenvPath,
|
||||
resolvePythonLaunch,
|
||||
resolveVenvPythonPath,
|
||||
} from "./python-runtime.js";
|
||||
|
||||
const KOKORO_MODEL_URL =
|
||||
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx";
|
||||
const KOKORO_VOICES_URL =
|
||||
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin";
|
||||
|
||||
async function run(command: string, args: string[], extraEnv?: NodeJS.ProcessEnv): Promise<void> {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
@@ -48,11 +60,28 @@ async function ensurePip(pythonBin: string, env: NodeJS.ProcessEnv): Promise<voi
|
||||
});
|
||||
}
|
||||
|
||||
async function ensureDownload(url: string, filePath: string): Promise<void> {
|
||||
if (existsSync(filePath)) {
|
||||
return;
|
||||
}
|
||||
|
||||
await mkdir(path.dirname(filePath), { recursive: true });
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) {
|
||||
throw new Error(`다운로드 실패: ${url} (${response.status})`);
|
||||
}
|
||||
|
||||
const bytes = Buffer.from(await response.arrayBuffer());
|
||||
await writeFile(filePath, bytes);
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const config = loadConfig();
|
||||
const venvPath = resolveLocalAiVenvPath(config);
|
||||
const venvPython = resolveVenvPythonPath(config);
|
||||
const cachePath = resolveLocalAiCachePath(config);
|
||||
const ttsModelPath = resolveLocalAiTtsModelPath(config);
|
||||
const ttsVoicesPath = resolveLocalAiTtsVoicesPath(config);
|
||||
const requirementsPath = path.resolve(process.cwd(), "python", "requirements.txt");
|
||||
const baseEnv = {
|
||||
HF_HOME: cachePath,
|
||||
@@ -77,6 +106,9 @@ async function main(): Promise<void> {
|
||||
console.log("로컬 AI 의존성 설치를 시작합니다.");
|
||||
await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], baseEnv);
|
||||
await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], baseEnv);
|
||||
console.log("로컬 TTS 모델 파일을 확인합니다.");
|
||||
await ensureDownload(KOKORO_MODEL_URL, ttsModelPath);
|
||||
await ensureDownload(KOKORO_VOICES_URL, ttsVoicesPath);
|
||||
|
||||
console.log("설치가 끝났습니다.");
|
||||
console.log("다음 순서:");
|
||||
|
||||
Reference in New Issue
Block a user