import { Readable } from "node:stream"; import prism from "prism-media"; import type { AssistantRuntimeConfig } from "../config.js"; import type { Logger } from "../logger.js"; import { resolveFfmpegPath } from "../audio/ffmpeg-path.js"; import { PythonJsonWorker } from "./python-json-worker.js"; import type { PreparedSpeechAudio, TtsService } from "./tts.js"; import { resolveLocalAiTtsModelPath, resolveLocalAiTtsVoicesPath } from "../python-runtime.js"; interface SynthesizeResult { wav_base64?: string; } export class LocalKokoroTtsService implements TtsService { private readonly worker: PythonJsonWorker; constructor(config: AssistantRuntimeConfig, logger: Logger) { const resolvedFfmpegPath = resolveFfmpegPath(); if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) { process.env.FFMPEG_PATH = resolvedFfmpegPath; } this.worker = new PythonJsonWorker(config, logger, "local_tts_worker.py", "local-tts", { LOCAL_TTS_MODEL_PATH: resolveLocalAiTtsModelPath(config), LOCAL_TTS_VOICES_PATH: resolveLocalAiTtsVoicesPath(config), LOCAL_TTS_LANGUAGE: config.LOCAL_TTS_LANGUAGE, LOCAL_TTS_SPEAKER: config.LOCAL_TTS_SPEAKER, LOCAL_TTS_DEVICE: config.LOCAL_TTS_DEVICE, LOCAL_TTS_SPEED: String(config.LOCAL_TTS_SPEED), }); } async warmup(): Promise { await this.worker.request("ping", {}); } async preparePlayback(text: string, signal?: AbortSignal): Promise { const result = await this.worker.request( "synthesize", { text, }, signal, ); const wavBase64 = result.wav_base64; if (!wavBase64) { throw new Error("로컬 TTS가 빈 오디오를 반환했습니다."); } const input = Readable.from([Buffer.from(wavBase64, "base64")]); const ffmpeg = new prism.FFmpeg({ args: [ "-analyzeduration", "0", "-loglevel", "0", "-i", "pipe:0", "-f", "s16le", "-ar", "48000", "-ac", "2", "pipe:1", ], }); if (signal) { signal.addEventListener( "abort", () => { input.destroy(); ffmpeg.destroy(); }, { once: true }, ); } input.pipe(ffmpeg); return { stream: ffmpeg, dispose: () => { input.destroy(); ffmpeg.destroy(); }, }; } async destroy(): Promise { await this.worker.destroy(); } }