98 lines
2.5 KiB
TypeScript
98 lines
2.5 KiB
TypeScript
import { Readable } from "node:stream";
|
|
|
|
import prism from "prism-media";
|
|
|
|
import type { AssistantRuntimeConfig } from "../config.js";
|
|
import type { Logger } from "../logger.js";
|
|
import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
|
|
import { PythonJsonWorker } from "./python-json-worker.js";
|
|
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
|
|
import { resolveLocalAiTtsModelPath, resolveLocalAiTtsVoicesPath } from "../python-runtime.js";
|
|
|
|
interface SynthesizeResult {
|
|
wav_base64?: string;
|
|
}
|
|
|
|
export class LocalKokoroTtsService implements TtsService {
|
|
private readonly worker: PythonJsonWorker;
|
|
|
|
constructor(config: AssistantRuntimeConfig, logger: Logger) {
|
|
const resolvedFfmpegPath = resolveFfmpegPath();
|
|
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
|
|
process.env.FFMPEG_PATH = resolvedFfmpegPath;
|
|
}
|
|
|
|
this.worker = new PythonJsonWorker(config, logger, "local_tts_worker.py", "local-tts", {
|
|
LOCAL_TTS_MODEL_PATH: resolveLocalAiTtsModelPath(config),
|
|
LOCAL_TTS_VOICES_PATH: resolveLocalAiTtsVoicesPath(config),
|
|
LOCAL_TTS_LANGUAGE: config.LOCAL_TTS_LANGUAGE,
|
|
LOCAL_TTS_SPEAKER: config.LOCAL_TTS_SPEAKER,
|
|
LOCAL_TTS_DEVICE: config.LOCAL_TTS_DEVICE,
|
|
LOCAL_TTS_SPEED: String(config.LOCAL_TTS_SPEED),
|
|
});
|
|
}
|
|
|
|
async warmup(): Promise<void> {
|
|
await this.worker.request("ping", {});
|
|
}
|
|
|
|
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
|
|
const result = await this.worker.request<SynthesizeResult>(
|
|
"synthesize",
|
|
{
|
|
text,
|
|
},
|
|
signal,
|
|
);
|
|
|
|
const wavBase64 = result.wav_base64;
|
|
if (!wavBase64) {
|
|
throw new Error("로컬 TTS가 빈 오디오를 반환했습니다.");
|
|
}
|
|
|
|
const input = Readable.from([Buffer.from(wavBase64, "base64")]);
|
|
const ffmpeg = new prism.FFmpeg({
|
|
args: [
|
|
"-analyzeduration",
|
|
"0",
|
|
"-loglevel",
|
|
"0",
|
|
"-i",
|
|
"pipe:0",
|
|
"-f",
|
|
"s16le",
|
|
"-ar",
|
|
"48000",
|
|
"-ac",
|
|
"2",
|
|
"pipe:1",
|
|
],
|
|
});
|
|
|
|
if (signal) {
|
|
signal.addEventListener(
|
|
"abort",
|
|
() => {
|
|
input.destroy();
|
|
ffmpeg.destroy();
|
|
},
|
|
{ once: true },
|
|
);
|
|
}
|
|
|
|
input.pipe(ffmpeg);
|
|
|
|
return {
|
|
stream: ffmpeg,
|
|
dispose: () => {
|
|
input.destroy();
|
|
ffmpeg.destroy();
|
|
},
|
|
};
|
|
}
|
|
|
|
async destroy(): Promise<void> {
|
|
await this.worker.destroy();
|
|
}
|
|
}
|