79 lines
2.0 KiB
TypeScript
79 lines
2.0 KiB
TypeScript
import { Readable } from "node:stream";
|
|
|
|
import ffmpegStatic from "ffmpeg-static";
|
|
import prism from "prism-media";
|
|
|
|
import type { AssistantRuntimeConfig } from "../config.js";
|
|
|
|
export interface PreparedSpeechAudio {
|
|
stream: Readable;
|
|
dispose: () => void;
|
|
}
|
|
|
|
export class ElevenLabsTtsService {
|
|
constructor(private readonly config: AssistantRuntimeConfig) {
|
|
const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
|
|
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
|
|
process.env.FFMPEG_PATH = resolvedFfmpegPath;
|
|
}
|
|
}
|
|
|
|
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
|
|
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
|
|
url.searchParams.set("output_format", "mp3_44100_128");
|
|
url.searchParams.set("enable_logging", "false");
|
|
|
|
const response = await fetch(url, {
|
|
method: "POST",
|
|
headers: {
|
|
"Content-Type": "application/json",
|
|
"xi-api-key": this.config.ELEVENLABS_API_KEY,
|
|
},
|
|
body: JSON.stringify({
|
|
text,
|
|
model_id: this.config.ELEVENLABS_TTS_MODEL,
|
|
language_code: this.config.BOT_DEFAULT_LANGUAGE,
|
|
voice_settings: {
|
|
stability: 0.35,
|
|
similarity_boost: 0.75,
|
|
speed: 1.05,
|
|
},
|
|
}),
|
|
signal,
|
|
});
|
|
|
|
if (!response.ok || !response.body) {
|
|
throw new Error(`ElevenLabs TTS request failed with status ${response.status}`);
|
|
}
|
|
|
|
const input = Readable.fromWeb(response.body as never);
|
|
const ffmpeg = new prism.FFmpeg({
|
|
args: [
|
|
"-analyzeduration",
|
|
"0",
|
|
"-loglevel",
|
|
"0",
|
|
"-i",
|
|
"pipe:0",
|
|
"-f",
|
|
"s16le",
|
|
"-ar",
|
|
"48000",
|
|
"-ac",
|
|
"2",
|
|
"pipe:1",
|
|
],
|
|
});
|
|
|
|
input.pipe(ffmpeg);
|
|
|
|
return {
|
|
stream: ffmpeg,
|
|
dispose: () => {
|
|
input.destroy();
|
|
ffmpeg.destroy();
|
|
},
|
|
};
|
|
}
|
|
}
|