import { Readable } from "node:stream"; import ffmpegStatic from "ffmpeg-static"; import prism from "prism-media"; import type { AssistantRuntimeConfig } from "../config.js"; export interface PreparedSpeechAudio { stream: Readable; dispose: () => void; } export class ElevenLabsTtsService { constructor(private readonly config: AssistantRuntimeConfig) { const resolvedFfmpegPath = ffmpegStatic as unknown as string | null; if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) { process.env.FFMPEG_PATH = resolvedFfmpegPath; } } async preparePlayback(text: string, signal?: AbortSignal): Promise { const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`); url.searchParams.set("output_format", "mp3_44100_128"); url.searchParams.set("enable_logging", "false"); const response = await fetch(url, { method: "POST", headers: { "Content-Type": "application/json", "xi-api-key": this.config.ELEVENLABS_API_KEY, }, body: JSON.stringify({ text, model_id: this.config.ELEVENLABS_TTS_MODEL, language_code: this.config.BOT_DEFAULT_LANGUAGE, voice_settings: { stability: 0.35, similarity_boost: 0.75, speed: 1.05, }, }), signal, }); if (!response.ok || !response.body) { throw new Error(`ElevenLabs TTS request failed with status ${response.status}`); } const input = Readable.fromWeb(response.body as never); const ffmpeg = new prism.FFmpeg({ args: [ "-analyzeduration", "0", "-loglevel", "0", "-i", "pipe:0", "-f", "s16le", "-ar", "48000", "-ac", "2", "pipe:1", ], }); input.pipe(ffmpeg); return { stream: ffmpeg, dispose: () => { input.destroy(); ffmpeg.destroy(); }, }; } }