Files
realtime_voice_bot/src/services/elevenlabs-tts.ts

79 lines
2.0 KiB
TypeScript

import { Readable } from "node:stream";
import ffmpegStatic from "ffmpeg-static";
import prism from "prism-media";
import type { AssistantRuntimeConfig } from "../config.js";
export interface PreparedSpeechAudio {
stream: Readable;
dispose: () => void;
}
export class ElevenLabsTtsService {
constructor(private readonly config: AssistantRuntimeConfig) {
const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
process.env.FFMPEG_PATH = resolvedFfmpegPath;
}
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
url.searchParams.set("output_format", "mp3_44100_128");
url.searchParams.set("enable_logging", "false");
const response = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": this.config.ELEVENLABS_API_KEY,
},
body: JSON.stringify({
text,
model_id: this.config.ELEVENLABS_TTS_MODEL,
language_code: this.config.BOT_DEFAULT_LANGUAGE,
voice_settings: {
stability: 0.35,
similarity_boost: 0.75,
speed: 1.05,
},
}),
signal,
});
if (!response.ok || !response.body) {
throw new Error(`ElevenLabs TTS request failed with status ${response.status}`);
}
const input = Readable.fromWeb(response.body as never);
const ffmpeg = new prism.FFmpeg({
args: [
"-analyzeduration",
"0",
"-loglevel",
"0",
"-i",
"pipe:0",
"-f",
"s16le",
"-ar",
"48000",
"-ac",
"2",
"pipe:1",
],
});
input.pipe(ffmpeg);
return {
stream: ffmpeg,
dispose: () => {
input.destroy();
ffmpeg.destroy();
},
};
}
}