Replace ElevenLabs with local STT and TTS

This commit is contained in:
2026-04-30 03:21:30 +09:00
parent 5d636e8619
commit 73546c15b9
24 changed files with 943 additions and 326 deletions

View File

@@ -1,124 +0,0 @@
import WebSocket from "ws";
import type { AssistantRuntimeConfig } from "../config.js";
interface ElevenLabsMessage {
message_type?: string;
text?: string;
error?: string;
}
const NON_FATAL_ERROR_TYPES = new Set([
"insufficient_audio_activity",
]);
export class ElevenLabsSttService {
constructor(private readonly config: AssistantRuntimeConfig) {}
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
if (pcm16MonoAudio.byteLength === 0) {
return null;
}
const url = new URL("wss://api.elevenlabs.io/v1/speech-to-text/realtime");
url.searchParams.set("model_id", this.config.ELEVENLABS_STT_MODEL);
url.searchParams.set("language_code", this.config.BOT_DEFAULT_LANGUAGE);
url.searchParams.set("audio_format", "pcm_16000");
url.searchParams.set("commit_strategy", "manual");
url.searchParams.set("include_timestamps", "false");
url.searchParams.set("include_language_detection", "false");
url.searchParams.set("enable_logging", "false");
return await new Promise<string | null>((resolve, reject) => {
const socket = new WebSocket(url, {
headers: {
"xi-api-key": this.config.ELEVENLABS_API_KEY,
},
});
let settled = false;
let lastTranscript = "";
const timeout = setTimeout(() => {
finish(lastTranscript || null);
}, 15_000);
const finish = (result: string | null, error?: Error) => {
if (settled) {
return;
}
settled = true;
clearTimeout(timeout);
try {
socket.close();
} catch {
// Ignore close race.
}
if (error) {
reject(error);
return;
}
resolve(result);
};
socket.on("message", (raw) => {
let message: ElevenLabsMessage;
try {
message = JSON.parse(raw.toString()) as ElevenLabsMessage;
} catch (error) {
finish(null, error as Error);
return;
}
switch (message.message_type) {
case "session_started":
socket.send(
JSON.stringify({
message_type: "input_audio_chunk",
audio_base_64: pcm16MonoAudio.toString("base64"),
commit: true,
sample_rate: 16000,
}),
);
return;
case "partial_transcript":
return;
case "committed_transcript":
case "committed_transcript_with_timestamps": {
const transcript = message.text?.trim() ?? "";
if (transcript.length > 0) {
lastTranscript = transcript;
finish(transcript);
}
return;
}
default:
if (!message.message_type?.endsWith("error") && !message.message_type) {
return;
}
if (message.message_type && NON_FATAL_ERROR_TYPES.has(message.message_type)) {
finish(null);
return;
}
finish(
null,
new Error(message.error ?? `ElevenLabs STT error: ${message.message_type ?? "unknown"}`),
);
}
});
socket.on("error", (error) => {
finish(null, error as Error);
});
socket.on("close", () => {
if (!settled) {
finish(lastTranscript || null);
}
});
});
}
}

View File

@@ -1,78 +0,0 @@
import { Readable } from "node:stream";
import prism from "prism-media";
import type { AssistantRuntimeConfig } from "../config.js";
import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
export interface PreparedSpeechAudio {
stream: Readable;
dispose: () => void;
}
export class ElevenLabsTtsService {
constructor(private readonly config: AssistantRuntimeConfig) {
const resolvedFfmpegPath = resolveFfmpegPath();
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
process.env.FFMPEG_PATH = resolvedFfmpegPath;
}
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
url.searchParams.set("output_format", "mp3_44100_128");
url.searchParams.set("enable_logging", "false");
const response = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": this.config.ELEVENLABS_API_KEY,
},
body: JSON.stringify({
text,
model_id: this.config.ELEVENLABS_TTS_MODEL,
language_code: this.config.BOT_DEFAULT_LANGUAGE,
voice_settings: {
stability: 0.35,
similarity_boost: 0.75,
speed: 1.05,
},
}),
signal,
});
if (!response.ok || !response.body) {
throw new Error(`ElevenLabs TTS request failed with status ${response.status}`);
}
const input = Readable.fromWeb(response.body as never);
const ffmpeg = new prism.FFmpeg({
args: [
"-analyzeduration",
"0",
"-loglevel",
"0",
"-i",
"pipe:0",
"-f",
"s16le",
"-ar",
"48000",
"-ac",
"2",
"pipe:1",
],
});
input.pipe(ffmpeg);
return {
stream: ffmpeg,
dispose: () => {
input.destroy();
ffmpeg.destroy();
},
};
}
}

43
src/services/local-stt.ts Normal file
View File

@@ -0,0 +1,43 @@
import type { AssistantRuntimeConfig } from "../config.js";
import type { Logger } from "../logger.js";
import { PythonJsonWorker } from "./python-json-worker.js";
import type { SttService } from "./stt.js";
interface TranscribeResult {
text?: string;
}
export class LocalFasterWhisperSttService implements SttService {
private readonly worker: PythonJsonWorker;
constructor(private readonly config: AssistantRuntimeConfig, logger: Logger) {
this.worker = new PythonJsonWorker(config, logger, "local_stt_worker.py", "local-stt", {
LOCAL_STT_MODEL: config.LOCAL_STT_MODEL,
LOCAL_STT_DEVICE: config.LOCAL_STT_DEVICE,
LOCAL_STT_COMPUTE_TYPE: config.LOCAL_STT_COMPUTE_TYPE,
LOCAL_STT_BEAM_SIZE: String(config.LOCAL_STT_BEAM_SIZE),
});
}
async warmup(): Promise<void> {
await this.worker.request("ping", {});
}
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
if (pcm16MonoAudio.byteLength === 0) {
return null;
}
const result = await this.worker.request<TranscribeResult>("transcribe", {
audio_base64: pcm16MonoAudio.toString("base64"),
language: this.config.BOT_DEFAULT_LANGUAGE,
});
const transcript = result.text?.trim() ?? "";
return transcript.length > 0 ? transcript : null;
}
async destroy(): Promise<void> {
await this.worker.destroy();
}
}

94
src/services/local-tts.ts Normal file
View File

@@ -0,0 +1,94 @@
import { Readable } from "node:stream";
import prism from "prism-media";
import type { AssistantRuntimeConfig } from "../config.js";
import type { Logger } from "../logger.js";
import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
import { PythonJsonWorker } from "./python-json-worker.js";
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
interface SynthesizeResult {
wav_base64?: string;
}
export class LocalMeloTtsService implements TtsService {
private readonly worker: PythonJsonWorker;
constructor(config: AssistantRuntimeConfig, logger: Logger) {
const resolvedFfmpegPath = resolveFfmpegPath();
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
process.env.FFMPEG_PATH = resolvedFfmpegPath;
}
this.worker = new PythonJsonWorker(config, logger, "local_tts_worker.py", "local-tts", {
LOCAL_TTS_LANGUAGE: config.LOCAL_TTS_LANGUAGE,
LOCAL_TTS_SPEAKER: config.LOCAL_TTS_SPEAKER,
LOCAL_TTS_DEVICE: config.LOCAL_TTS_DEVICE,
LOCAL_TTS_SPEED: String(config.LOCAL_TTS_SPEED),
});
}
async warmup(): Promise<void> {
await this.worker.request("ping", {});
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const result = await this.worker.request<SynthesizeResult>(
"synthesize",
{
text,
},
signal,
);
const wavBase64 = result.wav_base64;
if (!wavBase64) {
throw new Error("로컬 TTS가 빈 오디오를 반환했습니다.");
}
const input = Readable.from([Buffer.from(wavBase64, "base64")]);
const ffmpeg = new prism.FFmpeg({
args: [
"-analyzeduration",
"0",
"-loglevel",
"0",
"-i",
"pipe:0",
"-f",
"s16le",
"-ar",
"48000",
"-ac",
"2",
"pipe:1",
],
});
if (signal) {
signal.addEventListener(
"abort",
() => {
input.destroy();
ffmpeg.destroy();
},
{ once: true },
);
}
input.pipe(ffmpeg);
return {
stream: ffmpeg,
dispose: () => {
input.destroy();
ffmpeg.destroy();
},
};
}
async destroy(): Promise<void> {
await this.worker.destroy();
}
}

View File

@@ -0,0 +1,189 @@
import { spawn, type ChildProcessWithoutNullStreams } from "node:child_process";
import { createInterface } from "node:readline";
import path from "node:path";
import type { AssistantRuntimeConfig } from "../config.js";
import type { Logger } from "../logger.js";
import { resolveLocalAiCachePath, resolvePythonLaunch } from "../python-runtime.js";
interface WorkerRequest {
id: number;
method: string;
params: Record<string, unknown>;
}
interface WorkerResponse {
id: number;
ok: boolean;
result?: unknown;
error?: string;
}
export class PythonJsonWorker {
private child: ChildProcessWithoutNullStreams | null = null;
private nextId = 1;
private readonly pending = new Map<
number,
{
resolve: (value: unknown) => void;
reject: (error: Error) => void;
}
>();
constructor(
private readonly config: AssistantRuntimeConfig,
private readonly logger: Logger,
private readonly scriptName: string,
private readonly label: string,
private readonly workerEnv: Record<string, string>,
) {}
async request<T>(method: string, params: Record<string, unknown>, signal?: AbortSignal): Promise<T> {
const child = this.ensureStarted();
const id = this.nextId++;
return await new Promise<T>((resolve, reject) => {
if (signal?.aborted) {
reject(new Error(`${this.label} request aborted before start`));
return;
}
const abortHandler = () => {
this.pending.delete(id);
reject(new Error(`${this.label} request aborted`));
};
if (signal) {
signal.addEventListener("abort", abortHandler, { once: true });
}
this.pending.set(id, {
resolve: (value) => {
if (signal) {
signal.removeEventListener("abort", abortHandler);
}
resolve(value as T);
},
reject: (error) => {
if (signal) {
signal.removeEventListener("abort", abortHandler);
}
reject(error);
},
});
const message: WorkerRequest = {
id,
method,
params,
};
child.stdin.write(`${JSON.stringify(message)}\n`);
});
}
async destroy(): Promise<void> {
this.rejectAll(new Error(`${this.label} worker terminated`));
if (!this.child) {
return;
}
const child = this.child;
this.child = null;
child.kill("SIGTERM");
await new Promise<void>((resolve) => {
child.once("exit", () => resolve());
setTimeout(resolve, 1_500);
});
}
private ensureStarted(): ChildProcessWithoutNullStreams {
if (this.child) {
return this.child;
}
const launch = resolvePythonLaunch(this.config);
const scriptPath = path.resolve(process.cwd(), "python", this.scriptName);
const cachePath = resolveLocalAiCachePath(this.config);
const recentStderr: string[] = [];
const child = spawn(launch.command, [...launch.args, scriptPath], {
stdio: ["pipe", "pipe", "pipe"],
env: {
...process.env,
HF_HOME: cachePath,
TRANSFORMERS_CACHE: cachePath,
PYTHONIOENCODING: "utf-8",
BOT_DEFAULT_LANGUAGE: this.config.BOT_DEFAULT_LANGUAGE,
...this.workerEnv,
},
});
createInterface({
input: child.stdout,
crlfDelay: Number.POSITIVE_INFINITY,
}).on("line", (line) => {
if (!line.trim()) {
return;
}
let payload: WorkerResponse;
try {
payload = JSON.parse(line) as WorkerResponse;
} catch (error) {
this.logger.warn(`${this.label} stdout parse failed`, error);
return;
}
const pending = this.pending.get(payload.id);
if (!pending) {
return;
}
this.pending.delete(payload.id);
if (payload.ok) {
pending.resolve(payload.result);
return;
}
pending.reject(new Error(payload.error ?? `${this.label} worker error`));
});
child.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text.length > 0) {
recentStderr.push(text);
if (recentStderr.length > 20) {
recentStderr.shift();
}
this.logger.warn(`[${this.label}]`, text);
}
});
child.on("exit", (code, signal) => {
if (this.child === child) {
this.child = null;
}
const detail = recentStderr.length > 0 ? `\n${recentStderr.join("\n")}` : "";
this.rejectAll(new Error(`${this.label} worker exited code=${code ?? "null"} signal=${signal ?? "null"}${detail}`));
});
child.on("error", (error) => {
this.rejectAll(error as Error);
});
this.child = child;
return child;
}
private rejectAll(error: Error): void {
const pending = [...this.pending.values()];
this.pending.clear();
for (const item of pending) {
item.reject(error);
}
}
}

4
src/services/stt.ts Normal file
View File

@@ -0,0 +1,4 @@
export interface SttService {
transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null>;
destroy?(): Promise<void>;
}

11
src/services/tts.ts Normal file
View File

@@ -0,0 +1,11 @@
import type { Readable } from "node:stream";
export interface PreparedSpeechAudio {
stream: Readable;
dispose: () => void;
}
export interface TtsService {
preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio>;
destroy?(): Promise<void>;
}