feat: scaffold realtime Korean voice assistant bot

This commit is contained in:
2026-04-30 02:29:18 +09:00
commit 9dee708b64
15 changed files with 1574 additions and 0 deletions

View File

@@ -0,0 +1,77 @@
export interface ConversationTurn {
role: "user" | "assistant";
text: string;
speakerId?: string;
speakerName?: string;
createdAt: number;
}
export interface UserUtterance {
speakerId: string;
speakerName: string;
text: string;
}
export class ConversationMemory {
private readonly turns: ConversationTurn[] = [];
constructor(private readonly maxTurns: number) {}
addUserTurn(utterance: UserUtterance): void {
this.turns.push({
role: "user",
text: utterance.text,
speakerId: utterance.speakerId,
speakerName: utterance.speakerName,
createdAt: Date.now(),
});
this.trim();
}
addAssistantTurn(text: string): void {
this.turns.push({
role: "assistant",
text,
createdAt: Date.now(),
});
this.trim();
}
clear(): void {
this.turns.splice(0, this.turns.length);
}
recentTurns(): ConversationTurn[] {
return [...this.turns];
}
buildPrompt(currentUtterance: UserUtterance): string {
const recent = this.turns
.slice(-this.maxTurns)
.map((turn) => {
if (turn.role === "assistant") {
return `[assistant]\n${turn.text}`;
}
return `[user speaker_id=${turn.speakerId ?? "unknown"} speaker_name=${turn.speakerName ?? "unknown"}]\n${turn.text}`;
})
.join("\n\n");
const historyBlock = recent.length > 0 ? recent : "(이전 대화 없음)";
return [
"최근 대화:",
historyBlock,
"",
"이번 발화:",
`[user speaker_id=${currentUtterance.speakerId} speaker_name=${currentUtterance.speakerName}]`,
currentUtterance.text,
].join("\n");
}
private trim(): void {
const overflow = this.turns.length - this.maxTurns;
if (overflow > 0) {
this.turns.splice(0, overflow);
}
}
}

View File

@@ -0,0 +1,124 @@
import WebSocket from "ws";
import type { AppConfig } from "../config.js";
interface ElevenLabsMessage {
message_type?: string;
text?: string;
error?: string;
}
const NON_FATAL_ERROR_TYPES = new Set([
"insufficient_audio_activity",
]);
export class ElevenLabsSttService {
constructor(private readonly config: AppConfig) {}
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
if (pcm16MonoAudio.byteLength === 0) {
return null;
}
const url = new URL("wss://api.elevenlabs.io/v1/speech-to-text/realtime");
url.searchParams.set("model_id", this.config.ELEVENLABS_STT_MODEL);
url.searchParams.set("language_code", this.config.BOT_DEFAULT_LANGUAGE);
url.searchParams.set("audio_format", "pcm_16000");
url.searchParams.set("commit_strategy", "manual");
url.searchParams.set("include_timestamps", "false");
url.searchParams.set("include_language_detection", "false");
url.searchParams.set("enable_logging", "false");
return await new Promise<string | null>((resolve, reject) => {
const socket = new WebSocket(url, {
headers: {
"xi-api-key": this.config.ELEVENLABS_API_KEY,
},
});
let settled = false;
let lastTranscript = "";
const timeout = setTimeout(() => {
finish(lastTranscript || null);
}, 15_000);
const finish = (result: string | null, error?: Error) => {
if (settled) {
return;
}
settled = true;
clearTimeout(timeout);
try {
socket.close();
} catch {
// Ignore close race.
}
if (error) {
reject(error);
return;
}
resolve(result);
};
socket.on("message", (raw) => {
let message: ElevenLabsMessage;
try {
message = JSON.parse(raw.toString()) as ElevenLabsMessage;
} catch (error) {
finish(null, error as Error);
return;
}
switch (message.message_type) {
case "session_started":
socket.send(
JSON.stringify({
message_type: "input_audio_chunk",
audio_base_64: pcm16MonoAudio.toString("base64"),
commit: true,
sample_rate: 16000,
}),
);
return;
case "partial_transcript":
return;
case "committed_transcript":
case "committed_transcript_with_timestamps": {
const transcript = message.text?.trim() ?? "";
if (transcript.length > 0) {
lastTranscript = transcript;
finish(transcript);
}
return;
}
default:
if (!message.message_type?.endsWith("error") && !message.message_type) {
return;
}
if (message.message_type && NON_FATAL_ERROR_TYPES.has(message.message_type)) {
finish(null);
return;
}
finish(
null,
new Error(message.error ?? `ElevenLabs STT error: ${message.message_type ?? "unknown"}`),
);
}
});
socket.on("error", (error) => {
finish(null, error as Error);
});
socket.on("close", () => {
if (!settled) {
finish(lastTranscript || null);
}
});
});
}
}

View File

@@ -0,0 +1,83 @@
import { Readable } from "node:stream";
import ffmpegStatic from "ffmpeg-static";
import prism from "prism-media";
import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice";
import type { AppConfig } from "../config.js";
export interface PreparedSpeechPlayback {
resource: AudioResource;
dispose: () => void;
}
export class ElevenLabsTtsService {
constructor(private readonly config: AppConfig) {
const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
process.env.FFMPEG_PATH = resolvedFfmpegPath;
}
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechPlayback> {
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
url.searchParams.set("output_format", "mp3_44100_128");
url.searchParams.set("enable_logging", "false");
const response = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": this.config.ELEVENLABS_API_KEY,
},
body: JSON.stringify({
text,
model_id: this.config.ELEVENLABS_TTS_MODEL,
language_code: this.config.BOT_DEFAULT_LANGUAGE,
voice_settings: {
stability: 0.35,
similarity_boost: 0.75,
speed: 1.05,
},
}),
signal,
});
if (!response.ok || !response.body) {
throw new Error(`ElevenLabs TTS request failed with status ${response.status}`);
}
const input = Readable.fromWeb(response.body as never);
const ffmpeg = new prism.FFmpeg({
args: [
"-analyzeduration",
"0",
"-loglevel",
"0",
"-i",
"pipe:0",
"-f",
"s16le",
"-ar",
"48000",
"-ac",
"2",
"pipe:1",
],
});
input.pipe(ffmpeg);
const resource = createAudioResource(ffmpeg, {
inputType: StreamType.Raw,
});
return {
resource,
dispose: () => {
input.destroy();
ffmpeg.destroy();
},
};
}
}

View File

@@ -0,0 +1,64 @@
import OpenAI from "openai";
import type { AppConfig } from "../config.js";
import type { ConversationMemory, UserUtterance } from "./conversation.js";
const ASSISTANT_INSTRUCTIONS = [
"너는 디스코드 음성 채널에서 동작하는 한국어 음성 비서다.",
"답변은 짧고 실용적으로 한다.",
"기본은 한 문장, 길어도 두 문장을 넘기지 않는다.",
"말투는 자연스러운 한국어로 유지한다.",
"speaker_id와 speaker_name은 화자 구분용이므로 필요할 때만 자연스럽게 반영한다.",
"잘 못 들었거나 의미가 불명확하면 짧게 다시 물어본다.",
"목록, 마크다운, 코드블록은 쓰지 않는다.",
].join(" ");
function normalizeReply(text: string): string {
const compact = text.replace(/\s+/g, " ").trim();
if (compact.length <= 180) {
return compact;
}
const sentences = compact.match(/[^.!?]+[.!?]?/g);
if (!sentences || sentences.length === 0) {
return compact.slice(0, 180).trim();
}
return sentences.slice(0, 2).join(" ").trim().slice(0, 180).trim();
}
export class OpenAiLlmService {
private readonly client: OpenAI;
constructor(private readonly config: AppConfig) {
this.client = new OpenAI({
apiKey: this.config.OPENAI_API_KEY,
});
}
async generateReply(memory: ConversationMemory, utterance: UserUtterance): Promise<string> {
const response = await this.client.responses.create({
model: this.config.OPENAI_MODEL,
instructions: ASSISTANT_INSTRUCTIONS,
input: [
{
role: "user",
content: [
{
type: "input_text",
text: memory.buildPrompt(utterance),
},
],
},
],
max_output_tokens: 120,
});
const output = response.output_text?.trim();
if (!output) {
return "잘 못 들었습니다. 한 번만 다시 말씀해 주세요.";
}
return normalizeReply(output);
}
}