feat: scaffold realtime Korean voice assistant bot

This commit is contained in:
2026-04-30 02:29:18 +09:00
commit 9dee708b64
15 changed files with 1574 additions and 0 deletions

View File

@@ -0,0 +1,452 @@
import { EventEmitter } from "node:events";
import prism from "prism-media";
import { RealTimeVAD } from "avr-vad";
import {
AudioPlayerStatus,
EndBehaviorType,
NoSubscriberBehavior,
VoiceConnectionStatus,
createAudioPlayer,
entersState,
joinVoiceChannel,
type AudioPlayer,
type AudioReceiveStream,
type VoiceConnection,
} from "@discordjs/voice";
import type { Client, Guild, VoiceBasedChannel } from "discord.js";
import type { AppConfig } from "../config.js";
import { Logger } from "../logger.js";
import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js";
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
import { ElevenLabsTtsService, type PreparedSpeechPlayback } from "../services/elevenlabs-tts.js";
import { OpenAiLlmService } from "../services/openai-llm.js";
interface GuildVoiceSessionOptions {
client: Client;
config: AppConfig;
logger: Logger;
guild: Guild;
voiceChannel: VoiceBasedChannel;
textChannelId?: string;
stt: ElevenLabsSttService;
tts: ElevenLabsTtsService;
llm: OpenAiLlmService;
}
interface SpeechJob {
text: string;
source: "assistant" | "manual";
}
class UserAudioSession {
private readonly downsampler = new Stereo48kToMono16kDownsampler();
private readonly pendingSamples: number[] = [];
private readonly vad: RealTimeVAD;
private processing = Promise.resolve();
private constructor(
private readonly logger: Logger,
private readonly speakerId: string,
private readonly speakerName: string,
private readonly receiveStream: AudioReceiveStream,
private readonly decoder: NodeJS.ReadWriteStream & { destroy: () => void },
vad: RealTimeVAD,
private readonly onSpeechEnd: (utterance: UserUtterance, audio: Float32Array) => void,
) {
this.vad = vad;
}
static async create(options: {
logger: Logger;
speakerId: string;
speakerName: string;
receiveStream: AudioReceiveStream;
decoder: NodeJS.ReadWriteStream & { destroy: () => void };
onSpeechStart: () => void;
onSpeechEnd: (utterance: UserUtterance, audio: Float32Array) => void;
}): Promise<UserAudioSession> {
const vadInstance = await RealTimeVAD.new({
model: "v5",
sampleRate: 16000,
frameSamples: 1536,
positiveSpeechThreshold: 0.55,
negativeSpeechThreshold: 0.35,
redemptionFrames: 8,
preSpeechPadFrames: 2,
minSpeechFrames: 3,
onFrameProcessed: () => undefined,
onVADMisfire: () => undefined,
onSpeechStart: () => {
options.onSpeechStart();
},
onSpeechRealStart: () => undefined,
onSpeechEnd: (audio: Float32Array) => {
options.onSpeechEnd(
{
speakerId: options.speakerId,
speakerName: options.speakerName,
text: "",
},
audio,
);
},
});
const session = new UserAudioSession(
options.logger,
options.speakerId,
options.speakerName,
options.receiveStream,
options.decoder,
vadInstance,
options.onSpeechEnd,
);
session.decoder.on("data", (chunk: Buffer) => {
session.pushPcmChunk(chunk);
});
session.decoder.on("error", (error) => {
options.logger.warn("PCM decoder error", options.speakerId, error);
});
session.receiveStream.on("error", (error) => {
options.logger.warn("Audio receive stream error", options.speakerId, error);
});
return session;
}
private pushPcmChunk(chunk: Buffer): void {
const mono16k = this.downsampler.pushStereo48kChunk(chunk);
if (mono16k.length === 0) {
return;
}
for (const sample of mono16k) {
this.pendingSamples.push(sample);
}
while (true) {
const frame = takeFrame(this.pendingSamples, 1536);
if (!frame) {
return;
}
const floatFrame = int16ArrayToFloat32(frame);
this.processing = this.processing
.then(() => this.vad.processAudio(floatFrame))
.catch((error) => {
this.logger.warn("VAD frame processing failed", this.speakerId, this.speakerName, error);
});
}
}
destroy(): void {
this.receiveStream.destroy();
this.decoder.destroy();
void this.vad.destroy().catch((error) => {
this.logger.warn("VAD destroy failed", this.speakerId, this.speakerName, error);
});
}
}
export class GuildVoiceSession extends EventEmitter {
readonly guildId: string;
readonly voiceChannelId: string;
private readonly connection: VoiceConnection;
private readonly player: AudioPlayer;
private readonly memory: ConversationMemory;
private readonly trackedUsers = new Map<string, UserAudioSession>();
private readonly pendingUsers = new Map<string, Promise<void>>();
private readonly queue: SpeechJob[] = [];
private draining = false;
private currentAbortController: AbortController | null = null;
private currentPlayback: PreparedSpeechPlayback | null = null;
private textChannelId?: string;
private constructor(private readonly options: GuildVoiceSessionOptions) {
super();
this.guildId = options.guild.id;
this.voiceChannelId = options.voiceChannel.id;
this.textChannelId = options.textChannelId;
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
this.player = createAudioPlayer({
behaviors: {
noSubscriber: NoSubscriberBehavior.Pause,
},
});
this.connection = joinVoiceChannel({
guildId: options.guild.id,
channelId: options.voiceChannel.id,
adapterCreator: options.guild.voiceAdapterCreator,
selfDeaf: false,
selfMute: false,
});
}
static async create(options: GuildVoiceSessionOptions): Promise<GuildVoiceSession> {
const session = new GuildVoiceSession(options);
await session.initialize();
return session;
}
private async initialize(): Promise<void> {
this.player.on("error", (error) => {
this.options.logger.warn("Audio player error", this.guildId, error);
});
this.connection.on("stateChange", (_oldState, newState) => {
if (newState.status === VoiceConnectionStatus.Destroyed) {
this.options.logger.info("Voice connection destroyed", this.guildId);
}
});
this.connection.subscribe(this.player);
await entersState(this.connection, VoiceConnectionStatus.Ready, 30_000);
this.connection.receiver.speaking.on("start", (userId: string) => {
if (userId === this.options.client.user?.id) {
return;
}
void this.ensureTrackedUser(userId);
});
}
setTextChannel(textChannelId?: string): void {
this.textChannelId = textChannelId;
}
clearConversation(): void {
this.memory.clear();
this.interruptPlayback("conversation-reset");
}
statusSummary(): string {
const playbackState = this.player.state.status;
return [
`세션 활성: 예`,
`음성 채널: ${this.options.voiceChannel.name}`,
`추적 유저 수: ${this.trackedUsers.size}`,
`재생 상태: ${playbackState}`,
`대기열: ${this.queue.length}`,
`최근 대화 턴: ${this.memory.recentTurns().length}`,
].join("\n");
}
async speakText(text: string): Promise<void> {
this.queue.push({
text,
source: "manual",
});
await this.drainQueue();
}
interruptPlayback(reason: string): void {
if (this.queue.length > 0 || this.player.state.status !== AudioPlayerStatus.Idle) {
this.options.logger.info("Interrupting playback", this.guildId, reason);
}
this.queue.splice(0, this.queue.length);
this.currentAbortController?.abort();
this.currentAbortController = null;
this.currentPlayback?.dispose();
this.currentPlayback = null;
this.player.stop(true);
}
async destroy(): Promise<void> {
this.interruptPlayback("session-destroy");
for (const session of this.trackedUsers.values()) {
session.destroy();
}
this.trackedUsers.clear();
this.pendingUsers.clear();
this.connection.destroy();
}
private async ensureTrackedUser(userId: string): Promise<void> {
if (this.trackedUsers.has(userId)) {
return;
}
const existing = this.pendingUsers.get(userId);
if (existing) {
await existing;
return;
}
const pending = this.createTrackedUser(userId).finally(() => {
this.pendingUsers.delete(userId);
});
this.pendingUsers.set(userId, pending);
await pending;
}
private async createTrackedUser(userId: string): Promise<void> {
const speakerName = await this.resolveSpeakerName(userId);
const receiveStream = this.connection.receiver.subscribe(userId, {
end: {
behavior: EndBehaviorType.Manual,
},
});
const decoder = new prism.opus.Decoder({
rate: 48000,
channels: 2,
frameSize: 960,
}) as NodeJS.ReadWriteStream & { destroy: () => void };
receiveStream.pipe(decoder);
const session = await UserAudioSession.create({
logger: this.options.logger,
speakerId: userId,
speakerName,
receiveStream,
decoder,
onSpeechStart: () => {
this.interruptPlayback(`barge-in:${speakerName}`);
},
onSpeechEnd: (utterance, audio) => {
void this.handleSpeechEnd(utterance, audio);
},
});
this.trackedUsers.set(userId, session);
this.options.logger.info("Tracking speaker", this.guildId, userId, speakerName);
}
private async resolveSpeakerName(userId: string): Promise<string> {
try {
const user = await this.options.client.users.fetch(userId);
return user.globalName ?? user.username;
} catch {
return `user-${userId.slice(-6)}`;
}
}
private async handleSpeechEnd(utterance: UserUtterance, audio: Float32Array): Promise<void> {
if (audio.length < 16000 * 0.25) {
return;
}
const pcmBuffer = float32ToPcm16Buffer(audio);
let transcript: string | null = null;
try {
transcript = await this.options.stt.transcribePcm16(pcmBuffer);
} catch (error) {
this.options.logger.warn("STT failed", this.guildId, utterance.speakerId, error);
await this.announce(`음성 인식 실패: ${utterance.speakerName}`);
return;
}
if (!transcript || transcript.trim().length === 0) {
return;
}
const hydratedUtterance: UserUtterance = {
...utterance,
text: transcript.trim(),
};
this.options.logger.info("Transcript committed", this.guildId, hydratedUtterance.speakerName, hydratedUtterance.text);
this.memory.addUserTurn(hydratedUtterance);
if (this.options.config.DEBUG_TEXT_EVENTS) {
await this.announce(`🗣️ ${hydratedUtterance.speakerName}: ${hydratedUtterance.text}`);
}
let reply: string;
try {
reply = await this.options.llm.generateReply(this.memory, hydratedUtterance);
} catch (error) {
this.options.logger.warn("LLM failed", this.guildId, utterance.speakerId, error);
reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요.";
}
this.memory.addAssistantTurn(reply);
if (this.options.config.DEBUG_TEXT_EVENTS) {
await this.announce(`🤖 ${reply}`);
}
this.queue.push({
text: reply,
source: "assistant",
});
await this.drainQueue();
}
private async drainQueue(): Promise<void> {
if (this.draining) {
return;
}
this.draining = true;
try {
while (this.queue.length > 0) {
const job = this.queue.shift();
if (!job) {
continue;
}
const abortController = new AbortController();
this.currentAbortController = abortController;
try {
this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal);
} catch (error) {
if (abortController.signal.aborted) {
continue;
}
this.options.logger.warn("TTS synthesis failed", this.guildId, job.source, error);
await this.announce("음성 출력 생성에 실패했습니다.");
continue;
}
try {
const resource = this.currentPlayback.resource;
this.player.play(resource);
await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null);
await entersState(this.player, AudioPlayerStatus.Idle, 300_000);
} catch (error) {
if (!abortController.signal.aborted) {
this.options.logger.warn("Audio playback failed", this.guildId, error);
}
} finally {
this.currentPlayback?.dispose();
this.currentPlayback = null;
if (this.currentAbortController === abortController) {
this.currentAbortController = null;
}
}
}
} finally {
this.draining = false;
}
}
private async announce(message: string): Promise<void> {
if (!this.textChannelId) {
return;
}
const channel = await this.options.client.channels.fetch(this.textChannelId).catch(() => null);
if (!channel?.isTextBased() || !("send" in channel) || typeof channel.send !== "function") {
return;
}
await channel.send(message).catch(() => null);
}
}

60
src/audio/pcm.ts Normal file
View File

@@ -0,0 +1,60 @@
export class Stereo48kToMono16kDownsampler {
private readonly pendingMono48k: number[] = [];
pushStereo48kChunk(chunk: Buffer): Int16Array {
if (chunk.length < 4) {
return new Int16Array();
}
for (let offset = 0; offset + 3 < chunk.length; offset += 4) {
const left = chunk.readInt16LE(offset);
const right = chunk.readInt16LE(offset + 2);
this.pendingMono48k.push(Math.round((left + right) / 2));
}
const outputLength = Math.floor(this.pendingMono48k.length / 3);
if (outputLength === 0) {
return new Int16Array();
}
const output = new Int16Array(outputLength);
let readIndex = 0;
for (let index = 0; index < outputLength; index += 1) {
const a = this.pendingMono48k[readIndex];
const b = this.pendingMono48k[readIndex + 1];
const c = this.pendingMono48k[readIndex + 2];
output[index] = Math.round((a + b + c) / 3);
readIndex += 3;
}
this.pendingMono48k.splice(0, readIndex);
return output;
}
}
export function int16ArrayToFloat32(input: Int16Array): Float32Array {
const output = new Float32Array(input.length);
for (let index = 0; index < input.length; index += 1) {
output[index] = input[index] / 32768;
}
return output;
}
export function float32ToPcm16Buffer(input: Float32Array): Buffer {
const buffer = Buffer.allocUnsafe(input.length * 2);
for (let index = 0; index < input.length; index += 1) {
const value = Math.max(-1, Math.min(1, input[index]));
const scaled = value < 0 ? value * 32768 : value * 32767;
buffer.writeInt16LE(Math.round(scaled), index * 2);
}
return buffer;
}
export function takeFrame(source: number[], frameSize: number): Int16Array | null {
if (source.length < frameSize) {
return null;
}
const values = source.splice(0, frameSize);
return Int16Array.from(values);
}

29
src/config.ts Normal file
View File

@@ -0,0 +1,29 @@
import { config as loadDotenv } from "dotenv";
import { z } from "zod";
loadDotenv();
const envSchema = z.object({
DISCORD_BOT_TOKEN: z.string().min(1),
DISCORD_APPLICATION_ID: z.string().min(1),
DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(),
OPENAI_API_KEY: z.string().min(1),
OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"),
ELEVENLABS_API_KEY: z.string().min(1),
ELEVENLABS_VOICE_ID: z.string().min(1),
ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"),
ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"),
BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),
MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12),
DEBUG_TEXT_EVENTS: z
.string()
.optional()
.transform((value) => value === "true"),
LOG_LEVEL: z.enum(["debug", "info", "warn", "error"]).default("info"),
});
export type AppConfig = z.infer<typeof envSchema>;
export function loadConfig(): AppConfig {
return envSchema.parse(process.env);
}

240
src/index.ts Normal file
View File

@@ -0,0 +1,240 @@
import process from "node:process";
import {
GatewayIntentBits,
REST,
Routes,
SlashCommandBuilder,
type ChatInputCommandInteraction,
type Client,
type GuildMember,
type VoiceBasedChannel,
} from "discord.js";
import { Client as DiscordClient } from "discord.js";
import { GuildVoiceSession } from "./audio/guild-voice-session.js";
import { loadConfig } from "./config.js";
import { Logger } from "./logger.js";
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
import { OpenAiLlmService } from "./services/openai-llm.js";
const config = loadConfig();
const logger = new Logger(config.LOG_LEVEL);
const commands = [
new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
new SlashCommandBuilder()
.setName("say")
.setDescription("텍스트를 바로 음성으로 읽습니다.")
.addStringOption((option) =>
option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
),
].map((command) => command.toJSON());
const client = new DiscordClient({
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
});
const stt = new ElevenLabsSttService(config);
const tts = new ElevenLabsTtsService(config);
const llm = new OpenAiLlmService(config);
const sessions = new Map<string, GuildVoiceSession>();
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
const member = interaction.member as GuildMember | null;
return member?.voice.channel ?? null;
}
async function registerCommands(appClient: Client): Promise<void> {
const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
if (config.DISCORD_COMMAND_GUILD_ID) {
await rest.put(
Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
{
body: commands,
},
);
logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
return;
}
await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
body: commands,
});
logger.info("Registered global commands");
}
async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
if (!interaction.guild) {
throw new Error("Guild interaction required");
}
const voiceChannel = getVoiceChannel(interaction);
if (!voiceChannel) {
throw new Error("먼저 음성 채널에 들어가 주세요.");
}
const existing = sessions.get(interaction.guild.id);
if (existing && existing.voiceChannelId === voiceChannel.id) {
existing.setTextChannel(interaction.channelId);
return existing;
}
if (existing) {
await existing.destroy();
sessions.delete(interaction.guild.id);
}
const session = await GuildVoiceSession.create({
client,
config,
logger,
guild: interaction.guild,
voiceChannel,
textChannelId: interaction.channelId,
stt,
tts,
llm,
});
sessions.set(interaction.guild.id, session);
return session;
}
async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
try {
const session = await createSession(interaction);
await interaction.editReply(`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`);
} catch (error) {
const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
await interaction.editReply(message);
}
}
async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await session.destroy();
sessions.delete(interaction.guildId!);
await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
}
async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await interaction.reply({
content: session.statusSummary(),
ephemeral: true,
});
}
async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
session.clearConversation();
await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
}
async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
return;
}
const text = interaction.options.getString("text", true).trim();
await session.speakText(text);
await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
}
async function shutdown(exitCode = 0): Promise<void> {
logger.info("Shutting down");
for (const session of sessions.values()) {
await session.destroy().catch((error) => {
logger.warn("Session shutdown failed", error);
});
}
sessions.clear();
await client.destroy();
process.exit(exitCode);
}
client.once("ready", async () => {
logger.info("Discord client ready", client.user?.tag ?? "unknown");
try {
await registerCommands(client);
} catch (error) {
logger.error("Command registration failed", error);
}
});
client.on("interactionCreate", async (interaction) => {
if (!interaction.isChatInputCommand()) {
return;
}
try {
switch (interaction.commandName) {
case "join":
await handleJoin(interaction);
return;
case "leave":
await handleLeave(interaction);
return;
case "status":
await handleStatus(interaction);
return;
case "reset":
await handleReset(interaction);
return;
case "say":
await handleSay(interaction);
return;
default:
await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
}
} catch (error) {
logger.error("Interaction handler failed", error);
if (interaction.deferred || interaction.replied) {
await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
return;
}
await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
}
});
process.on("SIGINT", () => {
void shutdown(0);
});
process.on("SIGTERM", () => {
void shutdown(0);
});
async function main(): Promise<void> {
await client.login(config.DISCORD_BOT_TOKEN);
}
void main().catch((error) => {
logger.error("Fatal startup error", error);
process.exit(1);
});

63
src/logger.ts Normal file
View File

@@ -0,0 +1,63 @@
type LogLevel = "debug" | "info" | "warn" | "error";
const levelOrder: Record<LogLevel, number> = {
debug: 10,
info: 20,
warn: 30,
error: 40,
};
function formatParts(parts: unknown[]): string {
return parts
.map((part) => {
if (part instanceof Error) {
return `${part.name}: ${part.message}`;
}
if (typeof part === "string") {
return part;
}
return JSON.stringify(part);
})
.join(" ");
}
export class Logger {
constructor(private readonly level: LogLevel) {}
private shouldLog(target: LogLevel): boolean {
return levelOrder[target] >= levelOrder[this.level];
}
private write(target: LogLevel, ...parts: unknown[]): void {
if (!this.shouldLog(target)) {
return;
}
const line = `[${new Date().toISOString()}] [${target.toUpperCase()}] ${formatParts(parts)}`;
if (target === "error") {
console.error(line);
return;
}
if (target === "warn") {
console.warn(line);
return;
}
console.log(line);
}
debug(...parts: unknown[]): void {
this.write("debug", ...parts);
}
info(...parts: unknown[]): void {
this.write("info", ...parts);
}
warn(...parts: unknown[]): void {
this.write("warn", ...parts);
}
error(...parts: unknown[]): void {
this.write("error", ...parts);
}
}

View File

@@ -0,0 +1,77 @@
export interface ConversationTurn {
role: "user" | "assistant";
text: string;
speakerId?: string;
speakerName?: string;
createdAt: number;
}
export interface UserUtterance {
speakerId: string;
speakerName: string;
text: string;
}
export class ConversationMemory {
private readonly turns: ConversationTurn[] = [];
constructor(private readonly maxTurns: number) {}
addUserTurn(utterance: UserUtterance): void {
this.turns.push({
role: "user",
text: utterance.text,
speakerId: utterance.speakerId,
speakerName: utterance.speakerName,
createdAt: Date.now(),
});
this.trim();
}
addAssistantTurn(text: string): void {
this.turns.push({
role: "assistant",
text,
createdAt: Date.now(),
});
this.trim();
}
clear(): void {
this.turns.splice(0, this.turns.length);
}
recentTurns(): ConversationTurn[] {
return [...this.turns];
}
buildPrompt(currentUtterance: UserUtterance): string {
const recent = this.turns
.slice(-this.maxTurns)
.map((turn) => {
if (turn.role === "assistant") {
return `[assistant]\n${turn.text}`;
}
return `[user speaker_id=${turn.speakerId ?? "unknown"} speaker_name=${turn.speakerName ?? "unknown"}]\n${turn.text}`;
})
.join("\n\n");
const historyBlock = recent.length > 0 ? recent : "(이전 대화 없음)";
return [
"최근 대화:",
historyBlock,
"",
"이번 발화:",
`[user speaker_id=${currentUtterance.speakerId} speaker_name=${currentUtterance.speakerName}]`,
currentUtterance.text,
].join("\n");
}
private trim(): void {
const overflow = this.turns.length - this.maxTurns;
if (overflow > 0) {
this.turns.splice(0, overflow);
}
}
}

View File

@@ -0,0 +1,124 @@
import WebSocket from "ws";
import type { AppConfig } from "../config.js";
interface ElevenLabsMessage {
message_type?: string;
text?: string;
error?: string;
}
const NON_FATAL_ERROR_TYPES = new Set([
"insufficient_audio_activity",
]);
export class ElevenLabsSttService {
constructor(private readonly config: AppConfig) {}
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
if (pcm16MonoAudio.byteLength === 0) {
return null;
}
const url = new URL("wss://api.elevenlabs.io/v1/speech-to-text/realtime");
url.searchParams.set("model_id", this.config.ELEVENLABS_STT_MODEL);
url.searchParams.set("language_code", this.config.BOT_DEFAULT_LANGUAGE);
url.searchParams.set("audio_format", "pcm_16000");
url.searchParams.set("commit_strategy", "manual");
url.searchParams.set("include_timestamps", "false");
url.searchParams.set("include_language_detection", "false");
url.searchParams.set("enable_logging", "false");
return await new Promise<string | null>((resolve, reject) => {
const socket = new WebSocket(url, {
headers: {
"xi-api-key": this.config.ELEVENLABS_API_KEY,
},
});
let settled = false;
let lastTranscript = "";
const timeout = setTimeout(() => {
finish(lastTranscript || null);
}, 15_000);
const finish = (result: string | null, error?: Error) => {
if (settled) {
return;
}
settled = true;
clearTimeout(timeout);
try {
socket.close();
} catch {
// Ignore close race.
}
if (error) {
reject(error);
return;
}
resolve(result);
};
socket.on("message", (raw) => {
let message: ElevenLabsMessage;
try {
message = JSON.parse(raw.toString()) as ElevenLabsMessage;
} catch (error) {
finish(null, error as Error);
return;
}
switch (message.message_type) {
case "session_started":
socket.send(
JSON.stringify({
message_type: "input_audio_chunk",
audio_base_64: pcm16MonoAudio.toString("base64"),
commit: true,
sample_rate: 16000,
}),
);
return;
case "partial_transcript":
return;
case "committed_transcript":
case "committed_transcript_with_timestamps": {
const transcript = message.text?.trim() ?? "";
if (transcript.length > 0) {
lastTranscript = transcript;
finish(transcript);
}
return;
}
default:
if (!message.message_type?.endsWith("error") && !message.message_type) {
return;
}
if (message.message_type && NON_FATAL_ERROR_TYPES.has(message.message_type)) {
finish(null);
return;
}
finish(
null,
new Error(message.error ?? `ElevenLabs STT error: ${message.message_type ?? "unknown"}`),
);
}
});
socket.on("error", (error) => {
finish(null, error as Error);
});
socket.on("close", () => {
if (!settled) {
finish(lastTranscript || null);
}
});
});
}
}

View File

@@ -0,0 +1,83 @@
import { Readable } from "node:stream";
import ffmpegStatic from "ffmpeg-static";
import prism from "prism-media";
import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice";
import type { AppConfig } from "../config.js";
export interface PreparedSpeechPlayback {
resource: AudioResource;
dispose: () => void;
}
export class ElevenLabsTtsService {
constructor(private readonly config: AppConfig) {
const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
process.env.FFMPEG_PATH = resolvedFfmpegPath;
}
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechPlayback> {
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
url.searchParams.set("output_format", "mp3_44100_128");
url.searchParams.set("enable_logging", "false");
const response = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": this.config.ELEVENLABS_API_KEY,
},
body: JSON.stringify({
text,
model_id: this.config.ELEVENLABS_TTS_MODEL,
language_code: this.config.BOT_DEFAULT_LANGUAGE,
voice_settings: {
stability: 0.35,
similarity_boost: 0.75,
speed: 1.05,
},
}),
signal,
});
if (!response.ok || !response.body) {
throw new Error(`ElevenLabs TTS request failed with status ${response.status}`);
}
const input = Readable.fromWeb(response.body as never);
const ffmpeg = new prism.FFmpeg({
args: [
"-analyzeduration",
"0",
"-loglevel",
"0",
"-i",
"pipe:0",
"-f",
"s16le",
"-ar",
"48000",
"-ac",
"2",
"pipe:1",
],
});
input.pipe(ffmpeg);
const resource = createAudioResource(ffmpeg, {
inputType: StreamType.Raw,
});
return {
resource,
dispose: () => {
input.destroy();
ffmpeg.destroy();
},
};
}
}

View File

@@ -0,0 +1,64 @@
import OpenAI from "openai";
import type { AppConfig } from "../config.js";
import type { ConversationMemory, UserUtterance } from "./conversation.js";
const ASSISTANT_INSTRUCTIONS = [
"너는 디스코드 음성 채널에서 동작하는 한국어 음성 비서다.",
"답변은 짧고 실용적으로 한다.",
"기본은 한 문장, 길어도 두 문장을 넘기지 않는다.",
"말투는 자연스러운 한국어로 유지한다.",
"speaker_id와 speaker_name은 화자 구분용이므로 필요할 때만 자연스럽게 반영한다.",
"잘 못 들었거나 의미가 불명확하면 짧게 다시 물어본다.",
"목록, 마크다운, 코드블록은 쓰지 않는다.",
].join(" ");
function normalizeReply(text: string): string {
const compact = text.replace(/\s+/g, " ").trim();
if (compact.length <= 180) {
return compact;
}
const sentences = compact.match(/[^.!?]+[.!?]?/g);
if (!sentences || sentences.length === 0) {
return compact.slice(0, 180).trim();
}
return sentences.slice(0, 2).join(" ").trim().slice(0, 180).trim();
}
export class OpenAiLlmService {
private readonly client: OpenAI;
constructor(private readonly config: AppConfig) {
this.client = new OpenAI({
apiKey: this.config.OPENAI_API_KEY,
});
}
async generateReply(memory: ConversationMemory, utterance: UserUtterance): Promise<string> {
const response = await this.client.responses.create({
model: this.config.OPENAI_MODEL,
instructions: ASSISTANT_INSTRUCTIONS,
input: [
{
role: "user",
content: [
{
type: "input_text",
text: memory.buildPrompt(utterance),
},
],
},
],
max_output_tokens: 120,
});
const output = response.output_text?.trim();
if (!output) {
return "잘 못 들었습니다. 한 번만 다시 말씀해 주세요.";
}
return normalizeReply(output);
}
}