feat: scaffold realtime Korean voice assistant bot
This commit is contained in:
452
src/audio/guild-voice-session.ts
Normal file
452
src/audio/guild-voice-session.ts
Normal file
@@ -0,0 +1,452 @@
|
||||
import { EventEmitter } from "node:events";
|
||||
|
||||
import prism from "prism-media";
|
||||
import { RealTimeVAD } from "avr-vad";
|
||||
import {
|
||||
AudioPlayerStatus,
|
||||
EndBehaviorType,
|
||||
NoSubscriberBehavior,
|
||||
VoiceConnectionStatus,
|
||||
createAudioPlayer,
|
||||
entersState,
|
||||
joinVoiceChannel,
|
||||
type AudioPlayer,
|
||||
type AudioReceiveStream,
|
||||
type VoiceConnection,
|
||||
} from "@discordjs/voice";
|
||||
import type { Client, Guild, VoiceBasedChannel } from "discord.js";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
import { Logger } from "../logger.js";
|
||||
import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js";
|
||||
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
|
||||
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService, type PreparedSpeechPlayback } from "../services/elevenlabs-tts.js";
|
||||
import { OpenAiLlmService } from "../services/openai-llm.js";
|
||||
|
||||
interface GuildVoiceSessionOptions {
|
||||
client: Client;
|
||||
config: AppConfig;
|
||||
logger: Logger;
|
||||
guild: Guild;
|
||||
voiceChannel: VoiceBasedChannel;
|
||||
textChannelId?: string;
|
||||
stt: ElevenLabsSttService;
|
||||
tts: ElevenLabsTtsService;
|
||||
llm: OpenAiLlmService;
|
||||
}
|
||||
|
||||
interface SpeechJob {
|
||||
text: string;
|
||||
source: "assistant" | "manual";
|
||||
}
|
||||
|
||||
class UserAudioSession {
|
||||
private readonly downsampler = new Stereo48kToMono16kDownsampler();
|
||||
private readonly pendingSamples: number[] = [];
|
||||
private readonly vad: RealTimeVAD;
|
||||
private processing = Promise.resolve();
|
||||
|
||||
private constructor(
|
||||
private readonly logger: Logger,
|
||||
private readonly speakerId: string,
|
||||
private readonly speakerName: string,
|
||||
private readonly receiveStream: AudioReceiveStream,
|
||||
private readonly decoder: NodeJS.ReadWriteStream & { destroy: () => void },
|
||||
vad: RealTimeVAD,
|
||||
private readonly onSpeechEnd: (utterance: UserUtterance, audio: Float32Array) => void,
|
||||
) {
|
||||
this.vad = vad;
|
||||
}
|
||||
|
||||
static async create(options: {
|
||||
logger: Logger;
|
||||
speakerId: string;
|
||||
speakerName: string;
|
||||
receiveStream: AudioReceiveStream;
|
||||
decoder: NodeJS.ReadWriteStream & { destroy: () => void };
|
||||
onSpeechStart: () => void;
|
||||
onSpeechEnd: (utterance: UserUtterance, audio: Float32Array) => void;
|
||||
}): Promise<UserAudioSession> {
|
||||
const vadInstance = await RealTimeVAD.new({
|
||||
model: "v5",
|
||||
sampleRate: 16000,
|
||||
frameSamples: 1536,
|
||||
positiveSpeechThreshold: 0.55,
|
||||
negativeSpeechThreshold: 0.35,
|
||||
redemptionFrames: 8,
|
||||
preSpeechPadFrames: 2,
|
||||
minSpeechFrames: 3,
|
||||
onFrameProcessed: () => undefined,
|
||||
onVADMisfire: () => undefined,
|
||||
onSpeechStart: () => {
|
||||
options.onSpeechStart();
|
||||
},
|
||||
onSpeechRealStart: () => undefined,
|
||||
onSpeechEnd: (audio: Float32Array) => {
|
||||
options.onSpeechEnd(
|
||||
{
|
||||
speakerId: options.speakerId,
|
||||
speakerName: options.speakerName,
|
||||
text: "",
|
||||
},
|
||||
audio,
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
const session = new UserAudioSession(
|
||||
options.logger,
|
||||
options.speakerId,
|
||||
options.speakerName,
|
||||
options.receiveStream,
|
||||
options.decoder,
|
||||
vadInstance,
|
||||
options.onSpeechEnd,
|
||||
);
|
||||
|
||||
session.decoder.on("data", (chunk: Buffer) => {
|
||||
session.pushPcmChunk(chunk);
|
||||
});
|
||||
|
||||
session.decoder.on("error", (error) => {
|
||||
options.logger.warn("PCM decoder error", options.speakerId, error);
|
||||
});
|
||||
|
||||
session.receiveStream.on("error", (error) => {
|
||||
options.logger.warn("Audio receive stream error", options.speakerId, error);
|
||||
});
|
||||
|
||||
return session;
|
||||
}
|
||||
|
||||
private pushPcmChunk(chunk: Buffer): void {
|
||||
const mono16k = this.downsampler.pushStereo48kChunk(chunk);
|
||||
if (mono16k.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const sample of mono16k) {
|
||||
this.pendingSamples.push(sample);
|
||||
}
|
||||
|
||||
while (true) {
|
||||
const frame = takeFrame(this.pendingSamples, 1536);
|
||||
if (!frame) {
|
||||
return;
|
||||
}
|
||||
|
||||
const floatFrame = int16ArrayToFloat32(frame);
|
||||
this.processing = this.processing
|
||||
.then(() => this.vad.processAudio(floatFrame))
|
||||
.catch((error) => {
|
||||
this.logger.warn("VAD frame processing failed", this.speakerId, this.speakerName, error);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
destroy(): void {
|
||||
this.receiveStream.destroy();
|
||||
this.decoder.destroy();
|
||||
void this.vad.destroy().catch((error) => {
|
||||
this.logger.warn("VAD destroy failed", this.speakerId, this.speakerName, error);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export class GuildVoiceSession extends EventEmitter {
|
||||
readonly guildId: string;
|
||||
readonly voiceChannelId: string;
|
||||
|
||||
private readonly connection: VoiceConnection;
|
||||
private readonly player: AudioPlayer;
|
||||
private readonly memory: ConversationMemory;
|
||||
private readonly trackedUsers = new Map<string, UserAudioSession>();
|
||||
private readonly pendingUsers = new Map<string, Promise<void>>();
|
||||
private readonly queue: SpeechJob[] = [];
|
||||
|
||||
private draining = false;
|
||||
private currentAbortController: AbortController | null = null;
|
||||
private currentPlayback: PreparedSpeechPlayback | null = null;
|
||||
private textChannelId?: string;
|
||||
|
||||
private constructor(private readonly options: GuildVoiceSessionOptions) {
|
||||
super();
|
||||
|
||||
this.guildId = options.guild.id;
|
||||
this.voiceChannelId = options.voiceChannel.id;
|
||||
this.textChannelId = options.textChannelId;
|
||||
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
|
||||
this.player = createAudioPlayer({
|
||||
behaviors: {
|
||||
noSubscriber: NoSubscriberBehavior.Pause,
|
||||
},
|
||||
});
|
||||
this.connection = joinVoiceChannel({
|
||||
guildId: options.guild.id,
|
||||
channelId: options.voiceChannel.id,
|
||||
adapterCreator: options.guild.voiceAdapterCreator,
|
||||
selfDeaf: false,
|
||||
selfMute: false,
|
||||
});
|
||||
}
|
||||
|
||||
static async create(options: GuildVoiceSessionOptions): Promise<GuildVoiceSession> {
|
||||
const session = new GuildVoiceSession(options);
|
||||
await session.initialize();
|
||||
return session;
|
||||
}
|
||||
|
||||
private async initialize(): Promise<void> {
|
||||
this.player.on("error", (error) => {
|
||||
this.options.logger.warn("Audio player error", this.guildId, error);
|
||||
});
|
||||
|
||||
this.connection.on("stateChange", (_oldState, newState) => {
|
||||
if (newState.status === VoiceConnectionStatus.Destroyed) {
|
||||
this.options.logger.info("Voice connection destroyed", this.guildId);
|
||||
}
|
||||
});
|
||||
|
||||
this.connection.subscribe(this.player);
|
||||
await entersState(this.connection, VoiceConnectionStatus.Ready, 30_000);
|
||||
|
||||
this.connection.receiver.speaking.on("start", (userId: string) => {
|
||||
if (userId === this.options.client.user?.id) {
|
||||
return;
|
||||
}
|
||||
|
||||
void this.ensureTrackedUser(userId);
|
||||
});
|
||||
}
|
||||
|
||||
setTextChannel(textChannelId?: string): void {
|
||||
this.textChannelId = textChannelId;
|
||||
}
|
||||
|
||||
clearConversation(): void {
|
||||
this.memory.clear();
|
||||
this.interruptPlayback("conversation-reset");
|
||||
}
|
||||
|
||||
statusSummary(): string {
|
||||
const playbackState = this.player.state.status;
|
||||
return [
|
||||
`세션 활성: 예`,
|
||||
`음성 채널: ${this.options.voiceChannel.name}`,
|
||||
`추적 유저 수: ${this.trackedUsers.size}`,
|
||||
`재생 상태: ${playbackState}`,
|
||||
`대기열: ${this.queue.length}`,
|
||||
`최근 대화 턴: ${this.memory.recentTurns().length}`,
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
async speakText(text: string): Promise<void> {
|
||||
this.queue.push({
|
||||
text,
|
||||
source: "manual",
|
||||
});
|
||||
await this.drainQueue();
|
||||
}
|
||||
|
||||
interruptPlayback(reason: string): void {
|
||||
if (this.queue.length > 0 || this.player.state.status !== AudioPlayerStatus.Idle) {
|
||||
this.options.logger.info("Interrupting playback", this.guildId, reason);
|
||||
}
|
||||
|
||||
this.queue.splice(0, this.queue.length);
|
||||
this.currentAbortController?.abort();
|
||||
this.currentAbortController = null;
|
||||
this.currentPlayback?.dispose();
|
||||
this.currentPlayback = null;
|
||||
this.player.stop(true);
|
||||
}
|
||||
|
||||
async destroy(): Promise<void> {
|
||||
this.interruptPlayback("session-destroy");
|
||||
for (const session of this.trackedUsers.values()) {
|
||||
session.destroy();
|
||||
}
|
||||
this.trackedUsers.clear();
|
||||
this.pendingUsers.clear();
|
||||
this.connection.destroy();
|
||||
}
|
||||
|
||||
private async ensureTrackedUser(userId: string): Promise<void> {
|
||||
if (this.trackedUsers.has(userId)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const existing = this.pendingUsers.get(userId);
|
||||
if (existing) {
|
||||
await existing;
|
||||
return;
|
||||
}
|
||||
|
||||
const pending = this.createTrackedUser(userId).finally(() => {
|
||||
this.pendingUsers.delete(userId);
|
||||
});
|
||||
this.pendingUsers.set(userId, pending);
|
||||
await pending;
|
||||
}
|
||||
|
||||
private async createTrackedUser(userId: string): Promise<void> {
|
||||
const speakerName = await this.resolveSpeakerName(userId);
|
||||
const receiveStream = this.connection.receiver.subscribe(userId, {
|
||||
end: {
|
||||
behavior: EndBehaviorType.Manual,
|
||||
},
|
||||
});
|
||||
|
||||
const decoder = new prism.opus.Decoder({
|
||||
rate: 48000,
|
||||
channels: 2,
|
||||
frameSize: 960,
|
||||
}) as NodeJS.ReadWriteStream & { destroy: () => void };
|
||||
|
||||
receiveStream.pipe(decoder);
|
||||
|
||||
const session = await UserAudioSession.create({
|
||||
logger: this.options.logger,
|
||||
speakerId: userId,
|
||||
speakerName,
|
||||
receiveStream,
|
||||
decoder,
|
||||
onSpeechStart: () => {
|
||||
this.interruptPlayback(`barge-in:${speakerName}`);
|
||||
},
|
||||
onSpeechEnd: (utterance, audio) => {
|
||||
void this.handleSpeechEnd(utterance, audio);
|
||||
},
|
||||
});
|
||||
|
||||
this.trackedUsers.set(userId, session);
|
||||
this.options.logger.info("Tracking speaker", this.guildId, userId, speakerName);
|
||||
}
|
||||
|
||||
private async resolveSpeakerName(userId: string): Promise<string> {
|
||||
try {
|
||||
const user = await this.options.client.users.fetch(userId);
|
||||
return user.globalName ?? user.username;
|
||||
} catch {
|
||||
return `user-${userId.slice(-6)}`;
|
||||
}
|
||||
}
|
||||
|
||||
private async handleSpeechEnd(utterance: UserUtterance, audio: Float32Array): Promise<void> {
|
||||
if (audio.length < 16000 * 0.25) {
|
||||
return;
|
||||
}
|
||||
|
||||
const pcmBuffer = float32ToPcm16Buffer(audio);
|
||||
let transcript: string | null = null;
|
||||
|
||||
try {
|
||||
transcript = await this.options.stt.transcribePcm16(pcmBuffer);
|
||||
} catch (error) {
|
||||
this.options.logger.warn("STT failed", this.guildId, utterance.speakerId, error);
|
||||
await this.announce(`음성 인식 실패: ${utterance.speakerName}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!transcript || transcript.trim().length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const hydratedUtterance: UserUtterance = {
|
||||
...utterance,
|
||||
text: transcript.trim(),
|
||||
};
|
||||
|
||||
this.options.logger.info("Transcript committed", this.guildId, hydratedUtterance.speakerName, hydratedUtterance.text);
|
||||
this.memory.addUserTurn(hydratedUtterance);
|
||||
|
||||
if (this.options.config.DEBUG_TEXT_EVENTS) {
|
||||
await this.announce(`🗣️ ${hydratedUtterance.speakerName}: ${hydratedUtterance.text}`);
|
||||
}
|
||||
|
||||
let reply: string;
|
||||
try {
|
||||
reply = await this.options.llm.generateReply(this.memory, hydratedUtterance);
|
||||
} catch (error) {
|
||||
this.options.logger.warn("LLM failed", this.guildId, utterance.speakerId, error);
|
||||
reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요.";
|
||||
}
|
||||
|
||||
this.memory.addAssistantTurn(reply);
|
||||
if (this.options.config.DEBUG_TEXT_EVENTS) {
|
||||
await this.announce(`🤖 ${reply}`);
|
||||
}
|
||||
|
||||
this.queue.push({
|
||||
text: reply,
|
||||
source: "assistant",
|
||||
});
|
||||
await this.drainQueue();
|
||||
}
|
||||
|
||||
private async drainQueue(): Promise<void> {
|
||||
if (this.draining) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.draining = true;
|
||||
|
||||
try {
|
||||
while (this.queue.length > 0) {
|
||||
const job = this.queue.shift();
|
||||
if (!job) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const abortController = new AbortController();
|
||||
this.currentAbortController = abortController;
|
||||
|
||||
try {
|
||||
this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal);
|
||||
} catch (error) {
|
||||
if (abortController.signal.aborted) {
|
||||
continue;
|
||||
}
|
||||
|
||||
this.options.logger.warn("TTS synthesis failed", this.guildId, job.source, error);
|
||||
await this.announce("음성 출력 생성에 실패했습니다.");
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const resource = this.currentPlayback.resource;
|
||||
this.player.play(resource);
|
||||
|
||||
await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null);
|
||||
await entersState(this.player, AudioPlayerStatus.Idle, 300_000);
|
||||
} catch (error) {
|
||||
if (!abortController.signal.aborted) {
|
||||
this.options.logger.warn("Audio playback failed", this.guildId, error);
|
||||
}
|
||||
} finally {
|
||||
this.currentPlayback?.dispose();
|
||||
this.currentPlayback = null;
|
||||
if (this.currentAbortController === abortController) {
|
||||
this.currentAbortController = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
this.draining = false;
|
||||
}
|
||||
}
|
||||
|
||||
private async announce(message: string): Promise<void> {
|
||||
if (!this.textChannelId) {
|
||||
return;
|
||||
}
|
||||
|
||||
const channel = await this.options.client.channels.fetch(this.textChannelId).catch(() => null);
|
||||
if (!channel?.isTextBased() || !("send" in channel) || typeof channel.send !== "function") {
|
||||
return;
|
||||
}
|
||||
|
||||
await channel.send(message).catch(() => null);
|
||||
}
|
||||
}
|
||||
60
src/audio/pcm.ts
Normal file
60
src/audio/pcm.ts
Normal file
@@ -0,0 +1,60 @@
|
||||
export class Stereo48kToMono16kDownsampler {
|
||||
private readonly pendingMono48k: number[] = [];
|
||||
|
||||
pushStereo48kChunk(chunk: Buffer): Int16Array {
|
||||
if (chunk.length < 4) {
|
||||
return new Int16Array();
|
||||
}
|
||||
|
||||
for (let offset = 0; offset + 3 < chunk.length; offset += 4) {
|
||||
const left = chunk.readInt16LE(offset);
|
||||
const right = chunk.readInt16LE(offset + 2);
|
||||
this.pendingMono48k.push(Math.round((left + right) / 2));
|
||||
}
|
||||
|
||||
const outputLength = Math.floor(this.pendingMono48k.length / 3);
|
||||
if (outputLength === 0) {
|
||||
return new Int16Array();
|
||||
}
|
||||
|
||||
const output = new Int16Array(outputLength);
|
||||
let readIndex = 0;
|
||||
for (let index = 0; index < outputLength; index += 1) {
|
||||
const a = this.pendingMono48k[readIndex];
|
||||
const b = this.pendingMono48k[readIndex + 1];
|
||||
const c = this.pendingMono48k[readIndex + 2];
|
||||
output[index] = Math.round((a + b + c) / 3);
|
||||
readIndex += 3;
|
||||
}
|
||||
|
||||
this.pendingMono48k.splice(0, readIndex);
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
export function int16ArrayToFloat32(input: Int16Array): Float32Array {
|
||||
const output = new Float32Array(input.length);
|
||||
for (let index = 0; index < input.length; index += 1) {
|
||||
output[index] = input[index] / 32768;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
export function float32ToPcm16Buffer(input: Float32Array): Buffer {
|
||||
const buffer = Buffer.allocUnsafe(input.length * 2);
|
||||
for (let index = 0; index < input.length; index += 1) {
|
||||
const value = Math.max(-1, Math.min(1, input[index]));
|
||||
const scaled = value < 0 ? value * 32768 : value * 32767;
|
||||
buffer.writeInt16LE(Math.round(scaled), index * 2);
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
export function takeFrame(source: number[], frameSize: number): Int16Array | null {
|
||||
if (source.length < frameSize) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const values = source.splice(0, frameSize);
|
||||
return Int16Array.from(values);
|
||||
}
|
||||
29
src/config.ts
Normal file
29
src/config.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import { config as loadDotenv } from "dotenv";
|
||||
import { z } from "zod";
|
||||
|
||||
loadDotenv();
|
||||
|
||||
const envSchema = z.object({
|
||||
DISCORD_BOT_TOKEN: z.string().min(1),
|
||||
DISCORD_APPLICATION_ID: z.string().min(1),
|
||||
DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(),
|
||||
OPENAI_API_KEY: z.string().min(1),
|
||||
OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"),
|
||||
ELEVENLABS_API_KEY: z.string().min(1),
|
||||
ELEVENLABS_VOICE_ID: z.string().min(1),
|
||||
ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"),
|
||||
ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"),
|
||||
BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),
|
||||
MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12),
|
||||
DEBUG_TEXT_EVENTS: z
|
||||
.string()
|
||||
.optional()
|
||||
.transform((value) => value === "true"),
|
||||
LOG_LEVEL: z.enum(["debug", "info", "warn", "error"]).default("info"),
|
||||
});
|
||||
|
||||
export type AppConfig = z.infer<typeof envSchema>;
|
||||
|
||||
export function loadConfig(): AppConfig {
|
||||
return envSchema.parse(process.env);
|
||||
}
|
||||
240
src/index.ts
Normal file
240
src/index.ts
Normal file
@@ -0,0 +1,240 @@
|
||||
import process from "node:process";
|
||||
|
||||
import {
|
||||
GatewayIntentBits,
|
||||
REST,
|
||||
Routes,
|
||||
SlashCommandBuilder,
|
||||
type ChatInputCommandInteraction,
|
||||
type Client,
|
||||
type GuildMember,
|
||||
type VoiceBasedChannel,
|
||||
} from "discord.js";
|
||||
import { Client as DiscordClient } from "discord.js";
|
||||
|
||||
import { GuildVoiceSession } from "./audio/guild-voice-session.js";
|
||||
import { loadConfig } from "./config.js";
|
||||
import { Logger } from "./logger.js";
|
||||
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
|
||||
import { OpenAiLlmService } from "./services/openai-llm.js";
|
||||
|
||||
const config = loadConfig();
|
||||
const logger = new Logger(config.LOG_LEVEL);
|
||||
|
||||
const commands = [
|
||||
new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
|
||||
new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
|
||||
new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
|
||||
new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
|
||||
new SlashCommandBuilder()
|
||||
.setName("say")
|
||||
.setDescription("텍스트를 바로 음성으로 읽습니다.")
|
||||
.addStringOption((option) =>
|
||||
option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
|
||||
),
|
||||
].map((command) => command.toJSON());
|
||||
|
||||
const client = new DiscordClient({
|
||||
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
|
||||
});
|
||||
|
||||
const stt = new ElevenLabsSttService(config);
|
||||
const tts = new ElevenLabsTtsService(config);
|
||||
const llm = new OpenAiLlmService(config);
|
||||
const sessions = new Map<string, GuildVoiceSession>();
|
||||
|
||||
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
|
||||
const member = interaction.member as GuildMember | null;
|
||||
return member?.voice.channel ?? null;
|
||||
}
|
||||
|
||||
async function registerCommands(appClient: Client): Promise<void> {
|
||||
const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
|
||||
if (config.DISCORD_COMMAND_GUILD_ID) {
|
||||
await rest.put(
|
||||
Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
|
||||
{
|
||||
body: commands,
|
||||
},
|
||||
);
|
||||
logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
|
||||
return;
|
||||
}
|
||||
|
||||
await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
|
||||
body: commands,
|
||||
});
|
||||
logger.info("Registered global commands");
|
||||
}
|
||||
|
||||
async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
|
||||
if (!interaction.guild) {
|
||||
throw new Error("Guild interaction required");
|
||||
}
|
||||
|
||||
const voiceChannel = getVoiceChannel(interaction);
|
||||
if (!voiceChannel) {
|
||||
throw new Error("먼저 음성 채널에 들어가 주세요.");
|
||||
}
|
||||
|
||||
const existing = sessions.get(interaction.guild.id);
|
||||
if (existing && existing.voiceChannelId === voiceChannel.id) {
|
||||
existing.setTextChannel(interaction.channelId);
|
||||
return existing;
|
||||
}
|
||||
|
||||
if (existing) {
|
||||
await existing.destroy();
|
||||
sessions.delete(interaction.guild.id);
|
||||
}
|
||||
|
||||
const session = await GuildVoiceSession.create({
|
||||
client,
|
||||
config,
|
||||
logger,
|
||||
guild: interaction.guild,
|
||||
voiceChannel,
|
||||
textChannelId: interaction.channelId,
|
||||
stt,
|
||||
tts,
|
||||
llm,
|
||||
});
|
||||
sessions.set(interaction.guild.id, session);
|
||||
return session;
|
||||
}
|
||||
|
||||
async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
await interaction.deferReply({ ephemeral: true });
|
||||
|
||||
try {
|
||||
const session = await createSession(interaction);
|
||||
await interaction.editReply(`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
|
||||
await interaction.editReply(message);
|
||||
}
|
||||
}
|
||||
|
||||
async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
await session.destroy();
|
||||
sessions.delete(interaction.guildId!);
|
||||
await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
|
||||
}
|
||||
|
||||
async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
await interaction.reply({
|
||||
content: session.statusSummary(),
|
||||
ephemeral: true,
|
||||
});
|
||||
}
|
||||
|
||||
async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
session.clearConversation();
|
||||
await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
|
||||
}
|
||||
|
||||
async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
await interaction.deferReply({ ephemeral: true });
|
||||
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
|
||||
return;
|
||||
}
|
||||
|
||||
const text = interaction.options.getString("text", true).trim();
|
||||
await session.speakText(text);
|
||||
await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
|
||||
}
|
||||
|
||||
async function shutdown(exitCode = 0): Promise<void> {
|
||||
logger.info("Shutting down");
|
||||
for (const session of sessions.values()) {
|
||||
await session.destroy().catch((error) => {
|
||||
logger.warn("Session shutdown failed", error);
|
||||
});
|
||||
}
|
||||
sessions.clear();
|
||||
await client.destroy();
|
||||
process.exit(exitCode);
|
||||
}
|
||||
|
||||
client.once("ready", async () => {
|
||||
logger.info("Discord client ready", client.user?.tag ?? "unknown");
|
||||
try {
|
||||
await registerCommands(client);
|
||||
} catch (error) {
|
||||
logger.error("Command registration failed", error);
|
||||
}
|
||||
});
|
||||
|
||||
client.on("interactionCreate", async (interaction) => {
|
||||
if (!interaction.isChatInputCommand()) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
switch (interaction.commandName) {
|
||||
case "join":
|
||||
await handleJoin(interaction);
|
||||
return;
|
||||
case "leave":
|
||||
await handleLeave(interaction);
|
||||
return;
|
||||
case "status":
|
||||
await handleStatus(interaction);
|
||||
return;
|
||||
case "reset":
|
||||
await handleReset(interaction);
|
||||
return;
|
||||
case "say":
|
||||
await handleSay(interaction);
|
||||
return;
|
||||
default:
|
||||
await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("Interaction handler failed", error);
|
||||
if (interaction.deferred || interaction.replied) {
|
||||
await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
|
||||
return;
|
||||
}
|
||||
await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
|
||||
}
|
||||
});
|
||||
|
||||
process.on("SIGINT", () => {
|
||||
void shutdown(0);
|
||||
});
|
||||
|
||||
process.on("SIGTERM", () => {
|
||||
void shutdown(0);
|
||||
});
|
||||
|
||||
async function main(): Promise<void> {
|
||||
await client.login(config.DISCORD_BOT_TOKEN);
|
||||
}
|
||||
|
||||
void main().catch((error) => {
|
||||
logger.error("Fatal startup error", error);
|
||||
process.exit(1);
|
||||
});
|
||||
63
src/logger.ts
Normal file
63
src/logger.ts
Normal file
@@ -0,0 +1,63 @@
|
||||
type LogLevel = "debug" | "info" | "warn" | "error";
|
||||
|
||||
const levelOrder: Record<LogLevel, number> = {
|
||||
debug: 10,
|
||||
info: 20,
|
||||
warn: 30,
|
||||
error: 40,
|
||||
};
|
||||
|
||||
function formatParts(parts: unknown[]): string {
|
||||
return parts
|
||||
.map((part) => {
|
||||
if (part instanceof Error) {
|
||||
return `${part.name}: ${part.message}`;
|
||||
}
|
||||
if (typeof part === "string") {
|
||||
return part;
|
||||
}
|
||||
return JSON.stringify(part);
|
||||
})
|
||||
.join(" ");
|
||||
}
|
||||
|
||||
export class Logger {
|
||||
constructor(private readonly level: LogLevel) {}
|
||||
|
||||
private shouldLog(target: LogLevel): boolean {
|
||||
return levelOrder[target] >= levelOrder[this.level];
|
||||
}
|
||||
|
||||
private write(target: LogLevel, ...parts: unknown[]): void {
|
||||
if (!this.shouldLog(target)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const line = `[${new Date().toISOString()}] [${target.toUpperCase()}] ${formatParts(parts)}`;
|
||||
if (target === "error") {
|
||||
console.error(line);
|
||||
return;
|
||||
}
|
||||
if (target === "warn") {
|
||||
console.warn(line);
|
||||
return;
|
||||
}
|
||||
console.log(line);
|
||||
}
|
||||
|
||||
debug(...parts: unknown[]): void {
|
||||
this.write("debug", ...parts);
|
||||
}
|
||||
|
||||
info(...parts: unknown[]): void {
|
||||
this.write("info", ...parts);
|
||||
}
|
||||
|
||||
warn(...parts: unknown[]): void {
|
||||
this.write("warn", ...parts);
|
||||
}
|
||||
|
||||
error(...parts: unknown[]): void {
|
||||
this.write("error", ...parts);
|
||||
}
|
||||
}
|
||||
77
src/services/conversation.ts
Normal file
77
src/services/conversation.ts
Normal file
@@ -0,0 +1,77 @@
|
||||
export interface ConversationTurn {
|
||||
role: "user" | "assistant";
|
||||
text: string;
|
||||
speakerId?: string;
|
||||
speakerName?: string;
|
||||
createdAt: number;
|
||||
}
|
||||
|
||||
export interface UserUtterance {
|
||||
speakerId: string;
|
||||
speakerName: string;
|
||||
text: string;
|
||||
}
|
||||
|
||||
export class ConversationMemory {
|
||||
private readonly turns: ConversationTurn[] = [];
|
||||
|
||||
constructor(private readonly maxTurns: number) {}
|
||||
|
||||
addUserTurn(utterance: UserUtterance): void {
|
||||
this.turns.push({
|
||||
role: "user",
|
||||
text: utterance.text,
|
||||
speakerId: utterance.speakerId,
|
||||
speakerName: utterance.speakerName,
|
||||
createdAt: Date.now(),
|
||||
});
|
||||
this.trim();
|
||||
}
|
||||
|
||||
addAssistantTurn(text: string): void {
|
||||
this.turns.push({
|
||||
role: "assistant",
|
||||
text,
|
||||
createdAt: Date.now(),
|
||||
});
|
||||
this.trim();
|
||||
}
|
||||
|
||||
clear(): void {
|
||||
this.turns.splice(0, this.turns.length);
|
||||
}
|
||||
|
||||
recentTurns(): ConversationTurn[] {
|
||||
return [...this.turns];
|
||||
}
|
||||
|
||||
buildPrompt(currentUtterance: UserUtterance): string {
|
||||
const recent = this.turns
|
||||
.slice(-this.maxTurns)
|
||||
.map((turn) => {
|
||||
if (turn.role === "assistant") {
|
||||
return `[assistant]\n${turn.text}`;
|
||||
}
|
||||
return `[user speaker_id=${turn.speakerId ?? "unknown"} speaker_name=${turn.speakerName ?? "unknown"}]\n${turn.text}`;
|
||||
})
|
||||
.join("\n\n");
|
||||
|
||||
const historyBlock = recent.length > 0 ? recent : "(이전 대화 없음)";
|
||||
|
||||
return [
|
||||
"최근 대화:",
|
||||
historyBlock,
|
||||
"",
|
||||
"이번 발화:",
|
||||
`[user speaker_id=${currentUtterance.speakerId} speaker_name=${currentUtterance.speakerName}]`,
|
||||
currentUtterance.text,
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
private trim(): void {
|
||||
const overflow = this.turns.length - this.maxTurns;
|
||||
if (overflow > 0) {
|
||||
this.turns.splice(0, overflow);
|
||||
}
|
||||
}
|
||||
}
|
||||
124
src/services/elevenlabs-stt.ts
Normal file
124
src/services/elevenlabs-stt.ts
Normal file
@@ -0,0 +1,124 @@
|
||||
import WebSocket from "ws";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
|
||||
interface ElevenLabsMessage {
|
||||
message_type?: string;
|
||||
text?: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
const NON_FATAL_ERROR_TYPES = new Set([
|
||||
"insufficient_audio_activity",
|
||||
]);
|
||||
|
||||
export class ElevenLabsSttService {
|
||||
constructor(private readonly config: AppConfig) {}
|
||||
|
||||
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
|
||||
if (pcm16MonoAudio.byteLength === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const url = new URL("wss://api.elevenlabs.io/v1/speech-to-text/realtime");
|
||||
url.searchParams.set("model_id", this.config.ELEVENLABS_STT_MODEL);
|
||||
url.searchParams.set("language_code", this.config.BOT_DEFAULT_LANGUAGE);
|
||||
url.searchParams.set("audio_format", "pcm_16000");
|
||||
url.searchParams.set("commit_strategy", "manual");
|
||||
url.searchParams.set("include_timestamps", "false");
|
||||
url.searchParams.set("include_language_detection", "false");
|
||||
url.searchParams.set("enable_logging", "false");
|
||||
|
||||
return await new Promise<string | null>((resolve, reject) => {
|
||||
const socket = new WebSocket(url, {
|
||||
headers: {
|
||||
"xi-api-key": this.config.ELEVENLABS_API_KEY,
|
||||
},
|
||||
});
|
||||
|
||||
let settled = false;
|
||||
let lastTranscript = "";
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
finish(lastTranscript || null);
|
||||
}, 15_000);
|
||||
|
||||
const finish = (result: string | null, error?: Error) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
settled = true;
|
||||
clearTimeout(timeout);
|
||||
try {
|
||||
socket.close();
|
||||
} catch {
|
||||
// Ignore close race.
|
||||
}
|
||||
|
||||
if (error) {
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
resolve(result);
|
||||
};
|
||||
|
||||
socket.on("message", (raw) => {
|
||||
let message: ElevenLabsMessage;
|
||||
try {
|
||||
message = JSON.parse(raw.toString()) as ElevenLabsMessage;
|
||||
} catch (error) {
|
||||
finish(null, error as Error);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (message.message_type) {
|
||||
case "session_started":
|
||||
socket.send(
|
||||
JSON.stringify({
|
||||
message_type: "input_audio_chunk",
|
||||
audio_base_64: pcm16MonoAudio.toString("base64"),
|
||||
commit: true,
|
||||
sample_rate: 16000,
|
||||
}),
|
||||
);
|
||||
return;
|
||||
case "partial_transcript":
|
||||
return;
|
||||
case "committed_transcript":
|
||||
case "committed_transcript_with_timestamps": {
|
||||
const transcript = message.text?.trim() ?? "";
|
||||
if (transcript.length > 0) {
|
||||
lastTranscript = transcript;
|
||||
finish(transcript);
|
||||
}
|
||||
return;
|
||||
}
|
||||
default:
|
||||
if (!message.message_type?.endsWith("error") && !message.message_type) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (message.message_type && NON_FATAL_ERROR_TYPES.has(message.message_type)) {
|
||||
finish(null);
|
||||
return;
|
||||
}
|
||||
|
||||
finish(
|
||||
null,
|
||||
new Error(message.error ?? `ElevenLabs STT error: ${message.message_type ?? "unknown"}`),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
socket.on("error", (error) => {
|
||||
finish(null, error as Error);
|
||||
});
|
||||
|
||||
socket.on("close", () => {
|
||||
if (!settled) {
|
||||
finish(lastTranscript || null);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
83
src/services/elevenlabs-tts.ts
Normal file
83
src/services/elevenlabs-tts.ts
Normal file
@@ -0,0 +1,83 @@
|
||||
import { Readable } from "node:stream";
|
||||
|
||||
import ffmpegStatic from "ffmpeg-static";
|
||||
import prism from "prism-media";
|
||||
import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
|
||||
export interface PreparedSpeechPlayback {
|
||||
resource: AudioResource;
|
||||
dispose: () => void;
|
||||
}
|
||||
|
||||
export class ElevenLabsTtsService {
|
||||
constructor(private readonly config: AppConfig) {
|
||||
const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
|
||||
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
|
||||
process.env.FFMPEG_PATH = resolvedFfmpegPath;
|
||||
}
|
||||
}
|
||||
|
||||
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechPlayback> {
|
||||
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
|
||||
url.searchParams.set("output_format", "mp3_44100_128");
|
||||
url.searchParams.set("enable_logging", "false");
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"xi-api-key": this.config.ELEVENLABS_API_KEY,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
model_id: this.config.ELEVENLABS_TTS_MODEL,
|
||||
language_code: this.config.BOT_DEFAULT_LANGUAGE,
|
||||
voice_settings: {
|
||||
stability: 0.35,
|
||||
similarity_boost: 0.75,
|
||||
speed: 1.05,
|
||||
},
|
||||
}),
|
||||
signal,
|
||||
});
|
||||
|
||||
if (!response.ok || !response.body) {
|
||||
throw new Error(`ElevenLabs TTS request failed with status ${response.status}`);
|
||||
}
|
||||
|
||||
const input = Readable.fromWeb(response.body as never);
|
||||
const ffmpeg = new prism.FFmpeg({
|
||||
args: [
|
||||
"-analyzeduration",
|
||||
"0",
|
||||
"-loglevel",
|
||||
"0",
|
||||
"-i",
|
||||
"pipe:0",
|
||||
"-f",
|
||||
"s16le",
|
||||
"-ar",
|
||||
"48000",
|
||||
"-ac",
|
||||
"2",
|
||||
"pipe:1",
|
||||
],
|
||||
});
|
||||
|
||||
input.pipe(ffmpeg);
|
||||
|
||||
const resource = createAudioResource(ffmpeg, {
|
||||
inputType: StreamType.Raw,
|
||||
});
|
||||
|
||||
return {
|
||||
resource,
|
||||
dispose: () => {
|
||||
input.destroy();
|
||||
ffmpeg.destroy();
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
64
src/services/openai-llm.ts
Normal file
64
src/services/openai-llm.ts
Normal file
@@ -0,0 +1,64 @@
|
||||
import OpenAI from "openai";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
import type { ConversationMemory, UserUtterance } from "./conversation.js";
|
||||
|
||||
const ASSISTANT_INSTRUCTIONS = [
|
||||
"너는 디스코드 음성 채널에서 동작하는 한국어 음성 비서다.",
|
||||
"답변은 짧고 실용적으로 한다.",
|
||||
"기본은 한 문장, 길어도 두 문장을 넘기지 않는다.",
|
||||
"말투는 자연스러운 한국어로 유지한다.",
|
||||
"speaker_id와 speaker_name은 화자 구분용이므로 필요할 때만 자연스럽게 반영한다.",
|
||||
"잘 못 들었거나 의미가 불명확하면 짧게 다시 물어본다.",
|
||||
"목록, 마크다운, 코드블록은 쓰지 않는다.",
|
||||
].join(" ");
|
||||
|
||||
function normalizeReply(text: string): string {
|
||||
const compact = text.replace(/\s+/g, " ").trim();
|
||||
if (compact.length <= 180) {
|
||||
return compact;
|
||||
}
|
||||
|
||||
const sentences = compact.match(/[^.!?]+[.!?]?/g);
|
||||
if (!sentences || sentences.length === 0) {
|
||||
return compact.slice(0, 180).trim();
|
||||
}
|
||||
|
||||
return sentences.slice(0, 2).join(" ").trim().slice(0, 180).trim();
|
||||
}
|
||||
|
||||
export class OpenAiLlmService {
|
||||
private readonly client: OpenAI;
|
||||
|
||||
constructor(private readonly config: AppConfig) {
|
||||
this.client = new OpenAI({
|
||||
apiKey: this.config.OPENAI_API_KEY,
|
||||
});
|
||||
}
|
||||
|
||||
async generateReply(memory: ConversationMemory, utterance: UserUtterance): Promise<string> {
|
||||
const response = await this.client.responses.create({
|
||||
model: this.config.OPENAI_MODEL,
|
||||
instructions: ASSISTANT_INSTRUCTIONS,
|
||||
input: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "input_text",
|
||||
text: memory.buildPrompt(utterance),
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
max_output_tokens: 120,
|
||||
});
|
||||
|
||||
const output = response.output_text?.trim();
|
||||
if (!output) {
|
||||
return "잘 못 들었습니다. 한 번만 다시 말씀해 주세요.";
|
||||
}
|
||||
|
||||
return normalizeReply(output);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user