feat: add local audio test mode

This commit is contained in:
2026-04-30 02:37:54 +09:00
parent 9dee708b64
commit cf6398f50a
12 changed files with 766 additions and 256 deletions

View File

@@ -12,5 +12,8 @@ ELEVENLABS_TTS_MODEL=eleven_flash_v2_5
BOT_DEFAULT_LANGUAGE=ko
MAX_CONVERSATION_TURNS=12
LOCAL_AUDIO_SOURCE=
LOCAL_AUDIO_SINK=
LOCAL_SPEAKER_NAME=local-user
DEBUG_TEXT_EVENTS=false
LOG_LEVEL=info

View File

@@ -1,10 +1,11 @@
# realtime_voice_bot
디스코드 음성 채널에서 여러 사용자의 음성을 개별로 받아 한국어로 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다.
디스코드 음성 채널 또는 로컬 PC 마이크/스피커에서 한국어 음성을 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다.
## 현재 구현 범위
- Discord slash command 기반 제어: `/join`, `/leave`, `/status`, `/reset`, `/say`
- 로컬 테스트 모드: `pw-record` 입력, `pw-play` 출력
- `@discordjs/voice` 기반 음성 채널 입장 및 유저별 오디오 수신
- 48k stereo PCM을 16k mono로 내려서 유저별 VAD 처리
- Silero 계열 VAD(`avr-vad`)로 발화 시작/종료 감지
@@ -28,16 +29,25 @@
필수:
- `DISCORD_BOT_TOKEN`
- `DISCORD_APPLICATION_ID`
- `OPENAI_API_KEY`
- `ELEVENLABS_API_KEY`
- `ELEVENLABS_VOICE_ID`
Discord 모드에서만 필수:
- `DISCORD_BOT_TOKEN`
- `DISCORD_APPLICATION_ID`
선택:
- `DISCORD_COMMAND_GUILD_ID`
- 테스트 서버에만 slash command를 즉시 반영하려면 설정
- `LOCAL_AUDIO_SOURCE`
- `pw-record --target` 에 넣을 PipeWire source id 또는 node name
- `LOCAL_AUDIO_SINK`
- `pw-play --target` 에 넣을 PipeWire sink id 또는 node name
- `LOCAL_SPEAKER_NAME`
- 로컬 테스트에서 프롬프트에 넣을 화자 이름
- `OPENAI_MODEL`
- 기본값: `gpt-5.4-mini`
- `ELEVENLABS_STT_MODEL`
@@ -51,13 +61,24 @@
```bash
bun install
bun run start
```
개발 모드:
디스코드 모드:
```bash
bun run dev
bun run start:discord
```
로컬 장치 목록:
```bash
bun run audio:devices
```
로컬 테스트 모드:
```bash
bun run start:local
```
타입 체크:
@@ -74,9 +95,17 @@ bun run check
4. 말을 하면 봇이 발화 단위로 인식하고 음성으로 짧게 답합니다.
5. 다시 말하면 현재 읽고 있던 TTS는 즉시 중단됩니다.
로컬 테스트:
1. `bun run audio:devices` 로 source/sink id 또는 이름 확인
2. 필요하면 `.env``LOCAL_AUDIO_SOURCE`, `LOCAL_AUDIO_SINK` 설정
3. `bun run start:local`
4. 마이크로 바로 말해서 응답 확인
## 설계 메모
- 입력은 유저별 병렬 처리
- 출력은 길드 세션당 단일 큐
- 로컬 모드는 단일 화자 입력 기준
- 화자 구분은 `speaker_id`, `speaker_name`을 LLM 프롬프트에 항상 포함
- 최소 프로토타입이므로 Deepgram 대체 STT, 장기 메모리, 고급 명령 라우팅은 아직 포함하지 않았습니다.

View File

@@ -5,7 +5,10 @@
"type": "module",
"scripts": {
"dev": "bun --watch src/index.ts",
"start": "bun src/index.ts",
"start": "bun src/index.ts discord",
"start:discord": "bun src/index.ts discord",
"start:local": "bun src/index.ts local",
"audio:devices": "bun src/index.ts local-devices",
"check": "tsc --noEmit",
"build": "tsc -p tsconfig.json"
},

View File

@@ -8,8 +8,10 @@ import {
NoSubscriberBehavior,
VoiceConnectionStatus,
createAudioPlayer,
createAudioResource,
entersState,
joinVoiceChannel,
StreamType,
type AudioPlayer,
type AudioReceiveStream,
type VoiceConnection,
@@ -21,7 +23,7 @@ import { Logger } from "../logger.js";
import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js";
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
import { ElevenLabsTtsService, type PreparedSpeechPlayback } from "../services/elevenlabs-tts.js";
import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
import { OpenAiLlmService } from "../services/openai-llm.js";
interface GuildVoiceSessionOptions {
@@ -167,7 +169,7 @@ export class GuildVoiceSession extends EventEmitter {
private draining = false;
private currentAbortController: AbortController | null = null;
private currentPlayback: PreparedSpeechPlayback | null = null;
private currentPlayback: PreparedSpeechAudio | null = null;
private textChannelId?: string;
private constructor(private readonly options: GuildVoiceSessionOptions) {
@@ -415,7 +417,9 @@ export class GuildVoiceSession extends EventEmitter {
}
try {
const resource = this.currentPlayback.resource;
const resource = createAudioResource(this.currentPlayback.stream, {
inputType: StreamType.Raw,
});
this.player.play(resource);
await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null);

View File

@@ -0,0 +1,339 @@
import { spawn, type ChildProcessByStdio } from "node:child_process";
import { once } from "node:events";
import type { Readable, Writable } from "node:stream";
import { RealTimeVAD } from "avr-vad";
import type { AssistantRuntimeConfig } from "../config.js";
import { Logger } from "../logger.js";
import { takeFrame, int16ArrayToFloat32, float32ToPcm16Buffer } from "./pcm.js";
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
import { OpenAiLlmService } from "../services/openai-llm.js";
interface LocalVoiceSessionOptions {
config: AssistantRuntimeConfig;
logger: Logger;
stt: ElevenLabsSttService;
tts: ElevenLabsTtsService;
llm: OpenAiLlmService;
}
interface SpeechJob {
text: string;
source: "assistant" | "manual";
}
export class LocalVoiceSession {
private readonly memory: ConversationMemory;
private readonly queue: SpeechJob[] = [];
private readonly pendingSamples: number[] = [];
private vad: RealTimeVAD | null = null;
private recorder: ChildProcessByStdio<null, Readable, Readable> | null = null;
private currentPlayer: ChildProcessByStdio<Writable, null, Readable> | null = null;
private currentAbortController: AbortController | null = null;
private currentPlayback: PreparedSpeechAudio | null = null;
private processing = Promise.resolve();
private draining = false;
private destroyed = false;
constructor(private readonly options: LocalVoiceSessionOptions) {
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
}
async start(): Promise<void> {
this.vad = await RealTimeVAD.new({
model: "v5",
sampleRate: 16000,
frameSamples: 1536,
positiveSpeechThreshold: 0.55,
negativeSpeechThreshold: 0.35,
redemptionFrames: 8,
preSpeechPadFrames: 2,
minSpeechFrames: 3,
onFrameProcessed: () => undefined,
onVADMisfire: () => undefined,
onSpeechStart: () => {
this.interruptPlayback("local-barge-in");
},
onSpeechRealStart: () => undefined,
onSpeechEnd: (audio: Float32Array) => {
void this.handleSpeechEnd(audio);
},
});
this.recorder = this.spawnRecorder();
this.recorder.stdout.on("data", (chunk: Buffer) => {
this.pushPcm16Chunk(chunk);
});
this.recorder.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text.length > 0) {
this.options.logger.debug("[pw-record]", text);
}
});
this.recorder.on("exit", (code, signal) => {
if (!this.destroyed) {
this.options.logger.warn("pw-record exited unexpectedly", { code, signal });
}
});
}
async destroy(): Promise<void> {
this.destroyed = true;
this.interruptPlayback("local-shutdown");
if (this.recorder && !this.recorder.killed) {
this.recorder.kill("SIGTERM");
await once(this.recorder, "exit").catch(() => null);
}
if (this.vad) {
await this.vad.destroy().catch((error) => {
this.options.logger.warn("Local VAD destroy failed", error);
});
this.vad = null;
}
}
clearConversation(): void {
this.memory.clear();
this.interruptPlayback("local-reset");
}
async speakText(text: string): Promise<void> {
this.queue.push({
text,
source: "manual",
});
await this.drainQueue();
}
statusSummary(): string {
return [
"모드: local",
`입력 source: ${this.options.config.LOCAL_AUDIO_SOURCE ?? "default"}`,
`출력 sink: ${this.options.config.LOCAL_AUDIO_SINK ?? "default"}`,
`대기열: ${this.queue.length}`,
`최근 대화 턴: ${this.memory.recentTurns().length}`,
].join("\n");
}
private spawnRecorder(): ChildProcessByStdio<null, Readable, Readable> {
const args = [
"--rate",
"16000",
"--channels",
"1",
"--format",
"s16",
"--raw",
];
if (this.options.config.LOCAL_AUDIO_SOURCE) {
args.push("--target", this.options.config.LOCAL_AUDIO_SOURCE);
}
args.push("-");
this.options.logger.info("Starting local recorder", {
source: this.options.config.LOCAL_AUDIO_SOURCE ?? "default",
});
return spawn("pw-record", args, {
stdio: ["ignore", "pipe", "pipe"],
});
}
private pushPcm16Chunk(chunk: Buffer): void {
if (this.destroyed || !this.vad) {
return;
}
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
this.pendingSamples.push(chunk.readInt16LE(offset));
}
while (true) {
const frame = takeFrame(this.pendingSamples, 1536);
if (!frame) {
return;
}
const floatFrame = int16ArrayToFloat32(frame);
this.processing = this.processing
.then(() => this.vad?.processAudio(floatFrame))
.catch((error) => {
this.options.logger.warn("Local VAD processing failed", error);
});
}
}
private async handleSpeechEnd(audio: Float32Array): Promise<void> {
if (audio.length < 16000 * 0.25) {
return;
}
const utterance: UserUtterance = {
speakerId: "local-user",
speakerName: this.options.config.LOCAL_SPEAKER_NAME,
text: "",
};
let transcript: string | null = null;
try {
transcript = await this.options.stt.transcribePcm16(float32ToPcm16Buffer(audio));
} catch (error) {
this.options.logger.warn("Local STT failed", error);
return;
}
if (!transcript || transcript.trim().length === 0) {
return;
}
utterance.text = transcript.trim();
this.memory.addUserTurn(utterance);
this.options.logger.info("Local transcript", utterance.text);
if (this.options.config.DEBUG_TEXT_EVENTS) {
console.log(`\n[you] ${utterance.text}`);
}
let reply: string;
try {
reply = await this.options.llm.generateReply(this.memory, utterance);
} catch (error) {
this.options.logger.warn("Local LLM failed", error);
reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요.";
}
this.memory.addAssistantTurn(reply);
if (this.options.config.DEBUG_TEXT_EVENTS) {
console.log(`[bot] ${reply}\n`);
}
this.queue.push({
text: reply,
source: "assistant",
});
await this.drainQueue();
}
private interruptPlayback(reason: string): void {
if (this.queue.length > 0 || this.currentPlayer) {
this.options.logger.info("Interrupting local playback", reason);
}
this.queue.splice(0, this.queue.length);
this.currentAbortController?.abort();
this.currentAbortController = null;
this.currentPlayback?.dispose();
this.currentPlayback = null;
if (this.currentPlayer && !this.currentPlayer.killed) {
this.currentPlayer.kill("SIGKILL");
}
this.currentPlayer = null;
}
private async drainQueue(): Promise<void> {
if (this.draining || this.destroyed) {
return;
}
this.draining = true;
try {
while (this.queue.length > 0 && !this.destroyed) {
const job = this.queue.shift();
if (!job) {
continue;
}
const abortController = new AbortController();
this.currentAbortController = abortController;
try {
this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal);
} catch (error) {
if (!abortController.signal.aborted) {
this.options.logger.warn("Local TTS synthesis failed", error);
}
continue;
}
try {
await this.playToSink(this.currentPlayback, abortController.signal);
} catch (error) {
if (!abortController.signal.aborted) {
this.options.logger.warn("Local playback failed", error);
}
} finally {
this.currentPlayback?.dispose();
this.currentPlayback = null;
if (this.currentAbortController === abortController) {
this.currentAbortController = null;
}
}
}
} finally {
this.draining = false;
}
}
private async playToSink(playback: PreparedSpeechAudio, signal: AbortSignal): Promise<void> {
const args = [
"--rate",
"48000",
"--channels",
"2",
"--format",
"s16",
"--raw",
];
if (this.options.config.LOCAL_AUDIO_SINK) {
args.push("--target", this.options.config.LOCAL_AUDIO_SINK);
}
args.push("-");
const player = spawn("pw-play", args, {
stdio: ["pipe", "ignore", "pipe"],
});
this.currentPlayer = player;
player.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text.length > 0) {
this.options.logger.debug("[pw-play]", text);
}
});
signal.addEventListener(
"abort",
() => {
playback.stream.destroy();
if (!player.killed) {
player.kill("SIGKILL");
}
},
{ once: true },
);
playback.stream.pipe(player.stdin);
const [code, playSignal] = (await once(player, "exit")) as [number | null, NodeJS.Signals | null];
this.currentPlayer = null;
if (signal.aborted) {
return;
}
if (code !== 0) {
throw new Error(`pw-play exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`);
}
}
}

View File

@@ -4,17 +4,20 @@ import { z } from "zod";
loadDotenv();
const envSchema = z.object({
DISCORD_BOT_TOKEN: z.string().min(1),
DISCORD_APPLICATION_ID: z.string().min(1),
DISCORD_BOT_TOKEN: z.string().min(1).optional(),
DISCORD_APPLICATION_ID: z.string().min(1).optional(),
DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(),
OPENAI_API_KEY: z.string().min(1),
OPENAI_API_KEY: z.string().min(1).optional(),
OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"),
ELEVENLABS_API_KEY: z.string().min(1),
ELEVENLABS_VOICE_ID: z.string().min(1),
ELEVENLABS_API_KEY: z.string().min(1).optional(),
ELEVENLABS_VOICE_ID: z.string().min(1).optional(),
ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"),
ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"),
BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),
MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12),
LOCAL_AUDIO_SOURCE: z.string().min(1).optional(),
LOCAL_AUDIO_SINK: z.string().min(1).optional(),
LOCAL_SPEAKER_NAME: z.string().min(1).default("local-user"),
DEBUG_TEXT_EVENTS: z
.string()
.optional()
@@ -23,7 +26,41 @@ const envSchema = z.object({
});
export type AppConfig = z.infer<typeof envSchema>;
export type AssistantRuntimeConfig = AppConfig & {
OPENAI_API_KEY: string;
ELEVENLABS_API_KEY: string;
ELEVENLABS_VOICE_ID: string;
};
export type DiscordRuntimeConfig = AssistantRuntimeConfig & {
DISCORD_BOT_TOKEN: string;
DISCORD_APPLICATION_ID: string;
};
export function loadConfig(): AppConfig {
return envSchema.parse(process.env);
}
function requirePresent(value: string | undefined, name: string): string {
if (!value) {
throw new Error(`${name} 환경변수가 필요합니다.`);
}
return value;
}
export function requireAssistantRuntimeConfig(config: AppConfig): AssistantRuntimeConfig {
return {
...config,
OPENAI_API_KEY: requirePresent(config.OPENAI_API_KEY, "OPENAI_API_KEY"),
ELEVENLABS_API_KEY: requirePresent(config.ELEVENLABS_API_KEY, "ELEVENLABS_API_KEY"),
ELEVENLABS_VOICE_ID: requirePresent(config.ELEVENLABS_VOICE_ID, "ELEVENLABS_VOICE_ID"),
};
}
export function requireDiscordRuntimeConfig(config: AppConfig): DiscordRuntimeConfig {
const assistant = requireAssistantRuntimeConfig(config);
return {
...assistant,
DISCORD_BOT_TOKEN: requirePresent(config.DISCORD_BOT_TOKEN, "DISCORD_BOT_TOKEN"),
DISCORD_APPLICATION_ID: requirePresent(config.DISCORD_APPLICATION_ID, "DISCORD_APPLICATION_ID"),
};
}

234
src/discord-main.ts Normal file
View File

@@ -0,0 +1,234 @@
import process from "node:process";
import {
GatewayIntentBits,
REST,
Routes,
SlashCommandBuilder,
type ChatInputCommandInteraction,
type Client,
type GuildMember,
type VoiceBasedChannel,
} from "discord.js";
import { Client as DiscordClient } from "discord.js";
import { GuildVoiceSession } from "./audio/guild-voice-session.js";
import { type DiscordRuntimeConfig } from "./config.js";
import { Logger } from "./logger.js";
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
import { OpenAiLlmService } from "./services/openai-llm.js";
export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
const commands = [
new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
new SlashCommandBuilder()
.setName("say")
.setDescription("텍스트를 바로 음성으로 읽습니다.")
.addStringOption((option) =>
option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
),
].map((command) => command.toJSON());
const client = new DiscordClient({
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
});
const stt = new ElevenLabsSttService(config);
const tts = new ElevenLabsTtsService(config);
const llm = new OpenAiLlmService(config);
const sessions = new Map<string, GuildVoiceSession>();
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
const member = interaction.member as GuildMember | null;
return member?.voice.channel ?? null;
}
async function registerCommands(_appClient: Client): Promise<void> {
const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
if (config.DISCORD_COMMAND_GUILD_ID) {
await rest.put(
Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
{
body: commands,
},
);
logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
return;
}
await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
body: commands,
});
logger.info("Registered global commands");
}
async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
if (!interaction.guild) {
throw new Error("Guild interaction required");
}
const voiceChannel = getVoiceChannel(interaction);
if (!voiceChannel) {
throw new Error("먼저 음성 채널에 들어가 주세요.");
}
const existing = sessions.get(interaction.guild.id);
if (existing && existing.voiceChannelId === voiceChannel.id) {
existing.setTextChannel(interaction.channelId);
return existing;
}
if (existing) {
await existing.destroy();
sessions.delete(interaction.guild.id);
}
const session = await GuildVoiceSession.create({
client,
config,
logger,
guild: interaction.guild,
voiceChannel,
textChannelId: interaction.channelId,
stt,
tts,
llm,
});
sessions.set(interaction.guild.id, session);
return session;
}
async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
try {
const session = await createSession(interaction);
await interaction.editReply(
`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`,
);
} catch (error) {
const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
await interaction.editReply(message);
}
}
async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await session.destroy();
sessions.delete(interaction.guildId!);
await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
}
async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await interaction.reply({
content: session.statusSummary(),
ephemeral: true,
});
}
async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
session.clearConversation();
await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
}
async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
return;
}
const text = interaction.options.getString("text", true).trim();
await session.speakText(text);
await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
}
async function shutdown(exitCode = 0): Promise<void> {
logger.info("Shutting down");
for (const session of sessions.values()) {
await session.destroy().catch((error) => {
logger.warn("Session shutdown failed", error);
});
}
sessions.clear();
await client.destroy();
process.exit(exitCode);
}
client.once("ready", async () => {
logger.info("Discord client ready", client.user?.tag ?? "unknown");
try {
await registerCommands(client);
} catch (error) {
logger.error("Command registration failed", error);
}
});
client.on("interactionCreate", async (interaction) => {
if (!interaction.isChatInputCommand()) {
return;
}
try {
switch (interaction.commandName) {
case "join":
await handleJoin(interaction);
return;
case "leave":
await handleLeave(interaction);
return;
case "status":
await handleStatus(interaction);
return;
case "reset":
await handleReset(interaction);
return;
case "say":
await handleSay(interaction);
return;
default:
await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
}
} catch (error) {
logger.error("Interaction handler failed", error);
if (interaction.deferred || interaction.replied) {
await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
return;
}
await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
}
});
process.on("SIGINT", () => {
void shutdown(0);
});
process.on("SIGTERM", () => {
void shutdown(0);
});
await client.login(config.DISCORD_BOT_TOKEN);
}

View File

@@ -1,237 +1,28 @@
import process from "node:process";
import {
GatewayIntentBits,
REST,
Routes,
SlashCommandBuilder,
type ChatInputCommandInteraction,
type Client,
type GuildMember,
type VoiceBasedChannel,
} from "discord.js";
import { Client as DiscordClient } from "discord.js";
import { GuildVoiceSession } from "./audio/guild-voice-session.js";
import { loadConfig } from "./config.js";
import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js";
import { runDiscordBot } from "./discord-main.js";
import { Logger } from "./logger.js";
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
import { OpenAiLlmService } from "./services/openai-llm.js";
import { printLocalAudioDevices, runLocalAssistant } from "./local-main.js";
const mode = process.argv[2] ?? "discord";
const config = loadConfig();
const logger = new Logger(config.LOG_LEVEL);
const commands = [
new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
new SlashCommandBuilder()
.setName("say")
.setDescription("텍스트를 바로 음성으로 읽습니다.")
.addStringOption((option) =>
option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
),
].map((command) => command.toJSON());
const client = new DiscordClient({
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
});
const stt = new ElevenLabsSttService(config);
const tts = new ElevenLabsTtsService(config);
const llm = new OpenAiLlmService(config);
const sessions = new Map<string, GuildVoiceSession>();
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
const member = interaction.member as GuildMember | null;
return member?.voice.channel ?? null;
}
async function registerCommands(appClient: Client): Promise<void> {
const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
if (config.DISCORD_COMMAND_GUILD_ID) {
await rest.put(
Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
{
body: commands,
},
);
logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
return;
}
await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
body: commands,
});
logger.info("Registered global commands");
}
async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
if (!interaction.guild) {
throw new Error("Guild interaction required");
}
const voiceChannel = getVoiceChannel(interaction);
if (!voiceChannel) {
throw new Error("먼저 음성 채널에 들어가 주세요.");
}
const existing = sessions.get(interaction.guild.id);
if (existing && existing.voiceChannelId === voiceChannel.id) {
existing.setTextChannel(interaction.channelId);
return existing;
}
if (existing) {
await existing.destroy();
sessions.delete(interaction.guild.id);
}
const session = await GuildVoiceSession.create({
client,
config,
logger,
guild: interaction.guild,
voiceChannel,
textChannelId: interaction.channelId,
stt,
tts,
llm,
});
sessions.set(interaction.guild.id, session);
return session;
}
async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
try {
const session = await createSession(interaction);
await interaction.editReply(`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`);
} catch (error) {
const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
await interaction.editReply(message);
}
}
async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await session.destroy();
sessions.delete(interaction.guildId!);
await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
}
async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await interaction.reply({
content: session.statusSummary(),
ephemeral: true,
});
}
async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
session.clearConversation();
await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
}
async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
return;
}
const text = interaction.options.getString("text", true).trim();
await session.speakText(text);
await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
}
async function shutdown(exitCode = 0): Promise<void> {
logger.info("Shutting down");
for (const session of sessions.values()) {
await session.destroy().catch((error) => {
logger.warn("Session shutdown failed", error);
});
}
sessions.clear();
await client.destroy();
process.exit(exitCode);
}
client.once("ready", async () => {
logger.info("Discord client ready", client.user?.tag ?? "unknown");
try {
await registerCommands(client);
} catch (error) {
logger.error("Command registration failed", error);
}
});
client.on("interactionCreate", async (interaction) => {
if (!interaction.isChatInputCommand()) {
return;
}
try {
switch (interaction.commandName) {
case "join":
await handleJoin(interaction);
return;
case "leave":
await handleLeave(interaction);
return;
case "status":
await handleStatus(interaction);
return;
case "reset":
await handleReset(interaction);
return;
case "say":
await handleSay(interaction);
return;
default:
await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
}
} catch (error) {
logger.error("Interaction handler failed", error);
if (interaction.deferred || interaction.replied) {
await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
return;
}
await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
}
});
process.on("SIGINT", () => {
void shutdown(0);
});
process.on("SIGTERM", () => {
void shutdown(0);
});
async function main(): Promise<void> {
await client.login(config.DISCORD_BOT_TOKEN);
switch (mode) {
case "discord":
await runDiscordBot(requireDiscordRuntimeConfig(config), logger);
return;
case "local":
await runLocalAssistant(requireAssistantRuntimeConfig(config), logger);
return;
case "local-devices":
await printLocalAudioDevices();
return;
default:
throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices`);
}
}
void main().catch((error) => {

75
src/local-main.ts Normal file
View File

@@ -0,0 +1,75 @@
import { spawn } from "node:child_process";
import process from "node:process";
import type { AssistantRuntimeConfig } from "./config.js";
import { Logger } from "./logger.js";
import { LocalVoiceSession } from "./audio/local-voice-session.js";
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
import { OpenAiLlmService } from "./services/openai-llm.js";
export async function printLocalAudioDevices(): Promise<void> {
const runs = [
{
label: "wpctl status",
args: ["status"],
},
{
label: "wpctl status -n",
args: ["status", "-n"],
},
] as const;
for (const run of runs) {
console.log(`\n=== ${run.label} ===`);
await new Promise<void>((resolve, reject) => {
const child = spawn("wpctl", run.args, {
stdio: ["ignore", "inherit", "inherit"],
});
child.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(`wpctl exited with code ${code ?? "null"}`));
});
child.on("error", reject);
});
}
}
export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
const stt = new ElevenLabsSttService(config);
const tts = new ElevenLabsTtsService(config);
const llm = new OpenAiLlmService(config);
const session = new LocalVoiceSession({
config,
logger,
stt,
tts,
llm,
});
console.log(session.statusSummary());
console.log("로컬 음성 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
if (config.DEBUG_TEXT_EVENTS) {
console.log("텍스트 로그 출력이 켜져 있습니다.");
}
const shutdown = async (exitCode = 0) => {
await session.destroy().catch((error) => {
logger.warn("Local session shutdown failed", error);
});
process.exit(exitCode);
};
process.on("SIGINT", () => {
void shutdown(0);
});
process.on("SIGTERM", () => {
void shutdown(0);
});
await session.start();
}

View File

@@ -1,6 +1,6 @@
import WebSocket from "ws";
import type { AppConfig } from "../config.js";
import type { AssistantRuntimeConfig } from "../config.js";
interface ElevenLabsMessage {
message_type?: string;
@@ -13,7 +13,7 @@ const NON_FATAL_ERROR_TYPES = new Set([
]);
export class ElevenLabsSttService {
constructor(private readonly config: AppConfig) {}
constructor(private readonly config: AssistantRuntimeConfig) {}
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
if (pcm16MonoAudio.byteLength === 0) {

View File

@@ -2,24 +2,23 @@ import { Readable } from "node:stream";
import ffmpegStatic from "ffmpeg-static";
import prism from "prism-media";
import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice";
import type { AppConfig } from "../config.js";
import type { AssistantRuntimeConfig } from "../config.js";
export interface PreparedSpeechPlayback {
resource: AudioResource;
export interface PreparedSpeechAudio {
stream: Readable;
dispose: () => void;
}
export class ElevenLabsTtsService {
constructor(private readonly config: AppConfig) {
constructor(private readonly config: AssistantRuntimeConfig) {
const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
process.env.FFMPEG_PATH = resolvedFfmpegPath;
}
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechPlayback> {
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
url.searchParams.set("output_format", "mp3_44100_128");
url.searchParams.set("enable_logging", "false");
@@ -68,12 +67,8 @@ export class ElevenLabsTtsService {
input.pipe(ffmpeg);
const resource = createAudioResource(ffmpeg, {
inputType: StreamType.Raw,
});
return {
resource,
stream: ffmpeg,
dispose: () => {
input.destroy();
ffmpeg.destroy();

View File

@@ -1,6 +1,6 @@
import OpenAI from "openai";
import type { AppConfig } from "../config.js";
import type { AssistantRuntimeConfig } from "../config.js";
import type { ConversationMemory, UserUtterance } from "./conversation.js";
const ASSISTANT_INSTRUCTIONS = [
@@ -30,7 +30,7 @@ function normalizeReply(text: string): string {
export class OpenAiLlmService {
private readonly client: OpenAI;
constructor(private readonly config: AppConfig) {
constructor(private readonly config: AssistantRuntimeConfig) {
this.client = new OpenAI({
apiKey: this.config.OPENAI_API_KEY,
});