feat: add local audio test mode

This commit is contained in:
2026-04-30 02:37:54 +09:00
parent 9dee708b64
commit cf6398f50a
12 changed files with 766 additions and 256 deletions

View File

@@ -12,5 +12,8 @@ ELEVENLABS_TTS_MODEL=eleven_flash_v2_5
BOT_DEFAULT_LANGUAGE=ko BOT_DEFAULT_LANGUAGE=ko
MAX_CONVERSATION_TURNS=12 MAX_CONVERSATION_TURNS=12
LOCAL_AUDIO_SOURCE=
LOCAL_AUDIO_SINK=
LOCAL_SPEAKER_NAME=local-user
DEBUG_TEXT_EVENTS=false DEBUG_TEXT_EVENTS=false
LOG_LEVEL=info LOG_LEVEL=info

View File

@@ -1,10 +1,11 @@
# realtime_voice_bot # realtime_voice_bot
디스코드 음성 채널에서 여러 사용자의 음성을 개별로 받아 한국어로 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다. 디스코드 음성 채널 또는 로컬 PC 마이크/스피커에서 한국어 음성을 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다.
## 현재 구현 범위 ## 현재 구현 범위
- Discord slash command 기반 제어: `/join`, `/leave`, `/status`, `/reset`, `/say` - Discord slash command 기반 제어: `/join`, `/leave`, `/status`, `/reset`, `/say`
- 로컬 테스트 모드: `pw-record` 입력, `pw-play` 출력
- `@discordjs/voice` 기반 음성 채널 입장 및 유저별 오디오 수신 - `@discordjs/voice` 기반 음성 채널 입장 및 유저별 오디오 수신
- 48k stereo PCM을 16k mono로 내려서 유저별 VAD 처리 - 48k stereo PCM을 16k mono로 내려서 유저별 VAD 처리
- Silero 계열 VAD(`avr-vad`)로 발화 시작/종료 감지 - Silero 계열 VAD(`avr-vad`)로 발화 시작/종료 감지
@@ -28,16 +29,25 @@
필수: 필수:
- `DISCORD_BOT_TOKEN`
- `DISCORD_APPLICATION_ID`
- `OPENAI_API_KEY` - `OPENAI_API_KEY`
- `ELEVENLABS_API_KEY` - `ELEVENLABS_API_KEY`
- `ELEVENLABS_VOICE_ID` - `ELEVENLABS_VOICE_ID`
Discord 모드에서만 필수:
- `DISCORD_BOT_TOKEN`
- `DISCORD_APPLICATION_ID`
선택: 선택:
- `DISCORD_COMMAND_GUILD_ID` - `DISCORD_COMMAND_GUILD_ID`
- 테스트 서버에만 slash command를 즉시 반영하려면 설정 - 테스트 서버에만 slash command를 즉시 반영하려면 설정
- `LOCAL_AUDIO_SOURCE`
- `pw-record --target` 에 넣을 PipeWire source id 또는 node name
- `LOCAL_AUDIO_SINK`
- `pw-play --target` 에 넣을 PipeWire sink id 또는 node name
- `LOCAL_SPEAKER_NAME`
- 로컬 테스트에서 프롬프트에 넣을 화자 이름
- `OPENAI_MODEL` - `OPENAI_MODEL`
- 기본값: `gpt-5.4-mini` - 기본값: `gpt-5.4-mini`
- `ELEVENLABS_STT_MODEL` - `ELEVENLABS_STT_MODEL`
@@ -51,13 +61,24 @@
```bash ```bash
bun install bun install
bun run start
``` ```
개발 모드: 디스코드 모드:
```bash ```bash
bun run dev bun run start:discord
```
로컬 장치 목록:
```bash
bun run audio:devices
```
로컬 테스트 모드:
```bash
bun run start:local
``` ```
타입 체크: 타입 체크:
@@ -74,9 +95,17 @@ bun run check
4. 말을 하면 봇이 발화 단위로 인식하고 음성으로 짧게 답합니다. 4. 말을 하면 봇이 발화 단위로 인식하고 음성으로 짧게 답합니다.
5. 다시 말하면 현재 읽고 있던 TTS는 즉시 중단됩니다. 5. 다시 말하면 현재 읽고 있던 TTS는 즉시 중단됩니다.
로컬 테스트:
1. `bun run audio:devices` 로 source/sink id 또는 이름 확인
2. 필요하면 `.env``LOCAL_AUDIO_SOURCE`, `LOCAL_AUDIO_SINK` 설정
3. `bun run start:local`
4. 마이크로 바로 말해서 응답 확인
## 설계 메모 ## 설계 메모
- 입력은 유저별 병렬 처리 - 입력은 유저별 병렬 처리
- 출력은 길드 세션당 단일 큐 - 출력은 길드 세션당 단일 큐
- 로컬 모드는 단일 화자 입력 기준
- 화자 구분은 `speaker_id`, `speaker_name`을 LLM 프롬프트에 항상 포함 - 화자 구분은 `speaker_id`, `speaker_name`을 LLM 프롬프트에 항상 포함
- 최소 프로토타입이므로 Deepgram 대체 STT, 장기 메모리, 고급 명령 라우팅은 아직 포함하지 않았습니다. - 최소 프로토타입이므로 Deepgram 대체 STT, 장기 메모리, 고급 명령 라우팅은 아직 포함하지 않았습니다.

View File

@@ -5,7 +5,10 @@
"type": "module", "type": "module",
"scripts": { "scripts": {
"dev": "bun --watch src/index.ts", "dev": "bun --watch src/index.ts",
"start": "bun src/index.ts", "start": "bun src/index.ts discord",
"start:discord": "bun src/index.ts discord",
"start:local": "bun src/index.ts local",
"audio:devices": "bun src/index.ts local-devices",
"check": "tsc --noEmit", "check": "tsc --noEmit",
"build": "tsc -p tsconfig.json" "build": "tsc -p tsconfig.json"
}, },

View File

@@ -8,8 +8,10 @@ import {
NoSubscriberBehavior, NoSubscriberBehavior,
VoiceConnectionStatus, VoiceConnectionStatus,
createAudioPlayer, createAudioPlayer,
createAudioResource,
entersState, entersState,
joinVoiceChannel, joinVoiceChannel,
StreamType,
type AudioPlayer, type AudioPlayer,
type AudioReceiveStream, type AudioReceiveStream,
type VoiceConnection, type VoiceConnection,
@@ -21,7 +23,7 @@ import { Logger } from "../logger.js";
import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js"; import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js";
import { ConversationMemory, type UserUtterance } from "../services/conversation.js"; import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js"; import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
import { ElevenLabsTtsService, type PreparedSpeechPlayback } from "../services/elevenlabs-tts.js"; import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
import { OpenAiLlmService } from "../services/openai-llm.js"; import { OpenAiLlmService } from "../services/openai-llm.js";
interface GuildVoiceSessionOptions { interface GuildVoiceSessionOptions {
@@ -167,7 +169,7 @@ export class GuildVoiceSession extends EventEmitter {
private draining = false; private draining = false;
private currentAbortController: AbortController | null = null; private currentAbortController: AbortController | null = null;
private currentPlayback: PreparedSpeechPlayback | null = null; private currentPlayback: PreparedSpeechAudio | null = null;
private textChannelId?: string; private textChannelId?: string;
private constructor(private readonly options: GuildVoiceSessionOptions) { private constructor(private readonly options: GuildVoiceSessionOptions) {
@@ -415,7 +417,9 @@ export class GuildVoiceSession extends EventEmitter {
} }
try { try {
const resource = this.currentPlayback.resource; const resource = createAudioResource(this.currentPlayback.stream, {
inputType: StreamType.Raw,
});
this.player.play(resource); this.player.play(resource);
await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null); await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null);

View File

@@ -0,0 +1,339 @@
import { spawn, type ChildProcessByStdio } from "node:child_process";
import { once } from "node:events";
import type { Readable, Writable } from "node:stream";
import { RealTimeVAD } from "avr-vad";
import type { AssistantRuntimeConfig } from "../config.js";
import { Logger } from "../logger.js";
import { takeFrame, int16ArrayToFloat32, float32ToPcm16Buffer } from "./pcm.js";
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
import { OpenAiLlmService } from "../services/openai-llm.js";
interface LocalVoiceSessionOptions {
config: AssistantRuntimeConfig;
logger: Logger;
stt: ElevenLabsSttService;
tts: ElevenLabsTtsService;
llm: OpenAiLlmService;
}
interface SpeechJob {
text: string;
source: "assistant" | "manual";
}
export class LocalVoiceSession {
private readonly memory: ConversationMemory;
private readonly queue: SpeechJob[] = [];
private readonly pendingSamples: number[] = [];
private vad: RealTimeVAD | null = null;
private recorder: ChildProcessByStdio<null, Readable, Readable> | null = null;
private currentPlayer: ChildProcessByStdio<Writable, null, Readable> | null = null;
private currentAbortController: AbortController | null = null;
private currentPlayback: PreparedSpeechAudio | null = null;
private processing = Promise.resolve();
private draining = false;
private destroyed = false;
constructor(private readonly options: LocalVoiceSessionOptions) {
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
}
async start(): Promise<void> {
this.vad = await RealTimeVAD.new({
model: "v5",
sampleRate: 16000,
frameSamples: 1536,
positiveSpeechThreshold: 0.55,
negativeSpeechThreshold: 0.35,
redemptionFrames: 8,
preSpeechPadFrames: 2,
minSpeechFrames: 3,
onFrameProcessed: () => undefined,
onVADMisfire: () => undefined,
onSpeechStart: () => {
this.interruptPlayback("local-barge-in");
},
onSpeechRealStart: () => undefined,
onSpeechEnd: (audio: Float32Array) => {
void this.handleSpeechEnd(audio);
},
});
this.recorder = this.spawnRecorder();
this.recorder.stdout.on("data", (chunk: Buffer) => {
this.pushPcm16Chunk(chunk);
});
this.recorder.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text.length > 0) {
this.options.logger.debug("[pw-record]", text);
}
});
this.recorder.on("exit", (code, signal) => {
if (!this.destroyed) {
this.options.logger.warn("pw-record exited unexpectedly", { code, signal });
}
});
}
async destroy(): Promise<void> {
this.destroyed = true;
this.interruptPlayback("local-shutdown");
if (this.recorder && !this.recorder.killed) {
this.recorder.kill("SIGTERM");
await once(this.recorder, "exit").catch(() => null);
}
if (this.vad) {
await this.vad.destroy().catch((error) => {
this.options.logger.warn("Local VAD destroy failed", error);
});
this.vad = null;
}
}
clearConversation(): void {
this.memory.clear();
this.interruptPlayback("local-reset");
}
async speakText(text: string): Promise<void> {
this.queue.push({
text,
source: "manual",
});
await this.drainQueue();
}
statusSummary(): string {
return [
"모드: local",
`입력 source: ${this.options.config.LOCAL_AUDIO_SOURCE ?? "default"}`,
`출력 sink: ${this.options.config.LOCAL_AUDIO_SINK ?? "default"}`,
`대기열: ${this.queue.length}`,
`최근 대화 턴: ${this.memory.recentTurns().length}`,
].join("\n");
}
private spawnRecorder(): ChildProcessByStdio<null, Readable, Readable> {
const args = [
"--rate",
"16000",
"--channels",
"1",
"--format",
"s16",
"--raw",
];
if (this.options.config.LOCAL_AUDIO_SOURCE) {
args.push("--target", this.options.config.LOCAL_AUDIO_SOURCE);
}
args.push("-");
this.options.logger.info("Starting local recorder", {
source: this.options.config.LOCAL_AUDIO_SOURCE ?? "default",
});
return spawn("pw-record", args, {
stdio: ["ignore", "pipe", "pipe"],
});
}
private pushPcm16Chunk(chunk: Buffer): void {
if (this.destroyed || !this.vad) {
return;
}
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
this.pendingSamples.push(chunk.readInt16LE(offset));
}
while (true) {
const frame = takeFrame(this.pendingSamples, 1536);
if (!frame) {
return;
}
const floatFrame = int16ArrayToFloat32(frame);
this.processing = this.processing
.then(() => this.vad?.processAudio(floatFrame))
.catch((error) => {
this.options.logger.warn("Local VAD processing failed", error);
});
}
}
private async handleSpeechEnd(audio: Float32Array): Promise<void> {
if (audio.length < 16000 * 0.25) {
return;
}
const utterance: UserUtterance = {
speakerId: "local-user",
speakerName: this.options.config.LOCAL_SPEAKER_NAME,
text: "",
};
let transcript: string | null = null;
try {
transcript = await this.options.stt.transcribePcm16(float32ToPcm16Buffer(audio));
} catch (error) {
this.options.logger.warn("Local STT failed", error);
return;
}
if (!transcript || transcript.trim().length === 0) {
return;
}
utterance.text = transcript.trim();
this.memory.addUserTurn(utterance);
this.options.logger.info("Local transcript", utterance.text);
if (this.options.config.DEBUG_TEXT_EVENTS) {
console.log(`\n[you] ${utterance.text}`);
}
let reply: string;
try {
reply = await this.options.llm.generateReply(this.memory, utterance);
} catch (error) {
this.options.logger.warn("Local LLM failed", error);
reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요.";
}
this.memory.addAssistantTurn(reply);
if (this.options.config.DEBUG_TEXT_EVENTS) {
console.log(`[bot] ${reply}\n`);
}
this.queue.push({
text: reply,
source: "assistant",
});
await this.drainQueue();
}
private interruptPlayback(reason: string): void {
if (this.queue.length > 0 || this.currentPlayer) {
this.options.logger.info("Interrupting local playback", reason);
}
this.queue.splice(0, this.queue.length);
this.currentAbortController?.abort();
this.currentAbortController = null;
this.currentPlayback?.dispose();
this.currentPlayback = null;
if (this.currentPlayer && !this.currentPlayer.killed) {
this.currentPlayer.kill("SIGKILL");
}
this.currentPlayer = null;
}
private async drainQueue(): Promise<void> {
if (this.draining || this.destroyed) {
return;
}
this.draining = true;
try {
while (this.queue.length > 0 && !this.destroyed) {
const job = this.queue.shift();
if (!job) {
continue;
}
const abortController = new AbortController();
this.currentAbortController = abortController;
try {
this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal);
} catch (error) {
if (!abortController.signal.aborted) {
this.options.logger.warn("Local TTS synthesis failed", error);
}
continue;
}
try {
await this.playToSink(this.currentPlayback, abortController.signal);
} catch (error) {
if (!abortController.signal.aborted) {
this.options.logger.warn("Local playback failed", error);
}
} finally {
this.currentPlayback?.dispose();
this.currentPlayback = null;
if (this.currentAbortController === abortController) {
this.currentAbortController = null;
}
}
}
} finally {
this.draining = false;
}
}
private async playToSink(playback: PreparedSpeechAudio, signal: AbortSignal): Promise<void> {
const args = [
"--rate",
"48000",
"--channels",
"2",
"--format",
"s16",
"--raw",
];
if (this.options.config.LOCAL_AUDIO_SINK) {
args.push("--target", this.options.config.LOCAL_AUDIO_SINK);
}
args.push("-");
const player = spawn("pw-play", args, {
stdio: ["pipe", "ignore", "pipe"],
});
this.currentPlayer = player;
player.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text.length > 0) {
this.options.logger.debug("[pw-play]", text);
}
});
signal.addEventListener(
"abort",
() => {
playback.stream.destroy();
if (!player.killed) {
player.kill("SIGKILL");
}
},
{ once: true },
);
playback.stream.pipe(player.stdin);
const [code, playSignal] = (await once(player, "exit")) as [number | null, NodeJS.Signals | null];
this.currentPlayer = null;
if (signal.aborted) {
return;
}
if (code !== 0) {
throw new Error(`pw-play exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`);
}
}
}

View File

@@ -4,17 +4,20 @@ import { z } from "zod";
loadDotenv(); loadDotenv();
const envSchema = z.object({ const envSchema = z.object({
DISCORD_BOT_TOKEN: z.string().min(1), DISCORD_BOT_TOKEN: z.string().min(1).optional(),
DISCORD_APPLICATION_ID: z.string().min(1), DISCORD_APPLICATION_ID: z.string().min(1).optional(),
DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(), DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(),
OPENAI_API_KEY: z.string().min(1), OPENAI_API_KEY: z.string().min(1).optional(),
OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"), OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"),
ELEVENLABS_API_KEY: z.string().min(1), ELEVENLABS_API_KEY: z.string().min(1).optional(),
ELEVENLABS_VOICE_ID: z.string().min(1), ELEVENLABS_VOICE_ID: z.string().min(1).optional(),
ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"), ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"),
ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"), ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"),
BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"), BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),
MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12), MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12),
LOCAL_AUDIO_SOURCE: z.string().min(1).optional(),
LOCAL_AUDIO_SINK: z.string().min(1).optional(),
LOCAL_SPEAKER_NAME: z.string().min(1).default("local-user"),
DEBUG_TEXT_EVENTS: z DEBUG_TEXT_EVENTS: z
.string() .string()
.optional() .optional()
@@ -23,7 +26,41 @@ const envSchema = z.object({
}); });
export type AppConfig = z.infer<typeof envSchema>; export type AppConfig = z.infer<typeof envSchema>;
export type AssistantRuntimeConfig = AppConfig & {
OPENAI_API_KEY: string;
ELEVENLABS_API_KEY: string;
ELEVENLABS_VOICE_ID: string;
};
export type DiscordRuntimeConfig = AssistantRuntimeConfig & {
DISCORD_BOT_TOKEN: string;
DISCORD_APPLICATION_ID: string;
};
export function loadConfig(): AppConfig { export function loadConfig(): AppConfig {
return envSchema.parse(process.env); return envSchema.parse(process.env);
} }
function requirePresent(value: string | undefined, name: string): string {
if (!value) {
throw new Error(`${name} 환경변수가 필요합니다.`);
}
return value;
}
export function requireAssistantRuntimeConfig(config: AppConfig): AssistantRuntimeConfig {
return {
...config,
OPENAI_API_KEY: requirePresent(config.OPENAI_API_KEY, "OPENAI_API_KEY"),
ELEVENLABS_API_KEY: requirePresent(config.ELEVENLABS_API_KEY, "ELEVENLABS_API_KEY"),
ELEVENLABS_VOICE_ID: requirePresent(config.ELEVENLABS_VOICE_ID, "ELEVENLABS_VOICE_ID"),
};
}
export function requireDiscordRuntimeConfig(config: AppConfig): DiscordRuntimeConfig {
const assistant = requireAssistantRuntimeConfig(config);
return {
...assistant,
DISCORD_BOT_TOKEN: requirePresent(config.DISCORD_BOT_TOKEN, "DISCORD_BOT_TOKEN"),
DISCORD_APPLICATION_ID: requirePresent(config.DISCORD_APPLICATION_ID, "DISCORD_APPLICATION_ID"),
};
}

234
src/discord-main.ts Normal file
View File

@@ -0,0 +1,234 @@
import process from "node:process";
import {
GatewayIntentBits,
REST,
Routes,
SlashCommandBuilder,
type ChatInputCommandInteraction,
type Client,
type GuildMember,
type VoiceBasedChannel,
} from "discord.js";
import { Client as DiscordClient } from "discord.js";
import { GuildVoiceSession } from "./audio/guild-voice-session.js";
import { type DiscordRuntimeConfig } from "./config.js";
import { Logger } from "./logger.js";
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
import { OpenAiLlmService } from "./services/openai-llm.js";
export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
const commands = [
new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
new SlashCommandBuilder()
.setName("say")
.setDescription("텍스트를 바로 음성으로 읽습니다.")
.addStringOption((option) =>
option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
),
].map((command) => command.toJSON());
const client = new DiscordClient({
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
});
const stt = new ElevenLabsSttService(config);
const tts = new ElevenLabsTtsService(config);
const llm = new OpenAiLlmService(config);
const sessions = new Map<string, GuildVoiceSession>();
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
const member = interaction.member as GuildMember | null;
return member?.voice.channel ?? null;
}
async function registerCommands(_appClient: Client): Promise<void> {
const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
if (config.DISCORD_COMMAND_GUILD_ID) {
await rest.put(
Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
{
body: commands,
},
);
logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
return;
}
await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
body: commands,
});
logger.info("Registered global commands");
}
async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
if (!interaction.guild) {
throw new Error("Guild interaction required");
}
const voiceChannel = getVoiceChannel(interaction);
if (!voiceChannel) {
throw new Error("먼저 음성 채널에 들어가 주세요.");
}
const existing = sessions.get(interaction.guild.id);
if (existing && existing.voiceChannelId === voiceChannel.id) {
existing.setTextChannel(interaction.channelId);
return existing;
}
if (existing) {
await existing.destroy();
sessions.delete(interaction.guild.id);
}
const session = await GuildVoiceSession.create({
client,
config,
logger,
guild: interaction.guild,
voiceChannel,
textChannelId: interaction.channelId,
stt,
tts,
llm,
});
sessions.set(interaction.guild.id, session);
return session;
}
async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
try {
const session = await createSession(interaction);
await interaction.editReply(
`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`,
);
} catch (error) {
const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
await interaction.editReply(message);
}
}
async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await session.destroy();
sessions.delete(interaction.guildId!);
await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
}
async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await interaction.reply({
content: session.statusSummary(),
ephemeral: true,
});
}
async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
session.clearConversation();
await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
}
async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
return;
}
const text = interaction.options.getString("text", true).trim();
await session.speakText(text);
await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
}
async function shutdown(exitCode = 0): Promise<void> {
logger.info("Shutting down");
for (const session of sessions.values()) {
await session.destroy().catch((error) => {
logger.warn("Session shutdown failed", error);
});
}
sessions.clear();
await client.destroy();
process.exit(exitCode);
}
client.once("ready", async () => {
logger.info("Discord client ready", client.user?.tag ?? "unknown");
try {
await registerCommands(client);
} catch (error) {
logger.error("Command registration failed", error);
}
});
client.on("interactionCreate", async (interaction) => {
if (!interaction.isChatInputCommand()) {
return;
}
try {
switch (interaction.commandName) {
case "join":
await handleJoin(interaction);
return;
case "leave":
await handleLeave(interaction);
return;
case "status":
await handleStatus(interaction);
return;
case "reset":
await handleReset(interaction);
return;
case "say":
await handleSay(interaction);
return;
default:
await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
}
} catch (error) {
logger.error("Interaction handler failed", error);
if (interaction.deferred || interaction.replied) {
await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
return;
}
await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
}
});
process.on("SIGINT", () => {
void shutdown(0);
});
process.on("SIGTERM", () => {
void shutdown(0);
});
await client.login(config.DISCORD_BOT_TOKEN);
}

View File

@@ -1,237 +1,28 @@
import process from "node:process"; import process from "node:process";
import { import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js";
GatewayIntentBits, import { runDiscordBot } from "./discord-main.js";
REST,
Routes,
SlashCommandBuilder,
type ChatInputCommandInteraction,
type Client,
type GuildMember,
type VoiceBasedChannel,
} from "discord.js";
import { Client as DiscordClient } from "discord.js";
import { GuildVoiceSession } from "./audio/guild-voice-session.js";
import { loadConfig } from "./config.js";
import { Logger } from "./logger.js"; import { Logger } from "./logger.js";
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js"; import { printLocalAudioDevices, runLocalAssistant } from "./local-main.js";
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
import { OpenAiLlmService } from "./services/openai-llm.js";
const mode = process.argv[2] ?? "discord";
const config = loadConfig(); const config = loadConfig();
const logger = new Logger(config.LOG_LEVEL); const logger = new Logger(config.LOG_LEVEL);
const commands = [
new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
new SlashCommandBuilder()
.setName("say")
.setDescription("텍스트를 바로 음성으로 읽습니다.")
.addStringOption((option) =>
option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
),
].map((command) => command.toJSON());
const client = new DiscordClient({
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
});
const stt = new ElevenLabsSttService(config);
const tts = new ElevenLabsTtsService(config);
const llm = new OpenAiLlmService(config);
const sessions = new Map<string, GuildVoiceSession>();
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
const member = interaction.member as GuildMember | null;
return member?.voice.channel ?? null;
}
async function registerCommands(appClient: Client): Promise<void> {
const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
if (config.DISCORD_COMMAND_GUILD_ID) {
await rest.put(
Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
{
body: commands,
},
);
logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
return;
}
await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
body: commands,
});
logger.info("Registered global commands");
}
async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
if (!interaction.guild) {
throw new Error("Guild interaction required");
}
const voiceChannel = getVoiceChannel(interaction);
if (!voiceChannel) {
throw new Error("먼저 음성 채널에 들어가 주세요.");
}
const existing = sessions.get(interaction.guild.id);
if (existing && existing.voiceChannelId === voiceChannel.id) {
existing.setTextChannel(interaction.channelId);
return existing;
}
if (existing) {
await existing.destroy();
sessions.delete(interaction.guild.id);
}
const session = await GuildVoiceSession.create({
client,
config,
logger,
guild: interaction.guild,
voiceChannel,
textChannelId: interaction.channelId,
stt,
tts,
llm,
});
sessions.set(interaction.guild.id, session);
return session;
}
async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
try {
const session = await createSession(interaction);
await interaction.editReply(`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`);
} catch (error) {
const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
await interaction.editReply(message);
}
}
async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await session.destroy();
sessions.delete(interaction.guildId!);
await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
}
async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await interaction.reply({
content: session.statusSummary(),
ephemeral: true,
});
}
async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
session.clearConversation();
await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
}
async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
return;
}
const text = interaction.options.getString("text", true).trim();
await session.speakText(text);
await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
}
async function shutdown(exitCode = 0): Promise<void> {
logger.info("Shutting down");
for (const session of sessions.values()) {
await session.destroy().catch((error) => {
logger.warn("Session shutdown failed", error);
});
}
sessions.clear();
await client.destroy();
process.exit(exitCode);
}
client.once("ready", async () => {
logger.info("Discord client ready", client.user?.tag ?? "unknown");
try {
await registerCommands(client);
} catch (error) {
logger.error("Command registration failed", error);
}
});
client.on("interactionCreate", async (interaction) => {
if (!interaction.isChatInputCommand()) {
return;
}
try {
switch (interaction.commandName) {
case "join":
await handleJoin(interaction);
return;
case "leave":
await handleLeave(interaction);
return;
case "status":
await handleStatus(interaction);
return;
case "reset":
await handleReset(interaction);
return;
case "say":
await handleSay(interaction);
return;
default:
await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
}
} catch (error) {
logger.error("Interaction handler failed", error);
if (interaction.deferred || interaction.replied) {
await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
return;
}
await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
}
});
process.on("SIGINT", () => {
void shutdown(0);
});
process.on("SIGTERM", () => {
void shutdown(0);
});
async function main(): Promise<void> { async function main(): Promise<void> {
await client.login(config.DISCORD_BOT_TOKEN); switch (mode) {
case "discord":
await runDiscordBot(requireDiscordRuntimeConfig(config), logger);
return;
case "local":
await runLocalAssistant(requireAssistantRuntimeConfig(config), logger);
return;
case "local-devices":
await printLocalAudioDevices();
return;
default:
throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices`);
}
} }
void main().catch((error) => { void main().catch((error) => {

75
src/local-main.ts Normal file
View File

@@ -0,0 +1,75 @@
import { spawn } from "node:child_process";
import process from "node:process";
import type { AssistantRuntimeConfig } from "./config.js";
import { Logger } from "./logger.js";
import { LocalVoiceSession } from "./audio/local-voice-session.js";
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
import { OpenAiLlmService } from "./services/openai-llm.js";
export async function printLocalAudioDevices(): Promise<void> {
const runs = [
{
label: "wpctl status",
args: ["status"],
},
{
label: "wpctl status -n",
args: ["status", "-n"],
},
] as const;
for (const run of runs) {
console.log(`\n=== ${run.label} ===`);
await new Promise<void>((resolve, reject) => {
const child = spawn("wpctl", run.args, {
stdio: ["ignore", "inherit", "inherit"],
});
child.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(`wpctl exited with code ${code ?? "null"}`));
});
child.on("error", reject);
});
}
}
export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
const stt = new ElevenLabsSttService(config);
const tts = new ElevenLabsTtsService(config);
const llm = new OpenAiLlmService(config);
const session = new LocalVoiceSession({
config,
logger,
stt,
tts,
llm,
});
console.log(session.statusSummary());
console.log("로컬 음성 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
if (config.DEBUG_TEXT_EVENTS) {
console.log("텍스트 로그 출력이 켜져 있습니다.");
}
const shutdown = async (exitCode = 0) => {
await session.destroy().catch((error) => {
logger.warn("Local session shutdown failed", error);
});
process.exit(exitCode);
};
process.on("SIGINT", () => {
void shutdown(0);
});
process.on("SIGTERM", () => {
void shutdown(0);
});
await session.start();
}

View File

@@ -1,6 +1,6 @@
import WebSocket from "ws"; import WebSocket from "ws";
import type { AppConfig } from "../config.js"; import type { AssistantRuntimeConfig } from "../config.js";
interface ElevenLabsMessage { interface ElevenLabsMessage {
message_type?: string; message_type?: string;
@@ -13,7 +13,7 @@ const NON_FATAL_ERROR_TYPES = new Set([
]); ]);
export class ElevenLabsSttService { export class ElevenLabsSttService {
constructor(private readonly config: AppConfig) {} constructor(private readonly config: AssistantRuntimeConfig) {}
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> { async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
if (pcm16MonoAudio.byteLength === 0) { if (pcm16MonoAudio.byteLength === 0) {

View File

@@ -2,24 +2,23 @@ import { Readable } from "node:stream";
import ffmpegStatic from "ffmpeg-static"; import ffmpegStatic from "ffmpeg-static";
import prism from "prism-media"; import prism from "prism-media";
import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice";
import type { AppConfig } from "../config.js"; import type { AssistantRuntimeConfig } from "../config.js";
export interface PreparedSpeechPlayback { export interface PreparedSpeechAudio {
resource: AudioResource; stream: Readable;
dispose: () => void; dispose: () => void;
} }
export class ElevenLabsTtsService { export class ElevenLabsTtsService {
constructor(private readonly config: AppConfig) { constructor(private readonly config: AssistantRuntimeConfig) {
const resolvedFfmpegPath = ffmpegStatic as unknown as string | null; const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) { if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
process.env.FFMPEG_PATH = resolvedFfmpegPath; process.env.FFMPEG_PATH = resolvedFfmpegPath;
} }
} }
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechPlayback> { async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`); const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
url.searchParams.set("output_format", "mp3_44100_128"); url.searchParams.set("output_format", "mp3_44100_128");
url.searchParams.set("enable_logging", "false"); url.searchParams.set("enable_logging", "false");
@@ -68,12 +67,8 @@ export class ElevenLabsTtsService {
input.pipe(ffmpeg); input.pipe(ffmpeg);
const resource = createAudioResource(ffmpeg, {
inputType: StreamType.Raw,
});
return { return {
resource, stream: ffmpeg,
dispose: () => { dispose: () => {
input.destroy(); input.destroy();
ffmpeg.destroy(); ffmpeg.destroy();

View File

@@ -1,6 +1,6 @@
import OpenAI from "openai"; import OpenAI from "openai";
import type { AppConfig } from "../config.js"; import type { AssistantRuntimeConfig } from "../config.js";
import type { ConversationMemory, UserUtterance } from "./conversation.js"; import type { ConversationMemory, UserUtterance } from "./conversation.js";
const ASSISTANT_INSTRUCTIONS = [ const ASSISTANT_INSTRUCTIONS = [
@@ -30,7 +30,7 @@ function normalizeReply(text: string): string {
export class OpenAiLlmService { export class OpenAiLlmService {
private readonly client: OpenAI; private readonly client: OpenAI;
constructor(private readonly config: AppConfig) { constructor(private readonly config: AssistantRuntimeConfig) {
this.client = new OpenAI({ this.client = new OpenAI({
apiKey: this.config.OPENAI_API_KEY, apiKey: this.config.OPENAI_API_KEY,
}); });