From cf6398f50a7d1736ead74639958b5f16b5033789 Mon Sep 17 00:00:00 2001 From: claude-bot Date: Thu, 30 Apr 2026 02:37:54 +0900 Subject: [PATCH] feat: add local audio test mode --- .env.example | 3 + README.md | 41 +++- package.json | 5 +- src/audio/guild-voice-session.ts | 10 +- src/audio/local-voice-session.ts | 339 +++++++++++++++++++++++++++++++ src/config.ts | 47 ++++- src/discord-main.ts | 234 +++++++++++++++++++++ src/index.ts | 243 ++-------------------- src/local-main.ts | 75 +++++++ src/services/elevenlabs-stt.ts | 4 +- src/services/elevenlabs-tts.ts | 17 +- src/services/openai-llm.ts | 4 +- 12 files changed, 766 insertions(+), 256 deletions(-) create mode 100644 src/audio/local-voice-session.ts create mode 100644 src/discord-main.ts create mode 100644 src/local-main.ts diff --git a/.env.example b/.env.example index 81bfc03..257d0c1 100644 --- a/.env.example +++ b/.env.example @@ -12,5 +12,8 @@ ELEVENLABS_TTS_MODEL=eleven_flash_v2_5 BOT_DEFAULT_LANGUAGE=ko MAX_CONVERSATION_TURNS=12 +LOCAL_AUDIO_SOURCE= +LOCAL_AUDIO_SINK= +LOCAL_SPEAKER_NAME=local-user DEBUG_TEXT_EVENTS=false LOG_LEVEL=info diff --git a/README.md b/README.md index b25544e..2ca0cc1 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ # realtime_voice_bot -디스코드 음성 채널에서 여러 사용자의 음성을 개별로 받아 한국어로 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다. +디스코드 음성 채널 또는 로컬 PC 마이크/스피커에서 한국어 음성을 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다. ## 현재 구현 범위 - Discord slash command 기반 제어: `/join`, `/leave`, `/status`, `/reset`, `/say` +- 로컬 테스트 모드: `pw-record` 입력, `pw-play` 출력 - `@discordjs/voice` 기반 음성 채널 입장 및 유저별 오디오 수신 - 48k stereo PCM을 16k mono로 내려서 유저별 VAD 처리 - Silero 계열 VAD(`avr-vad`)로 발화 시작/종료 감지 @@ -28,16 +29,25 @@ 필수: -- `DISCORD_BOT_TOKEN` -- `DISCORD_APPLICATION_ID` - `OPENAI_API_KEY` - `ELEVENLABS_API_KEY` - `ELEVENLABS_VOICE_ID` +Discord 모드에서만 필수: + +- `DISCORD_BOT_TOKEN` +- `DISCORD_APPLICATION_ID` + 선택: - `DISCORD_COMMAND_GUILD_ID` - 테스트 서버에만 slash command를 즉시 반영하려면 설정 +- `LOCAL_AUDIO_SOURCE` + - `pw-record --target` 에 넣을 PipeWire source id 또는 node name +- `LOCAL_AUDIO_SINK` + - `pw-play --target` 에 넣을 PipeWire sink id 또는 node name +- `LOCAL_SPEAKER_NAME` + - 로컬 테스트에서 프롬프트에 넣을 화자 이름 - `OPENAI_MODEL` - 기본값: `gpt-5.4-mini` - `ELEVENLABS_STT_MODEL` @@ -51,13 +61,24 @@ ```bash bun install -bun run start ``` -개발 모드: +디스코드 모드: ```bash -bun run dev +bun run start:discord +``` + +로컬 장치 목록: + +```bash +bun run audio:devices +``` + +로컬 테스트 모드: + +```bash +bun run start:local ``` 타입 체크: @@ -74,9 +95,17 @@ bun run check 4. 말을 하면 봇이 발화 단위로 인식하고 음성으로 짧게 답합니다. 5. 다시 말하면 현재 읽고 있던 TTS는 즉시 중단됩니다. +로컬 테스트: + +1. `bun run audio:devices` 로 source/sink id 또는 이름 확인 +2. 필요하면 `.env` 에 `LOCAL_AUDIO_SOURCE`, `LOCAL_AUDIO_SINK` 설정 +3. `bun run start:local` +4. 마이크로 바로 말해서 응답 확인 + ## 설계 메모 - 입력은 유저별 병렬 처리 - 출력은 길드 세션당 단일 큐 +- 로컬 모드는 단일 화자 입력 기준 - 화자 구분은 `speaker_id`, `speaker_name`을 LLM 프롬프트에 항상 포함 - 최소 프로토타입이므로 Deepgram 대체 STT, 장기 메모리, 고급 명령 라우팅은 아직 포함하지 않았습니다. diff --git a/package.json b/package.json index 2e95195..723214c 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,10 @@ "type": "module", "scripts": { "dev": "bun --watch src/index.ts", - "start": "bun src/index.ts", + "start": "bun src/index.ts discord", + "start:discord": "bun src/index.ts discord", + "start:local": "bun src/index.ts local", + "audio:devices": "bun src/index.ts local-devices", "check": "tsc --noEmit", "build": "tsc -p tsconfig.json" }, diff --git a/src/audio/guild-voice-session.ts b/src/audio/guild-voice-session.ts index 5644d49..5437016 100644 --- a/src/audio/guild-voice-session.ts +++ b/src/audio/guild-voice-session.ts @@ -8,8 +8,10 @@ import { NoSubscriberBehavior, VoiceConnectionStatus, createAudioPlayer, + createAudioResource, entersState, joinVoiceChannel, + StreamType, type AudioPlayer, type AudioReceiveStream, type VoiceConnection, @@ -21,7 +23,7 @@ import { Logger } from "../logger.js"; import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js"; import { ConversationMemory, type UserUtterance } from "../services/conversation.js"; import { ElevenLabsSttService } from "../services/elevenlabs-stt.js"; -import { ElevenLabsTtsService, type PreparedSpeechPlayback } from "../services/elevenlabs-tts.js"; +import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js"; import { OpenAiLlmService } from "../services/openai-llm.js"; interface GuildVoiceSessionOptions { @@ -167,7 +169,7 @@ export class GuildVoiceSession extends EventEmitter { private draining = false; private currentAbortController: AbortController | null = null; - private currentPlayback: PreparedSpeechPlayback | null = null; + private currentPlayback: PreparedSpeechAudio | null = null; private textChannelId?: string; private constructor(private readonly options: GuildVoiceSessionOptions) { @@ -415,7 +417,9 @@ export class GuildVoiceSession extends EventEmitter { } try { - const resource = this.currentPlayback.resource; + const resource = createAudioResource(this.currentPlayback.stream, { + inputType: StreamType.Raw, + }); this.player.play(resource); await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null); diff --git a/src/audio/local-voice-session.ts b/src/audio/local-voice-session.ts new file mode 100644 index 0000000..da0ce4a --- /dev/null +++ b/src/audio/local-voice-session.ts @@ -0,0 +1,339 @@ +import { spawn, type ChildProcessByStdio } from "node:child_process"; +import { once } from "node:events"; +import type { Readable, Writable } from "node:stream"; + +import { RealTimeVAD } from "avr-vad"; + +import type { AssistantRuntimeConfig } from "../config.js"; +import { Logger } from "../logger.js"; +import { takeFrame, int16ArrayToFloat32, float32ToPcm16Buffer } from "./pcm.js"; +import { ConversationMemory, type UserUtterance } from "../services/conversation.js"; +import { ElevenLabsSttService } from "../services/elevenlabs-stt.js"; +import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js"; +import { OpenAiLlmService } from "../services/openai-llm.js"; + +interface LocalVoiceSessionOptions { + config: AssistantRuntimeConfig; + logger: Logger; + stt: ElevenLabsSttService; + tts: ElevenLabsTtsService; + llm: OpenAiLlmService; +} + +interface SpeechJob { + text: string; + source: "assistant" | "manual"; +} + +export class LocalVoiceSession { + private readonly memory: ConversationMemory; + private readonly queue: SpeechJob[] = []; + private readonly pendingSamples: number[] = []; + + private vad: RealTimeVAD | null = null; + private recorder: ChildProcessByStdio | null = null; + private currentPlayer: ChildProcessByStdio | null = null; + private currentAbortController: AbortController | null = null; + private currentPlayback: PreparedSpeechAudio | null = null; + private processing = Promise.resolve(); + private draining = false; + private destroyed = false; + + constructor(private readonly options: LocalVoiceSessionOptions) { + this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS); + } + + async start(): Promise { + this.vad = await RealTimeVAD.new({ + model: "v5", + sampleRate: 16000, + frameSamples: 1536, + positiveSpeechThreshold: 0.55, + negativeSpeechThreshold: 0.35, + redemptionFrames: 8, + preSpeechPadFrames: 2, + minSpeechFrames: 3, + onFrameProcessed: () => undefined, + onVADMisfire: () => undefined, + onSpeechStart: () => { + this.interruptPlayback("local-barge-in"); + }, + onSpeechRealStart: () => undefined, + onSpeechEnd: (audio: Float32Array) => { + void this.handleSpeechEnd(audio); + }, + }); + + this.recorder = this.spawnRecorder(); + this.recorder.stdout.on("data", (chunk: Buffer) => { + this.pushPcm16Chunk(chunk); + }); + this.recorder.stderr.on("data", (chunk: Buffer) => { + const text = chunk.toString().trim(); + if (text.length > 0) { + this.options.logger.debug("[pw-record]", text); + } + }); + this.recorder.on("exit", (code, signal) => { + if (!this.destroyed) { + this.options.logger.warn("pw-record exited unexpectedly", { code, signal }); + } + }); + } + + async destroy(): Promise { + this.destroyed = true; + this.interruptPlayback("local-shutdown"); + + if (this.recorder && !this.recorder.killed) { + this.recorder.kill("SIGTERM"); + await once(this.recorder, "exit").catch(() => null); + } + + if (this.vad) { + await this.vad.destroy().catch((error) => { + this.options.logger.warn("Local VAD destroy failed", error); + }); + this.vad = null; + } + } + + clearConversation(): void { + this.memory.clear(); + this.interruptPlayback("local-reset"); + } + + async speakText(text: string): Promise { + this.queue.push({ + text, + source: "manual", + }); + await this.drainQueue(); + } + + statusSummary(): string { + return [ + "모드: local", + `입력 source: ${this.options.config.LOCAL_AUDIO_SOURCE ?? "default"}`, + `출력 sink: ${this.options.config.LOCAL_AUDIO_SINK ?? "default"}`, + `대기열: ${this.queue.length}`, + `최근 대화 턴: ${this.memory.recentTurns().length}`, + ].join("\n"); + } + + private spawnRecorder(): ChildProcessByStdio { + const args = [ + "--rate", + "16000", + "--channels", + "1", + "--format", + "s16", + "--raw", + ]; + + if (this.options.config.LOCAL_AUDIO_SOURCE) { + args.push("--target", this.options.config.LOCAL_AUDIO_SOURCE); + } + + args.push("-"); + + this.options.logger.info("Starting local recorder", { + source: this.options.config.LOCAL_AUDIO_SOURCE ?? "default", + }); + + return spawn("pw-record", args, { + stdio: ["ignore", "pipe", "pipe"], + }); + } + + private pushPcm16Chunk(chunk: Buffer): void { + if (this.destroyed || !this.vad) { + return; + } + + for (let offset = 0; offset + 1 < chunk.length; offset += 2) { + this.pendingSamples.push(chunk.readInt16LE(offset)); + } + + while (true) { + const frame = takeFrame(this.pendingSamples, 1536); + if (!frame) { + return; + } + + const floatFrame = int16ArrayToFloat32(frame); + this.processing = this.processing + .then(() => this.vad?.processAudio(floatFrame)) + .catch((error) => { + this.options.logger.warn("Local VAD processing failed", error); + }); + } + } + + private async handleSpeechEnd(audio: Float32Array): Promise { + if (audio.length < 16000 * 0.25) { + return; + } + + const utterance: UserUtterance = { + speakerId: "local-user", + speakerName: this.options.config.LOCAL_SPEAKER_NAME, + text: "", + }; + + let transcript: string | null = null; + try { + transcript = await this.options.stt.transcribePcm16(float32ToPcm16Buffer(audio)); + } catch (error) { + this.options.logger.warn("Local STT failed", error); + return; + } + + if (!transcript || transcript.trim().length === 0) { + return; + } + + utterance.text = transcript.trim(); + this.memory.addUserTurn(utterance); + this.options.logger.info("Local transcript", utterance.text); + if (this.options.config.DEBUG_TEXT_EVENTS) { + console.log(`\n[you] ${utterance.text}`); + } + + let reply: string; + try { + reply = await this.options.llm.generateReply(this.memory, utterance); + } catch (error) { + this.options.logger.warn("Local LLM failed", error); + reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요."; + } + + this.memory.addAssistantTurn(reply); + if (this.options.config.DEBUG_TEXT_EVENTS) { + console.log(`[bot] ${reply}\n`); + } + + this.queue.push({ + text: reply, + source: "assistant", + }); + await this.drainQueue(); + } + + private interruptPlayback(reason: string): void { + if (this.queue.length > 0 || this.currentPlayer) { + this.options.logger.info("Interrupting local playback", reason); + } + + this.queue.splice(0, this.queue.length); + this.currentAbortController?.abort(); + this.currentAbortController = null; + this.currentPlayback?.dispose(); + this.currentPlayback = null; + + if (this.currentPlayer && !this.currentPlayer.killed) { + this.currentPlayer.kill("SIGKILL"); + } + this.currentPlayer = null; + } + + private async drainQueue(): Promise { + if (this.draining || this.destroyed) { + return; + } + + this.draining = true; + + try { + while (this.queue.length > 0 && !this.destroyed) { + const job = this.queue.shift(); + if (!job) { + continue; + } + + const abortController = new AbortController(); + this.currentAbortController = abortController; + + try { + this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal); + } catch (error) { + if (!abortController.signal.aborted) { + this.options.logger.warn("Local TTS synthesis failed", error); + } + continue; + } + + try { + await this.playToSink(this.currentPlayback, abortController.signal); + } catch (error) { + if (!abortController.signal.aborted) { + this.options.logger.warn("Local playback failed", error); + } + } finally { + this.currentPlayback?.dispose(); + this.currentPlayback = null; + if (this.currentAbortController === abortController) { + this.currentAbortController = null; + } + } + } + } finally { + this.draining = false; + } + } + + private async playToSink(playback: PreparedSpeechAudio, signal: AbortSignal): Promise { + const args = [ + "--rate", + "48000", + "--channels", + "2", + "--format", + "s16", + "--raw", + ]; + + if (this.options.config.LOCAL_AUDIO_SINK) { + args.push("--target", this.options.config.LOCAL_AUDIO_SINK); + } + + args.push("-"); + + const player = spawn("pw-play", args, { + stdio: ["pipe", "ignore", "pipe"], + }); + this.currentPlayer = player; + + player.stderr.on("data", (chunk: Buffer) => { + const text = chunk.toString().trim(); + if (text.length > 0) { + this.options.logger.debug("[pw-play]", text); + } + }); + + signal.addEventListener( + "abort", + () => { + playback.stream.destroy(); + if (!player.killed) { + player.kill("SIGKILL"); + } + }, + { once: true }, + ); + + playback.stream.pipe(player.stdin); + + const [code, playSignal] = (await once(player, "exit")) as [number | null, NodeJS.Signals | null]; + this.currentPlayer = null; + + if (signal.aborted) { + return; + } + + if (code !== 0) { + throw new Error(`pw-play exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`); + } + } +} diff --git a/src/config.ts b/src/config.ts index a122c90..8cf45a0 100644 --- a/src/config.ts +++ b/src/config.ts @@ -4,17 +4,20 @@ import { z } from "zod"; loadDotenv(); const envSchema = z.object({ - DISCORD_BOT_TOKEN: z.string().min(1), - DISCORD_APPLICATION_ID: z.string().min(1), + DISCORD_BOT_TOKEN: z.string().min(1).optional(), + DISCORD_APPLICATION_ID: z.string().min(1).optional(), DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(), - OPENAI_API_KEY: z.string().min(1), + OPENAI_API_KEY: z.string().min(1).optional(), OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"), - ELEVENLABS_API_KEY: z.string().min(1), - ELEVENLABS_VOICE_ID: z.string().min(1), + ELEVENLABS_API_KEY: z.string().min(1).optional(), + ELEVENLABS_VOICE_ID: z.string().min(1).optional(), ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"), ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"), BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"), MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12), + LOCAL_AUDIO_SOURCE: z.string().min(1).optional(), + LOCAL_AUDIO_SINK: z.string().min(1).optional(), + LOCAL_SPEAKER_NAME: z.string().min(1).default("local-user"), DEBUG_TEXT_EVENTS: z .string() .optional() @@ -23,7 +26,41 @@ const envSchema = z.object({ }); export type AppConfig = z.infer; +export type AssistantRuntimeConfig = AppConfig & { + OPENAI_API_KEY: string; + ELEVENLABS_API_KEY: string; + ELEVENLABS_VOICE_ID: string; +}; +export type DiscordRuntimeConfig = AssistantRuntimeConfig & { + DISCORD_BOT_TOKEN: string; + DISCORD_APPLICATION_ID: string; +}; export function loadConfig(): AppConfig { return envSchema.parse(process.env); } + +function requirePresent(value: string | undefined, name: string): string { + if (!value) { + throw new Error(`${name} 환경변수가 필요합니다.`); + } + return value; +} + +export function requireAssistantRuntimeConfig(config: AppConfig): AssistantRuntimeConfig { + return { + ...config, + OPENAI_API_KEY: requirePresent(config.OPENAI_API_KEY, "OPENAI_API_KEY"), + ELEVENLABS_API_KEY: requirePresent(config.ELEVENLABS_API_KEY, "ELEVENLABS_API_KEY"), + ELEVENLABS_VOICE_ID: requirePresent(config.ELEVENLABS_VOICE_ID, "ELEVENLABS_VOICE_ID"), + }; +} + +export function requireDiscordRuntimeConfig(config: AppConfig): DiscordRuntimeConfig { + const assistant = requireAssistantRuntimeConfig(config); + return { + ...assistant, + DISCORD_BOT_TOKEN: requirePresent(config.DISCORD_BOT_TOKEN, "DISCORD_BOT_TOKEN"), + DISCORD_APPLICATION_ID: requirePresent(config.DISCORD_APPLICATION_ID, "DISCORD_APPLICATION_ID"), + }; +} diff --git a/src/discord-main.ts b/src/discord-main.ts new file mode 100644 index 0000000..cf8060c --- /dev/null +++ b/src/discord-main.ts @@ -0,0 +1,234 @@ +import process from "node:process"; + +import { + GatewayIntentBits, + REST, + Routes, + SlashCommandBuilder, + type ChatInputCommandInteraction, + type Client, + type GuildMember, + type VoiceBasedChannel, +} from "discord.js"; +import { Client as DiscordClient } from "discord.js"; + +import { GuildVoiceSession } from "./audio/guild-voice-session.js"; +import { type DiscordRuntimeConfig } from "./config.js"; +import { Logger } from "./logger.js"; +import { ElevenLabsSttService } from "./services/elevenlabs-stt.js"; +import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js"; +import { OpenAiLlmService } from "./services/openai-llm.js"; + +export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise { + const commands = [ + new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."), + new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."), + new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."), + new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."), + new SlashCommandBuilder() + .setName("say") + .setDescription("텍스트를 바로 음성으로 읽습니다.") + .addStringOption((option) => + option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400), + ), + ].map((command) => command.toJSON()); + + const client = new DiscordClient({ + intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates], + }); + + const stt = new ElevenLabsSttService(config); + const tts = new ElevenLabsTtsService(config); + const llm = new OpenAiLlmService(config); + const sessions = new Map(); + + function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null { + const member = interaction.member as GuildMember | null; + return member?.voice.channel ?? null; + } + + async function registerCommands(_appClient: Client): Promise { + const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN); + if (config.DISCORD_COMMAND_GUILD_ID) { + await rest.put( + Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID), + { + body: commands, + }, + ); + logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID); + return; + } + + await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), { + body: commands, + }); + logger.info("Registered global commands"); + } + + async function createSession(interaction: ChatInputCommandInteraction): Promise { + if (!interaction.guild) { + throw new Error("Guild interaction required"); + } + + const voiceChannel = getVoiceChannel(interaction); + if (!voiceChannel) { + throw new Error("먼저 음성 채널에 들어가 주세요."); + } + + const existing = sessions.get(interaction.guild.id); + if (existing && existing.voiceChannelId === voiceChannel.id) { + existing.setTextChannel(interaction.channelId); + return existing; + } + + if (existing) { + await existing.destroy(); + sessions.delete(interaction.guild.id); + } + + const session = await GuildVoiceSession.create({ + client, + config, + logger, + guild: interaction.guild, + voiceChannel, + textChannelId: interaction.channelId, + stt, + tts, + llm, + }); + sessions.set(interaction.guild.id, session); + return session; + } + + async function handleJoin(interaction: ChatInputCommandInteraction): Promise { + await interaction.deferReply({ ephemeral: true }); + + try { + const session = await createSession(interaction); + await interaction.editReply( + `음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`, + ); + } catch (error) { + const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다."; + await interaction.editReply(message); + } + } + + async function handleLeave(interaction: ChatInputCommandInteraction): Promise { + const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; + if (!session) { + await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true }); + return; + } + + await session.destroy(); + sessions.delete(interaction.guildId!); + await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true }); + } + + async function handleStatus(interaction: ChatInputCommandInteraction): Promise { + const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; + if (!session) { + await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true }); + return; + } + + await interaction.reply({ + content: session.statusSummary(), + ephemeral: true, + }); + } + + async function handleReset(interaction: ChatInputCommandInteraction): Promise { + const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; + if (!session) { + await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true }); + return; + } + + session.clearConversation(); + await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true }); + } + + async function handleSay(interaction: ChatInputCommandInteraction): Promise { + await interaction.deferReply({ ephemeral: true }); + + const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; + if (!session) { + await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요."); + return; + } + + const text = interaction.options.getString("text", true).trim(); + await session.speakText(text); + await interaction.editReply("읽기 요청을 대기열에 추가했습니다."); + } + + async function shutdown(exitCode = 0): Promise { + logger.info("Shutting down"); + for (const session of sessions.values()) { + await session.destroy().catch((error) => { + logger.warn("Session shutdown failed", error); + }); + } + sessions.clear(); + await client.destroy(); + process.exit(exitCode); + } + + client.once("ready", async () => { + logger.info("Discord client ready", client.user?.tag ?? "unknown"); + try { + await registerCommands(client); + } catch (error) { + logger.error("Command registration failed", error); + } + }); + + client.on("interactionCreate", async (interaction) => { + if (!interaction.isChatInputCommand()) { + return; + } + + try { + switch (interaction.commandName) { + case "join": + await handleJoin(interaction); + return; + case "leave": + await handleLeave(interaction); + return; + case "status": + await handleStatus(interaction); + return; + case "reset": + await handleReset(interaction); + return; + case "say": + await handleSay(interaction); + return; + default: + await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true }); + } + } catch (error) { + logger.error("Interaction handler failed", error); + if (interaction.deferred || interaction.replied) { + await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null); + return; + } + await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null); + } + }); + + process.on("SIGINT", () => { + void shutdown(0); + }); + + process.on("SIGTERM", () => { + void shutdown(0); + }); + + await client.login(config.DISCORD_BOT_TOKEN); +} diff --git a/src/index.ts b/src/index.ts index 59cd390..c931977 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,237 +1,28 @@ import process from "node:process"; -import { - GatewayIntentBits, - REST, - Routes, - SlashCommandBuilder, - type ChatInputCommandInteraction, - type Client, - type GuildMember, - type VoiceBasedChannel, -} from "discord.js"; -import { Client as DiscordClient } from "discord.js"; - -import { GuildVoiceSession } from "./audio/guild-voice-session.js"; -import { loadConfig } from "./config.js"; +import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js"; +import { runDiscordBot } from "./discord-main.js"; import { Logger } from "./logger.js"; -import { ElevenLabsSttService } from "./services/elevenlabs-stt.js"; -import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js"; -import { OpenAiLlmService } from "./services/openai-llm.js"; +import { printLocalAudioDevices, runLocalAssistant } from "./local-main.js"; +const mode = process.argv[2] ?? "discord"; const config = loadConfig(); const logger = new Logger(config.LOG_LEVEL); -const commands = [ - new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."), - new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."), - new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."), - new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."), - new SlashCommandBuilder() - .setName("say") - .setDescription("텍스트를 바로 음성으로 읽습니다.") - .addStringOption((option) => - option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400), - ), -].map((command) => command.toJSON()); - -const client = new DiscordClient({ - intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates], -}); - -const stt = new ElevenLabsSttService(config); -const tts = new ElevenLabsTtsService(config); -const llm = new OpenAiLlmService(config); -const sessions = new Map(); - -function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null { - const member = interaction.member as GuildMember | null; - return member?.voice.channel ?? null; -} - -async function registerCommands(appClient: Client): Promise { - const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN); - if (config.DISCORD_COMMAND_GUILD_ID) { - await rest.put( - Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID), - { - body: commands, - }, - ); - logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID); - return; - } - - await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), { - body: commands, - }); - logger.info("Registered global commands"); -} - -async function createSession(interaction: ChatInputCommandInteraction): Promise { - if (!interaction.guild) { - throw new Error("Guild interaction required"); - } - - const voiceChannel = getVoiceChannel(interaction); - if (!voiceChannel) { - throw new Error("먼저 음성 채널에 들어가 주세요."); - } - - const existing = sessions.get(interaction.guild.id); - if (existing && existing.voiceChannelId === voiceChannel.id) { - existing.setTextChannel(interaction.channelId); - return existing; - } - - if (existing) { - await existing.destroy(); - sessions.delete(interaction.guild.id); - } - - const session = await GuildVoiceSession.create({ - client, - config, - logger, - guild: interaction.guild, - voiceChannel, - textChannelId: interaction.channelId, - stt, - tts, - llm, - }); - sessions.set(interaction.guild.id, session); - return session; -} - -async function handleJoin(interaction: ChatInputCommandInteraction): Promise { - await interaction.deferReply({ ephemeral: true }); - - try { - const session = await createSession(interaction); - await interaction.editReply(`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`); - } catch (error) { - const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다."; - await interaction.editReply(message); - } -} - -async function handleLeave(interaction: ChatInputCommandInteraction): Promise { - const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; - if (!session) { - await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true }); - return; - } - - await session.destroy(); - sessions.delete(interaction.guildId!); - await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true }); -} - -async function handleStatus(interaction: ChatInputCommandInteraction): Promise { - const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; - if (!session) { - await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true }); - return; - } - - await interaction.reply({ - content: session.statusSummary(), - ephemeral: true, - }); -} - -async function handleReset(interaction: ChatInputCommandInteraction): Promise { - const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; - if (!session) { - await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true }); - return; - } - - session.clearConversation(); - await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true }); -} - -async function handleSay(interaction: ChatInputCommandInteraction): Promise { - await interaction.deferReply({ ephemeral: true }); - - const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; - if (!session) { - await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요."); - return; - } - - const text = interaction.options.getString("text", true).trim(); - await session.speakText(text); - await interaction.editReply("읽기 요청을 대기열에 추가했습니다."); -} - -async function shutdown(exitCode = 0): Promise { - logger.info("Shutting down"); - for (const session of sessions.values()) { - await session.destroy().catch((error) => { - logger.warn("Session shutdown failed", error); - }); - } - sessions.clear(); - await client.destroy(); - process.exit(exitCode); -} - -client.once("ready", async () => { - logger.info("Discord client ready", client.user?.tag ?? "unknown"); - try { - await registerCommands(client); - } catch (error) { - logger.error("Command registration failed", error); - } -}); - -client.on("interactionCreate", async (interaction) => { - if (!interaction.isChatInputCommand()) { - return; - } - - try { - switch (interaction.commandName) { - case "join": - await handleJoin(interaction); - return; - case "leave": - await handleLeave(interaction); - return; - case "status": - await handleStatus(interaction); - return; - case "reset": - await handleReset(interaction); - return; - case "say": - await handleSay(interaction); - return; - default: - await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true }); - } - } catch (error) { - logger.error("Interaction handler failed", error); - if (interaction.deferred || interaction.replied) { - await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null); - return; - } - await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null); - } -}); - -process.on("SIGINT", () => { - void shutdown(0); -}); - -process.on("SIGTERM", () => { - void shutdown(0); -}); - async function main(): Promise { - await client.login(config.DISCORD_BOT_TOKEN); + switch (mode) { + case "discord": + await runDiscordBot(requireDiscordRuntimeConfig(config), logger); + return; + case "local": + await runLocalAssistant(requireAssistantRuntimeConfig(config), logger); + return; + case "local-devices": + await printLocalAudioDevices(); + return; + default: + throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices`); + } } void main().catch((error) => { diff --git a/src/local-main.ts b/src/local-main.ts new file mode 100644 index 0000000..b4420c6 --- /dev/null +++ b/src/local-main.ts @@ -0,0 +1,75 @@ +import { spawn } from "node:child_process"; +import process from "node:process"; + +import type { AssistantRuntimeConfig } from "./config.js"; +import { Logger } from "./logger.js"; +import { LocalVoiceSession } from "./audio/local-voice-session.js"; +import { ElevenLabsSttService } from "./services/elevenlabs-stt.js"; +import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js"; +import { OpenAiLlmService } from "./services/openai-llm.js"; + +export async function printLocalAudioDevices(): Promise { + const runs = [ + { + label: "wpctl status", + args: ["status"], + }, + { + label: "wpctl status -n", + args: ["status", "-n"], + }, + ] as const; + + for (const run of runs) { + console.log(`\n=== ${run.label} ===`); + await new Promise((resolve, reject) => { + const child = spawn("wpctl", run.args, { + stdio: ["ignore", "inherit", "inherit"], + }); + child.on("exit", (code) => { + if (code === 0) { + resolve(); + return; + } + reject(new Error(`wpctl exited with code ${code ?? "null"}`)); + }); + child.on("error", reject); + }); + } +} + +export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise { + const stt = new ElevenLabsSttService(config); + const tts = new ElevenLabsTtsService(config); + const llm = new OpenAiLlmService(config); + const session = new LocalVoiceSession({ + config, + logger, + stt, + tts, + llm, + }); + + console.log(session.statusSummary()); + console.log("로컬 음성 테스트를 시작합니다. Ctrl+C 로 종료합니다."); + if (config.DEBUG_TEXT_EVENTS) { + console.log("텍스트 로그 출력이 켜져 있습니다."); + } + + const shutdown = async (exitCode = 0) => { + await session.destroy().catch((error) => { + logger.warn("Local session shutdown failed", error); + }); + process.exit(exitCode); + }; + + process.on("SIGINT", () => { + void shutdown(0); + }); + + process.on("SIGTERM", () => { + void shutdown(0); + }); + + await session.start(); +} diff --git a/src/services/elevenlabs-stt.ts b/src/services/elevenlabs-stt.ts index 1c3719b..67b7979 100644 --- a/src/services/elevenlabs-stt.ts +++ b/src/services/elevenlabs-stt.ts @@ -1,6 +1,6 @@ import WebSocket from "ws"; -import type { AppConfig } from "../config.js"; +import type { AssistantRuntimeConfig } from "../config.js"; interface ElevenLabsMessage { message_type?: string; @@ -13,7 +13,7 @@ const NON_FATAL_ERROR_TYPES = new Set([ ]); export class ElevenLabsSttService { - constructor(private readonly config: AppConfig) {} + constructor(private readonly config: AssistantRuntimeConfig) {} async transcribePcm16(pcm16MonoAudio: Buffer): Promise { if (pcm16MonoAudio.byteLength === 0) { diff --git a/src/services/elevenlabs-tts.ts b/src/services/elevenlabs-tts.ts index 24f83a3..d22bed4 100644 --- a/src/services/elevenlabs-tts.ts +++ b/src/services/elevenlabs-tts.ts @@ -2,24 +2,23 @@ import { Readable } from "node:stream"; import ffmpegStatic from "ffmpeg-static"; import prism from "prism-media"; -import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice"; -import type { AppConfig } from "../config.js"; +import type { AssistantRuntimeConfig } from "../config.js"; -export interface PreparedSpeechPlayback { - resource: AudioResource; +export interface PreparedSpeechAudio { + stream: Readable; dispose: () => void; } export class ElevenLabsTtsService { - constructor(private readonly config: AppConfig) { + constructor(private readonly config: AssistantRuntimeConfig) { const resolvedFfmpegPath = ffmpegStatic as unknown as string | null; if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) { process.env.FFMPEG_PATH = resolvedFfmpegPath; } } - async preparePlayback(text: string, signal?: AbortSignal): Promise { + async preparePlayback(text: string, signal?: AbortSignal): Promise { const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`); url.searchParams.set("output_format", "mp3_44100_128"); url.searchParams.set("enable_logging", "false"); @@ -68,12 +67,8 @@ export class ElevenLabsTtsService { input.pipe(ffmpeg); - const resource = createAudioResource(ffmpeg, { - inputType: StreamType.Raw, - }); - return { - resource, + stream: ffmpeg, dispose: () => { input.destroy(); ffmpeg.destroy(); diff --git a/src/services/openai-llm.ts b/src/services/openai-llm.ts index c6b02e9..d866d5f 100644 --- a/src/services/openai-llm.ts +++ b/src/services/openai-llm.ts @@ -1,6 +1,6 @@ import OpenAI from "openai"; -import type { AppConfig } from "../config.js"; +import type { AssistantRuntimeConfig } from "../config.js"; import type { ConversationMemory, UserUtterance } from "./conversation.js"; const ASSISTANT_INSTRUCTIONS = [ @@ -30,7 +30,7 @@ function normalizeReply(text: string): string { export class OpenAiLlmService { private readonly client: OpenAI; - constructor(private readonly config: AppConfig) { + constructor(private readonly config: AssistantRuntimeConfig) { this.client = new OpenAI({ apiKey: this.config.OPENAI_API_KEY, });