feat: add local audio test mode

2026-04-30 02:37:54 +09:00
parent 9dee708b64
commit cf6398f50a
12 changed files with 766 additions and 256 deletions
--- a/.env.example
+++ b/.env.example
@@ -12,5 +12,8 @@ ELEVENLABS_TTS_MODEL=eleven_flash_v2_5
 BOT_DEFAULT_LANGUAGE=ko
 MAX_CONVERSATION_TURNS=12
 LOCAL_AUDIO_SOURCE=
 LOCAL_AUDIO_SINK=
 LOCAL_SPEAKER_NAME=local-user
 DEBUG_TEXT_EVENTS=false
 LOG_LEVEL=info
--- a/README.md
+++ b/README.md
@@ -1,10 +1,11 @@
 # realtime_voice_bot
-디스코드 음성 채널에서 여러 사용자의 음성을 개별로 받아 한국어로 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다.
+디스코드 음성 채널 또는 로컬 PC 마이크/스피커에서 한국어 음성을 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다.
 ## 현재 구현 범위
 - Discord slash command 기반 제어: `/join`, `/leave`, `/status`, `/reset`, `/say`
 - 로컬 테스트 모드: `pw-record` 입력, `pw-play` 출력
 - `@discordjs/voice` 기반 음성 채널 입장 및 유저별 오디오 수신
 - 48k stereo PCM을 16k mono로 내려서 유저별 VAD 처리
 - Silero 계열 VAD(`avr-vad`)로 발화 시작/종료 감지
@@ -28,16 +29,25 @@
 필수:
 - `DISCORD_BOT_TOKEN`
 - `DISCORD_APPLICATION_ID`
 - `OPENAI_API_KEY`
 - `ELEVENLABS_API_KEY`
 - `ELEVENLABS_VOICE_ID`
 Discord 모드에서만 필수:
 - `DISCORD_BOT_TOKEN`
 - `DISCORD_APPLICATION_ID`
 선택:
 - `DISCORD_COMMAND_GUILD_ID`
  - 테스트 서버에만 slash command를 즉시 반영하려면 설정
 - `LOCAL_AUDIO_SOURCE`
  - `pw-record --target` 에 넣을 PipeWire source id 또는 node name
 - `LOCAL_AUDIO_SINK`
  - `pw-play --target` 에 넣을 PipeWire sink id 또는 node name
 - `LOCAL_SPEAKER_NAME`
  - 로컬 테스트에서 프롬프트에 넣을 화자 이름
 - `OPENAI_MODEL`
  - 기본값: `gpt-5.4-mini`
 - `ELEVENLABS_STT_MODEL`
@@ -51,13 +61,24 @@
 ```bash
 bun install
 bun run start
 ```
-개발 모드:
+디스코드 모드:
 ```bash
-bun run dev
+bun run start:discord
 ```
 로컬 장치 목록:
 ```bash
 bun run audio:devices
 ```
 로컬 테스트 모드:
 ```bash
 bun run start:local
 ```
 타입 체크:
@@ -74,9 +95,17 @@ bun run check
 4. 말을 하면 봇이 발화 단위로 인식하고 음성으로 짧게 답합니다.
 5. 다시 말하면 현재 읽고 있던 TTS는 즉시 중단됩니다.
 로컬 테스트:
 1. `bun run audio:devices` 로 source/sink id 또는 이름 확인
 2. 필요하면 `.env` 에 `LOCAL_AUDIO_SOURCE`, `LOCAL_AUDIO_SINK` 설정
 3. `bun run start:local`
 4. 마이크로 바로 말해서 응답 확인
 ## 설계 메모
 - 입력은 유저별 병렬 처리
 - 출력은 길드 세션당 단일 큐
 - 로컬 모드는 단일 화자 입력 기준
 - 화자 구분은 `speaker_id`, `speaker_name`을 LLM 프롬프트에 항상 포함
 - 최소 프로토타입이므로 Deepgram 대체 STT, 장기 메모리, 고급 명령 라우팅은 아직 포함하지 않았습니다.
--- a/package.json
+++ b/package.json
@@ -5,7 +5,10 @@
  "type": "module",
  "scripts": {
    "dev": "bun --watch src/index.ts",
-    "start": "bun src/index.ts",
+    "start": "bun src/index.ts discord",
    "start:discord": "bun src/index.ts discord",
    "start:local": "bun src/index.ts local",
    "audio:devices": "bun src/index.ts local-devices",
    "check": "tsc --noEmit",
    "build": "tsc -p tsconfig.json"
  },
--- a/src/audio/guild-voice-session.ts
+++ b/src/audio/guild-voice-session.ts
@@ -8,8 +8,10 @@ import {
  NoSubscriberBehavior,
  VoiceConnectionStatus,
  createAudioPlayer,
  createAudioResource,
  entersState,
  joinVoiceChannel,
  StreamType,
  type AudioPlayer,
  type AudioReceiveStream,
  type VoiceConnection,
@@ -21,7 +23,7 @@ import { Logger } from "../logger.js";
 import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js";
 import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
 import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
-import { ElevenLabsTtsService, type PreparedSpeechPlayback } from "../services/elevenlabs-tts.js";
+import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
 import { OpenAiLlmService } from "../services/openai-llm.js";
 interface GuildVoiceSessionOptions {
@@ -167,7 +169,7 @@ export class GuildVoiceSession extends EventEmitter {
  private draining = false;
  private currentAbortController: AbortController | null = null;
-  private currentPlayback: PreparedSpeechPlayback | null = null;
+  private currentPlayback: PreparedSpeechAudio | null = null;
  private textChannelId?: string;
  private constructor(private readonly options: GuildVoiceSessionOptions) {
@@ -415,7 +417,9 @@ export class GuildVoiceSession extends EventEmitter {
        }
        try {
-          const resource = this.currentPlayback.resource;
+          const resource = createAudioResource(this.currentPlayback.stream, {
            inputType: StreamType.Raw,
          });
          this.player.play(resource);
          await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null);
--- a/src/audio/local-voice-session.ts
+++ b/src/audio/local-voice-session.ts
@@ -0,0 +1,339 @@
 import { spawn, type ChildProcessByStdio } from "node:child_process";
 import { once } from "node:events";
 import type { Readable, Writable } from "node:stream";
 import { RealTimeVAD } from "avr-vad";
 import type { AssistantRuntimeConfig } from "../config.js";
 import { Logger } from "../logger.js";
 import { takeFrame, int16ArrayToFloat32, float32ToPcm16Buffer } from "./pcm.js";
 import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
 import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
 import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
 import { OpenAiLlmService } from "../services/openai-llm.js";
 interface LocalVoiceSessionOptions {
  config: AssistantRuntimeConfig;
  logger: Logger;
  stt: ElevenLabsSttService;
  tts: ElevenLabsTtsService;
  llm: OpenAiLlmService;
 }
 interface SpeechJob {
  text: string;
  source: "assistant" | "manual";
 }
 export class LocalVoiceSession {
  private readonly memory: ConversationMemory;
  private readonly queue: SpeechJob[] = [];
  private readonly pendingSamples: number[] = [];
  private vad: RealTimeVAD | null = null;
  private recorder: ChildProcessByStdio<null, Readable, Readable> | null = null;
  private currentPlayer: ChildProcessByStdio<Writable, null, Readable> | null = null;
  private currentAbortController: AbortController | null = null;
  private currentPlayback: PreparedSpeechAudio | null = null;
  private processing = Promise.resolve();
  private draining = false;
  private destroyed = false;
  constructor(private readonly options: LocalVoiceSessionOptions) {
    this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
  }
  async start(): Promise<void> {
    this.vad = await RealTimeVAD.new({
      model: "v5",
      sampleRate: 16000,
      frameSamples: 1536,
      positiveSpeechThreshold: 0.55,
      negativeSpeechThreshold: 0.35,
      redemptionFrames: 8,
      preSpeechPadFrames: 2,
      minSpeechFrames: 3,
      onFrameProcessed: () => undefined,
      onVADMisfire: () => undefined,
      onSpeechStart: () => {
        this.interruptPlayback("local-barge-in");
      },
      onSpeechRealStart: () => undefined,
      onSpeechEnd: (audio: Float32Array) => {
        void this.handleSpeechEnd(audio);
      },
    });
    this.recorder = this.spawnRecorder();
    this.recorder.stdout.on("data", (chunk: Buffer) => {
      this.pushPcm16Chunk(chunk);
    });
    this.recorder.stderr.on("data", (chunk: Buffer) => {
      const text = chunk.toString().trim();
      if (text.length > 0) {
        this.options.logger.debug("[pw-record]", text);
      }
    });
    this.recorder.on("exit", (code, signal) => {
      if (!this.destroyed) {
        this.options.logger.warn("pw-record exited unexpectedly", { code, signal });
      }
    });
  }
  async destroy(): Promise<void> {
    this.destroyed = true;
    this.interruptPlayback("local-shutdown");
    if (this.recorder && !this.recorder.killed) {
      this.recorder.kill("SIGTERM");
      await once(this.recorder, "exit").catch(() => null);
    }
    if (this.vad) {
      await this.vad.destroy().catch((error) => {
        this.options.logger.warn("Local VAD destroy failed", error);
      });
      this.vad = null;
    }
  }
  clearConversation(): void {
    this.memory.clear();
    this.interruptPlayback("local-reset");
  }
  async speakText(text: string): Promise<void> {
    this.queue.push({
      text,
      source: "manual",
    });
    await this.drainQueue();
  }
  statusSummary(): string {
    return [
      "모드: local",
      `입력 source: ${this.options.config.LOCAL_AUDIO_SOURCE ?? "default"}`,
      `출력 sink: ${this.options.config.LOCAL_AUDIO_SINK ?? "default"}`,
      `대기열: ${this.queue.length}`,
      `최근 대화 턴: ${this.memory.recentTurns().length}`,
    ].join("\n");
  }
  private spawnRecorder(): ChildProcessByStdio<null, Readable, Readable> {
    const args = [
      "--rate",
      "16000",
      "--channels",
      "1",
      "--format",
      "s16",
      "--raw",
    ];
    if (this.options.config.LOCAL_AUDIO_SOURCE) {
      args.push("--target", this.options.config.LOCAL_AUDIO_SOURCE);
    }
    args.push("-");
    this.options.logger.info("Starting local recorder", {
      source: this.options.config.LOCAL_AUDIO_SOURCE ?? "default",
    });
    return spawn("pw-record", args, {
      stdio: ["ignore", "pipe", "pipe"],
    });
  }
  private pushPcm16Chunk(chunk: Buffer): void {
    if (this.destroyed || !this.vad) {
      return;
    }
    for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
      this.pendingSamples.push(chunk.readInt16LE(offset));
    }
    while (true) {
      const frame = takeFrame(this.pendingSamples, 1536);
      if (!frame) {
        return;
      }
      const floatFrame = int16ArrayToFloat32(frame);
      this.processing = this.processing
        .then(() => this.vad?.processAudio(floatFrame))
        .catch((error) => {
          this.options.logger.warn("Local VAD processing failed", error);
        });
    }
  }
  private async handleSpeechEnd(audio: Float32Array): Promise<void> {
    if (audio.length < 16000 * 0.25) {
      return;
    }
    const utterance: UserUtterance = {
      speakerId: "local-user",
      speakerName: this.options.config.LOCAL_SPEAKER_NAME,
      text: "",
    };
    let transcript: string | null = null;
    try {
      transcript = await this.options.stt.transcribePcm16(float32ToPcm16Buffer(audio));
    } catch (error) {
      this.options.logger.warn("Local STT failed", error);
      return;
    }
    if (!transcript || transcript.trim().length === 0) {
      return;
    }
    utterance.text = transcript.trim();
    this.memory.addUserTurn(utterance);
    this.options.logger.info("Local transcript", utterance.text);
    if (this.options.config.DEBUG_TEXT_EVENTS) {
      console.log(`\n[you] ${utterance.text}`);
    }
    let reply: string;
    try {
      reply = await this.options.llm.generateReply(this.memory, utterance);
    } catch (error) {
      this.options.logger.warn("Local LLM failed", error);
      reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요.";
    }
    this.memory.addAssistantTurn(reply);
    if (this.options.config.DEBUG_TEXT_EVENTS) {
      console.log(`[bot] ${reply}\n`);
    }
    this.queue.push({
      text: reply,
      source: "assistant",
    });
    await this.drainQueue();
  }
  private interruptPlayback(reason: string): void {
    if (this.queue.length > 0 || this.currentPlayer) {
      this.options.logger.info("Interrupting local playback", reason);
    }
    this.queue.splice(0, this.queue.length);
    this.currentAbortController?.abort();
    this.currentAbortController = null;
    this.currentPlayback?.dispose();
    this.currentPlayback = null;
    if (this.currentPlayer && !this.currentPlayer.killed) {
      this.currentPlayer.kill("SIGKILL");
    }
    this.currentPlayer = null;
  }
  private async drainQueue(): Promise<void> {
    if (this.draining || this.destroyed) {
      return;
    }
    this.draining = true;
    try {
      while (this.queue.length > 0 && !this.destroyed) {
        const job = this.queue.shift();
        if (!job) {
          continue;
        }
        const abortController = new AbortController();
        this.currentAbortController = abortController;
        try {
          this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal);
        } catch (error) {
          if (!abortController.signal.aborted) {
            this.options.logger.warn("Local TTS synthesis failed", error);
          }
          continue;
        }
        try {
          await this.playToSink(this.currentPlayback, abortController.signal);
        } catch (error) {
          if (!abortController.signal.aborted) {
            this.options.logger.warn("Local playback failed", error);
          }
        } finally {
          this.currentPlayback?.dispose();
          this.currentPlayback = null;
          if (this.currentAbortController === abortController) {
            this.currentAbortController = null;
          }
        }
      }
    } finally {
      this.draining = false;
    }
  }
  private async playToSink(playback: PreparedSpeechAudio, signal: AbortSignal): Promise<void> {
    const args = [
      "--rate",
      "48000",
      "--channels",
      "2",
      "--format",
      "s16",
      "--raw",
    ];
    if (this.options.config.LOCAL_AUDIO_SINK) {
      args.push("--target", this.options.config.LOCAL_AUDIO_SINK);
    }
    args.push("-");
    const player = spawn("pw-play", args, {
      stdio: ["pipe", "ignore", "pipe"],
    });
    this.currentPlayer = player;
    player.stderr.on("data", (chunk: Buffer) => {
      const text = chunk.toString().trim();
      if (text.length > 0) {
        this.options.logger.debug("[pw-play]", text);
      }
    });
    signal.addEventListener(
      "abort",
      () => {
        playback.stream.destroy();
        if (!player.killed) {
          player.kill("SIGKILL");
        }
      },
      { once: true },
    );
    playback.stream.pipe(player.stdin);
    const [code, playSignal] = (await once(player, "exit")) as [number | null, NodeJS.Signals | null];
    this.currentPlayer = null;
    if (signal.aborted) {
      return;
    }
    if (code !== 0) {
      throw new Error(`pw-play exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`);
    }
  }
 }
--- a/src/config.ts
+++ b/src/config.ts
@@ -4,17 +4,20 @@ import { z } from "zod";
 loadDotenv();
 const envSchema = z.object({
-  DISCORD_BOT_TOKEN: z.string().min(1),
+  DISCORD_BOT_TOKEN: z.string().min(1).optional(),
-  DISCORD_APPLICATION_ID: z.string().min(1),
+  DISCORD_APPLICATION_ID: z.string().min(1).optional(),
  DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(),
-  OPENAI_API_KEY: z.string().min(1),
+  OPENAI_API_KEY: z.string().min(1).optional(),
  OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"),
-  ELEVENLABS_API_KEY: z.string().min(1),
+  ELEVENLABS_API_KEY: z.string().min(1).optional(),
-  ELEVENLABS_VOICE_ID: z.string().min(1),
+  ELEVENLABS_VOICE_ID: z.string().min(1).optional(),
  ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"),
  ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"),
  BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),
  MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12),
  LOCAL_AUDIO_SOURCE: z.string().min(1).optional(),
  LOCAL_AUDIO_SINK: z.string().min(1).optional(),
  LOCAL_SPEAKER_NAME: z.string().min(1).default("local-user"),
  DEBUG_TEXT_EVENTS: z
    .string()
    .optional()
@@ -23,7 +26,41 @@ const envSchema = z.object({
 });
 export type AppConfig = z.infer<typeof envSchema>;
 export type AssistantRuntimeConfig = AppConfig & {
  OPENAI_API_KEY: string;
  ELEVENLABS_API_KEY: string;
  ELEVENLABS_VOICE_ID: string;
 };
 export type DiscordRuntimeConfig = AssistantRuntimeConfig & {
  DISCORD_BOT_TOKEN: string;
  DISCORD_APPLICATION_ID: string;
 };
 export function loadConfig(): AppConfig {
  return envSchema.parse(process.env);
 }
 function requirePresent(value: string | undefined, name: string): string {
  if (!value) {
    throw new Error(`${name} 환경변수가 필요합니다.`);
  }
  return value;
 }
 export function requireAssistantRuntimeConfig(config: AppConfig): AssistantRuntimeConfig {
  return {
    ...config,
    OPENAI_API_KEY: requirePresent(config.OPENAI_API_KEY, "OPENAI_API_KEY"),
    ELEVENLABS_API_KEY: requirePresent(config.ELEVENLABS_API_KEY, "ELEVENLABS_API_KEY"),
    ELEVENLABS_VOICE_ID: requirePresent(config.ELEVENLABS_VOICE_ID, "ELEVENLABS_VOICE_ID"),
  };
 }
 export function requireDiscordRuntimeConfig(config: AppConfig): DiscordRuntimeConfig {
  const assistant = requireAssistantRuntimeConfig(config);
  return {
    ...assistant,
    DISCORD_BOT_TOKEN: requirePresent(config.DISCORD_BOT_TOKEN, "DISCORD_BOT_TOKEN"),
    DISCORD_APPLICATION_ID: requirePresent(config.DISCORD_APPLICATION_ID, "DISCORD_APPLICATION_ID"),
  };
 }
--- a/src/discord-main.ts
+++ b/src/discord-main.ts
@@ -0,0 +1,234 @@
 import process from "node:process";
 import {
  GatewayIntentBits,
  REST,
  Routes,
  SlashCommandBuilder,
  type ChatInputCommandInteraction,
  type Client,
  type GuildMember,
  type VoiceBasedChannel,
 } from "discord.js";
 import { Client as DiscordClient } from "discord.js";
 import { GuildVoiceSession } from "./audio/guild-voice-session.js";
 import { type DiscordRuntimeConfig } from "./config.js";
 import { Logger } from "./logger.js";
 import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
 import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
 import { OpenAiLlmService } from "./services/openai-llm.js";
 export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
  const commands = [
    new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
    new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
    new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
    new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
    new SlashCommandBuilder()
      .setName("say")
      .setDescription("텍스트를 바로 음성으로 읽습니다.")
      .addStringOption((option) =>
        option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
      ),
  ].map((command) => command.toJSON());
  const client = new DiscordClient({
    intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
  });
  const stt = new ElevenLabsSttService(config);
  const tts = new ElevenLabsTtsService(config);
  const llm = new OpenAiLlmService(config);
  const sessions = new Map<string, GuildVoiceSession>();
  function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
    const member = interaction.member as GuildMember | null;
    return member?.voice.channel ?? null;
  }
  async function registerCommands(_appClient: Client): Promise<void> {
    const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
    if (config.DISCORD_COMMAND_GUILD_ID) {
      await rest.put(
        Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
        {
          body: commands,
        },
      );
      logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
      return;
    }
    await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
      body: commands,
    });
    logger.info("Registered global commands");
  }
  async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
    if (!interaction.guild) {
      throw new Error("Guild interaction required");
    }
    const voiceChannel = getVoiceChannel(interaction);
    if (!voiceChannel) {
      throw new Error("먼저 음성 채널에 들어가 주세요.");
    }
    const existing = sessions.get(interaction.guild.id);
    if (existing && existing.voiceChannelId === voiceChannel.id) {
      existing.setTextChannel(interaction.channelId);
      return existing;
    }
    if (existing) {
      await existing.destroy();
      sessions.delete(interaction.guild.id);
    }
    const session = await GuildVoiceSession.create({
      client,
      config,
      logger,
      guild: interaction.guild,
      voiceChannel,
      textChannelId: interaction.channelId,
      stt,
      tts,
      llm,
    });
    sessions.set(interaction.guild.id, session);
    return session;
  }
  async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
    await interaction.deferReply({ ephemeral: true });
    try {
      const session = await createSession(interaction);
      await interaction.editReply(
        `음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`,
      );
    } catch (error) {
      const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
      await interaction.editReply(message);
    }
  }
  async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
    const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
    if (!session) {
      await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
      return;
    }
    await session.destroy();
    sessions.delete(interaction.guildId!);
    await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
  }
  async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
    const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
    if (!session) {
      await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
      return;
    }
    await interaction.reply({
      content: session.statusSummary(),
      ephemeral: true,
    });
  }
  async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
    const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
    if (!session) {
      await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
      return;
    }
    session.clearConversation();
    await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
  }
  async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
    await interaction.deferReply({ ephemeral: true });
    const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
    if (!session) {
      await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
      return;
    }
    const text = interaction.options.getString("text", true).trim();
    await session.speakText(text);
    await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
  }
  async function shutdown(exitCode = 0): Promise<void> {
    logger.info("Shutting down");
    for (const session of sessions.values()) {
      await session.destroy().catch((error) => {
        logger.warn("Session shutdown failed", error);
      });
    }
    sessions.clear();
    await client.destroy();
    process.exit(exitCode);
  }
  client.once("ready", async () => {
    logger.info("Discord client ready", client.user?.tag ?? "unknown");
    try {
      await registerCommands(client);
    } catch (error) {
      logger.error("Command registration failed", error);
    }
  });
  client.on("interactionCreate", async (interaction) => {
    if (!interaction.isChatInputCommand()) {
      return;
    }
    try {
      switch (interaction.commandName) {
        case "join":
          await handleJoin(interaction);
          return;
        case "leave":
          await handleLeave(interaction);
          return;
        case "status":
          await handleStatus(interaction);
          return;
        case "reset":
          await handleReset(interaction);
          return;
        case "say":
          await handleSay(interaction);
          return;
        default:
          await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
      }
    } catch (error) {
      logger.error("Interaction handler failed", error);
      if (interaction.deferred || interaction.replied) {
        await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
        return;
      }
      await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
    }
  });
  process.on("SIGINT", () => {
    void shutdown(0);
  });
  process.on("SIGTERM", () => {
    void shutdown(0);
  });
  await client.login(config.DISCORD_BOT_TOKEN);
 }
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,237 +1,28 @@
 import process from "node:process";
-import {
+import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js";
-  GatewayIntentBits,
+import { runDiscordBot } from "./discord-main.js";
  REST,
  Routes,
  SlashCommandBuilder,
  type ChatInputCommandInteraction,
  type Client,
  type GuildMember,
  type VoiceBasedChannel,
 } from "discord.js";
 import { Client as DiscordClient } from "discord.js";
 import { GuildVoiceSession } from "./audio/guild-voice-session.js";
 import { loadConfig } from "./config.js";
 import { Logger } from "./logger.js";
-import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
+import { printLocalAudioDevices, runLocalAssistant } from "./local-main.js";
 import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
 import { OpenAiLlmService } from "./services/openai-llm.js";
 const mode = process.argv[2] ?? "discord";
 const config = loadConfig();
 const logger = new Logger(config.LOG_LEVEL);
 const commands = [
  new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
  new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
  new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
  new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
  new SlashCommandBuilder()
    .setName("say")
    .setDescription("텍스트를 바로 음성으로 읽습니다.")
    .addStringOption((option) =>
      option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
    ),
 ].map((command) => command.toJSON());
 const client = new DiscordClient({
  intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
 });
 const stt = new ElevenLabsSttService(config);
 const tts = new ElevenLabsTtsService(config);
 const llm = new OpenAiLlmService(config);
 const sessions = new Map<string, GuildVoiceSession>();
 function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
  const member = interaction.member as GuildMember | null;
  return member?.voice.channel ?? null;
 }
 async function registerCommands(appClient: Client): Promise<void> {
  const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
  if (config.DISCORD_COMMAND_GUILD_ID) {
    await rest.put(
      Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
      {
        body: commands,
      },
    );
    logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
    return;
  }
  await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
    body: commands,
  });
  logger.info("Registered global commands");
 }
 async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
  if (!interaction.guild) {
    throw new Error("Guild interaction required");
  }
  const voiceChannel = getVoiceChannel(interaction);
  if (!voiceChannel) {
    throw new Error("먼저 음성 채널에 들어가 주세요.");
  }
  const existing = sessions.get(interaction.guild.id);
  if (existing && existing.voiceChannelId === voiceChannel.id) {
    existing.setTextChannel(interaction.channelId);
    return existing;
  }
  if (existing) {
    await existing.destroy();
    sessions.delete(interaction.guild.id);
  }
  const session = await GuildVoiceSession.create({
    client,
    config,
    logger,
    guild: interaction.guild,
    voiceChannel,
    textChannelId: interaction.channelId,
    stt,
    tts,
    llm,
  });
  sessions.set(interaction.guild.id, session);
  return session;
 }
 async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
  await interaction.deferReply({ ephemeral: true });
  try {
    const session = await createSession(interaction);
    await interaction.editReply(`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`);
  } catch (error) {
    const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
    await interaction.editReply(message);
  }
 }
 async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
  const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
  if (!session) {
    await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
    return;
  }
  await session.destroy();
  sessions.delete(interaction.guildId!);
  await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
 }
 async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
  const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
  if (!session) {
    await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
    return;
  }
  await interaction.reply({
    content: session.statusSummary(),
    ephemeral: true,
  });
 }
 async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
  const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
  if (!session) {
    await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
    return;
  }
  session.clearConversation();
  await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
 }
 async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
  await interaction.deferReply({ ephemeral: true });
  const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
  if (!session) {
    await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
    return;
  }
  const text = interaction.options.getString("text", true).trim();
  await session.speakText(text);
  await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
 }
 async function shutdown(exitCode = 0): Promise<void> {
  logger.info("Shutting down");
  for (const session of sessions.values()) {
    await session.destroy().catch((error) => {
      logger.warn("Session shutdown failed", error);
    });
  }
  sessions.clear();
  await client.destroy();
  process.exit(exitCode);
 }
 client.once("ready", async () => {
  logger.info("Discord client ready", client.user?.tag ?? "unknown");
  try {
    await registerCommands(client);
  } catch (error) {
    logger.error("Command registration failed", error);
  }
 });
 client.on("interactionCreate", async (interaction) => {
  if (!interaction.isChatInputCommand()) {
    return;
  }
  try {
    switch (interaction.commandName) {
      case "join":
        await handleJoin(interaction);
        return;
      case "leave":
        await handleLeave(interaction);
        return;
      case "status":
        await handleStatus(interaction);
        return;
      case "reset":
        await handleReset(interaction);
        return;
      case "say":
        await handleSay(interaction);
        return;
      default:
        await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
    }
  } catch (error) {
    logger.error("Interaction handler failed", error);
    if (interaction.deferred || interaction.replied) {
      await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
      return;
    }
    await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
  }
 });
 process.on("SIGINT", () => {
  void shutdown(0);
 });
 process.on("SIGTERM", () => {
  void shutdown(0);
 });
 async function main(): Promise<void> {
-  await client.login(config.DISCORD_BOT_TOKEN);
+  switch (mode) {
    case "discord":
      await runDiscordBot(requireDiscordRuntimeConfig(config), logger);
      return;
    case "local":
      await runLocalAssistant(requireAssistantRuntimeConfig(config), logger);
      return;
    case "local-devices":
      await printLocalAudioDevices();
      return;
    default:
      throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices`);
  }
 }
 void main().catch((error) => {
--- a/src/local-main.ts
+++ b/src/local-main.ts
@@ -0,0 +1,75 @@
 import { spawn } from "node:child_process";
 import process from "node:process";
 import type { AssistantRuntimeConfig } from "./config.js";
 import { Logger } from "./logger.js";
 import { LocalVoiceSession } from "./audio/local-voice-session.js";
 import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
 import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
 import { OpenAiLlmService } from "./services/openai-llm.js";
 export async function printLocalAudioDevices(): Promise<void> {
  const runs = [
    {
      label: "wpctl status",
      args: ["status"],
    },
    {
      label: "wpctl status -n",
      args: ["status", "-n"],
    },
  ] as const;
  for (const run of runs) {
    console.log(`\n=== ${run.label} ===`);
    await new Promise<void>((resolve, reject) => {
      const child = spawn("wpctl", run.args, {
        stdio: ["ignore", "inherit", "inherit"],
      });
      child.on("exit", (code) => {
        if (code === 0) {
          resolve();
          return;
        }
        reject(new Error(`wpctl exited with code ${code ?? "null"}`));
      });
      child.on("error", reject);
    });
  }
 }
 export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
  const stt = new ElevenLabsSttService(config);
  const tts = new ElevenLabsTtsService(config);
  const llm = new OpenAiLlmService(config);
  const session = new LocalVoiceSession({
    config,
    logger,
    stt,
    tts,
    llm,
  });
  console.log(session.statusSummary());
  console.log("로컬 음성 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
  if (config.DEBUG_TEXT_EVENTS) {
    console.log("텍스트 로그 출력이 켜져 있습니다.");
  }
  const shutdown = async (exitCode = 0) => {
    await session.destroy().catch((error) => {
      logger.warn("Local session shutdown failed", error);
    });
    process.exit(exitCode);
  };
  process.on("SIGINT", () => {
    void shutdown(0);
  });
  process.on("SIGTERM", () => {
    void shutdown(0);
  });
  await session.start();
 }
--- a/src/services/elevenlabs-stt.ts
+++ b/src/services/elevenlabs-stt.ts
@@ -1,6 +1,6 @@
 import WebSocket from "ws";
-import type { AppConfig } from "../config.js";
+import type { AssistantRuntimeConfig } from "../config.js";
 interface ElevenLabsMessage {
  message_type?: string;
@@ -13,7 +13,7 @@ const NON_FATAL_ERROR_TYPES = new Set([
 ]);
 export class ElevenLabsSttService {
-  constructor(private readonly config: AppConfig) {}
+  constructor(private readonly config: AssistantRuntimeConfig) {}
  async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
    if (pcm16MonoAudio.byteLength === 0) {
--- a/src/services/elevenlabs-tts.ts
+++ b/src/services/elevenlabs-tts.ts
@@ -2,24 +2,23 @@ import { Readable } from "node:stream";
 import ffmpegStatic from "ffmpeg-static";
 import prism from "prism-media";
 import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice";
-import type { AppConfig } from "../config.js";
+import type { AssistantRuntimeConfig } from "../config.js";
-export interface PreparedSpeechPlayback {
+export interface PreparedSpeechAudio {
-  resource: AudioResource;
+  stream: Readable;
  dispose: () => void;
 }
 export class ElevenLabsTtsService {
-  constructor(private readonly config: AppConfig) {
+  constructor(private readonly config: AssistantRuntimeConfig) {
    const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
    if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
      process.env.FFMPEG_PATH = resolvedFfmpegPath;
    }
  }
-  async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechPlayback> {
+  async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
    const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
    url.searchParams.set("output_format", "mp3_44100_128");
    url.searchParams.set("enable_logging", "false");
@@ -68,12 +67,8 @@ export class ElevenLabsTtsService {
    input.pipe(ffmpeg);
    const resource = createAudioResource(ffmpeg, {
      inputType: StreamType.Raw,
    });
    return {
-      resource,
+      stream: ffmpeg,
      dispose: () => {
        input.destroy();
        ffmpeg.destroy();
--- a/src/services/openai-llm.ts
+++ b/src/services/openai-llm.ts
@@ -1,6 +1,6 @@
 import OpenAI from "openai";
-import type { AppConfig } from "../config.js";
+import type { AssistantRuntimeConfig } from "../config.js";
 import type { ConversationMemory, UserUtterance } from "./conversation.js";
 const ASSISTANT_INSTRUCTIONS = [
@@ -30,7 +30,7 @@ function normalizeReply(text: string): string {
 export class OpenAiLlmService {
  private readonly client: OpenAI;
-  constructor(private readonly config: AppConfig) {
+  constructor(private readonly config: AssistantRuntimeConfig) {
    this.client = new OpenAI({
      apiKey: this.config.OPENAI_API_KEY,
    });