From cf6398f50a7d1736ead74639958b5f16b5033789 Mon Sep 17 00:00:00 2001
From: claude-bot <claude-bot@tkrmagid.kr>
Date: Thu, 30 Apr 2026 02:37:54 +0900
Subject: [PATCH] feat: add local audio test mode

---
 .env.example                     |   3 +
 README.md                        |  41 +++-
 package.json                     |   5 +-
 src/audio/guild-voice-session.ts |  10 +-
 src/audio/local-voice-session.ts | 339 +++++++++++++++++++++++++++++++
 src/config.ts                    |  47 ++++-
 src/discord-main.ts              | 234 +++++++++++++++++++++
 src/index.ts                     | 243 ++--------------------
 src/local-main.ts                |  75 +++++++
 src/services/elevenlabs-stt.ts   |   4 +-
 src/services/elevenlabs-tts.ts   |  17 +-
 src/services/openai-llm.ts       |   4 +-
 12 files changed, 766 insertions(+), 256 deletions(-)
 create mode 100644 src/audio/local-voice-session.ts
 create mode 100644 src/discord-main.ts
 create mode 100644 src/local-main.ts

diff --git a/.env.example b/.env.example
index 81bfc03..257d0c1 100644
--- a/.env.example
+++ b/.env.example
@@ -12,5 +12,8 @@ ELEVENLABS_TTS_MODEL=eleven_flash_v2_5
 
 BOT_DEFAULT_LANGUAGE=ko
 MAX_CONVERSATION_TURNS=12
+LOCAL_AUDIO_SOURCE=
+LOCAL_AUDIO_SINK=
+LOCAL_SPEAKER_NAME=local-user
 DEBUG_TEXT_EVENTS=false
 LOG_LEVEL=info
diff --git a/README.md b/README.md
index b25544e..2ca0cc1 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,11 @@
 # realtime_voice_bot
 
-디스코드 음성 채널에서 여러 사용자의 음성을 개별로 받아 한국어로 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다.
+디스코드 음성 채널 또는 로컬 PC 마이크/스피커에서 한국어 음성을 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다.
 
 ## 현재 구현 범위
 
 - Discord slash command 기반 제어: `/join`, `/leave`, `/status`, `/reset`, `/say`
+- 로컬 테스트 모드: `pw-record` 입력, `pw-play` 출력
 - `@discordjs/voice` 기반 음성 채널 입장 및 유저별 오디오 수신
 - 48k stereo PCM을 16k mono로 내려서 유저별 VAD 처리
 - Silero 계열 VAD(`avr-vad`)로 발화 시작/종료 감지
@@ -28,16 +29,25 @@
 
 필수:
 
-- `DISCORD_BOT_TOKEN`
-- `DISCORD_APPLICATION_ID`
 - `OPENAI_API_KEY`
 - `ELEVENLABS_API_KEY`
 - `ELEVENLABS_VOICE_ID`
 
+Discord 모드에서만 필수:
+
+- `DISCORD_BOT_TOKEN`
+- `DISCORD_APPLICATION_ID`
+
 선택:
 
 - `DISCORD_COMMAND_GUILD_ID`
   - 테스트 서버에만 slash command를 즉시 반영하려면 설정
+- `LOCAL_AUDIO_SOURCE`
+  - `pw-record --target` 에 넣을 PipeWire source id 또는 node name
+- `LOCAL_AUDIO_SINK`
+  - `pw-play --target` 에 넣을 PipeWire sink id 또는 node name
+- `LOCAL_SPEAKER_NAME`
+  - 로컬 테스트에서 프롬프트에 넣을 화자 이름
 - `OPENAI_MODEL`
   - 기본값: `gpt-5.4-mini`
 - `ELEVENLABS_STT_MODEL`
@@ -51,13 +61,24 @@
 
 ```bash
 bun install
-bun run start
 ```
 
-개발 모드:
+디스코드 모드:
 
 ```bash
-bun run dev
+bun run start:discord
+```
+
+로컬 장치 목록:
+
+```bash
+bun run audio:devices
+```
+
+로컬 테스트 모드:
+
+```bash
+bun run start:local
 ```
 
 타입 체크:
@@ -74,9 +95,17 @@ bun run check
 4. 말을 하면 봇이 발화 단위로 인식하고 음성으로 짧게 답합니다.
 5. 다시 말하면 현재 읽고 있던 TTS는 즉시 중단됩니다.
 
+로컬 테스트:
+
+1. `bun run audio:devices` 로 source/sink id 또는 이름 확인
+2. 필요하면 `.env` 에 `LOCAL_AUDIO_SOURCE`, `LOCAL_AUDIO_SINK` 설정
+3. `bun run start:local`
+4. 마이크로 바로 말해서 응답 확인
+
 ## 설계 메모
 
 - 입력은 유저별 병렬 처리
 - 출력은 길드 세션당 단일 큐
+- 로컬 모드는 단일 화자 입력 기준
 - 화자 구분은 `speaker_id`, `speaker_name`을 LLM 프롬프트에 항상 포함
 - 최소 프로토타입이므로 Deepgram 대체 STT, 장기 메모리, 고급 명령 라우팅은 아직 포함하지 않았습니다.
diff --git a/package.json b/package.json
index 2e95195..723214c 100644
--- a/package.json
+++ b/package.json
@@ -5,7 +5,10 @@
   "type": "module",
   "scripts": {
     "dev": "bun --watch src/index.ts",
-    "start": "bun src/index.ts",
+    "start": "bun src/index.ts discord",
+    "start:discord": "bun src/index.ts discord",
+    "start:local": "bun src/index.ts local",
+    "audio:devices": "bun src/index.ts local-devices",
     "check": "tsc --noEmit",
     "build": "tsc -p tsconfig.json"
   },
diff --git a/src/audio/guild-voice-session.ts b/src/audio/guild-voice-session.ts
index 5644d49..5437016 100644
--- a/src/audio/guild-voice-session.ts
+++ b/src/audio/guild-voice-session.ts
@@ -8,8 +8,10 @@ import {
   NoSubscriberBehavior,
   VoiceConnectionStatus,
   createAudioPlayer,
+  createAudioResource,
   entersState,
   joinVoiceChannel,
+  StreamType,
   type AudioPlayer,
   type AudioReceiveStream,
   type VoiceConnection,
@@ -21,7 +23,7 @@ import { Logger } from "../logger.js";
 import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js";
 import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
 import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
-import { ElevenLabsTtsService, type PreparedSpeechPlayback } from "../services/elevenlabs-tts.js";
+import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
 import { OpenAiLlmService } from "../services/openai-llm.js";
 
 interface GuildVoiceSessionOptions {
@@ -167,7 +169,7 @@ export class GuildVoiceSession extends EventEmitter {
 
   private draining = false;
   private currentAbortController: AbortController | null = null;
-  private currentPlayback: PreparedSpeechPlayback | null = null;
+  private currentPlayback: PreparedSpeechAudio | null = null;
   private textChannelId?: string;
 
   private constructor(private readonly options: GuildVoiceSessionOptions) {
@@ -415,7 +417,9 @@ export class GuildVoiceSession extends EventEmitter {
         }
 
         try {
-          const resource = this.currentPlayback.resource;
+          const resource = createAudioResource(this.currentPlayback.stream, {
+            inputType: StreamType.Raw,
+          });
           this.player.play(resource);
 
           await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null);
diff --git a/src/audio/local-voice-session.ts b/src/audio/local-voice-session.ts
new file mode 100644
index 0000000..da0ce4a
--- /dev/null
+++ b/src/audio/local-voice-session.ts
@@ -0,0 +1,339 @@
+import { spawn, type ChildProcessByStdio } from "node:child_process";
+import { once } from "node:events";
+import type { Readable, Writable } from "node:stream";
+
+import { RealTimeVAD } from "avr-vad";
+
+import type { AssistantRuntimeConfig } from "../config.js";
+import { Logger } from "../logger.js";
+import { takeFrame, int16ArrayToFloat32, float32ToPcm16Buffer } from "./pcm.js";
+import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
+import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
+import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
+import { OpenAiLlmService } from "../services/openai-llm.js";
+
+interface LocalVoiceSessionOptions {
+  config: AssistantRuntimeConfig;
+  logger: Logger;
+  stt: ElevenLabsSttService;
+  tts: ElevenLabsTtsService;
+  llm: OpenAiLlmService;
+}
+
+interface SpeechJob {
+  text: string;
+  source: "assistant" | "manual";
+}
+
+export class LocalVoiceSession {
+  private readonly memory: ConversationMemory;
+  private readonly queue: SpeechJob[] = [];
+  private readonly pendingSamples: number[] = [];
+
+  private vad: RealTimeVAD | null = null;
+  private recorder: ChildProcessByStdio<null, Readable, Readable> | null = null;
+  private currentPlayer: ChildProcessByStdio<Writable, null, Readable> | null = null;
+  private currentAbortController: AbortController | null = null;
+  private currentPlayback: PreparedSpeechAudio | null = null;
+  private processing = Promise.resolve();
+  private draining = false;
+  private destroyed = false;
+
+  constructor(private readonly options: LocalVoiceSessionOptions) {
+    this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
+  }
+
+  async start(): Promise<void> {
+    this.vad = await RealTimeVAD.new({
+      model: "v5",
+      sampleRate: 16000,
+      frameSamples: 1536,
+      positiveSpeechThreshold: 0.55,
+      negativeSpeechThreshold: 0.35,
+      redemptionFrames: 8,
+      preSpeechPadFrames: 2,
+      minSpeechFrames: 3,
+      onFrameProcessed: () => undefined,
+      onVADMisfire: () => undefined,
+      onSpeechStart: () => {
+        this.interruptPlayback("local-barge-in");
+      },
+      onSpeechRealStart: () => undefined,
+      onSpeechEnd: (audio: Float32Array) => {
+        void this.handleSpeechEnd(audio);
+      },
+    });
+
+    this.recorder = this.spawnRecorder();
+    this.recorder.stdout.on("data", (chunk: Buffer) => {
+      this.pushPcm16Chunk(chunk);
+    });
+    this.recorder.stderr.on("data", (chunk: Buffer) => {
+      const text = chunk.toString().trim();
+      if (text.length > 0) {
+        this.options.logger.debug("[pw-record]", text);
+      }
+    });
+    this.recorder.on("exit", (code, signal) => {
+      if (!this.destroyed) {
+        this.options.logger.warn("pw-record exited unexpectedly", { code, signal });
+      }
+    });
+  }
+
+  async destroy(): Promise<void> {
+    this.destroyed = true;
+    this.interruptPlayback("local-shutdown");
+
+    if (this.recorder && !this.recorder.killed) {
+      this.recorder.kill("SIGTERM");
+      await once(this.recorder, "exit").catch(() => null);
+    }
+
+    if (this.vad) {
+      await this.vad.destroy().catch((error) => {
+        this.options.logger.warn("Local VAD destroy failed", error);
+      });
+      this.vad = null;
+    }
+  }
+
+  clearConversation(): void {
+    this.memory.clear();
+    this.interruptPlayback("local-reset");
+  }
+
+  async speakText(text: string): Promise<void> {
+    this.queue.push({
+      text,
+      source: "manual",
+    });
+    await this.drainQueue();
+  }
+
+  statusSummary(): string {
+    return [
+      "모드: local",
+      `입력 source: ${this.options.config.LOCAL_AUDIO_SOURCE ?? "default"}`,
+      `출력 sink: ${this.options.config.LOCAL_AUDIO_SINK ?? "default"}`,
+      `대기열: ${this.queue.length}`,
+      `최근 대화 턴: ${this.memory.recentTurns().length}`,
+    ].join("\n");
+  }
+
+  private spawnRecorder(): ChildProcessByStdio<null, Readable, Readable> {
+    const args = [
+      "--rate",
+      "16000",
+      "--channels",
+      "1",
+      "--format",
+      "s16",
+      "--raw",
+    ];
+
+    if (this.options.config.LOCAL_AUDIO_SOURCE) {
+      args.push("--target", this.options.config.LOCAL_AUDIO_SOURCE);
+    }
+
+    args.push("-");
+
+    this.options.logger.info("Starting local recorder", {
+      source: this.options.config.LOCAL_AUDIO_SOURCE ?? "default",
+    });
+
+    return spawn("pw-record", args, {
+      stdio: ["ignore", "pipe", "pipe"],
+    });
+  }
+
+  private pushPcm16Chunk(chunk: Buffer): void {
+    if (this.destroyed || !this.vad) {
+      return;
+    }
+
+    for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
+      this.pendingSamples.push(chunk.readInt16LE(offset));
+    }
+
+    while (true) {
+      const frame = takeFrame(this.pendingSamples, 1536);
+      if (!frame) {
+        return;
+      }
+
+      const floatFrame = int16ArrayToFloat32(frame);
+      this.processing = this.processing
+        .then(() => this.vad?.processAudio(floatFrame))
+        .catch((error) => {
+          this.options.logger.warn("Local VAD processing failed", error);
+        });
+    }
+  }
+
+  private async handleSpeechEnd(audio: Float32Array): Promise<void> {
+    if (audio.length < 16000 * 0.25) {
+      return;
+    }
+
+    const utterance: UserUtterance = {
+      speakerId: "local-user",
+      speakerName: this.options.config.LOCAL_SPEAKER_NAME,
+      text: "",
+    };
+
+    let transcript: string | null = null;
+    try {
+      transcript = await this.options.stt.transcribePcm16(float32ToPcm16Buffer(audio));
+    } catch (error) {
+      this.options.logger.warn("Local STT failed", error);
+      return;
+    }
+
+    if (!transcript || transcript.trim().length === 0) {
+      return;
+    }
+
+    utterance.text = transcript.trim();
+    this.memory.addUserTurn(utterance);
+    this.options.logger.info("Local transcript", utterance.text);
+    if (this.options.config.DEBUG_TEXT_EVENTS) {
+      console.log(`\n[you] ${utterance.text}`);
+    }
+
+    let reply: string;
+    try {
+      reply = await this.options.llm.generateReply(this.memory, utterance);
+    } catch (error) {
+      this.options.logger.warn("Local LLM failed", error);
+      reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요.";
+    }
+
+    this.memory.addAssistantTurn(reply);
+    if (this.options.config.DEBUG_TEXT_EVENTS) {
+      console.log(`[bot] ${reply}\n`);
+    }
+
+    this.queue.push({
+      text: reply,
+      source: "assistant",
+    });
+    await this.drainQueue();
+  }
+
+  private interruptPlayback(reason: string): void {
+    if (this.queue.length > 0 || this.currentPlayer) {
+      this.options.logger.info("Interrupting local playback", reason);
+    }
+
+    this.queue.splice(0, this.queue.length);
+    this.currentAbortController?.abort();
+    this.currentAbortController = null;
+    this.currentPlayback?.dispose();
+    this.currentPlayback = null;
+
+    if (this.currentPlayer && !this.currentPlayer.killed) {
+      this.currentPlayer.kill("SIGKILL");
+    }
+    this.currentPlayer = null;
+  }
+
+  private async drainQueue(): Promise<void> {
+    if (this.draining || this.destroyed) {
+      return;
+    }
+
+    this.draining = true;
+
+    try {
+      while (this.queue.length > 0 && !this.destroyed) {
+        const job = this.queue.shift();
+        if (!job) {
+          continue;
+        }
+
+        const abortController = new AbortController();
+        this.currentAbortController = abortController;
+
+        try {
+          this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal);
+        } catch (error) {
+          if (!abortController.signal.aborted) {
+            this.options.logger.warn("Local TTS synthesis failed", error);
+          }
+          continue;
+        }
+
+        try {
+          await this.playToSink(this.currentPlayback, abortController.signal);
+        } catch (error) {
+          if (!abortController.signal.aborted) {
+            this.options.logger.warn("Local playback failed", error);
+          }
+        } finally {
+          this.currentPlayback?.dispose();
+          this.currentPlayback = null;
+          if (this.currentAbortController === abortController) {
+            this.currentAbortController = null;
+          }
+        }
+      }
+    } finally {
+      this.draining = false;
+    }
+  }
+
+  private async playToSink(playback: PreparedSpeechAudio, signal: AbortSignal): Promise<void> {
+    const args = [
+      "--rate",
+      "48000",
+      "--channels",
+      "2",
+      "--format",
+      "s16",
+      "--raw",
+    ];
+
+    if (this.options.config.LOCAL_AUDIO_SINK) {
+      args.push("--target", this.options.config.LOCAL_AUDIO_SINK);
+    }
+
+    args.push("-");
+
+    const player = spawn("pw-play", args, {
+      stdio: ["pipe", "ignore", "pipe"],
+    });
+    this.currentPlayer = player;
+
+    player.stderr.on("data", (chunk: Buffer) => {
+      const text = chunk.toString().trim();
+      if (text.length > 0) {
+        this.options.logger.debug("[pw-play]", text);
+      }
+    });
+
+    signal.addEventListener(
+      "abort",
+      () => {
+        playback.stream.destroy();
+        if (!player.killed) {
+          player.kill("SIGKILL");
+        }
+      },
+      { once: true },
+    );
+
+    playback.stream.pipe(player.stdin);
+
+    const [code, playSignal] = (await once(player, "exit")) as [number | null, NodeJS.Signals | null];
+    this.currentPlayer = null;
+
+    if (signal.aborted) {
+      return;
+    }
+
+    if (code !== 0) {
+      throw new Error(`pw-play exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`);
+    }
+  }
+}
diff --git a/src/config.ts b/src/config.ts
index a122c90..8cf45a0 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -4,17 +4,20 @@ import { z } from "zod";
 loadDotenv();
 
 const envSchema = z.object({
-  DISCORD_BOT_TOKEN: z.string().min(1),
-  DISCORD_APPLICATION_ID: z.string().min(1),
+  DISCORD_BOT_TOKEN: z.string().min(1).optional(),
+  DISCORD_APPLICATION_ID: z.string().min(1).optional(),
   DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(),
-  OPENAI_API_KEY: z.string().min(1),
+  OPENAI_API_KEY: z.string().min(1).optional(),
   OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"),
-  ELEVENLABS_API_KEY: z.string().min(1),
-  ELEVENLABS_VOICE_ID: z.string().min(1),
+  ELEVENLABS_API_KEY: z.string().min(1).optional(),
+  ELEVENLABS_VOICE_ID: z.string().min(1).optional(),
   ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"),
   ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"),
   BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),
   MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12),
+  LOCAL_AUDIO_SOURCE: z.string().min(1).optional(),
+  LOCAL_AUDIO_SINK: z.string().min(1).optional(),
+  LOCAL_SPEAKER_NAME: z.string().min(1).default("local-user"),
   DEBUG_TEXT_EVENTS: z
     .string()
     .optional()
@@ -23,7 +26,41 @@ const envSchema = z.object({
 });
 
 export type AppConfig = z.infer<typeof envSchema>;
+export type AssistantRuntimeConfig = AppConfig & {
+  OPENAI_API_KEY: string;
+  ELEVENLABS_API_KEY: string;
+  ELEVENLABS_VOICE_ID: string;
+};
+export type DiscordRuntimeConfig = AssistantRuntimeConfig & {
+  DISCORD_BOT_TOKEN: string;
+  DISCORD_APPLICATION_ID: string;
+};
 
 export function loadConfig(): AppConfig {
   return envSchema.parse(process.env);
 }
+
+function requirePresent(value: string | undefined, name: string): string {
+  if (!value) {
+    throw new Error(`${name} 환경변수가 필요합니다.`);
+  }
+  return value;
+}
+
+export function requireAssistantRuntimeConfig(config: AppConfig): AssistantRuntimeConfig {
+  return {
+    ...config,
+    OPENAI_API_KEY: requirePresent(config.OPENAI_API_KEY, "OPENAI_API_KEY"),
+    ELEVENLABS_API_KEY: requirePresent(config.ELEVENLABS_API_KEY, "ELEVENLABS_API_KEY"),
+    ELEVENLABS_VOICE_ID: requirePresent(config.ELEVENLABS_VOICE_ID, "ELEVENLABS_VOICE_ID"),
+  };
+}
+
+export function requireDiscordRuntimeConfig(config: AppConfig): DiscordRuntimeConfig {
+  const assistant = requireAssistantRuntimeConfig(config);
+  return {
+    ...assistant,
+    DISCORD_BOT_TOKEN: requirePresent(config.DISCORD_BOT_TOKEN, "DISCORD_BOT_TOKEN"),
+    DISCORD_APPLICATION_ID: requirePresent(config.DISCORD_APPLICATION_ID, "DISCORD_APPLICATION_ID"),
+  };
+}
diff --git a/src/discord-main.ts b/src/discord-main.ts
new file mode 100644
index 0000000..cf8060c
--- /dev/null
+++ b/src/discord-main.ts
@@ -0,0 +1,234 @@
+import process from "node:process";
+
+import {
+  GatewayIntentBits,
+  REST,
+  Routes,
+  SlashCommandBuilder,
+  type ChatInputCommandInteraction,
+  type Client,
+  type GuildMember,
+  type VoiceBasedChannel,
+} from "discord.js";
+import { Client as DiscordClient } from "discord.js";
+
+import { GuildVoiceSession } from "./audio/guild-voice-session.js";
+import { type DiscordRuntimeConfig } from "./config.js";
+import { Logger } from "./logger.js";
+import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
+import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
+import { OpenAiLlmService } from "./services/openai-llm.js";
+
+export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
+  const commands = [
+    new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
+    new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
+    new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
+    new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
+    new SlashCommandBuilder()
+      .setName("say")
+      .setDescription("텍스트를 바로 음성으로 읽습니다.")
+      .addStringOption((option) =>
+        option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
+      ),
+  ].map((command) => command.toJSON());
+
+  const client = new DiscordClient({
+    intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
+  });
+
+  const stt = new ElevenLabsSttService(config);
+  const tts = new ElevenLabsTtsService(config);
+  const llm = new OpenAiLlmService(config);
+  const sessions = new Map<string, GuildVoiceSession>();
+
+  function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
+    const member = interaction.member as GuildMember | null;
+    return member?.voice.channel ?? null;
+  }
+
+  async function registerCommands(_appClient: Client): Promise<void> {
+    const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
+    if (config.DISCORD_COMMAND_GUILD_ID) {
+      await rest.put(
+        Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
+        {
+          body: commands,
+        },
+      );
+      logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
+      return;
+    }
+
+    await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
+      body: commands,
+    });
+    logger.info("Registered global commands");
+  }
+
+  async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
+    if (!interaction.guild) {
+      throw new Error("Guild interaction required");
+    }
+
+    const voiceChannel = getVoiceChannel(interaction);
+    if (!voiceChannel) {
+      throw new Error("먼저 음성 채널에 들어가 주세요.");
+    }
+
+    const existing = sessions.get(interaction.guild.id);
+    if (existing && existing.voiceChannelId === voiceChannel.id) {
+      existing.setTextChannel(interaction.channelId);
+      return existing;
+    }
+
+    if (existing) {
+      await existing.destroy();
+      sessions.delete(interaction.guild.id);
+    }
+
+    const session = await GuildVoiceSession.create({
+      client,
+      config,
+      logger,
+      guild: interaction.guild,
+      voiceChannel,
+      textChannelId: interaction.channelId,
+      stt,
+      tts,
+      llm,
+    });
+    sessions.set(interaction.guild.id, session);
+    return session;
+  }
+
+  async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
+    await interaction.deferReply({ ephemeral: true });
+
+    try {
+      const session = await createSession(interaction);
+      await interaction.editReply(
+        `음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`,
+      );
+    } catch (error) {
+      const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
+      await interaction.editReply(message);
+    }
+  }
+
+  async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
+    const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
+    if (!session) {
+      await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
+      return;
+    }
+
+    await session.destroy();
+    sessions.delete(interaction.guildId!);
+    await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
+  }
+
+  async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
+    const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
+    if (!session) {
+      await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
+      return;
+    }
+
+    await interaction.reply({
+      content: session.statusSummary(),
+      ephemeral: true,
+    });
+  }
+
+  async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
+    const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
+    if (!session) {
+      await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
+      return;
+    }
+
+    session.clearConversation();
+    await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
+  }
+
+  async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
+    await interaction.deferReply({ ephemeral: true });
+
+    const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
+    if (!session) {
+      await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
+      return;
+    }
+
+    const text = interaction.options.getString("text", true).trim();
+    await session.speakText(text);
+    await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
+  }
+
+  async function shutdown(exitCode = 0): Promise<void> {
+    logger.info("Shutting down");
+    for (const session of sessions.values()) {
+      await session.destroy().catch((error) => {
+        logger.warn("Session shutdown failed", error);
+      });
+    }
+    sessions.clear();
+    await client.destroy();
+    process.exit(exitCode);
+  }
+
+  client.once("ready", async () => {
+    logger.info("Discord client ready", client.user?.tag ?? "unknown");
+    try {
+      await registerCommands(client);
+    } catch (error) {
+      logger.error("Command registration failed", error);
+    }
+  });
+
+  client.on("interactionCreate", async (interaction) => {
+    if (!interaction.isChatInputCommand()) {
+      return;
+    }
+
+    try {
+      switch (interaction.commandName) {
+        case "join":
+          await handleJoin(interaction);
+          return;
+        case "leave":
+          await handleLeave(interaction);
+          return;
+        case "status":
+          await handleStatus(interaction);
+          return;
+        case "reset":
+          await handleReset(interaction);
+          return;
+        case "say":
+          await handleSay(interaction);
+          return;
+        default:
+          await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
+      }
+    } catch (error) {
+      logger.error("Interaction handler failed", error);
+      if (interaction.deferred || interaction.replied) {
+        await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
+        return;
+      }
+      await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
+    }
+  });
+
+  process.on("SIGINT", () => {
+    void shutdown(0);
+  });
+
+  process.on("SIGTERM", () => {
+    void shutdown(0);
+  });
+
+  await client.login(config.DISCORD_BOT_TOKEN);
+}
diff --git a/src/index.ts b/src/index.ts
index 59cd390..c931977 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,237 +1,28 @@
 import process from "node:process";
 
-import {
-  GatewayIntentBits,
-  REST,
-  Routes,
-  SlashCommandBuilder,
-  type ChatInputCommandInteraction,
-  type Client,
-  type GuildMember,
-  type VoiceBasedChannel,
-} from "discord.js";
-import { Client as DiscordClient } from "discord.js";
-
-import { GuildVoiceSession } from "./audio/guild-voice-session.js";
-import { loadConfig } from "./config.js";
+import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js";
+import { runDiscordBot } from "./discord-main.js";
 import { Logger } from "./logger.js";
-import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
-import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
-import { OpenAiLlmService } from "./services/openai-llm.js";
+import { printLocalAudioDevices, runLocalAssistant } from "./local-main.js";
 
+const mode = process.argv[2] ?? "discord";
 const config = loadConfig();
 const logger = new Logger(config.LOG_LEVEL);
 
-const commands = [
-  new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
-  new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
-  new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
-  new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
-  new SlashCommandBuilder()
-    .setName("say")
-    .setDescription("텍스트를 바로 음성으로 읽습니다.")
-    .addStringOption((option) =>
-      option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
-    ),
-].map((command) => command.toJSON());
-
-const client = new DiscordClient({
-  intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
-});
-
-const stt = new ElevenLabsSttService(config);
-const tts = new ElevenLabsTtsService(config);
-const llm = new OpenAiLlmService(config);
-const sessions = new Map<string, GuildVoiceSession>();
-
-function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
-  const member = interaction.member as GuildMember | null;
-  return member?.voice.channel ?? null;
-}
-
-async function registerCommands(appClient: Client): Promise<void> {
-  const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
-  if (config.DISCORD_COMMAND_GUILD_ID) {
-    await rest.put(
-      Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
-      {
-        body: commands,
-      },
-    );
-    logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
-    return;
-  }
-
-  await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
-    body: commands,
-  });
-  logger.info("Registered global commands");
-}
-
-async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
-  if (!interaction.guild) {
-    throw new Error("Guild interaction required");
-  }
-
-  const voiceChannel = getVoiceChannel(interaction);
-  if (!voiceChannel) {
-    throw new Error("먼저 음성 채널에 들어가 주세요.");
-  }
-
-  const existing = sessions.get(interaction.guild.id);
-  if (existing && existing.voiceChannelId === voiceChannel.id) {
-    existing.setTextChannel(interaction.channelId);
-    return existing;
-  }
-
-  if (existing) {
-    await existing.destroy();
-    sessions.delete(interaction.guild.id);
-  }
-
-  const session = await GuildVoiceSession.create({
-    client,
-    config,
-    logger,
-    guild: interaction.guild,
-    voiceChannel,
-    textChannelId: interaction.channelId,
-    stt,
-    tts,
-    llm,
-  });
-  sessions.set(interaction.guild.id, session);
-  return session;
-}
-
-async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
-  await interaction.deferReply({ ephemeral: true });
-
-  try {
-    const session = await createSession(interaction);
-    await interaction.editReply(`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`);
-  } catch (error) {
-    const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
-    await interaction.editReply(message);
-  }
-}
-
-async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
-  const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
-  if (!session) {
-    await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
-    return;
-  }
-
-  await session.destroy();
-  sessions.delete(interaction.guildId!);
-  await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
-}
-
-async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
-  const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
-  if (!session) {
-    await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
-    return;
-  }
-
-  await interaction.reply({
-    content: session.statusSummary(),
-    ephemeral: true,
-  });
-}
-
-async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
-  const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
-  if (!session) {
-    await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
-    return;
-  }
-
-  session.clearConversation();
-  await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
-}
-
-async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
-  await interaction.deferReply({ ephemeral: true });
-
-  const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
-  if (!session) {
-    await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
-    return;
-  }
-
-  const text = interaction.options.getString("text", true).trim();
-  await session.speakText(text);
-  await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
-}
-
-async function shutdown(exitCode = 0): Promise<void> {
-  logger.info("Shutting down");
-  for (const session of sessions.values()) {
-    await session.destroy().catch((error) => {
-      logger.warn("Session shutdown failed", error);
-    });
-  }
-  sessions.clear();
-  await client.destroy();
-  process.exit(exitCode);
-}
-
-client.once("ready", async () => {
-  logger.info("Discord client ready", client.user?.tag ?? "unknown");
-  try {
-    await registerCommands(client);
-  } catch (error) {
-    logger.error("Command registration failed", error);
-  }
-});
-
-client.on("interactionCreate", async (interaction) => {
-  if (!interaction.isChatInputCommand()) {
-    return;
-  }
-
-  try {
-    switch (interaction.commandName) {
-      case "join":
-        await handleJoin(interaction);
-        return;
-      case "leave":
-        await handleLeave(interaction);
-        return;
-      case "status":
-        await handleStatus(interaction);
-        return;
-      case "reset":
-        await handleReset(interaction);
-        return;
-      case "say":
-        await handleSay(interaction);
-        return;
-      default:
-        await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
-    }
-  } catch (error) {
-    logger.error("Interaction handler failed", error);
-    if (interaction.deferred || interaction.replied) {
-      await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
-      return;
-    }
-    await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
-  }
-});
-
-process.on("SIGINT", () => {
-  void shutdown(0);
-});
-
-process.on("SIGTERM", () => {
-  void shutdown(0);
-});
-
 async function main(): Promise<void> {
-  await client.login(config.DISCORD_BOT_TOKEN);
+  switch (mode) {
+    case "discord":
+      await runDiscordBot(requireDiscordRuntimeConfig(config), logger);
+      return;
+    case "local":
+      await runLocalAssistant(requireAssistantRuntimeConfig(config), logger);
+      return;
+    case "local-devices":
+      await printLocalAudioDevices();
+      return;
+    default:
+      throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices`);
+  }
 }
 
 void main().catch((error) => {
diff --git a/src/local-main.ts b/src/local-main.ts
new file mode 100644
index 0000000..b4420c6
--- /dev/null
+++ b/src/local-main.ts
@@ -0,0 +1,75 @@
+import { spawn } from "node:child_process";
+import process from "node:process";
+
+import type { AssistantRuntimeConfig } from "./config.js";
+import { Logger } from "./logger.js";
+import { LocalVoiceSession } from "./audio/local-voice-session.js";
+import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
+import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
+import { OpenAiLlmService } from "./services/openai-llm.js";
+
+export async function printLocalAudioDevices(): Promise<void> {
+  const runs = [
+    {
+      label: "wpctl status",
+      args: ["status"],
+    },
+    {
+      label: "wpctl status -n",
+      args: ["status", "-n"],
+    },
+  ] as const;
+
+  for (const run of runs) {
+    console.log(`\n=== ${run.label} ===`);
+    await new Promise<void>((resolve, reject) => {
+      const child = spawn("wpctl", run.args, {
+        stdio: ["ignore", "inherit", "inherit"],
+      });
+      child.on("exit", (code) => {
+        if (code === 0) {
+          resolve();
+          return;
+        }
+        reject(new Error(`wpctl exited with code ${code ?? "null"}`));
+      });
+      child.on("error", reject);
+    });
+  }
+}
+
+export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
+  const stt = new ElevenLabsSttService(config);
+  const tts = new ElevenLabsTtsService(config);
+  const llm = new OpenAiLlmService(config);
+  const session = new LocalVoiceSession({
+    config,
+    logger,
+    stt,
+    tts,
+    llm,
+  });
+
+  console.log(session.statusSummary());
+  console.log("로컬 음성 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
+  if (config.DEBUG_TEXT_EVENTS) {
+    console.log("텍스트 로그 출력이 켜져 있습니다.");
+  }
+
+  const shutdown = async (exitCode = 0) => {
+    await session.destroy().catch((error) => {
+      logger.warn("Local session shutdown failed", error);
+    });
+    process.exit(exitCode);
+  };
+
+  process.on("SIGINT", () => {
+    void shutdown(0);
+  });
+
+  process.on("SIGTERM", () => {
+    void shutdown(0);
+  });
+
+  await session.start();
+}
diff --git a/src/services/elevenlabs-stt.ts b/src/services/elevenlabs-stt.ts
index 1c3719b..67b7979 100644
--- a/src/services/elevenlabs-stt.ts
+++ b/src/services/elevenlabs-stt.ts
@@ -1,6 +1,6 @@
 import WebSocket from "ws";
 
-import type { AppConfig } from "../config.js";
+import type { AssistantRuntimeConfig } from "../config.js";
 
 interface ElevenLabsMessage {
   message_type?: string;
@@ -13,7 +13,7 @@ const NON_FATAL_ERROR_TYPES = new Set([
 ]);
 
 export class ElevenLabsSttService {
-  constructor(private readonly config: AppConfig) {}
+  constructor(private readonly config: AssistantRuntimeConfig) {}
 
   async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
     if (pcm16MonoAudio.byteLength === 0) {
diff --git a/src/services/elevenlabs-tts.ts b/src/services/elevenlabs-tts.ts
index 24f83a3..d22bed4 100644
--- a/src/services/elevenlabs-tts.ts
+++ b/src/services/elevenlabs-tts.ts
@@ -2,24 +2,23 @@ import { Readable } from "node:stream";
 
 import ffmpegStatic from "ffmpeg-static";
 import prism from "prism-media";
-import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice";
 
-import type { AppConfig } from "../config.js";
+import type { AssistantRuntimeConfig } from "../config.js";
 
-export interface PreparedSpeechPlayback {
-  resource: AudioResource;
+export interface PreparedSpeechAudio {
+  stream: Readable;
   dispose: () => void;
 }
 
 export class ElevenLabsTtsService {
-  constructor(private readonly config: AppConfig) {
+  constructor(private readonly config: AssistantRuntimeConfig) {
     const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
     if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
       process.env.FFMPEG_PATH = resolvedFfmpegPath;
     }
   }
 
-  async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechPlayback> {
+  async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
     const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
     url.searchParams.set("output_format", "mp3_44100_128");
     url.searchParams.set("enable_logging", "false");
@@ -68,12 +67,8 @@ export class ElevenLabsTtsService {
 
     input.pipe(ffmpeg);
 
-    const resource = createAudioResource(ffmpeg, {
-      inputType: StreamType.Raw,
-    });
-
     return {
-      resource,
+      stream: ffmpeg,
       dispose: () => {
         input.destroy();
         ffmpeg.destroy();
diff --git a/src/services/openai-llm.ts b/src/services/openai-llm.ts
index c6b02e9..d866d5f 100644
--- a/src/services/openai-llm.ts
+++ b/src/services/openai-llm.ts
@@ -1,6 +1,6 @@
 import OpenAI from "openai";
 
-import type { AppConfig } from "../config.js";
+import type { AssistantRuntimeConfig } from "../config.js";
 import type { ConversationMemory, UserUtterance } from "./conversation.js";
 
 const ASSISTANT_INSTRUCTIONS = [
@@ -30,7 +30,7 @@ function normalizeReply(text: string): string {
 export class OpenAiLlmService {
   private readonly client: OpenAI;
 
-  constructor(private readonly config: AppConfig) {
+  constructor(private readonly config: AssistantRuntimeConfig) {
     this.client = new OpenAI({
       apiKey: this.config.OPENAI_API_KEY,
     });