Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/bot/src/bridge.ts
+++ b/bot/src/bridge.ts
@@ -0,0 +1,52 @@
+/**
+ * HTTP client for the Python brain bridge (bridge/server.py).
+ * All AI work (STT, reply engine, TTS) lives behind these calls.
+ */
+import { config } from "./config.ts";
+
+export interface ConverseResult {
+  transcript: string;
+  language?: string | null;
+  reply: string;
+  error?: string | null;
+  /** base64-encoded 16-bit PCM WAV of the spoken reply, or null if TTS off */
+  audio_b64?: string | null;
+}
+
+export interface TextResult {
+  reply: string;
+  error?: string | null;
+  audio_b64?: string | null;
+}
+
+/** Full voice turn: WAV in -> {transcript, reply, reply audio}. */
+export async function converse(wav: Buffer): Promise<ConverseResult> {
+  const res = await fetch(`${config.bridgeUrl}/converse`, {
+    method: "POST",
+    headers: { "content-type": "audio/wav" },
+    body: wav,
+  });
+  if (!res.ok) throw new Error(`bridge /converse ${res.status}: ${await res.text()}`);
+  return (await res.json()) as ConverseResult;
+}
+
+/** Text-only turn (used by /자비스 ask). */
+export async function ask(text: string): Promise<TextResult> {
+  const res = await fetch(`${config.bridgeUrl}/text`, {
+    method: "POST",
+    headers: { "content-type": "application/json" },
+    body: JSON.stringify({ text }),
+  });
+  if (!res.ok) throw new Error(`bridge /text ${res.status}: ${await res.text()}`);
+  return (await res.json()) as TextResult;
+}
+
+export async function health(): Promise<any> {
+  const res = await fetch(`${config.bridgeUrl}/health`);
+  return res.json();
+}
+
+export function decodeWav(audio_b64?: string | null): Buffer | null {
+  if (!audio_b64) return null;
+  return Buffer.from(audio_b64, "base64");
+}
--- a/bot/src/config.ts
+++ b/bot/src/config.ts
@@ -0,0 +1,55 @@
+/**
+ * Centralised, typed configuration loaded from environment (.env at repo root).
+ * Nothing else in the bot reads process.env directly.
+ */
+import "dotenv/config";
+
+function req(name: string): string {
+  const v = process.env[name];
+  if (!v) throw new Error(`Missing required env var: ${name} (see .env.example)`);
+  return v;
+}
+
+function opt(name: string, fallback = ""): string {
+  return process.env[name] ?? fallback;
+}
+
+export type StreamBackend = "selfbot" | "novnc" | "screenshot" | "none";
+
+export const config = {
+  // --- Normal Discord bot (voice I/O, slash commands) ---
+  botToken: req("DISCORD_BOT_TOKEN"),
+  appId: req("DISCORD_APP_ID"),
+  guildId: req("DISCORD_GUILD_ID"),
+
+  // --- Python brain bridge ---
+  bridgeUrl: opt("BRIDGE_URL", "http://127.0.0.1:8765"),
+
+  // --- VNC screen broadcast ---
+  // selfbot   = real live "Go Live" stream via a user (burner) account token
+  // novnc     = post a noVNC web link the channel can open in a browser
+  // screenshot= periodically upload VNC screenshots
+  // none      = disable screen sharing
+  streamBackend: (opt("STREAM_BACKEND", "selfbot") as StreamBackend),
+
+  // x11grab source for the VNC display (TigerVNC runs the desktop on :1)
+  vncDisplay: opt("VNC_DISPLAY", ":1"),
+  vncResolution: opt("VNC_RESOLUTION", "1920x1080"),
+  vncFramerate: parseInt(opt("VNC_FRAMERATE", "30"), 10),
+  vncBitrateKbps: parseInt(opt("VNC_BITRATE_KBPS", "4000"), 10),
+
+  // selfbot backend (ToS-risk; use a throwaway account token, never your main)
+  selfbotToken: opt("DISCORD_SELFBOT_TOKEN"),
+
+  // novnc backend
+  novncUrl: opt("NOVNC_URL", ""),
+
+  // screenshot backend
+  screenshotIntervalSec: parseInt(opt("SCREENSHOT_INTERVAL_SEC", "5"), 10),
+
+  // --- Voice behaviour ---
+  // Min/max captured utterance bounds (ms) before forwarding to the brain.
+  silenceMs: parseInt(opt("VOICE_SILENCE_MS", "800"), 10),
+};
+
+export type AppConfig = typeof config;
--- a/bot/src/index.ts
+++ b/bot/src/index.ts
@@ -0,0 +1,148 @@
+/**
+ * Javis bot entry point.
+ *
+ * A normal Discord bot that:
+ *  - exposes /자비스 (join / leave / ask / stream / stop / status)
+ *  - replies to every slash command EPHEMERALLY (only the invoker sees it)
+ *  - joins the caller's voice channel for live voice conversation (brain in bridge/)
+ *  - broadcasts the VNC screen via a pluggable backend (selfbot / novnc / screenshot)
+ */
+import {
+  Client,
+  GatewayIntentBits,
+  MessageFlags,
+  type ChatInputCommandInteraction,
+  type GuildMember,
+  type TextBasedChannel,
+} from "discord.js";
+import { AttachmentBuilder } from "discord.js";
+import { config } from "./config.ts";
+import { ask, health } from "./bridge.ts";
+import { joinChannel, leaveGuild, getSession } from "./voice.ts";
+import { createStreamer, type ScreenStreamer, type StreamContext } from "./stream/index.ts";
+
+const client = new Client({
+  intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
+});
+
+const streamers = new Map<string, ScreenStreamer>();
+
+async function getStreamer(guildId: string): Promise<ScreenStreamer> {
+  let s = streamers.get(guildId);
+  if (!s) {
+    s = await createStreamer(config);
+    streamers.set(guildId, s);
+  }
+  return s;
+}
+
+const eph = { flags: MessageFlags.Ephemeral } as const;
+
+client.once("clientReady", () => {
+  console.log(`✓ 로그인: ${client.user?.tag} | stream backend: ${config.streamBackend}`);
+});
+
+client.on("interactionCreate", async (interaction) => {
+  if (!interaction.isChatInputCommand()) return;
+  if (interaction.commandName !== "자비스") return;
+  const i = interaction as ChatInputCommandInteraction;
+  const sub = i.options.getSubcommand();
+
+  try {
+    switch (sub) {
+      case "join":
+        return void (await handleJoin(i));
+      case "leave":
+        return void (await handleLeave(i));
+      case "ask":
+        return void (await handleAsk(i));
+      case "stream":
+        return void (await handleStream(i));
+      case "stop":
+        return void (await handleStop(i));
+      case "status":
+        return void (await handleStatus(i));
+    }
+  } catch (err) {
+    console.error(`[/자비스 ${sub}]`, err);
+    const msg = `오류: ${(err as Error).message}`;
+    if (i.deferred || i.replied) await i.editReply(msg);
+    else await i.reply({ content: msg, ...eph });
+  }
+});
+
+async function handleJoin(i: ChatInputCommandInteraction) {
+  const member = i.member as GuildMember;
+  const channel = member?.voice?.channel;
+  if (!channel) {
+    return i.reply({ content: "먼저 음성 채널에 들어간 뒤 다시 호출해주세요.", ...eph });
+  }
+  await i.deferReply(eph);
+  const session = await joinChannel(channel);
+  session.onTurn = ({ transcript, reply }) =>
+    console.log(`🗣️  ${transcript}\n🤖 ${reply}`);
+  await i.editReply(`🎙️ '${channel.name}' 채널에 접속했습니다. 말씀하세요.`);
+}
+
+async function handleLeave(i: ChatInputCommandInteraction) {
+  const left = leaveGuild(i.guildId!);
+  await i.reply({ content: left ? "음성 채널에서 나갔습니다." : "접속 중인 세션이 없습니다.", ...eph });
+}
+
+async function handleAsk(i: ChatInputCommandInteraction) {
+  const q = i.options.getString("질문", true);
+  await i.deferReply(eph);
+  const res = await ask(q);
+  const reply = res.reply || res.error || "(응답 없음)";
+  await i.editReply(reply.slice(0, 1900));
+}
+
+async function handleStream(i: ChatInputCommandInteraction) {
+  const member = i.member as GuildMember;
+  await i.deferReply(eph);
+  const streamer = await getStreamer(i.guildId!);
+  const ctx: StreamContext = {
+    guildId: i.guildId!,
+    voiceChannelId: member?.voice?.channelId ?? "",
+    postImage: async (png, name) => {
+      const ch = i.channel as TextBasedChannel | null;
+      if (ch && "send" in ch) {
+        await (ch as any).send({ files: [new AttachmentBuilder(png, { name })] });
+      }
+    },
+  };
+  if (config.streamBackend === "selfbot" && !ctx.voiceChannelId) {
+    return i.editReply("셀프봇 송출은 음성 채널 안에서 호출해야 합니다. 음성 채널에 들어간 뒤 다시 시도하세요.");
+  }
+  const msg = await streamer.start(ctx);
+  await i.editReply(msg);
+}
+
+async function handleStop(i: ChatInputCommandInteraction) {
+  const streamer = streamers.get(i.guildId!);
+  if (!streamer) return i.reply({ content: "송출 중이 아닙니다.", ...eph });
+  await streamer.stop();
+  await i.reply({ content: "송출을 중단했습니다.", ...eph });
+}
+
+async function handleStatus(i: ChatInputCommandInteraction) {
+  await i.deferReply(eph);
+  let brain = "unreachable";
+  try {
+    const h = await health();
+    brain = h.brain_ready ? "ready" : `not-ready${h.brain_error ? " (" + h.brain_error + ")" : ""}`;
+  } catch {
+    /* keep unreachable */
+  }
+  const session = getSession(i.guildId!);
+  const streamer = streamers.get(i.guildId!);
+  await i.editReply(
+    [
+      `브릿지 두뇌: ${brain}`,
+      `음성 세션: ${session ? "접속 중" : "없음"}`,
+      `송출 백엔드: ${config.streamBackend} (${streamer?.isActive() ? "활성" : "대기"})`,
+    ].join("\n"),
+  );
+}
+
+client.login(config.botToken);
--- a/bot/src/register-commands.ts
+++ b/bot/src/register-commands.ts
@@ -0,0 +1,42 @@
+/**
+ * Registers the /자비스 slash command (guild-scoped for instant availability).
+ * Run once after changing the command shape:  bun run register
+ */
+import { REST, Routes, SlashCommandBuilder } from "discord.js";
+import { config } from "./config.ts";
+
+export const jarvisCommand = new SlashCommandBuilder()
+  .setName("자비스")
+  .setDescription("자비스 음성 비서를 제어합니다")
+  .addSubcommand((s) =>
+    s.setName("join").setDescription("당신이 있는 음성 채널에 접속해 듣기 시작합니다"),
+  )
+  .addSubcommand((s) => s.setName("leave").setDescription("음성 채널에서 나갑니다"))
+  .addSubcommand((s) =>
+    s
+      .setName("ask")
+      .setDescription("텍스트로 자비스에게 질문합니다")
+      .addStringOption((o) =>
+        o.setName("질문").setDescription("질문 내용").setRequired(true),
+      ),
+  )
+  .addSubcommand((s) =>
+    s.setName("stream").setDescription("VNC 화면을 디스코드에 송출합니다"),
+  )
+  .addSubcommand((s) => s.setName("stop").setDescription("VNC 화면 송출을 중단합니다"))
+  .addSubcommand((s) => s.setName("status").setDescription("브릿지/세션 상태를 봅니다"));
+
+export async function registerCommands() {
+  const rest = new REST({ version: "10" }).setToken(config.botToken);
+  await rest.put(Routes.applicationGuildCommands(config.appId, config.guildId), {
+    body: [jarvisCommand.toJSON()],
+  });
+  console.log("✓ /자비스 명령어 등록 완료 (guild:", config.guildId, ")");
+}
+
+if (import.meta.main) {
+  registerCommands().catch((e) => {
+    console.error("명령어 등록 실패:", e);
+    process.exit(1);
+  });
+}
--- a/bot/src/stream/index.ts
+++ b/bot/src/stream/index.ts
@@ -0,0 +1,51 @@
+/**
+ * Pluggable VNC screen-broadcast backends.
+ *
+ * Per the chosen design (option 1): the streaming method is swappable via
+ * STREAM_BACKEND in .env. The default is the real live "Go Live" stream via a
+ * selfbot account (only way to get a native Discord video broadcast), with safe
+ * fallbacks (noVNC link / periodic screenshots) available without code changes.
+ */
+import type { AppConfig } from "../config.ts";
+
+export interface StreamContext {
+  guildId: string;
+  voiceChannelId: string;
+  /** Post an image to the invoking text channel (used by the screenshot backend). */
+  postImage?: (png: Buffer, name: string) => Promise<void>;
+}
+
+export interface ScreenStreamer {
+  readonly kind: AppConfig["streamBackend"];
+  /** Start broadcasting. Returns a short user-facing status/link message. */
+  start(ctx: StreamContext): Promise<string>;
+  stop(): Promise<void>;
+  isActive(): boolean;
+}
+
+export async function createStreamer(config: AppConfig): Promise<ScreenStreamer> {
+  switch (config.streamBackend) {
+    case "selfbot": {
+      const { SelfbotStreamer } = await import("./selfbot.ts");
+      return new SelfbotStreamer(config);
+    }
+    case "novnc": {
+      const { NoVncStreamer } = await import("./novnc.ts");
+      return new NoVncStreamer(config);
+    }
+    case "screenshot": {
+      const { ScreenshotStreamer } = await import("./screenshot.ts");
+      return new ScreenshotStreamer(config);
+    }
+    case "none":
+    default:
+      return {
+        kind: "none",
+        async start() {
+          return "화면 송출이 비활성화되어 있습니다 (STREAM_BACKEND=none).";
+        },
+        async stop() {},
+        isActive: () => false,
+      };
+  }
+}
--- a/bot/src/stream/novnc.ts
+++ b/bot/src/stream/novnc.ts
@@ -0,0 +1,34 @@
+/**
+ * noVNC link backend (safe, real-time, no ban risk).
+ *
+ * Does not broadcast natively into Discord. Instead it shares a noVNC web URL
+ * that anyone can open in a browser to watch (and optionally control) the VNC
+ * desktop live. Set NOVNC_URL in .env (e.g. http://192.168.10.9:6080/vnc.html).
+ *
+ * Stand up noVNC once on the host with websockify, e.g.:
+ *   websockify --web=/usr/share/novnc 6080 localhost:5901
+ */
+import type { AppConfig } from "../config.ts";
+import type { ScreenStreamer, StreamContext } from "./index.ts";
+
+export class NoVncStreamer implements ScreenStreamer {
+  readonly kind = "novnc" as const;
+  private active = false;
+  constructor(private config: AppConfig) {}
+
+  isActive() {
+    return this.active;
+  }
+
+  async start(_ctx: StreamContext): Promise<string> {
+    if (!this.config.novncUrl) {
+      return "NOVNC_URL이 설정되지 않았습니다 (.env). 예: http://192.168.10.9:6080/vnc.html";
+    }
+    this.active = true;
+    return `🖥️ VNC 화면 실시간 보기 (브라우저): ${this.config.novncUrl}`;
+  }
+
+  async stop(): Promise<void> {
+    this.active = false;
+  }
+}
--- a/bot/src/stream/screenshot.ts
+++ b/bot/src/stream/screenshot.ts
@@ -0,0 +1,62 @@
+/**
+ * Screenshot backend (safe, no ban risk, not real-time).
+ *
+ * Periodically grabs a frame from the VNC X display with ffmpeg's x11grab and
+ * posts it to the invoking text channel. Low FPS, but works with a normal bot
+ * account and never touches Discord's selfbot surface.
+ */
+import { spawn } from "node:child_process";
+import type { AppConfig } from "../config.ts";
+import type { ScreenStreamer, StreamContext } from "./index.ts";
+
+function grabFrame(display: string, size: string): Promise<Buffer> {
+  return new Promise((resolve, reject) => {
+    const ff = spawn("ffmpeg", [
+      "-loglevel", "error",
+      "-f", "x11grab",
+      "-video_size", size,
+      "-i", display,
+      "-frames:v", "1",
+      "-f", "image2pipe",
+      "-vcodec", "png",
+      "pipe:1",
+    ]);
+    const chunks: Buffer[] = [];
+    ff.stdout.on("data", (c) => chunks.push(c));
+    ff.on("error", reject);
+    ff.on("close", (code) =>
+      code === 0 ? resolve(Buffer.concat(chunks)) : reject(new Error(`ffmpeg exited ${code}`)),
+    );
+  });
+}
+
+export class ScreenshotStreamer implements ScreenStreamer {
+  readonly kind = "screenshot" as const;
+  private timer: ReturnType<typeof setInterval> | null = null;
+  constructor(private config: AppConfig) {}
+
+  isActive() {
+    return this.timer !== null;
+  }
+
+  async start(ctx: StreamContext): Promise<string> {
+    if (!ctx.postImage) return "스크린샷을 올릴 텍스트 채널 컨텍스트가 없습니다.";
+    if (this.timer) return "이미 스크린샷 송출 중입니다.";
+    const tick = async () => {
+      try {
+        const png = await grabFrame(this.config.vncDisplay, this.config.vncResolution);
+        await ctx.postImage!(png, "vnc.png");
+      } catch (e) {
+        console.error("[screenshot] grab failed:", e);
+      }
+    };
+    this.timer = setInterval(tick, this.config.screenshotIntervalSec * 1000);
+    void tick();
+    return `📸 ${this.config.screenshotIntervalSec}초마다 VNC 스크린샷을 이 채널에 올립니다.`;
+  }
+
+  async stop(): Promise<void> {
+    if (this.timer) clearInterval(this.timer);
+    this.timer = null;
+  }
+}
--- a/bot/src/stream/selfbot.ts
+++ b/bot/src/stream/selfbot.ts
@@ -0,0 +1,116 @@
+/**
+ * Selfbot live-stream backend (default).
+ *
+ * Streams the VNC X display (:1) into the voice channel as a real Discord
+ * "Go Live" broadcast. Discord blocks video from *bot* accounts, so this path
+ * requires a USER account token (a "selfbot"), which violates Discord ToS and
+ * can get the account banned. Use a throwaway/burner account, never your main.
+ *
+ * Dependencies are optional (native): install with
+ *   bun add discord.js-selfbot-v13 @dank074/discord-video-stream
+ * They are dynamically imported so the core bot installs/runs without them.
+ *
+ * Library API targets @dank074/discord-video-stream v6 (Streamer / prepareStream
+ * / playStream). If a different major is installed, the import guard below will
+ * point you at the docs rather than crash cryptically.
+ */
+import type { AppConfig } from "../config.ts";
+import type { ScreenStreamer, StreamContext } from "./index.ts";
+
+export class SelfbotStreamer implements ScreenStreamer {
+  readonly kind = "selfbot" as const;
+  private config: AppConfig;
+  private streamer: any = null;
+  private controller: AbortController | null = null;
+  private active = false;
+
+  constructor(config: AppConfig) {
+    this.config = config;
+  }
+
+  isActive() {
+    return this.active;
+  }
+
+  private async loadLib() {
+    let selfbot: any, videoStream: any;
+    try {
+      selfbot = await import("discord.js-selfbot-v13");
+      // Optional native dep; resolved at runtime only. Version/name can vary by
+      // upstream release, so we don't hard-bind its types at compile time.
+      // @ts-ignore - optional dependency, may be absent until `bun add`ed
+      videoStream = await import("@dank074/discord-video-stream");
+    } catch (e) {
+      throw new Error(
+        "셀프봇 송출 의존성이 없습니다. 설치: bun add discord.js-selfbot-v13 @dank074/discord-video-stream\n" +
+          `원본 오류: ${(e as Error).message}`,
+      );
+    }
+    if (!videoStream.Streamer || !videoStream.prepareStream || !videoStream.playStream) {
+      throw new Error(
+        "@dank074/discord-video-stream v6 API(Streamer/prepareStream/playStream)를 찾지 못했습니다. " +
+          "package.json 버전을 ^4.2.1(=v6 npm 태그)로 맞추거나 docs를 확인하세요.",
+      );
+    }
+    return { selfbot, videoStream };
+  }
+
+  async start(ctx: StreamContext): Promise<string> {
+    if (this.active) return "이미 송출 중입니다.";
+    if (!this.config.selfbotToken) {
+      return "DISCORD_SELFBOT_TOKEN이 설정되지 않았습니다 (.env). 버너 계정 토큰을 넣어주세요.";
+    }
+    const { selfbot, videoStream } = await this.loadLib();
+    const { Streamer, prepareStream, playStream, Utils } = videoStream;
+
+    this.streamer = new Streamer(new selfbot.Client());
+    await this.streamer.client.login(this.config.selfbotToken);
+    await this.streamer.joinVoice(ctx.guildId, ctx.voiceChannelId);
+
+    // Grab the VNC X display with ffmpeg's x11grab and let the library
+    // encode/transport it. NVENC (RTX 5050) is used if available.
+    const input = `x11grab:${this.config.vncDisplay}`;
+    const { command, output } = prepareStream(
+      input,
+      {
+        width: parseInt(this.config.vncResolution.split("x")[0] ?? "1920", 10),
+        height: parseInt(this.config.vncResolution.split("x")[1] ?? "1080", 10),
+        frameRate: this.config.vncFramerate,
+        bitrateVideo: this.config.vncBitrateKbps,
+        videoCodec: Utils?.normalizeVideoCodec ? Utils.normalizeVideoCodec("H264") : "H264",
+        // x11grab needs to be set as the input format for ffmpeg
+        customHeaders: undefined,
+        inputFormat: "x11grab",
+        inputSize: this.config.vncResolution,
+      },
+      (this.controller = new AbortController()).signal,
+    );
+
+    command.on("error", (err: Error) => {
+      if (!this.controller?.signal.aborted) console.error("[selfbot] ffmpeg error:", err);
+    });
+
+    this.active = true;
+    // Fire-and-forget; resolves when the stream ends.
+    playStream(output, this.streamer, { type: "go-live" })
+      .catch((err: Error) => console.error("[selfbot] playStream:", err))
+      .finally(() => {
+        this.active = false;
+      });
+
+    return "🔴 셀프봇으로 VNC 화면을 음성채널에 실시간 송출 중입니다 (Go Live).";
+  }
+
+  async stop(): Promise<void> {
+    this.controller?.abort();
+    this.controller = null;
+    try {
+      this.streamer?.leaveVoice?.();
+      this.streamer?.client?.destroy?.();
+    } catch {
+      /* ignore */
+    }
+    this.streamer = null;
+    this.active = false;
+  }
+}
--- a/bot/src/voice.ts
+++ b/bot/src/voice.ts
@@ -0,0 +1,169 @@
+/**
+ * Discord voice I/O.
+ *
+ * - Joins the caller's voice channel.
+ * - Receives each speaker's Opus stream, decodes to PCM, and on end-of-speech
+ *   forwards the utterance (as a WAV) to the brain bridge.
+ * - Plays the brain's spoken reply back into the channel.
+ *
+ * No AI logic here — capture in, audio out. The brain lives in bridge/.
+ */
+import { Readable } from "node:stream";
+import {
+  joinVoiceChannel,
+  createAudioPlayer,
+  createAudioResource,
+  EndBehaviorType,
+  StreamType,
+  VoiceConnection,
+  VoiceConnectionStatus,
+  entersState,
+  type AudioPlayer,
+} from "@discordjs/voice";
+import prism from "prism-media";
+import type { VoiceBasedChannel } from "discord.js";
+import { converse, decodeWav } from "./bridge.ts";
+import { config } from "./config.ts";
+
+const DISCORD_RATE = 48000;
+const DISCORD_CHANNELS = 2;
+
+/** Build a minimal PCM16 mono WAV around raw little-endian samples. */
+function pcm16MonoToWav(pcm: Buffer, sampleRate: number): Buffer {
+  const header = Buffer.alloc(44);
+  const dataLen = pcm.length;
+  header.write("RIFF", 0);
+  header.writeUInt32LE(36 + dataLen, 4);
+  header.write("WAVE", 8);
+  header.write("fmt ", 12);
+  header.writeUInt32LE(16, 16);
+  header.writeUInt16LE(1, 20); // PCM
+  header.writeUInt16LE(1, 22); // mono
+  header.writeUInt32LE(sampleRate, 24);
+  header.writeUInt32LE(sampleRate * 2, 28); // byte rate (mono * 2 bytes)
+  header.writeUInt16LE(2, 32); // block align
+  header.writeUInt16LE(16, 34); // bits per sample
+  header.write("data", 36);
+  header.writeUInt32LE(dataLen, 40);
+  return Buffer.concat([header, pcm]);
+}
+
+/** Downmix interleaved stereo PCM16 to mono PCM16. */
+function stereoToMono(stereo: Buffer): Buffer {
+  const samples = stereo.length / 4; // 2 ch * 2 bytes
+  const mono = Buffer.alloc(samples * 2);
+  for (let i = 0; i < samples; i++) {
+    const l = stereo.readInt16LE(i * 4);
+    const r = stereo.readInt16LE(i * 4 + 2);
+    mono.writeInt16LE((l + r) >> 1, i * 2);
+  }
+  return mono;
+}
+
+export class VoiceSession {
+  readonly guildId: string;
+  private connection: VoiceConnection;
+  private player: AudioPlayer;
+  private listening = new Set<string>();
+  /** Optional callback to surface transcripts/replies to a text channel. */
+  onTurn?: (info: { user: string; transcript: string; reply: string }) => void;
+
+  constructor(channel: VoiceBasedChannel) {
+    this.guildId = channel.guild.id;
+    this.connection = joinVoiceChannel({
+      channelId: channel.id,
+      guildId: channel.guild.id,
+      adapterCreator: channel.guild.voiceAdapterCreator,
+      selfDeaf: false, // we need to hear users
+      selfMute: false,
+    });
+    this.player = createAudioPlayer();
+    this.connection.subscribe(this.player);
+    this.attachReceiver();
+  }
+
+  async ready(): Promise<void> {
+    await entersState(this.connection, VoiceConnectionStatus.Ready, 20_000);
+  }
+
+  private attachReceiver() {
+    const receiver = this.connection.receiver;
+    receiver.speaking.on("start", (userId: string) => {
+      if (this.listening.has(userId)) return;
+      this.listening.add(userId);
+      this.captureUtterance(userId).finally(() => this.listening.delete(userId));
+    });
+  }
+
+  private async captureUtterance(userId: string): Promise<void> {
+    const opusStream = this.connection.receiver.subscribe(userId, {
+      end: { behavior: EndBehaviorType.AfterSilence, duration: config.silenceMs },
+    });
+    const decoder = new prism.opus.Decoder({
+      frameSize: 960,
+      channels: DISCORD_CHANNELS,
+      rate: DISCORD_RATE,
+    });
+    const chunks: Buffer[] = [];
+    const pcmStream = opusStream.pipe(decoder);
+    pcmStream.on("data", (c: Buffer) => chunks.push(c));
+
+    await new Promise<void>((resolve) => pcmStream.once("end", () => resolve()));
+
+    if (!chunks.length) return;
+    const mono = stereoToMono(Buffer.concat(chunks));
+    // Ignore blips shorter than ~300ms (likely noise / key clicks).
+    if (mono.length < DISCORD_RATE * 0.3 * 2) return;
+    const wav = pcm16MonoToWav(mono, DISCORD_RATE);
+
+    try {
+      const result = await converse(wav);
+      if (result.transcript) {
+        this.onTurn?.({ user: userId, transcript: result.transcript, reply: result.reply });
+      }
+      const audio = decodeWav(result.audio_b64);
+      if (audio) this.play(audio);
+    } catch (err) {
+      console.error("[voice] converse failed:", err);
+    }
+  }
+
+  /** Play a WAV buffer into the channel. */
+  play(wav: Buffer) {
+    const resource = createAudioResource(Readable.from(wav), {
+      inputType: StreamType.Arbitrary,
+    });
+    this.player.play(resource);
+  }
+
+  destroy() {
+    try {
+      this.connection.destroy();
+    } catch {
+      /* already gone */
+    }
+  }
+}
+
+/** One session per guild. */
+const sessions = new Map<string, VoiceSession>();
+
+export async function joinChannel(channel: VoiceBasedChannel): Promise<VoiceSession> {
+  sessions.get(channel.guild.id)?.destroy();
+  const session = new VoiceSession(channel);
+  sessions.set(channel.guild.id, session);
+  await session.ready();
+  return session;
+}
+
+export function leaveGuild(guildId: string): boolean {
+  const s = sessions.get(guildId);
+  if (!s) return false;
+  s.destroy();
+  sessions.delete(guildId);
+  return true;
+}
+
+export function getSession(guildId: string): VoiceSession | undefined {
+  return sessions.get(guildId);
+}