Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled

Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
This commit is contained in:
javis-bot
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions

52
bot/src/bridge.ts Normal file
View File

@@ -0,0 +1,52 @@
/**
* HTTP client for the Python brain bridge (bridge/server.py).
* All AI work (STT, reply engine, TTS) lives behind these calls.
*/
import { config } from "./config.ts";
export interface ConverseResult {
transcript: string;
language?: string | null;
reply: string;
error?: string | null;
/** base64-encoded 16-bit PCM WAV of the spoken reply, or null if TTS off */
audio_b64?: string | null;
}
export interface TextResult {
reply: string;
error?: string | null;
audio_b64?: string | null;
}
/** Full voice turn: WAV in -> {transcript, reply, reply audio}. */
export async function converse(wav: Buffer): Promise<ConverseResult> {
const res = await fetch(`${config.bridgeUrl}/converse`, {
method: "POST",
headers: { "content-type": "audio/wav" },
body: wav,
});
if (!res.ok) throw new Error(`bridge /converse ${res.status}: ${await res.text()}`);
return (await res.json()) as ConverseResult;
}
/** Text-only turn (used by /자비스 ask). */
export async function ask(text: string): Promise<TextResult> {
const res = await fetch(`${config.bridgeUrl}/text`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({ text }),
});
if (!res.ok) throw new Error(`bridge /text ${res.status}: ${await res.text()}`);
return (await res.json()) as TextResult;
}
export async function health(): Promise<any> {
const res = await fetch(`${config.bridgeUrl}/health`);
return res.json();
}
export function decodeWav(audio_b64?: string | null): Buffer | null {
if (!audio_b64) return null;
return Buffer.from(audio_b64, "base64");
}

55
bot/src/config.ts Normal file
View File

@@ -0,0 +1,55 @@
/**
* Centralised, typed configuration loaded from environment (.env at repo root).
* Nothing else in the bot reads process.env directly.
*/
import "dotenv/config";
function req(name: string): string {
const v = process.env[name];
if (!v) throw new Error(`Missing required env var: ${name} (see .env.example)`);
return v;
}
function opt(name: string, fallback = ""): string {
return process.env[name] ?? fallback;
}
export type StreamBackend = "selfbot" | "novnc" | "screenshot" | "none";
export const config = {
// --- Normal Discord bot (voice I/O, slash commands) ---
botToken: req("DISCORD_BOT_TOKEN"),
appId: req("DISCORD_APP_ID"),
guildId: req("DISCORD_GUILD_ID"),
// --- Python brain bridge ---
bridgeUrl: opt("BRIDGE_URL", "http://127.0.0.1:8765"),
// --- VNC screen broadcast ---
// selfbot = real live "Go Live" stream via a user (burner) account token
// novnc = post a noVNC web link the channel can open in a browser
// screenshot= periodically upload VNC screenshots
// none = disable screen sharing
streamBackend: (opt("STREAM_BACKEND", "selfbot") as StreamBackend),
// x11grab source for the VNC display (TigerVNC runs the desktop on :1)
vncDisplay: opt("VNC_DISPLAY", ":1"),
vncResolution: opt("VNC_RESOLUTION", "1920x1080"),
vncFramerate: parseInt(opt("VNC_FRAMERATE", "30"), 10),
vncBitrateKbps: parseInt(opt("VNC_BITRATE_KBPS", "4000"), 10),
// selfbot backend (ToS-risk; use a throwaway account token, never your main)
selfbotToken: opt("DISCORD_SELFBOT_TOKEN"),
// novnc backend
novncUrl: opt("NOVNC_URL", ""),
// screenshot backend
screenshotIntervalSec: parseInt(opt("SCREENSHOT_INTERVAL_SEC", "5"), 10),
// --- Voice behaviour ---
// Min/max captured utterance bounds (ms) before forwarding to the brain.
silenceMs: parseInt(opt("VOICE_SILENCE_MS", "800"), 10),
};
export type AppConfig = typeof config;

148
bot/src/index.ts Normal file
View File

@@ -0,0 +1,148 @@
/**
* Javis bot entry point.
*
* A normal Discord bot that:
* - exposes /자비스 (join / leave / ask / stream / stop / status)
* - replies to every slash command EPHEMERALLY (only the invoker sees it)
* - joins the caller's voice channel for live voice conversation (brain in bridge/)
* - broadcasts the VNC screen via a pluggable backend (selfbot / novnc / screenshot)
*/
import {
Client,
GatewayIntentBits,
MessageFlags,
type ChatInputCommandInteraction,
type GuildMember,
type TextBasedChannel,
} from "discord.js";
import { AttachmentBuilder } from "discord.js";
import { config } from "./config.ts";
import { ask, health } from "./bridge.ts";
import { joinChannel, leaveGuild, getSession } from "./voice.ts";
import { createStreamer, type ScreenStreamer, type StreamContext } from "./stream/index.ts";
const client = new Client({
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
});
const streamers = new Map<string, ScreenStreamer>();
async function getStreamer(guildId: string): Promise<ScreenStreamer> {
let s = streamers.get(guildId);
if (!s) {
s = await createStreamer(config);
streamers.set(guildId, s);
}
return s;
}
const eph = { flags: MessageFlags.Ephemeral } as const;
client.once("clientReady", () => {
console.log(`✓ 로그인: ${client.user?.tag} | stream backend: ${config.streamBackend}`);
});
client.on("interactionCreate", async (interaction) => {
if (!interaction.isChatInputCommand()) return;
if (interaction.commandName !== "자비스") return;
const i = interaction as ChatInputCommandInteraction;
const sub = i.options.getSubcommand();
try {
switch (sub) {
case "join":
return void (await handleJoin(i));
case "leave":
return void (await handleLeave(i));
case "ask":
return void (await handleAsk(i));
case "stream":
return void (await handleStream(i));
case "stop":
return void (await handleStop(i));
case "status":
return void (await handleStatus(i));
}
} catch (err) {
console.error(`[/자비스 ${sub}]`, err);
const msg = `오류: ${(err as Error).message}`;
if (i.deferred || i.replied) await i.editReply(msg);
else await i.reply({ content: msg, ...eph });
}
});
async function handleJoin(i: ChatInputCommandInteraction) {
const member = i.member as GuildMember;
const channel = member?.voice?.channel;
if (!channel) {
return i.reply({ content: "먼저 음성 채널에 들어간 뒤 다시 호출해주세요.", ...eph });
}
await i.deferReply(eph);
const session = await joinChannel(channel);
session.onTurn = ({ transcript, reply }) =>
console.log(`🗣️ ${transcript}\n🤖 ${reply}`);
await i.editReply(`🎙️ '${channel.name}' 채널에 접속했습니다. 말씀하세요.`);
}
async function handleLeave(i: ChatInputCommandInteraction) {
const left = leaveGuild(i.guildId!);
await i.reply({ content: left ? "음성 채널에서 나갔습니다." : "접속 중인 세션이 없습니다.", ...eph });
}
async function handleAsk(i: ChatInputCommandInteraction) {
const q = i.options.getString("질문", true);
await i.deferReply(eph);
const res = await ask(q);
const reply = res.reply || res.error || "(응답 없음)";
await i.editReply(reply.slice(0, 1900));
}
async function handleStream(i: ChatInputCommandInteraction) {
const member = i.member as GuildMember;
await i.deferReply(eph);
const streamer = await getStreamer(i.guildId!);
const ctx: StreamContext = {
guildId: i.guildId!,
voiceChannelId: member?.voice?.channelId ?? "",
postImage: async (png, name) => {
const ch = i.channel as TextBasedChannel | null;
if (ch && "send" in ch) {
await (ch as any).send({ files: [new AttachmentBuilder(png, { name })] });
}
},
};
if (config.streamBackend === "selfbot" && !ctx.voiceChannelId) {
return i.editReply("셀프봇 송출은 음성 채널 안에서 호출해야 합니다. 음성 채널에 들어간 뒤 다시 시도하세요.");
}
const msg = await streamer.start(ctx);
await i.editReply(msg);
}
async function handleStop(i: ChatInputCommandInteraction) {
const streamer = streamers.get(i.guildId!);
if (!streamer) return i.reply({ content: "송출 중이 아닙니다.", ...eph });
await streamer.stop();
await i.reply({ content: "송출을 중단했습니다.", ...eph });
}
async function handleStatus(i: ChatInputCommandInteraction) {
await i.deferReply(eph);
let brain = "unreachable";
try {
const h = await health();
brain = h.brain_ready ? "ready" : `not-ready${h.brain_error ? " (" + h.brain_error + ")" : ""}`;
} catch {
/* keep unreachable */
}
const session = getSession(i.guildId!);
const streamer = streamers.get(i.guildId!);
await i.editReply(
[
`브릿지 두뇌: ${brain}`,
`음성 세션: ${session ? "접속 중" : "없음"}`,
`송출 백엔드: ${config.streamBackend} (${streamer?.isActive() ? "활성" : "대기"})`,
].join("\n"),
);
}
client.login(config.botToken);

View File

@@ -0,0 +1,42 @@
/**
* Registers the /자비스 slash command (guild-scoped for instant availability).
* Run once after changing the command shape: bun run register
*/
import { REST, Routes, SlashCommandBuilder } from "discord.js";
import { config } from "./config.ts";
export const jarvisCommand = new SlashCommandBuilder()
.setName("자비스")
.setDescription("자비스 음성 비서를 제어합니다")
.addSubcommand((s) =>
s.setName("join").setDescription("당신이 있는 음성 채널에 접속해 듣기 시작합니다"),
)
.addSubcommand((s) => s.setName("leave").setDescription("음성 채널에서 나갑니다"))
.addSubcommand((s) =>
s
.setName("ask")
.setDescription("텍스트로 자비스에게 질문합니다")
.addStringOption((o) =>
o.setName("질문").setDescription("질문 내용").setRequired(true),
),
)
.addSubcommand((s) =>
s.setName("stream").setDescription("VNC 화면을 디스코드에 송출합니다"),
)
.addSubcommand((s) => s.setName("stop").setDescription("VNC 화면 송출을 중단합니다"))
.addSubcommand((s) => s.setName("status").setDescription("브릿지/세션 상태를 봅니다"));
export async function registerCommands() {
const rest = new REST({ version: "10" }).setToken(config.botToken);
await rest.put(Routes.applicationGuildCommands(config.appId, config.guildId), {
body: [jarvisCommand.toJSON()],
});
console.log("✓ /자비스 명령어 등록 완료 (guild:", config.guildId, ")");
}
if (import.meta.main) {
registerCommands().catch((e) => {
console.error("명령어 등록 실패:", e);
process.exit(1);
});
}

51
bot/src/stream/index.ts Normal file
View File

@@ -0,0 +1,51 @@
/**
* Pluggable VNC screen-broadcast backends.
*
* Per the chosen design (option 1): the streaming method is swappable via
* STREAM_BACKEND in .env. The default is the real live "Go Live" stream via a
* selfbot account (only way to get a native Discord video broadcast), with safe
* fallbacks (noVNC link / periodic screenshots) available without code changes.
*/
import type { AppConfig } from "../config.ts";
export interface StreamContext {
guildId: string;
voiceChannelId: string;
/** Post an image to the invoking text channel (used by the screenshot backend). */
postImage?: (png: Buffer, name: string) => Promise<void>;
}
export interface ScreenStreamer {
readonly kind: AppConfig["streamBackend"];
/** Start broadcasting. Returns a short user-facing status/link message. */
start(ctx: StreamContext): Promise<string>;
stop(): Promise<void>;
isActive(): boolean;
}
export async function createStreamer(config: AppConfig): Promise<ScreenStreamer> {
switch (config.streamBackend) {
case "selfbot": {
const { SelfbotStreamer } = await import("./selfbot.ts");
return new SelfbotStreamer(config);
}
case "novnc": {
const { NoVncStreamer } = await import("./novnc.ts");
return new NoVncStreamer(config);
}
case "screenshot": {
const { ScreenshotStreamer } = await import("./screenshot.ts");
return new ScreenshotStreamer(config);
}
case "none":
default:
return {
kind: "none",
async start() {
return "화면 송출이 비활성화되어 있습니다 (STREAM_BACKEND=none).";
},
async stop() {},
isActive: () => false,
};
}
}

34
bot/src/stream/novnc.ts Normal file
View File

@@ -0,0 +1,34 @@
/**
* noVNC link backend (safe, real-time, no ban risk).
*
* Does not broadcast natively into Discord. Instead it shares a noVNC web URL
* that anyone can open in a browser to watch (and optionally control) the VNC
* desktop live. Set NOVNC_URL in .env (e.g. http://192.168.10.9:6080/vnc.html).
*
* Stand up noVNC once on the host with websockify, e.g.:
* websockify --web=/usr/share/novnc 6080 localhost:5901
*/
import type { AppConfig } from "../config.ts";
import type { ScreenStreamer, StreamContext } from "./index.ts";
export class NoVncStreamer implements ScreenStreamer {
readonly kind = "novnc" as const;
private active = false;
constructor(private config: AppConfig) {}
isActive() {
return this.active;
}
async start(_ctx: StreamContext): Promise<string> {
if (!this.config.novncUrl) {
return "NOVNC_URL이 설정되지 않았습니다 (.env). 예: http://192.168.10.9:6080/vnc.html";
}
this.active = true;
return `🖥️ VNC 화면 실시간 보기 (브라우저): ${this.config.novncUrl}`;
}
async stop(): Promise<void> {
this.active = false;
}
}

View File

@@ -0,0 +1,62 @@
/**
* Screenshot backend (safe, no ban risk, not real-time).
*
* Periodically grabs a frame from the VNC X display with ffmpeg's x11grab and
* posts it to the invoking text channel. Low FPS, but works with a normal bot
* account and never touches Discord's selfbot surface.
*/
import { spawn } from "node:child_process";
import type { AppConfig } from "../config.ts";
import type { ScreenStreamer, StreamContext } from "./index.ts";
function grabFrame(display: string, size: string): Promise<Buffer> {
return new Promise((resolve, reject) => {
const ff = spawn("ffmpeg", [
"-loglevel", "error",
"-f", "x11grab",
"-video_size", size,
"-i", display,
"-frames:v", "1",
"-f", "image2pipe",
"-vcodec", "png",
"pipe:1",
]);
const chunks: Buffer[] = [];
ff.stdout.on("data", (c) => chunks.push(c));
ff.on("error", reject);
ff.on("close", (code) =>
code === 0 ? resolve(Buffer.concat(chunks)) : reject(new Error(`ffmpeg exited ${code}`)),
);
});
}
export class ScreenshotStreamer implements ScreenStreamer {
readonly kind = "screenshot" as const;
private timer: ReturnType<typeof setInterval> | null = null;
constructor(private config: AppConfig) {}
isActive() {
return this.timer !== null;
}
async start(ctx: StreamContext): Promise<string> {
if (!ctx.postImage) return "스크린샷을 올릴 텍스트 채널 컨텍스트가 없습니다.";
if (this.timer) return "이미 스크린샷 송출 중입니다.";
const tick = async () => {
try {
const png = await grabFrame(this.config.vncDisplay, this.config.vncResolution);
await ctx.postImage!(png, "vnc.png");
} catch (e) {
console.error("[screenshot] grab failed:", e);
}
};
this.timer = setInterval(tick, this.config.screenshotIntervalSec * 1000);
void tick();
return `📸 ${this.config.screenshotIntervalSec}초마다 VNC 스크린샷을 이 채널에 올립니다.`;
}
async stop(): Promise<void> {
if (this.timer) clearInterval(this.timer);
this.timer = null;
}
}

116
bot/src/stream/selfbot.ts Normal file
View File

@@ -0,0 +1,116 @@
/**
* Selfbot live-stream backend (default).
*
* Streams the VNC X display (:1) into the voice channel as a real Discord
* "Go Live" broadcast. Discord blocks video from *bot* accounts, so this path
* requires a USER account token (a "selfbot"), which violates Discord ToS and
* can get the account banned. Use a throwaway/burner account, never your main.
*
* Dependencies are optional (native): install with
* bun add discord.js-selfbot-v13 @dank074/discord-video-stream
* They are dynamically imported so the core bot installs/runs without them.
*
* Library API targets @dank074/discord-video-stream v6 (Streamer / prepareStream
* / playStream). If a different major is installed, the import guard below will
* point you at the docs rather than crash cryptically.
*/
import type { AppConfig } from "../config.ts";
import type { ScreenStreamer, StreamContext } from "./index.ts";
export class SelfbotStreamer implements ScreenStreamer {
readonly kind = "selfbot" as const;
private config: AppConfig;
private streamer: any = null;
private controller: AbortController | null = null;
private active = false;
constructor(config: AppConfig) {
this.config = config;
}
isActive() {
return this.active;
}
private async loadLib() {
let selfbot: any, videoStream: any;
try {
selfbot = await import("discord.js-selfbot-v13");
// Optional native dep; resolved at runtime only. Version/name can vary by
// upstream release, so we don't hard-bind its types at compile time.
// @ts-ignore - optional dependency, may be absent until `bun add`ed
videoStream = await import("@dank074/discord-video-stream");
} catch (e) {
throw new Error(
"셀프봇 송출 의존성이 없습니다. 설치: bun add discord.js-selfbot-v13 @dank074/discord-video-stream\n" +
`원본 오류: ${(e as Error).message}`,
);
}
if (!videoStream.Streamer || !videoStream.prepareStream || !videoStream.playStream) {
throw new Error(
"@dank074/discord-video-stream v6 API(Streamer/prepareStream/playStream)를 찾지 못했습니다. " +
"package.json 버전을 ^4.2.1(=v6 npm 태그)로 맞추거나 docs를 확인하세요.",
);
}
return { selfbot, videoStream };
}
async start(ctx: StreamContext): Promise<string> {
if (this.active) return "이미 송출 중입니다.";
if (!this.config.selfbotToken) {
return "DISCORD_SELFBOT_TOKEN이 설정되지 않았습니다 (.env). 버너 계정 토큰을 넣어주세요.";
}
const { selfbot, videoStream } = await this.loadLib();
const { Streamer, prepareStream, playStream, Utils } = videoStream;
this.streamer = new Streamer(new selfbot.Client());
await this.streamer.client.login(this.config.selfbotToken);
await this.streamer.joinVoice(ctx.guildId, ctx.voiceChannelId);
// Grab the VNC X display with ffmpeg's x11grab and let the library
// encode/transport it. NVENC (RTX 5050) is used if available.
const input = `x11grab:${this.config.vncDisplay}`;
const { command, output } = prepareStream(
input,
{
width: parseInt(this.config.vncResolution.split("x")[0] ?? "1920", 10),
height: parseInt(this.config.vncResolution.split("x")[1] ?? "1080", 10),
frameRate: this.config.vncFramerate,
bitrateVideo: this.config.vncBitrateKbps,
videoCodec: Utils?.normalizeVideoCodec ? Utils.normalizeVideoCodec("H264") : "H264",
// x11grab needs to be set as the input format for ffmpeg
customHeaders: undefined,
inputFormat: "x11grab",
inputSize: this.config.vncResolution,
},
(this.controller = new AbortController()).signal,
);
command.on("error", (err: Error) => {
if (!this.controller?.signal.aborted) console.error("[selfbot] ffmpeg error:", err);
});
this.active = true;
// Fire-and-forget; resolves when the stream ends.
playStream(output, this.streamer, { type: "go-live" })
.catch((err: Error) => console.error("[selfbot] playStream:", err))
.finally(() => {
this.active = false;
});
return "🔴 셀프봇으로 VNC 화면을 음성채널에 실시간 송출 중입니다 (Go Live).";
}
async stop(): Promise<void> {
this.controller?.abort();
this.controller = null;
try {
this.streamer?.leaveVoice?.();
this.streamer?.client?.destroy?.();
} catch {
/* ignore */
}
this.streamer = null;
this.active = false;
}
}

169
bot/src/voice.ts Normal file
View File

@@ -0,0 +1,169 @@
/**
* Discord voice I/O.
*
* - Joins the caller's voice channel.
* - Receives each speaker's Opus stream, decodes to PCM, and on end-of-speech
* forwards the utterance (as a WAV) to the brain bridge.
* - Plays the brain's spoken reply back into the channel.
*
* No AI logic here — capture in, audio out. The brain lives in bridge/.
*/
import { Readable } from "node:stream";
import {
joinVoiceChannel,
createAudioPlayer,
createAudioResource,
EndBehaviorType,
StreamType,
VoiceConnection,
VoiceConnectionStatus,
entersState,
type AudioPlayer,
} from "@discordjs/voice";
import prism from "prism-media";
import type { VoiceBasedChannel } from "discord.js";
import { converse, decodeWav } from "./bridge.ts";
import { config } from "./config.ts";
const DISCORD_RATE = 48000;
const DISCORD_CHANNELS = 2;
/** Build a minimal PCM16 mono WAV around raw little-endian samples. */
function pcm16MonoToWav(pcm: Buffer, sampleRate: number): Buffer {
const header = Buffer.alloc(44);
const dataLen = pcm.length;
header.write("RIFF", 0);
header.writeUInt32LE(36 + dataLen, 4);
header.write("WAVE", 8);
header.write("fmt ", 12);
header.writeUInt32LE(16, 16);
header.writeUInt16LE(1, 20); // PCM
header.writeUInt16LE(1, 22); // mono
header.writeUInt32LE(sampleRate, 24);
header.writeUInt32LE(sampleRate * 2, 28); // byte rate (mono * 2 bytes)
header.writeUInt16LE(2, 32); // block align
header.writeUInt16LE(16, 34); // bits per sample
header.write("data", 36);
header.writeUInt32LE(dataLen, 40);
return Buffer.concat([header, pcm]);
}
/** Downmix interleaved stereo PCM16 to mono PCM16. */
function stereoToMono(stereo: Buffer): Buffer {
const samples = stereo.length / 4; // 2 ch * 2 bytes
const mono = Buffer.alloc(samples * 2);
for (let i = 0; i < samples; i++) {
const l = stereo.readInt16LE(i * 4);
const r = stereo.readInt16LE(i * 4 + 2);
mono.writeInt16LE((l + r) >> 1, i * 2);
}
return mono;
}
export class VoiceSession {
readonly guildId: string;
private connection: VoiceConnection;
private player: AudioPlayer;
private listening = new Set<string>();
/** Optional callback to surface transcripts/replies to a text channel. */
onTurn?: (info: { user: string; transcript: string; reply: string }) => void;
constructor(channel: VoiceBasedChannel) {
this.guildId = channel.guild.id;
this.connection = joinVoiceChannel({
channelId: channel.id,
guildId: channel.guild.id,
adapterCreator: channel.guild.voiceAdapterCreator,
selfDeaf: false, // we need to hear users
selfMute: false,
});
this.player = createAudioPlayer();
this.connection.subscribe(this.player);
this.attachReceiver();
}
async ready(): Promise<void> {
await entersState(this.connection, VoiceConnectionStatus.Ready, 20_000);
}
private attachReceiver() {
const receiver = this.connection.receiver;
receiver.speaking.on("start", (userId: string) => {
if (this.listening.has(userId)) return;
this.listening.add(userId);
this.captureUtterance(userId).finally(() => this.listening.delete(userId));
});
}
private async captureUtterance(userId: string): Promise<void> {
const opusStream = this.connection.receiver.subscribe(userId, {
end: { behavior: EndBehaviorType.AfterSilence, duration: config.silenceMs },
});
const decoder = new prism.opus.Decoder({
frameSize: 960,
channels: DISCORD_CHANNELS,
rate: DISCORD_RATE,
});
const chunks: Buffer[] = [];
const pcmStream = opusStream.pipe(decoder);
pcmStream.on("data", (c: Buffer) => chunks.push(c));
await new Promise<void>((resolve) => pcmStream.once("end", () => resolve()));
if (!chunks.length) return;
const mono = stereoToMono(Buffer.concat(chunks));
// Ignore blips shorter than ~300ms (likely noise / key clicks).
if (mono.length < DISCORD_RATE * 0.3 * 2) return;
const wav = pcm16MonoToWav(mono, DISCORD_RATE);
try {
const result = await converse(wav);
if (result.transcript) {
this.onTurn?.({ user: userId, transcript: result.transcript, reply: result.reply });
}
const audio = decodeWav(result.audio_b64);
if (audio) this.play(audio);
} catch (err) {
console.error("[voice] converse failed:", err);
}
}
/** Play a WAV buffer into the channel. */
play(wav: Buffer) {
const resource = createAudioResource(Readable.from(wav), {
inputType: StreamType.Arbitrary,
});
this.player.play(resource);
}
destroy() {
try {
this.connection.destroy();
} catch {
/* already gone */
}
}
}
/** One session per guild. */
const sessions = new Map<string, VoiceSession>();
export async function joinChannel(channel: VoiceBasedChannel): Promise<VoiceSession> {
sessions.get(channel.guild.id)?.destroy();
const session = new VoiceSession(channel);
sessions.set(channel.guild.id, session);
await session.ready();
return session;
}
export function leaveGuild(guildId: string): boolean {
const s = sessions.get(guildId);
if (!s) return false;
s.destroy();
sessions.delete(guildId);
return true;
}
export function getSession(guildId: string): VoiceSession | undefined {
return sessions.get(guildId);
}