/** * Discord voice I/O. * * - Joins the caller's voice channel. * - Receives each speaker's Opus stream, decodes to PCM, and on end-of-speech * forwards the utterance (as a WAV) to the brain bridge. * - Plays the brain's spoken reply back into the channel. * * No AI logic here — capture in, audio out. The brain lives in bridge/. */ import { Readable } from "node:stream"; import { joinVoiceChannel, createAudioPlayer, createAudioResource, EndBehaviorType, StreamType, AudioPlayerStatus, VoiceConnection, VoiceConnectionStatus, entersState, type AudioPlayer, } from "@discordjs/voice"; import prism from "prism-media"; import type { VoiceBasedChannel } from "discord.js"; import { converse, decodeWav } from "./bridge.ts"; import { config } from "./config.ts"; const DISCORD_RATE = 48000; const DISCORD_CHANNELS = 2; /** Build a minimal PCM16 mono WAV around raw little-endian samples. */ function pcm16MonoToWav(pcm: Buffer, sampleRate: number): Buffer { const header = Buffer.alloc(44); const dataLen = pcm.length; header.write("RIFF", 0); header.writeUInt32LE(36 + dataLen, 4); header.write("WAVE", 8); header.write("fmt ", 12); header.writeUInt32LE(16, 16); header.writeUInt16LE(1, 20); // PCM header.writeUInt16LE(1, 22); // mono header.writeUInt32LE(sampleRate, 24); header.writeUInt32LE(sampleRate * 2, 28); // byte rate (mono * 2 bytes) header.writeUInt16LE(2, 32); // block align header.writeUInt16LE(16, 34); // bits per sample header.write("data", 36); header.writeUInt32LE(dataLen, 40); return Buffer.concat([header, pcm]); } /** Downmix interleaved stereo PCM16 to mono PCM16. */ function stereoToMono(stereo: Buffer): Buffer { const samples = stereo.length / 4; // 2 ch * 2 bytes const mono = Buffer.alloc(samples * 2); for (let i = 0; i < samples; i++) { const l = stereo.readInt16LE(i * 4); const r = stereo.readInt16LE(i * 4 + 2); mono.writeInt16LE((l + r) >> 1, i * 2); } return mono; } export class VoiceSession { readonly guildId: string; private connection: VoiceConnection; private player: AudioPlayer; private listening = new Set(); /** Pending reply clips. Played one at a time so concurrent speakers don't * cut each other's replies off. */ private playQueue: Buffer[] = []; /** Optional callback to surface transcripts/replies to a text channel. */ onTurn?: (info: { user: string; transcript: string; reply: string }) => void; constructor(channel: VoiceBasedChannel) { this.guildId = channel.guild.id; this.connection = joinVoiceChannel({ channelId: channel.id, guildId: channel.guild.id, adapterCreator: channel.guild.voiceAdapterCreator, selfDeaf: false, // we need to hear users selfMute: false, }); this.player = createAudioPlayer(); this.connection.subscribe(this.player); // Drain the queue when the current clip finishes. this.player.on(AudioPlayerStatus.Idle, () => this.pump()); this.attachReceiver(); } async ready(): Promise { await entersState(this.connection, VoiceConnectionStatus.Ready, 20_000); } private attachReceiver() { const receiver = this.connection.receiver; receiver.speaking.on("start", (userId: string) => { if (this.listening.has(userId)) return; this.listening.add(userId); this.captureUtterance(userId).finally(() => this.listening.delete(userId)); }); } private async captureUtterance(userId: string): Promise { const opusStream = this.connection.receiver.subscribe(userId, { end: { behavior: EndBehaviorType.AfterSilence, duration: config.silenceMs }, }); const decoder = new prism.opus.Decoder({ frameSize: 960, channels: DISCORD_CHANNELS, rate: DISCORD_RATE, }); const chunks: Buffer[] = []; const pcmStream = opusStream.pipe(decoder); pcmStream.on("data", (c: Buffer) => chunks.push(c)); await new Promise((resolve) => pcmStream.once("end", () => resolve())); if (!chunks.length) return; const mono = stereoToMono(Buffer.concat(chunks)); // Ignore blips shorter than ~300ms (likely noise / key clicks). if (mono.length < DISCORD_RATE * 0.3 * 2) return; const wav = pcm16MonoToWav(mono, DISCORD_RATE); try { const result = await converse(wav); if (result.transcript) { this.onTurn?.({ user: userId, transcript: result.transcript, reply: result.reply }); } const audio = decodeWav(result.audio_b64); if (audio) this.play(audio); } catch (err) { console.error("[voice] converse failed:", err); } } /** Queue a WAV buffer for playback (FIFO, one clip at a time). */ play(wav: Buffer) { this.playQueue.push(wav); this.pump(); } private pump() { if (this.player.state.status !== AudioPlayerStatus.Idle) return; const next = this.playQueue.shift(); if (!next) return; const resource = createAudioResource(Readable.from(next), { inputType: StreamType.Arbitrary, }); this.player.play(resource); } destroy() { try { this.connection.destroy(); } catch { /* already gone */ } } } /** One session per guild. */ const sessions = new Map(); export async function joinChannel(channel: VoiceBasedChannel): Promise { sessions.get(channel.guild.id)?.destroy(); const session = new VoiceSession(channel); sessions.set(channel.guild.id, session); await session.ready(); return session; } export function leaveGuild(guildId: string): boolean { const s = sessions.get(guildId); if (!s) return false; s.destroy(); sessions.delete(guildId); return true; } export function getSession(guildId: string): VoiceSession | undefined { return sessions.get(guildId); }