javis_bot/bot/src/voice.ts

/**
 * Discord voice I/O.
 *
 * - Joins the caller's voice channel.
 * - Receives each speaker's Opus stream, decodes to PCM, and on end-of-speech
 *   forwards the utterance (as a WAV) to the brain bridge.
 * - Plays the brain's spoken reply back into the channel.
 *
 * No AI logic here — capture in, audio out. The brain lives in bridge/.
 */
import { Readable } from "node:stream";
import {
  joinVoiceChannel,
  createAudioPlayer,
  createAudioResource,
  EndBehaviorType,
  StreamType,
  AudioPlayerStatus,
  VoiceConnection,
  VoiceConnectionStatus,
  entersState,
  type AudioPlayer,
} from "@discordjs/voice";
import prism from "prism-media";
import type { VoiceBasedChannel } from "discord.js";
import { converse, decodeWav } from "./bridge.ts";
import { config } from "./config.ts";

const DISCORD_RATE = 48000;
const DISCORD_CHANNELS = 2;

/** Build a minimal PCM16 mono WAV around raw little-endian samples. */
function pcm16MonoToWav(pcm: Buffer, sampleRate: number): Buffer {
  const header = Buffer.alloc(44);
  const dataLen = pcm.length;
  header.write("RIFF", 0);
  header.writeUInt32LE(36 + dataLen, 4);
  header.write("WAVE", 8);
  header.write("fmt ", 12);
  header.writeUInt32LE(16, 16);
  header.writeUInt16LE(1, 20); // PCM
  header.writeUInt16LE(1, 22); // mono
  header.writeUInt32LE(sampleRate, 24);
  header.writeUInt32LE(sampleRate * 2, 28); // byte rate (mono * 2 bytes)
  header.writeUInt16LE(2, 32); // block align
  header.writeUInt16LE(16, 34); // bits per sample
  header.write("data", 36);
  header.writeUInt32LE(dataLen, 40);
  return Buffer.concat([header, pcm]);
}

/** Downmix interleaved stereo PCM16 to mono PCM16. */
function stereoToMono(stereo: Buffer): Buffer {
  const samples = stereo.length / 4; // 2 ch * 2 bytes
  const mono = Buffer.alloc(samples * 2);
  for (let i = 0; i < samples; i++) {
    const l = stereo.readInt16LE(i * 4);
    const r = stereo.readInt16LE(i * 4 + 2);
    mono.writeInt16LE((l + r) >> 1, i * 2);
  }
  return mono;
}

export class VoiceSession {
  readonly guildId: string;
  private connection: VoiceConnection;
  private player: AudioPlayer;
  private listening = new Set<string>();
  /** Pending reply clips. Played one at a time so concurrent speakers don't
   *  cut each other's replies off. */
  private playQueue: Buffer[] = [];
  /** Optional callback to surface transcripts/replies to a text channel. */
  onTurn?: (info: { user: string; transcript: string; reply: string }) => void;

  constructor(channel: VoiceBasedChannel) {
    this.guildId = channel.guild.id;
    this.connection = joinVoiceChannel({
      channelId: channel.id,
      guildId: channel.guild.id,
      adapterCreator: channel.guild.voiceAdapterCreator,
      selfDeaf: false, // we need to hear users
      selfMute: false,
    });
    this.player = createAudioPlayer();
    this.connection.subscribe(this.player);
    // Drain the queue when the current clip finishes.
    this.player.on(AudioPlayerStatus.Idle, () => this.pump());
    this.attachReceiver();
  }

  async ready(): Promise<void> {
    await entersState(this.connection, VoiceConnectionStatus.Ready, 20_000);
  }

  private attachReceiver() {
    const receiver = this.connection.receiver;
    receiver.speaking.on("start", (userId: string) => {
      if (this.listening.has(userId)) return;
      this.listening.add(userId);
      this.captureUtterance(userId).finally(() => this.listening.delete(userId));
    });
  }

  private async captureUtterance(userId: string): Promise<void> {
    const opusStream = this.connection.receiver.subscribe(userId, {
      end: { behavior: EndBehaviorType.AfterSilence, duration: config.silenceMs },
    });
    const decoder = new prism.opus.Decoder({
      frameSize: 960,
      channels: DISCORD_CHANNELS,
      rate: DISCORD_RATE,
    });
    const chunks: Buffer[] = [];
    const pcmStream = opusStream.pipe(decoder);
    pcmStream.on("data", (c: Buffer) => chunks.push(c));

    await new Promise<void>((resolve) => pcmStream.once("end", () => resolve()));

    if (!chunks.length) return;
    const mono = stereoToMono(Buffer.concat(chunks));
    // Ignore blips shorter than ~300ms (likely noise / key clicks).
    if (mono.length < DISCORD_RATE * 0.3 * 2) return;
    const wav = pcm16MonoToWav(mono, DISCORD_RATE);

    try {
      const result = await converse(wav);
      if (result.transcript) {
        this.onTurn?.({ user: userId, transcript: result.transcript, reply: result.reply });
      }
      const audio = decodeWav(result.audio_b64);
      if (audio) this.play(audio);
    } catch (err) {
      console.error("[voice] converse failed:", err);
    }
  }

  /** Queue a WAV buffer for playback (FIFO, one clip at a time). */
  play(wav: Buffer) {
    this.playQueue.push(wav);
    this.pump();
  }

  private pump() {
    if (this.player.state.status !== AudioPlayerStatus.Idle) return;
    const next = this.playQueue.shift();
    if (!next) return;
    const resource = createAudioResource(Readable.from(next), {
      inputType: StreamType.Arbitrary,
    });
    this.player.play(resource);
  }

  destroy() {
    try {
      this.connection.destroy();
    } catch {
      /* already gone */
    }
  }
}

/** One session per guild. */
const sessions = new Map<string, VoiceSession>();

export async function joinChannel(channel: VoiceBasedChannel): Promise<VoiceSession> {
  sessions.get(channel.guild.id)?.destroy();
  const session = new VoiceSession(channel);
  sessions.set(channel.guild.id, session);
  await session.ready();
  return session;
}

export function leaveGuild(guildId: string): boolean {
  const s = sessions.get(guildId);
  if (!s) return false;
  s.destroy();
  sessions.delete(guildId);
  return true;
}

export function getSession(guildId: string): VoiceSession | undefined {
  return sessions.get(guildId);
}