Files
javis_bot/bot/src/voice.ts
javis-bot b56c9c7721
Some checks failed
Release / semantic-release (push) Successful in 22s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 9m56s
Release / build-linux (push) Failing after 7m15s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Address remaining review items (queue, selfbot v6 API, ldconfig, resample)
- voice.ts: reply playback is now a FIFO queue (AudioPlayerStatus.Idle drains
  it) so concurrent speakers no longer cut each other's replies off.
- selfbot.ts: rewritten against the REAL @dank074/discord-video-stream v6 API
  (verified from its d.ts): prepareStream(input, opts, signal)->{command,output},
  playStream(output, streamer, {type:"go-live"}, signal), Streamer.joinVoice.
  x11grab via customInputOptions; optional NVENC encode (RTX 5050) via exported
  `nvenc`. package.json pinned to ^6.0.0 (was a wrong ^4.2.1).
- Dockerfile: dropped the hardcoded python3.12 LD_LIBRARY_PATH. faster-whisper
  >=1.1 self-locates the pip CUDA libs; ldconfig (full path, glob) registers
  them as a robust fallback. Verified: ld.so cache lists libcublas/libcudnn and
  GPU whisper works with LD_LIBRARY_PATH empty.
- bridge: STT resample 48k->16k upgraded from nearest-neighbor to linear
  (np.interp).

Verified: tsc clean, image builds, GPU whisper OK via ldconfig, compose valid.
2026-06-09 18:47:25 +09:00

184 lines
5.7 KiB
TypeScript

/**
* Discord voice I/O.
*
* - Joins the caller's voice channel.
* - Receives each speaker's Opus stream, decodes to PCM, and on end-of-speech
* forwards the utterance (as a WAV) to the brain bridge.
* - Plays the brain's spoken reply back into the channel.
*
* No AI logic here — capture in, audio out. The brain lives in bridge/.
*/
import { Readable } from "node:stream";
import {
joinVoiceChannel,
createAudioPlayer,
createAudioResource,
EndBehaviorType,
StreamType,
AudioPlayerStatus,
VoiceConnection,
VoiceConnectionStatus,
entersState,
type AudioPlayer,
} from "@discordjs/voice";
import prism from "prism-media";
import type { VoiceBasedChannel } from "discord.js";
import { converse, decodeWav } from "./bridge.ts";
import { config } from "./config.ts";
const DISCORD_RATE = 48000;
const DISCORD_CHANNELS = 2;
/** Build a minimal PCM16 mono WAV around raw little-endian samples. */
function pcm16MonoToWav(pcm: Buffer, sampleRate: number): Buffer {
const header = Buffer.alloc(44);
const dataLen = pcm.length;
header.write("RIFF", 0);
header.writeUInt32LE(36 + dataLen, 4);
header.write("WAVE", 8);
header.write("fmt ", 12);
header.writeUInt32LE(16, 16);
header.writeUInt16LE(1, 20); // PCM
header.writeUInt16LE(1, 22); // mono
header.writeUInt32LE(sampleRate, 24);
header.writeUInt32LE(sampleRate * 2, 28); // byte rate (mono * 2 bytes)
header.writeUInt16LE(2, 32); // block align
header.writeUInt16LE(16, 34); // bits per sample
header.write("data", 36);
header.writeUInt32LE(dataLen, 40);
return Buffer.concat([header, pcm]);
}
/** Downmix interleaved stereo PCM16 to mono PCM16. */
function stereoToMono(stereo: Buffer): Buffer {
const samples = stereo.length / 4; // 2 ch * 2 bytes
const mono = Buffer.alloc(samples * 2);
for (let i = 0; i < samples; i++) {
const l = stereo.readInt16LE(i * 4);
const r = stereo.readInt16LE(i * 4 + 2);
mono.writeInt16LE((l + r) >> 1, i * 2);
}
return mono;
}
export class VoiceSession {
readonly guildId: string;
private connection: VoiceConnection;
private player: AudioPlayer;
private listening = new Set<string>();
/** Pending reply clips. Played one at a time so concurrent speakers don't
* cut each other's replies off. */
private playQueue: Buffer[] = [];
/** Optional callback to surface transcripts/replies to a text channel. */
onTurn?: (info: { user: string; transcript: string; reply: string }) => void;
constructor(channel: VoiceBasedChannel) {
this.guildId = channel.guild.id;
this.connection = joinVoiceChannel({
channelId: channel.id,
guildId: channel.guild.id,
adapterCreator: channel.guild.voiceAdapterCreator,
selfDeaf: false, // we need to hear users
selfMute: false,
});
this.player = createAudioPlayer();
this.connection.subscribe(this.player);
// Drain the queue when the current clip finishes.
this.player.on(AudioPlayerStatus.Idle, () => this.pump());
this.attachReceiver();
}
async ready(): Promise<void> {
await entersState(this.connection, VoiceConnectionStatus.Ready, 20_000);
}
private attachReceiver() {
const receiver = this.connection.receiver;
receiver.speaking.on("start", (userId: string) => {
if (this.listening.has(userId)) return;
this.listening.add(userId);
this.captureUtterance(userId).finally(() => this.listening.delete(userId));
});
}
private async captureUtterance(userId: string): Promise<void> {
const opusStream = this.connection.receiver.subscribe(userId, {
end: { behavior: EndBehaviorType.AfterSilence, duration: config.silenceMs },
});
const decoder = new prism.opus.Decoder({
frameSize: 960,
channels: DISCORD_CHANNELS,
rate: DISCORD_RATE,
});
const chunks: Buffer[] = [];
const pcmStream = opusStream.pipe(decoder);
pcmStream.on("data", (c: Buffer) => chunks.push(c));
await new Promise<void>((resolve) => pcmStream.once("end", () => resolve()));
if (!chunks.length) return;
const mono = stereoToMono(Buffer.concat(chunks));
// Ignore blips shorter than ~300ms (likely noise / key clicks).
if (mono.length < DISCORD_RATE * 0.3 * 2) return;
const wav = pcm16MonoToWav(mono, DISCORD_RATE);
try {
const result = await converse(wav);
if (result.transcript) {
this.onTurn?.({ user: userId, transcript: result.transcript, reply: result.reply });
}
const audio = decodeWav(result.audio_b64);
if (audio) this.play(audio);
} catch (err) {
console.error("[voice] converse failed:", err);
}
}
/** Queue a WAV buffer for playback (FIFO, one clip at a time). */
play(wav: Buffer) {
this.playQueue.push(wav);
this.pump();
}
private pump() {
if (this.player.state.status !== AudioPlayerStatus.Idle) return;
const next = this.playQueue.shift();
if (!next) return;
const resource = createAudioResource(Readable.from(next), {
inputType: StreamType.Arbitrary,
});
this.player.play(resource);
}
destroy() {
try {
this.connection.destroy();
} catch {
/* already gone */
}
}
}
/** One session per guild. */
const sessions = new Map<string, VoiceSession>();
export async function joinChannel(channel: VoiceBasedChannel): Promise<VoiceSession> {
sessions.get(channel.guild.id)?.destroy();
const session = new VoiceSession(channel);
sessions.set(channel.guild.id, session);
await session.ready();
return session;
}
export function leaveGuild(guildId: string): boolean {
const s = sessions.get(guildId);
if (!s) return false;
s.destroy();
sessions.delete(guildId);
return true;
}
export function getSession(guildId: string): VoiceSession | undefined {
return sessions.get(guildId);
}