Some checks failed
Release / semantic-release (push) Successful in 22s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 9m56s
Release / build-linux (push) Failing after 7m15s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
- voice.ts: reply playback is now a FIFO queue (AudioPlayerStatus.Idle drains
it) so concurrent speakers no longer cut each other's replies off.
- selfbot.ts: rewritten against the REAL @dank074/discord-video-stream v6 API
(verified from its d.ts): prepareStream(input, opts, signal)->{command,output},
playStream(output, streamer, {type:"go-live"}, signal), Streamer.joinVoice.
x11grab via customInputOptions; optional NVENC encode (RTX 5050) via exported
`nvenc`. package.json pinned to ^6.0.0 (was a wrong ^4.2.1).
- Dockerfile: dropped the hardcoded python3.12 LD_LIBRARY_PATH. faster-whisper
>=1.1 self-locates the pip CUDA libs; ldconfig (full path, glob) registers
them as a robust fallback. Verified: ld.so cache lists libcublas/libcudnn and
GPU whisper works with LD_LIBRARY_PATH empty.
- bridge: STT resample 48k->16k upgraded from nearest-neighbor to linear
(np.interp).
Verified: tsc clean, image builds, GPU whisper OK via ldconfig, compose valid.
184 lines
5.7 KiB
TypeScript
184 lines
5.7 KiB
TypeScript
/**
|
|
* Discord voice I/O.
|
|
*
|
|
* - Joins the caller's voice channel.
|
|
* - Receives each speaker's Opus stream, decodes to PCM, and on end-of-speech
|
|
* forwards the utterance (as a WAV) to the brain bridge.
|
|
* - Plays the brain's spoken reply back into the channel.
|
|
*
|
|
* No AI logic here — capture in, audio out. The brain lives in bridge/.
|
|
*/
|
|
import { Readable } from "node:stream";
|
|
import {
|
|
joinVoiceChannel,
|
|
createAudioPlayer,
|
|
createAudioResource,
|
|
EndBehaviorType,
|
|
StreamType,
|
|
AudioPlayerStatus,
|
|
VoiceConnection,
|
|
VoiceConnectionStatus,
|
|
entersState,
|
|
type AudioPlayer,
|
|
} from "@discordjs/voice";
|
|
import prism from "prism-media";
|
|
import type { VoiceBasedChannel } from "discord.js";
|
|
import { converse, decodeWav } from "./bridge.ts";
|
|
import { config } from "./config.ts";
|
|
|
|
const DISCORD_RATE = 48000;
|
|
const DISCORD_CHANNELS = 2;
|
|
|
|
/** Build a minimal PCM16 mono WAV around raw little-endian samples. */
|
|
function pcm16MonoToWav(pcm: Buffer, sampleRate: number): Buffer {
|
|
const header = Buffer.alloc(44);
|
|
const dataLen = pcm.length;
|
|
header.write("RIFF", 0);
|
|
header.writeUInt32LE(36 + dataLen, 4);
|
|
header.write("WAVE", 8);
|
|
header.write("fmt ", 12);
|
|
header.writeUInt32LE(16, 16);
|
|
header.writeUInt16LE(1, 20); // PCM
|
|
header.writeUInt16LE(1, 22); // mono
|
|
header.writeUInt32LE(sampleRate, 24);
|
|
header.writeUInt32LE(sampleRate * 2, 28); // byte rate (mono * 2 bytes)
|
|
header.writeUInt16LE(2, 32); // block align
|
|
header.writeUInt16LE(16, 34); // bits per sample
|
|
header.write("data", 36);
|
|
header.writeUInt32LE(dataLen, 40);
|
|
return Buffer.concat([header, pcm]);
|
|
}
|
|
|
|
/** Downmix interleaved stereo PCM16 to mono PCM16. */
|
|
function stereoToMono(stereo: Buffer): Buffer {
|
|
const samples = stereo.length / 4; // 2 ch * 2 bytes
|
|
const mono = Buffer.alloc(samples * 2);
|
|
for (let i = 0; i < samples; i++) {
|
|
const l = stereo.readInt16LE(i * 4);
|
|
const r = stereo.readInt16LE(i * 4 + 2);
|
|
mono.writeInt16LE((l + r) >> 1, i * 2);
|
|
}
|
|
return mono;
|
|
}
|
|
|
|
export class VoiceSession {
|
|
readonly guildId: string;
|
|
private connection: VoiceConnection;
|
|
private player: AudioPlayer;
|
|
private listening = new Set<string>();
|
|
/** Pending reply clips. Played one at a time so concurrent speakers don't
|
|
* cut each other's replies off. */
|
|
private playQueue: Buffer[] = [];
|
|
/** Optional callback to surface transcripts/replies to a text channel. */
|
|
onTurn?: (info: { user: string; transcript: string; reply: string }) => void;
|
|
|
|
constructor(channel: VoiceBasedChannel) {
|
|
this.guildId = channel.guild.id;
|
|
this.connection = joinVoiceChannel({
|
|
channelId: channel.id,
|
|
guildId: channel.guild.id,
|
|
adapterCreator: channel.guild.voiceAdapterCreator,
|
|
selfDeaf: false, // we need to hear users
|
|
selfMute: false,
|
|
});
|
|
this.player = createAudioPlayer();
|
|
this.connection.subscribe(this.player);
|
|
// Drain the queue when the current clip finishes.
|
|
this.player.on(AudioPlayerStatus.Idle, () => this.pump());
|
|
this.attachReceiver();
|
|
}
|
|
|
|
async ready(): Promise<void> {
|
|
await entersState(this.connection, VoiceConnectionStatus.Ready, 20_000);
|
|
}
|
|
|
|
private attachReceiver() {
|
|
const receiver = this.connection.receiver;
|
|
receiver.speaking.on("start", (userId: string) => {
|
|
if (this.listening.has(userId)) return;
|
|
this.listening.add(userId);
|
|
this.captureUtterance(userId).finally(() => this.listening.delete(userId));
|
|
});
|
|
}
|
|
|
|
private async captureUtterance(userId: string): Promise<void> {
|
|
const opusStream = this.connection.receiver.subscribe(userId, {
|
|
end: { behavior: EndBehaviorType.AfterSilence, duration: config.silenceMs },
|
|
});
|
|
const decoder = new prism.opus.Decoder({
|
|
frameSize: 960,
|
|
channels: DISCORD_CHANNELS,
|
|
rate: DISCORD_RATE,
|
|
});
|
|
const chunks: Buffer[] = [];
|
|
const pcmStream = opusStream.pipe(decoder);
|
|
pcmStream.on("data", (c: Buffer) => chunks.push(c));
|
|
|
|
await new Promise<void>((resolve) => pcmStream.once("end", () => resolve()));
|
|
|
|
if (!chunks.length) return;
|
|
const mono = stereoToMono(Buffer.concat(chunks));
|
|
// Ignore blips shorter than ~300ms (likely noise / key clicks).
|
|
if (mono.length < DISCORD_RATE * 0.3 * 2) return;
|
|
const wav = pcm16MonoToWav(mono, DISCORD_RATE);
|
|
|
|
try {
|
|
const result = await converse(wav);
|
|
if (result.transcript) {
|
|
this.onTurn?.({ user: userId, transcript: result.transcript, reply: result.reply });
|
|
}
|
|
const audio = decodeWav(result.audio_b64);
|
|
if (audio) this.play(audio);
|
|
} catch (err) {
|
|
console.error("[voice] converse failed:", err);
|
|
}
|
|
}
|
|
|
|
/** Queue a WAV buffer for playback (FIFO, one clip at a time). */
|
|
play(wav: Buffer) {
|
|
this.playQueue.push(wav);
|
|
this.pump();
|
|
}
|
|
|
|
private pump() {
|
|
if (this.player.state.status !== AudioPlayerStatus.Idle) return;
|
|
const next = this.playQueue.shift();
|
|
if (!next) return;
|
|
const resource = createAudioResource(Readable.from(next), {
|
|
inputType: StreamType.Arbitrary,
|
|
});
|
|
this.player.play(resource);
|
|
}
|
|
|
|
destroy() {
|
|
try {
|
|
this.connection.destroy();
|
|
} catch {
|
|
/* already gone */
|
|
}
|
|
}
|
|
}
|
|
|
|
/** One session per guild. */
|
|
const sessions = new Map<string, VoiceSession>();
|
|
|
|
export async function joinChannel(channel: VoiceBasedChannel): Promise<VoiceSession> {
|
|
sessions.get(channel.guild.id)?.destroy();
|
|
const session = new VoiceSession(channel);
|
|
sessions.set(channel.guild.id, session);
|
|
await session.ready();
|
|
return session;
|
|
}
|
|
|
|
export function leaveGuild(guildId: string): boolean {
|
|
const s = sessions.get(guildId);
|
|
if (!s) return false;
|
|
s.destroy();
|
|
sessions.delete(guildId);
|
|
return true;
|
|
}
|
|
|
|
export function getSession(guildId: string): VoiceSession | undefined {
|
|
return sessions.get(guildId);
|
|
}
|