Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
169
bot/src/voice.ts
Normal file
169
bot/src/voice.ts
Normal file
@@ -0,0 +1,169 @@
|
||||
/**
|
||||
* Discord voice I/O.
|
||||
*
|
||||
* - Joins the caller's voice channel.
|
||||
* - Receives each speaker's Opus stream, decodes to PCM, and on end-of-speech
|
||||
* forwards the utterance (as a WAV) to the brain bridge.
|
||||
* - Plays the brain's spoken reply back into the channel.
|
||||
*
|
||||
* No AI logic here — capture in, audio out. The brain lives in bridge/.
|
||||
*/
|
||||
import { Readable } from "node:stream";
|
||||
import {
|
||||
joinVoiceChannel,
|
||||
createAudioPlayer,
|
||||
createAudioResource,
|
||||
EndBehaviorType,
|
||||
StreamType,
|
||||
VoiceConnection,
|
||||
VoiceConnectionStatus,
|
||||
entersState,
|
||||
type AudioPlayer,
|
||||
} from "@discordjs/voice";
|
||||
import prism from "prism-media";
|
||||
import type { VoiceBasedChannel } from "discord.js";
|
||||
import { converse, decodeWav } from "./bridge.ts";
|
||||
import { config } from "./config.ts";
|
||||
|
||||
const DISCORD_RATE = 48000;
|
||||
const DISCORD_CHANNELS = 2;
|
||||
|
||||
/** Build a minimal PCM16 mono WAV around raw little-endian samples. */
|
||||
function pcm16MonoToWav(pcm: Buffer, sampleRate: number): Buffer {
|
||||
const header = Buffer.alloc(44);
|
||||
const dataLen = pcm.length;
|
||||
header.write("RIFF", 0);
|
||||
header.writeUInt32LE(36 + dataLen, 4);
|
||||
header.write("WAVE", 8);
|
||||
header.write("fmt ", 12);
|
||||
header.writeUInt32LE(16, 16);
|
||||
header.writeUInt16LE(1, 20); // PCM
|
||||
header.writeUInt16LE(1, 22); // mono
|
||||
header.writeUInt32LE(sampleRate, 24);
|
||||
header.writeUInt32LE(sampleRate * 2, 28); // byte rate (mono * 2 bytes)
|
||||
header.writeUInt16LE(2, 32); // block align
|
||||
header.writeUInt16LE(16, 34); // bits per sample
|
||||
header.write("data", 36);
|
||||
header.writeUInt32LE(dataLen, 40);
|
||||
return Buffer.concat([header, pcm]);
|
||||
}
|
||||
|
||||
/** Downmix interleaved stereo PCM16 to mono PCM16. */
|
||||
function stereoToMono(stereo: Buffer): Buffer {
|
||||
const samples = stereo.length / 4; // 2 ch * 2 bytes
|
||||
const mono = Buffer.alloc(samples * 2);
|
||||
for (let i = 0; i < samples; i++) {
|
||||
const l = stereo.readInt16LE(i * 4);
|
||||
const r = stereo.readInt16LE(i * 4 + 2);
|
||||
mono.writeInt16LE((l + r) >> 1, i * 2);
|
||||
}
|
||||
return mono;
|
||||
}
|
||||
|
||||
export class VoiceSession {
|
||||
readonly guildId: string;
|
||||
private connection: VoiceConnection;
|
||||
private player: AudioPlayer;
|
||||
private listening = new Set<string>();
|
||||
/** Optional callback to surface transcripts/replies to a text channel. */
|
||||
onTurn?: (info: { user: string; transcript: string; reply: string }) => void;
|
||||
|
||||
constructor(channel: VoiceBasedChannel) {
|
||||
this.guildId = channel.guild.id;
|
||||
this.connection = joinVoiceChannel({
|
||||
channelId: channel.id,
|
||||
guildId: channel.guild.id,
|
||||
adapterCreator: channel.guild.voiceAdapterCreator,
|
||||
selfDeaf: false, // we need to hear users
|
||||
selfMute: false,
|
||||
});
|
||||
this.player = createAudioPlayer();
|
||||
this.connection.subscribe(this.player);
|
||||
this.attachReceiver();
|
||||
}
|
||||
|
||||
async ready(): Promise<void> {
|
||||
await entersState(this.connection, VoiceConnectionStatus.Ready, 20_000);
|
||||
}
|
||||
|
||||
private attachReceiver() {
|
||||
const receiver = this.connection.receiver;
|
||||
receiver.speaking.on("start", (userId: string) => {
|
||||
if (this.listening.has(userId)) return;
|
||||
this.listening.add(userId);
|
||||
this.captureUtterance(userId).finally(() => this.listening.delete(userId));
|
||||
});
|
||||
}
|
||||
|
||||
private async captureUtterance(userId: string): Promise<void> {
|
||||
const opusStream = this.connection.receiver.subscribe(userId, {
|
||||
end: { behavior: EndBehaviorType.AfterSilence, duration: config.silenceMs },
|
||||
});
|
||||
const decoder = new prism.opus.Decoder({
|
||||
frameSize: 960,
|
||||
channels: DISCORD_CHANNELS,
|
||||
rate: DISCORD_RATE,
|
||||
});
|
||||
const chunks: Buffer[] = [];
|
||||
const pcmStream = opusStream.pipe(decoder);
|
||||
pcmStream.on("data", (c: Buffer) => chunks.push(c));
|
||||
|
||||
await new Promise<void>((resolve) => pcmStream.once("end", () => resolve()));
|
||||
|
||||
if (!chunks.length) return;
|
||||
const mono = stereoToMono(Buffer.concat(chunks));
|
||||
// Ignore blips shorter than ~300ms (likely noise / key clicks).
|
||||
if (mono.length < DISCORD_RATE * 0.3 * 2) return;
|
||||
const wav = pcm16MonoToWav(mono, DISCORD_RATE);
|
||||
|
||||
try {
|
||||
const result = await converse(wav);
|
||||
if (result.transcript) {
|
||||
this.onTurn?.({ user: userId, transcript: result.transcript, reply: result.reply });
|
||||
}
|
||||
const audio = decodeWav(result.audio_b64);
|
||||
if (audio) this.play(audio);
|
||||
} catch (err) {
|
||||
console.error("[voice] converse failed:", err);
|
||||
}
|
||||
}
|
||||
|
||||
/** Play a WAV buffer into the channel. */
|
||||
play(wav: Buffer) {
|
||||
const resource = createAudioResource(Readable.from(wav), {
|
||||
inputType: StreamType.Arbitrary,
|
||||
});
|
||||
this.player.play(resource);
|
||||
}
|
||||
|
||||
destroy() {
|
||||
try {
|
||||
this.connection.destroy();
|
||||
} catch {
|
||||
/* already gone */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** One session per guild. */
|
||||
const sessions = new Map<string, VoiceSession>();
|
||||
|
||||
export async function joinChannel(channel: VoiceBasedChannel): Promise<VoiceSession> {
|
||||
sessions.get(channel.guild.id)?.destroy();
|
||||
const session = new VoiceSession(channel);
|
||||
sessions.set(channel.guild.id, session);
|
||||
await session.ready();
|
||||
return session;
|
||||
}
|
||||
|
||||
export function leaveGuild(guildId: string): boolean {
|
||||
const s = sessions.get(guildId);
|
||||
if (!s) return false;
|
||||
s.destroy();
|
||||
sessions.delete(guildId);
|
||||
return true;
|
||||
}
|
||||
|
||||
export function getSession(guildId: string): VoiceSession | undefined {
|
||||
return sessions.get(guildId);
|
||||
}
|
||||
Reference in New Issue
Block a user