Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled

Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
This commit is contained in:
javis-bot
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions

169
bot/src/voice.ts Normal file
View File

@@ -0,0 +1,169 @@
/**
* Discord voice I/O.
*
* - Joins the caller's voice channel.
* - Receives each speaker's Opus stream, decodes to PCM, and on end-of-speech
* forwards the utterance (as a WAV) to the brain bridge.
* - Plays the brain's spoken reply back into the channel.
*
* No AI logic here — capture in, audio out. The brain lives in bridge/.
*/
import { Readable } from "node:stream";
import {
joinVoiceChannel,
createAudioPlayer,
createAudioResource,
EndBehaviorType,
StreamType,
VoiceConnection,
VoiceConnectionStatus,
entersState,
type AudioPlayer,
} from "@discordjs/voice";
import prism from "prism-media";
import type { VoiceBasedChannel } from "discord.js";
import { converse, decodeWav } from "./bridge.ts";
import { config } from "./config.ts";
const DISCORD_RATE = 48000;
const DISCORD_CHANNELS = 2;
/** Build a minimal PCM16 mono WAV around raw little-endian samples. */
function pcm16MonoToWav(pcm: Buffer, sampleRate: number): Buffer {
const header = Buffer.alloc(44);
const dataLen = pcm.length;
header.write("RIFF", 0);
header.writeUInt32LE(36 + dataLen, 4);
header.write("WAVE", 8);
header.write("fmt ", 12);
header.writeUInt32LE(16, 16);
header.writeUInt16LE(1, 20); // PCM
header.writeUInt16LE(1, 22); // mono
header.writeUInt32LE(sampleRate, 24);
header.writeUInt32LE(sampleRate * 2, 28); // byte rate (mono * 2 bytes)
header.writeUInt16LE(2, 32); // block align
header.writeUInt16LE(16, 34); // bits per sample
header.write("data", 36);
header.writeUInt32LE(dataLen, 40);
return Buffer.concat([header, pcm]);
}
/** Downmix interleaved stereo PCM16 to mono PCM16. */
function stereoToMono(stereo: Buffer): Buffer {
const samples = stereo.length / 4; // 2 ch * 2 bytes
const mono = Buffer.alloc(samples * 2);
for (let i = 0; i < samples; i++) {
const l = stereo.readInt16LE(i * 4);
const r = stereo.readInt16LE(i * 4 + 2);
mono.writeInt16LE((l + r) >> 1, i * 2);
}
return mono;
}
export class VoiceSession {
readonly guildId: string;
private connection: VoiceConnection;
private player: AudioPlayer;
private listening = new Set<string>();
/** Optional callback to surface transcripts/replies to a text channel. */
onTurn?: (info: { user: string; transcript: string; reply: string }) => void;
constructor(channel: VoiceBasedChannel) {
this.guildId = channel.guild.id;
this.connection = joinVoiceChannel({
channelId: channel.id,
guildId: channel.guild.id,
adapterCreator: channel.guild.voiceAdapterCreator,
selfDeaf: false, // we need to hear users
selfMute: false,
});
this.player = createAudioPlayer();
this.connection.subscribe(this.player);
this.attachReceiver();
}
async ready(): Promise<void> {
await entersState(this.connection, VoiceConnectionStatus.Ready, 20_000);
}
private attachReceiver() {
const receiver = this.connection.receiver;
receiver.speaking.on("start", (userId: string) => {
if (this.listening.has(userId)) return;
this.listening.add(userId);
this.captureUtterance(userId).finally(() => this.listening.delete(userId));
});
}
private async captureUtterance(userId: string): Promise<void> {
const opusStream = this.connection.receiver.subscribe(userId, {
end: { behavior: EndBehaviorType.AfterSilence, duration: config.silenceMs },
});
const decoder = new prism.opus.Decoder({
frameSize: 960,
channels: DISCORD_CHANNELS,
rate: DISCORD_RATE,
});
const chunks: Buffer[] = [];
const pcmStream = opusStream.pipe(decoder);
pcmStream.on("data", (c: Buffer) => chunks.push(c));
await new Promise<void>((resolve) => pcmStream.once("end", () => resolve()));
if (!chunks.length) return;
const mono = stereoToMono(Buffer.concat(chunks));
// Ignore blips shorter than ~300ms (likely noise / key clicks).
if (mono.length < DISCORD_RATE * 0.3 * 2) return;
const wav = pcm16MonoToWav(mono, DISCORD_RATE);
try {
const result = await converse(wav);
if (result.transcript) {
this.onTurn?.({ user: userId, transcript: result.transcript, reply: result.reply });
}
const audio = decodeWav(result.audio_b64);
if (audio) this.play(audio);
} catch (err) {
console.error("[voice] converse failed:", err);
}
}
/** Play a WAV buffer into the channel. */
play(wav: Buffer) {
const resource = createAudioResource(Readable.from(wav), {
inputType: StreamType.Arbitrary,
});
this.player.play(resource);
}
destroy() {
try {
this.connection.destroy();
} catch {
/* already gone */
}
}
}
/** One session per guild. */
const sessions = new Map<string, VoiceSession>();
export async function joinChannel(channel: VoiceBasedChannel): Promise<VoiceSession> {
sessions.get(channel.guild.id)?.destroy();
const session = new VoiceSession(channel);
sessions.set(channel.guild.id, session);
await session.ready();
return session;
}
export function leaveGuild(guildId: string): boolean {
const s = sessions.get(guildId);
if (!s) return false;
s.destroy();
sessions.delete(guildId);
return true;
}
export function getSession(guildId: string): VoiceSession | undefined {
return sessions.get(guildId);
}