Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/bot/src/voice.ts
+++ b/bot/src/voice.ts
@@ -0,0 +1,169 @@
+/**
+ * Discord voice I/O.
+ *
+ * - Joins the caller's voice channel.
+ * - Receives each speaker's Opus stream, decodes to PCM, and on end-of-speech
+ *   forwards the utterance (as a WAV) to the brain bridge.
+ * - Plays the brain's spoken reply back into the channel.
+ *
+ * No AI logic here — capture in, audio out. The brain lives in bridge/.
+ */
+import { Readable } from "node:stream";
+import {
+  joinVoiceChannel,
+  createAudioPlayer,
+  createAudioResource,
+  EndBehaviorType,
+  StreamType,
+  VoiceConnection,
+  VoiceConnectionStatus,
+  entersState,
+  type AudioPlayer,
+} from "@discordjs/voice";
+import prism from "prism-media";
+import type { VoiceBasedChannel } from "discord.js";
+import { converse, decodeWav } from "./bridge.ts";
+import { config } from "./config.ts";
+
+const DISCORD_RATE = 48000;
+const DISCORD_CHANNELS = 2;
+
+/** Build a minimal PCM16 mono WAV around raw little-endian samples. */
+function pcm16MonoToWav(pcm: Buffer, sampleRate: number): Buffer {
+  const header = Buffer.alloc(44);
+  const dataLen = pcm.length;
+  header.write("RIFF", 0);
+  header.writeUInt32LE(36 + dataLen, 4);
+  header.write("WAVE", 8);
+  header.write("fmt ", 12);
+  header.writeUInt32LE(16, 16);
+  header.writeUInt16LE(1, 20); // PCM
+  header.writeUInt16LE(1, 22); // mono
+  header.writeUInt32LE(sampleRate, 24);
+  header.writeUInt32LE(sampleRate * 2, 28); // byte rate (mono * 2 bytes)
+  header.writeUInt16LE(2, 32); // block align
+  header.writeUInt16LE(16, 34); // bits per sample
+  header.write("data", 36);
+  header.writeUInt32LE(dataLen, 40);
+  return Buffer.concat([header, pcm]);
+}
+
+/** Downmix interleaved stereo PCM16 to mono PCM16. */
+function stereoToMono(stereo: Buffer): Buffer {
+  const samples = stereo.length / 4; // 2 ch * 2 bytes
+  const mono = Buffer.alloc(samples * 2);
+  for (let i = 0; i < samples; i++) {
+    const l = stereo.readInt16LE(i * 4);
+    const r = stereo.readInt16LE(i * 4 + 2);
+    mono.writeInt16LE((l + r) >> 1, i * 2);
+  }
+  return mono;
+}
+
+export class VoiceSession {
+  readonly guildId: string;
+  private connection: VoiceConnection;
+  private player: AudioPlayer;
+  private listening = new Set<string>();
+  /** Optional callback to surface transcripts/replies to a text channel. */
+  onTurn?: (info: { user: string; transcript: string; reply: string }) => void;
+
+  constructor(channel: VoiceBasedChannel) {
+    this.guildId = channel.guild.id;
+    this.connection = joinVoiceChannel({
+      channelId: channel.id,
+      guildId: channel.guild.id,
+      adapterCreator: channel.guild.voiceAdapterCreator,
+      selfDeaf: false, // we need to hear users
+      selfMute: false,
+    });
+    this.player = createAudioPlayer();
+    this.connection.subscribe(this.player);
+    this.attachReceiver();
+  }
+
+  async ready(): Promise<void> {
+    await entersState(this.connection, VoiceConnectionStatus.Ready, 20_000);
+  }
+
+  private attachReceiver() {
+    const receiver = this.connection.receiver;
+    receiver.speaking.on("start", (userId: string) => {
+      if (this.listening.has(userId)) return;
+      this.listening.add(userId);
+      this.captureUtterance(userId).finally(() => this.listening.delete(userId));
+    });
+  }
+
+  private async captureUtterance(userId: string): Promise<void> {
+    const opusStream = this.connection.receiver.subscribe(userId, {
+      end: { behavior: EndBehaviorType.AfterSilence, duration: config.silenceMs },
+    });
+    const decoder = new prism.opus.Decoder({
+      frameSize: 960,
+      channels: DISCORD_CHANNELS,
+      rate: DISCORD_RATE,
+    });
+    const chunks: Buffer[] = [];
+    const pcmStream = opusStream.pipe(decoder);
+    pcmStream.on("data", (c: Buffer) => chunks.push(c));
+
+    await new Promise<void>((resolve) => pcmStream.once("end", () => resolve()));
+
+    if (!chunks.length) return;
+    const mono = stereoToMono(Buffer.concat(chunks));
+    // Ignore blips shorter than ~300ms (likely noise / key clicks).
+    if (mono.length < DISCORD_RATE * 0.3 * 2) return;
+    const wav = pcm16MonoToWav(mono, DISCORD_RATE);
+
+    try {
+      const result = await converse(wav);
+      if (result.transcript) {
+        this.onTurn?.({ user: userId, transcript: result.transcript, reply: result.reply });
+      }
+      const audio = decodeWav(result.audio_b64);
+      if (audio) this.play(audio);
+    } catch (err) {
+      console.error("[voice] converse failed:", err);
+    }
+  }
+
+  /** Play a WAV buffer into the channel. */
+  play(wav: Buffer) {
+    const resource = createAudioResource(Readable.from(wav), {
+      inputType: StreamType.Arbitrary,
+    });
+    this.player.play(resource);
+  }
+
+  destroy() {
+    try {
+      this.connection.destroy();
+    } catch {
+      /* already gone */
+    }
+  }
+}
+
+/** One session per guild. */
+const sessions = new Map<string, VoiceSession>();
+
+export async function joinChannel(channel: VoiceBasedChannel): Promise<VoiceSession> {
+  sessions.get(channel.guild.id)?.destroy();
+  const session = new VoiceSession(channel);
+  sessions.set(channel.guild.id, session);
+  await session.ready();
+  return session;
+}
+
+export function leaveGuild(guildId: string): boolean {
+  const s = sessions.get(guildId);
+  if (!s) return false;
+  s.destroy();
+  sessions.delete(guildId);
+  return true;
+}
+
+export function getSession(guildId: string): VoiceSession | undefined {
+  return sessions.get(guildId);
+}