Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
275 lines
10 KiB
Python
275 lines
10 KiB
Python
"""
|
|
Jarvis Brain Bridge
|
|
===================
|
|
|
|
A thin local HTTP service that exposes the existing Jarvis "brain"
|
|
(speech-to-text + reply engine + text-to-speech) to the Node/bun Discord bot.
|
|
|
|
The Discord layer (``bot/``) is responsible for everything Discord-specific:
|
|
joining voice channels, capturing user audio, playing audio back, slash
|
|
commands, and streaming the VNC screen. It does NOT contain any AI logic.
|
|
Instead it calls this bridge:
|
|
|
|
POST /converse (multipart wav) -> { transcript, reply, audio_b64 }
|
|
POST /text (json {text}) -> { reply, audio_b64 }
|
|
POST /stt (multipart wav) -> { text, language }
|
|
POST /tts (json {text}) -> { audio_b64 }
|
|
GET /health -> { ok, brain, stt, tts }
|
|
|
|
This keeps the mature ~39k-line Python brain intact while letting Node own the
|
|
Discord/voice/video integration (which is only feasible in the Node ecosystem).
|
|
|
|
Run:
|
|
python -m bridge.server # from repo root
|
|
# or
|
|
BRIDGE_HOST=127.0.0.1 BRIDGE_PORT=8765 python bridge/server.py
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import io
|
|
import os
|
|
import sys
|
|
import threading
|
|
import wave
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
# Ensure repo-root/src is importable (jarvis package lives in src/jarvis)
|
|
_REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
_SRC = _REPO_ROOT / "src"
|
|
if str(_SRC) not in sys.path:
|
|
sys.path.insert(0, str(_SRC))
|
|
|
|
from flask import Flask, request, jsonify
|
|
|
|
app = Flask(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration (env-driven; see .env.example)
|
|
# ---------------------------------------------------------------------------
|
|
BRIDGE_HOST = os.environ.get("BRIDGE_HOST", "127.0.0.1")
|
|
BRIDGE_PORT = int(os.environ.get("BRIDGE_PORT", "8765"))
|
|
BRAIN_ENABLED = os.environ.get("JARVIS_BRAIN_ENABLED", "1") not in ("0", "false", "False")
|
|
TTS_ENABLED = os.environ.get("JARVIS_TTS_ENABLED", "1") not in ("0", "false", "False")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Lazy singletons. The first request pays the model-load cost; afterwards the
|
|
# brain stays warm. A lock guards initialization so concurrent Discord events
|
|
# don't double-load Whisper.
|
|
# ---------------------------------------------------------------------------
|
|
_init_lock = threading.Lock()
|
|
_cfg = None
|
|
_db = None
|
|
_dialogue_memory = None
|
|
_whisper = None
|
|
_piper_voice = None
|
|
_brain_error: Optional[str] = None
|
|
|
|
|
|
def _ensure_brain():
|
|
"""Initialize cfg, db, dialogue memory, and Whisper once."""
|
|
global _cfg, _db, _dialogue_memory, _whisper, _brain_error
|
|
if _cfg is not None or _brain_error is not None:
|
|
return
|
|
with _init_lock:
|
|
if _cfg is not None or _brain_error is not None:
|
|
return
|
|
try:
|
|
from jarvis.config import load_settings
|
|
from jarvis.memory.db import Database
|
|
from jarvis.memory.conversation import DialogueMemory
|
|
from faster_whisper import WhisperModel
|
|
|
|
cfg = load_settings()
|
|
db = Database(cfg.db_path, cfg.sqlite_vss_path)
|
|
dialogue_memory = DialogueMemory(
|
|
inactivity_timeout=getattr(cfg, "dialogue_memory_timeout", 300.0),
|
|
max_interactions=20,
|
|
)
|
|
device = os.environ.get("WHISPER_DEVICE", "auto")
|
|
compute = os.environ.get("WHISPER_COMPUTE_TYPE", "auto")
|
|
whisper = WhisperModel(cfg.whisper_model, device=device, compute_type=compute)
|
|
|
|
_cfg, _db, _dialogue_memory, _whisper = cfg, db, dialogue_memory, whisper
|
|
print(f"[bridge] brain ready (chat={cfg.ollama_chat_model}, whisper={cfg.whisper_model})", flush=True)
|
|
except Exception as e: # pragma: no cover - depends on local models
|
|
_brain_error = f"{type(e).__name__}: {e}"
|
|
print(f"[bridge] brain init FAILED: {_brain_error}", flush=True)
|
|
|
|
|
|
def _ensure_piper():
|
|
"""Initialize the Piper TTS voice once (independent of the brain)."""
|
|
global _piper_voice
|
|
if _piper_voice is not None or not TTS_ENABLED:
|
|
return
|
|
with _init_lock:
|
|
if _piper_voice is not None:
|
|
return
|
|
try:
|
|
from piper import PiperVoice # piper-tts package
|
|
model_path = os.environ.get("TTS_PIPER_MODEL_PATH")
|
|
if not model_path:
|
|
# Fall back to jarvis' default piper model location.
|
|
from jarvis.output.tts import _get_default_piper_model_path # type: ignore
|
|
model_path = _get_default_piper_model_path()
|
|
if not model_path or not Path(model_path).exists():
|
|
raise FileNotFoundError(
|
|
f"Piper voice model not found at '{model_path}'. "
|
|
f"Set TTS_PIPER_MODEL_PATH in .env or run scripts/setup_models.sh"
|
|
)
|
|
_piper_voice = PiperVoice.load(model_path)
|
|
print(f"[bridge] piper TTS ready ({model_path})", flush=True)
|
|
except Exception as e: # pragma: no cover
|
|
print(f"[bridge] piper init failed (TTS disabled): {e}", flush=True)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Core operations
|
|
# ---------------------------------------------------------------------------
|
|
def _read_wav_pcm(raw: bytes) -> tuple[bytes, int]:
|
|
"""Decode an incoming WAV blob to mono 16-bit PCM @ its sample rate."""
|
|
with wave.open(io.BytesIO(raw), "rb") as wf:
|
|
sr = wf.getframerate()
|
|
frames = wf.readframes(wf.getnframes())
|
|
return frames, sr
|
|
|
|
|
|
def transcribe(wav_bytes: bytes) -> dict:
|
|
_ensure_brain()
|
|
if _whisper is None:
|
|
return {"text": "", "language": None, "error": _brain_error or "stt unavailable"}
|
|
import numpy as np
|
|
|
|
pcm, sr = _read_wav_pcm(wav_bytes)
|
|
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
|
|
# faster-whisper expects 16kHz mono float32; resample if needed.
|
|
if sr != 16000 and audio.size:
|
|
import math
|
|
ratio = 16000 / sr
|
|
idx = (np.arange(int(audio.size * ratio)) / ratio).astype(np.int64)
|
|
idx = np.clip(idx, 0, audio.size - 1)
|
|
audio = audio[idx]
|
|
segments, info = _whisper.transcribe(audio, beam_size=1)
|
|
text = "".join(seg.text for seg in segments).strip()
|
|
return {"text": text, "language": getattr(info, "language", None)}
|
|
|
|
|
|
def think(text: str, language: Optional[str] = None) -> dict:
|
|
"""Run the Jarvis reply engine on a piece of text."""
|
|
if not BRAIN_ENABLED:
|
|
return {"reply": text, "error": "brain disabled (JARVIS_BRAIN_ENABLED=0)"}
|
|
_ensure_brain()
|
|
if _cfg is None:
|
|
return {"reply": "", "error": _brain_error or "brain unavailable"}
|
|
try:
|
|
from jarvis.reply.engine import run_reply_engine
|
|
|
|
# tts=None: we do our own Discord-side synthesis, the engine must not
|
|
# try to speak to a local speaker that doesn't exist in this process.
|
|
reply = run_reply_engine(
|
|
_db, _cfg, None, text, _dialogue_memory, language=language
|
|
)
|
|
reply = (reply or "").strip()
|
|
if reply:
|
|
_dialogue_memory.add_interaction(text, reply)
|
|
return {"reply": reply}
|
|
except Exception as e: # pragma: no cover
|
|
return {"reply": "", "error": f"{type(e).__name__}: {e}"}
|
|
|
|
|
|
def synthesize(text: str) -> Optional[bytes]:
|
|
"""Synthesize text to a 16-bit PCM WAV using Piper. Returns None if TTS off."""
|
|
if not TTS_ENABLED or not text.strip():
|
|
return None
|
|
_ensure_piper()
|
|
if _piper_voice is None:
|
|
return None
|
|
buf = io.BytesIO()
|
|
with wave.open(buf, "wb") as wf:
|
|
_piper_voice.synthesize(text, wf)
|
|
return buf.getvalue()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTTP endpoints
|
|
# ---------------------------------------------------------------------------
|
|
@app.get("/health")
|
|
def health():
|
|
return jsonify(
|
|
{
|
|
"ok": True,
|
|
"brain_enabled": BRAIN_ENABLED,
|
|
"brain_ready": _cfg is not None,
|
|
"brain_error": _brain_error,
|
|
"tts_enabled": TTS_ENABLED,
|
|
}
|
|
)
|
|
|
|
|
|
@app.post("/stt")
|
|
def http_stt():
|
|
raw = request.get_data()
|
|
if not raw:
|
|
return jsonify({"error": "empty body; send a WAV blob"}), 400
|
|
return jsonify(transcribe(raw))
|
|
|
|
|
|
@app.post("/text")
|
|
def http_text():
|
|
data = request.get_json(silent=True) or {}
|
|
text = (data.get("text") or "").strip()
|
|
if not text:
|
|
return jsonify({"error": "missing 'text'"}), 400
|
|
result = think(text, data.get("language"))
|
|
audio = synthesize(result.get("reply", ""))
|
|
if audio:
|
|
result["audio_b64"] = base64.b64encode(audio).decode("ascii")
|
|
return jsonify(result)
|
|
|
|
|
|
@app.post("/tts")
|
|
def http_tts():
|
|
data = request.get_json(silent=True) or {}
|
|
text = (data.get("text") or "").strip()
|
|
if not text:
|
|
return jsonify({"error": "missing 'text'"}), 400
|
|
audio = synthesize(text)
|
|
if not audio:
|
|
return jsonify({"error": "tts unavailable"}), 503
|
|
return jsonify({"audio_b64": base64.b64encode(audio).decode("ascii")})
|
|
|
|
|
|
@app.post("/converse")
|
|
def http_converse():
|
|
"""Full turn: speech in -> transcript -> reply -> speech out."""
|
|
raw = request.get_data()
|
|
if not raw:
|
|
return jsonify({"error": "empty body; send a WAV blob"}), 400
|
|
stt = transcribe(raw)
|
|
transcript = stt.get("text", "")
|
|
if not transcript:
|
|
return jsonify({"transcript": "", "reply": "", "audio_b64": None})
|
|
result = think(transcript, stt.get("language"))
|
|
audio = synthesize(result.get("reply", ""))
|
|
return jsonify(
|
|
{
|
|
"transcript": transcript,
|
|
"language": stt.get("language"),
|
|
"reply": result.get("reply", ""),
|
|
"error": result.get("error"),
|
|
"audio_b64": base64.b64encode(audio).decode("ascii") if audio else None,
|
|
}
|
|
)
|
|
|
|
|
|
def main():
|
|
print(f"[bridge] listening on http://{BRIDGE_HOST}:{BRIDGE_PORT}", flush=True)
|
|
# threaded=True so STT (slow) on one request doesn't block /health, etc.
|
|
app.run(host=BRIDGE_HOST, port=BRIDGE_PORT, threaded=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|