Files
javis_bot/bridge/server.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

275 lines
10 KiB
Python

"""
Jarvis Brain Bridge
===================
A thin local HTTP service that exposes the existing Jarvis "brain"
(speech-to-text + reply engine + text-to-speech) to the Node/bun Discord bot.
The Discord layer (``bot/``) is responsible for everything Discord-specific:
joining voice channels, capturing user audio, playing audio back, slash
commands, and streaming the VNC screen. It does NOT contain any AI logic.
Instead it calls this bridge:
POST /converse (multipart wav) -> { transcript, reply, audio_b64 }
POST /text (json {text}) -> { reply, audio_b64 }
POST /stt (multipart wav) -> { text, language }
POST /tts (json {text}) -> { audio_b64 }
GET /health -> { ok, brain, stt, tts }
This keeps the mature ~39k-line Python brain intact while letting Node own the
Discord/voice/video integration (which is only feasible in the Node ecosystem).
Run:
python -m bridge.server # from repo root
# or
BRIDGE_HOST=127.0.0.1 BRIDGE_PORT=8765 python bridge/server.py
"""
from __future__ import annotations
import base64
import io
import os
import sys
import threading
import wave
from pathlib import Path
from typing import Optional
# Ensure repo-root/src is importable (jarvis package lives in src/jarvis)
_REPO_ROOT = Path(__file__).resolve().parent.parent
_SRC = _REPO_ROOT / "src"
if str(_SRC) not in sys.path:
sys.path.insert(0, str(_SRC))
from flask import Flask, request, jsonify
app = Flask(__name__)
# ---------------------------------------------------------------------------
# Configuration (env-driven; see .env.example)
# ---------------------------------------------------------------------------
BRIDGE_HOST = os.environ.get("BRIDGE_HOST", "127.0.0.1")
BRIDGE_PORT = int(os.environ.get("BRIDGE_PORT", "8765"))
BRAIN_ENABLED = os.environ.get("JARVIS_BRAIN_ENABLED", "1") not in ("0", "false", "False")
TTS_ENABLED = os.environ.get("JARVIS_TTS_ENABLED", "1") not in ("0", "false", "False")
# ---------------------------------------------------------------------------
# Lazy singletons. The first request pays the model-load cost; afterwards the
# brain stays warm. A lock guards initialization so concurrent Discord events
# don't double-load Whisper.
# ---------------------------------------------------------------------------
_init_lock = threading.Lock()
_cfg = None
_db = None
_dialogue_memory = None
_whisper = None
_piper_voice = None
_brain_error: Optional[str] = None
def _ensure_brain():
"""Initialize cfg, db, dialogue memory, and Whisper once."""
global _cfg, _db, _dialogue_memory, _whisper, _brain_error
if _cfg is not None or _brain_error is not None:
return
with _init_lock:
if _cfg is not None or _brain_error is not None:
return
try:
from jarvis.config import load_settings
from jarvis.memory.db import Database
from jarvis.memory.conversation import DialogueMemory
from faster_whisper import WhisperModel
cfg = load_settings()
db = Database(cfg.db_path, cfg.sqlite_vss_path)
dialogue_memory = DialogueMemory(
inactivity_timeout=getattr(cfg, "dialogue_memory_timeout", 300.0),
max_interactions=20,
)
device = os.environ.get("WHISPER_DEVICE", "auto")
compute = os.environ.get("WHISPER_COMPUTE_TYPE", "auto")
whisper = WhisperModel(cfg.whisper_model, device=device, compute_type=compute)
_cfg, _db, _dialogue_memory, _whisper = cfg, db, dialogue_memory, whisper
print(f"[bridge] brain ready (chat={cfg.ollama_chat_model}, whisper={cfg.whisper_model})", flush=True)
except Exception as e: # pragma: no cover - depends on local models
_brain_error = f"{type(e).__name__}: {e}"
print(f"[bridge] brain init FAILED: {_brain_error}", flush=True)
def _ensure_piper():
"""Initialize the Piper TTS voice once (independent of the brain)."""
global _piper_voice
if _piper_voice is not None or not TTS_ENABLED:
return
with _init_lock:
if _piper_voice is not None:
return
try:
from piper import PiperVoice # piper-tts package
model_path = os.environ.get("TTS_PIPER_MODEL_PATH")
if not model_path:
# Fall back to jarvis' default piper model location.
from jarvis.output.tts import _get_default_piper_model_path # type: ignore
model_path = _get_default_piper_model_path()
if not model_path or not Path(model_path).exists():
raise FileNotFoundError(
f"Piper voice model not found at '{model_path}'. "
f"Set TTS_PIPER_MODEL_PATH in .env or run scripts/setup_models.sh"
)
_piper_voice = PiperVoice.load(model_path)
print(f"[bridge] piper TTS ready ({model_path})", flush=True)
except Exception as e: # pragma: no cover
print(f"[bridge] piper init failed (TTS disabled): {e}", flush=True)
# ---------------------------------------------------------------------------
# Core operations
# ---------------------------------------------------------------------------
def _read_wav_pcm(raw: bytes) -> tuple[bytes, int]:
"""Decode an incoming WAV blob to mono 16-bit PCM @ its sample rate."""
with wave.open(io.BytesIO(raw), "rb") as wf:
sr = wf.getframerate()
frames = wf.readframes(wf.getnframes())
return frames, sr
def transcribe(wav_bytes: bytes) -> dict:
_ensure_brain()
if _whisper is None:
return {"text": "", "language": None, "error": _brain_error or "stt unavailable"}
import numpy as np
pcm, sr = _read_wav_pcm(wav_bytes)
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
# faster-whisper expects 16kHz mono float32; resample if needed.
if sr != 16000 and audio.size:
import math
ratio = 16000 / sr
idx = (np.arange(int(audio.size * ratio)) / ratio).astype(np.int64)
idx = np.clip(idx, 0, audio.size - 1)
audio = audio[idx]
segments, info = _whisper.transcribe(audio, beam_size=1)
text = "".join(seg.text for seg in segments).strip()
return {"text": text, "language": getattr(info, "language", None)}
def think(text: str, language: Optional[str] = None) -> dict:
"""Run the Jarvis reply engine on a piece of text."""
if not BRAIN_ENABLED:
return {"reply": text, "error": "brain disabled (JARVIS_BRAIN_ENABLED=0)"}
_ensure_brain()
if _cfg is None:
return {"reply": "", "error": _brain_error or "brain unavailable"}
try:
from jarvis.reply.engine import run_reply_engine
# tts=None: we do our own Discord-side synthesis, the engine must not
# try to speak to a local speaker that doesn't exist in this process.
reply = run_reply_engine(
_db, _cfg, None, text, _dialogue_memory, language=language
)
reply = (reply or "").strip()
if reply:
_dialogue_memory.add_interaction(text, reply)
return {"reply": reply}
except Exception as e: # pragma: no cover
return {"reply": "", "error": f"{type(e).__name__}: {e}"}
def synthesize(text: str) -> Optional[bytes]:
"""Synthesize text to a 16-bit PCM WAV using Piper. Returns None if TTS off."""
if not TTS_ENABLED or not text.strip():
return None
_ensure_piper()
if _piper_voice is None:
return None
buf = io.BytesIO()
with wave.open(buf, "wb") as wf:
_piper_voice.synthesize(text, wf)
return buf.getvalue()
# ---------------------------------------------------------------------------
# HTTP endpoints
# ---------------------------------------------------------------------------
@app.get("/health")
def health():
return jsonify(
{
"ok": True,
"brain_enabled": BRAIN_ENABLED,
"brain_ready": _cfg is not None,
"brain_error": _brain_error,
"tts_enabled": TTS_ENABLED,
}
)
@app.post("/stt")
def http_stt():
raw = request.get_data()
if not raw:
return jsonify({"error": "empty body; send a WAV blob"}), 400
return jsonify(transcribe(raw))
@app.post("/text")
def http_text():
data = request.get_json(silent=True) or {}
text = (data.get("text") or "").strip()
if not text:
return jsonify({"error": "missing 'text'"}), 400
result = think(text, data.get("language"))
audio = synthesize(result.get("reply", ""))
if audio:
result["audio_b64"] = base64.b64encode(audio).decode("ascii")
return jsonify(result)
@app.post("/tts")
def http_tts():
data = request.get_json(silent=True) or {}
text = (data.get("text") or "").strip()
if not text:
return jsonify({"error": "missing 'text'"}), 400
audio = synthesize(text)
if not audio:
return jsonify({"error": "tts unavailable"}), 503
return jsonify({"audio_b64": base64.b64encode(audio).decode("ascii")})
@app.post("/converse")
def http_converse():
"""Full turn: speech in -> transcript -> reply -> speech out."""
raw = request.get_data()
if not raw:
return jsonify({"error": "empty body; send a WAV blob"}), 400
stt = transcribe(raw)
transcript = stt.get("text", "")
if not transcript:
return jsonify({"transcript": "", "reply": "", "audio_b64": None})
result = think(transcript, stt.get("language"))
audio = synthesize(result.get("reply", ""))
return jsonify(
{
"transcript": transcript,
"language": stt.get("language"),
"reply": result.get("reply", ""),
"error": result.get("error"),
"audio_b64": base64.b64encode(audio).decode("ascii") if audio else None,
}
)
def main():
print(f"[bridge] listening on http://{BRIDGE_HOST}:{BRIDGE_PORT}", flush=True)
# threaded=True so STT (slow) on one request doesn't block /health, etc.
app.run(host=BRIDGE_HOST, port=BRIDGE_PORT, threaded=True)
if __name__ == "__main__":
main()