""" Jarvis Brain Bridge =================== A thin local HTTP service that exposes the existing Jarvis "brain" (speech-to-text + reply engine + text-to-speech) to the Node/bun Discord bot. The Discord layer (``bot/``) is responsible for everything Discord-specific: joining voice channels, capturing user audio, playing audio back, slash commands, and streaming the VNC screen. It does NOT contain any AI logic. Instead it calls this bridge: POST /converse (multipart wav) -> { transcript, reply, audio_b64 } POST /text (json {text}) -> { reply, audio_b64 } POST /stt (multipart wav) -> { text, language } POST /tts (json {text}) -> { audio_b64 } GET /health -> { ok, brain, stt, tts } This keeps the mature ~39k-line Python brain intact while letting Node own the Discord/voice/video integration (which is only feasible in the Node ecosystem). Run: python -m bridge.server # from repo root # or BRIDGE_HOST=127.0.0.1 BRIDGE_PORT=8765 python bridge/server.py """ from __future__ import annotations import base64 import io import os import sys import threading import wave from pathlib import Path from typing import Optional # Ensure repo-root/src is importable (jarvis package lives in src/jarvis) _REPO_ROOT = Path(__file__).resolve().parent.parent _SRC = _REPO_ROOT / "src" if str(_SRC) not in sys.path: sys.path.insert(0, str(_SRC)) from flask import Flask, request, jsonify app = Flask(__name__) # --------------------------------------------------------------------------- # Configuration (env-driven; see .env.example) # --------------------------------------------------------------------------- BRIDGE_HOST = os.environ.get("BRIDGE_HOST", "127.0.0.1") BRIDGE_PORT = int(os.environ.get("BRIDGE_PORT", "8765")) BRAIN_ENABLED = os.environ.get("JARVIS_BRAIN_ENABLED", "1") not in ("0", "false", "False") TTS_ENABLED = os.environ.get("JARVIS_TTS_ENABLED", "1") not in ("0", "false", "False") # --------------------------------------------------------------------------- # Lazy singletons. The first request pays the model-load cost; afterwards the # brain stays warm. A lock guards initialization so concurrent Discord events # don't double-load Whisper. # --------------------------------------------------------------------------- _init_lock = threading.Lock() _cfg = None _db = None _dialogue_memory = None _whisper = None _piper_voice = None _brain_error: Optional[str] = None def _ensure_brain(): """Initialize cfg, db, dialogue memory, and Whisper once.""" global _cfg, _db, _dialogue_memory, _whisper, _brain_error if _cfg is not None or _brain_error is not None: return with _init_lock: if _cfg is not None or _brain_error is not None: return try: from jarvis.config import load_settings from jarvis.memory.db import Database from jarvis.memory.conversation import DialogueMemory from faster_whisper import WhisperModel cfg = load_settings() db = Database(cfg.db_path, cfg.sqlite_vss_path) dialogue_memory = DialogueMemory( inactivity_timeout=getattr(cfg, "dialogue_memory_timeout", 300.0), max_interactions=20, ) device = os.environ.get("WHISPER_DEVICE", "auto") compute = os.environ.get("WHISPER_COMPUTE_TYPE", "auto") try: whisper = WhisperModel(cfg.whisper_model, device=device, compute_type=compute) except Exception as ge: # GPU not available / unsupported -> fall back to CPU so the # bridge still works without a GPU passed to the container. if device != "cpu": print(f"[bridge] whisper device='{device}' failed ({ge}); falling back to CPU", flush=True) whisper = WhisperModel(cfg.whisper_model, device="cpu", compute_type="int8") else: raise _cfg, _db, _dialogue_memory, _whisper = cfg, db, dialogue_memory, whisper print(f"[bridge] brain ready (chat={cfg.ollama_chat_model}, whisper={cfg.whisper_model})", flush=True) except Exception as e: # pragma: no cover - depends on local models _brain_error = f"{type(e).__name__}: {e}" print(f"[bridge] brain init FAILED: {_brain_error}", flush=True) def _ensure_piper(): """Initialize the Piper TTS voice once (independent of the brain).""" global _piper_voice if _piper_voice is not None or not TTS_ENABLED: return with _init_lock: if _piper_voice is not None: return try: from piper import PiperVoice # piper-tts package model_path = os.environ.get("TTS_PIPER_MODEL_PATH") if not model_path: # Fall back to jarvis' default piper model location. from jarvis.output.tts import _get_default_piper_model_path # type: ignore model_path = _get_default_piper_model_path() if not model_path or not Path(model_path).exists(): raise FileNotFoundError( f"Piper voice model not found at '{model_path}'. " f"Set TTS_PIPER_MODEL_PATH in .env or run scripts/setup_models.sh" ) _piper_voice = PiperVoice.load(model_path) print(f"[bridge] piper TTS ready ({model_path})", flush=True) except Exception as e: # pragma: no cover print(f"[bridge] piper init failed (TTS disabled): {e}", flush=True) # --------------------------------------------------------------------------- # Core operations # --------------------------------------------------------------------------- def _read_wav_pcm(raw: bytes) -> tuple[bytes, int]: """Decode an incoming WAV blob to mono 16-bit PCM @ its sample rate.""" with wave.open(io.BytesIO(raw), "rb") as wf: sr = wf.getframerate() frames = wf.readframes(wf.getnframes()) return frames, sr def transcribe(wav_bytes: bytes) -> dict: _ensure_brain() if _whisper is None: return {"text": "", "language": None, "error": _brain_error or "stt unavailable"} import numpy as np pcm, sr = _read_wav_pcm(wav_bytes) audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0 # faster-whisper expects 16kHz mono float32; linearly resample if needed. if sr != 16000 and audio.size: n_out = int(round(audio.size * 16000 / sr)) if n_out > 0: x_old = np.linspace(0.0, 1.0, num=audio.size, endpoint=False) x_new = np.linspace(0.0, 1.0, num=n_out, endpoint=False) audio = np.interp(x_new, x_old, audio).astype(np.float32) segments, info = _whisper.transcribe(audio, beam_size=1) text = "".join(seg.text for seg in segments).strip() return {"text": text, "language": getattr(info, "language", None)} def think(text: str, language: Optional[str] = None) -> dict: """Run the Jarvis reply engine on a piece of text.""" if not BRAIN_ENABLED: return {"reply": text, "error": "brain disabled (JARVIS_BRAIN_ENABLED=0)"} _ensure_brain() if _cfg is None: return {"reply": "", "error": _brain_error or "brain unavailable"} try: from jarvis.reply.engine import run_reply_engine # tts=None: we do our own Discord-side synthesis, the engine must not # try to speak to a local speaker that doesn't exist in this process. reply = run_reply_engine( _db, _cfg, None, text, _dialogue_memory, language=language ) reply = (reply or "").strip() if reply: _dialogue_memory.add_interaction(text, reply) return {"reply": reply} except Exception as e: # pragma: no cover return {"reply": "", "error": f"{type(e).__name__}: {e}"} def synthesize(text: str) -> Optional[bytes]: """Synthesize text to a 16-bit PCM WAV using Piper. Returns None if TTS off.""" if not TTS_ENABLED or not text.strip(): return None _ensure_piper() if _piper_voice is None: return None buf = io.BytesIO() with wave.open(buf, "wb") as wf: # piper-tts API: synthesize_wav(text, wav_file) writes a full WAV; # plain synthesize() returns AudioChunks and takes a SynthesisConfig # (NOT a wav file) as its 2nd arg. _piper_voice.synthesize_wav(text, wf) return buf.getvalue() # --------------------------------------------------------------------------- # HTTP endpoints # --------------------------------------------------------------------------- @app.get("/health") def health(): return jsonify( { "ok": True, "brain_enabled": BRAIN_ENABLED, "brain_ready": _cfg is not None, "brain_error": _brain_error, "tts_enabled": TTS_ENABLED, } ) @app.post("/stt") def http_stt(): raw = request.get_data() if not raw: return jsonify({"error": "empty body; send a WAV blob"}), 400 return jsonify(transcribe(raw)) @app.post("/text") def http_text(): data = request.get_json(silent=True) or {} text = (data.get("text") or "").strip() if not text: return jsonify({"error": "missing 'text'"}), 400 result = think(text, data.get("language")) audio = synthesize(result.get("reply", "")) if audio: result["audio_b64"] = base64.b64encode(audio).decode("ascii") return jsonify(result) @app.post("/tts") def http_tts(): data = request.get_json(silent=True) or {} text = (data.get("text") or "").strip() if not text: return jsonify({"error": "missing 'text'"}), 400 audio = synthesize(text) if not audio: return jsonify({"error": "tts unavailable"}), 503 return jsonify({"audio_b64": base64.b64encode(audio).decode("ascii")}) @app.post("/converse") def http_converse(): """Full turn: speech in -> transcript -> reply -> speech out.""" raw = request.get_data() if not raw: return jsonify({"error": "empty body; send a WAV blob"}), 400 stt = transcribe(raw) transcript = stt.get("text", "") if not transcript: return jsonify({"transcript": "", "reply": "", "audio_b64": None}) result = think(transcript, stt.get("language")) audio = synthesize(result.get("reply", "")) return jsonify( { "transcript": transcript, "language": stt.get("language"), "reply": result.get("reply", ""), "error": result.get("error"), "audio_b64": base64.b64encode(audio).decode("ascii") if audio else None, } ) def main(): print(f"[bridge] listening on http://{BRIDGE_HOST}:{BRIDGE_PORT}", flush=True) # threaded=True so STT (slow) on one request doesn't block /health, etc. app.run(host=BRIDGE_HOST, port=BRIDGE_PORT, threaded=True) if __name__ == "__main__": main()