Some checks failed
Release / semantic-release (push) Successful in 22s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 9m56s
Release / build-linux (push) Failing after 7m15s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
- voice.ts: reply playback is now a FIFO queue (AudioPlayerStatus.Idle drains
it) so concurrent speakers no longer cut each other's replies off.
- selfbot.ts: rewritten against the REAL @dank074/discord-video-stream v6 API
(verified from its d.ts): prepareStream(input, opts, signal)->{command,output},
playStream(output, streamer, {type:"go-live"}, signal), Streamer.joinVoice.
x11grab via customInputOptions; optional NVENC encode (RTX 5050) via exported
`nvenc`. package.json pinned to ^6.0.0 (was a wrong ^4.2.1).
- Dockerfile: dropped the hardcoded python3.12 LD_LIBRARY_PATH. faster-whisper
>=1.1 self-locates the pip CUDA libs; ldconfig (full path, glob) registers
them as a robust fallback. Verified: ld.so cache lists libcublas/libcudnn and
GPU whisper works with LD_LIBRARY_PATH empty.
- bridge: STT resample 48k->16k upgraded from nearest-neighbor to linear
(np.interp).
Verified: tsc clean, image builds, GPU whisper OK via ldconfig, compose valid.
287 lines
11 KiB
Python
287 lines
11 KiB
Python
"""
|
|
Jarvis Brain Bridge
|
|
===================
|
|
|
|
A thin local HTTP service that exposes the existing Jarvis "brain"
|
|
(speech-to-text + reply engine + text-to-speech) to the Node/bun Discord bot.
|
|
|
|
The Discord layer (``bot/``) is responsible for everything Discord-specific:
|
|
joining voice channels, capturing user audio, playing audio back, slash
|
|
commands, and streaming the VNC screen. It does NOT contain any AI logic.
|
|
Instead it calls this bridge:
|
|
|
|
POST /converse (multipart wav) -> { transcript, reply, audio_b64 }
|
|
POST /text (json {text}) -> { reply, audio_b64 }
|
|
POST /stt (multipart wav) -> { text, language }
|
|
POST /tts (json {text}) -> { audio_b64 }
|
|
GET /health -> { ok, brain, stt, tts }
|
|
|
|
This keeps the mature ~39k-line Python brain intact while letting Node own the
|
|
Discord/voice/video integration (which is only feasible in the Node ecosystem).
|
|
|
|
Run:
|
|
python -m bridge.server # from repo root
|
|
# or
|
|
BRIDGE_HOST=127.0.0.1 BRIDGE_PORT=8765 python bridge/server.py
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import io
|
|
import os
|
|
import sys
|
|
import threading
|
|
import wave
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
# Ensure repo-root/src is importable (jarvis package lives in src/jarvis)
|
|
_REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
_SRC = _REPO_ROOT / "src"
|
|
if str(_SRC) not in sys.path:
|
|
sys.path.insert(0, str(_SRC))
|
|
|
|
from flask import Flask, request, jsonify
|
|
|
|
app = Flask(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration (env-driven; see .env.example)
|
|
# ---------------------------------------------------------------------------
|
|
BRIDGE_HOST = os.environ.get("BRIDGE_HOST", "127.0.0.1")
|
|
BRIDGE_PORT = int(os.environ.get("BRIDGE_PORT", "8765"))
|
|
BRAIN_ENABLED = os.environ.get("JARVIS_BRAIN_ENABLED", "1") not in ("0", "false", "False")
|
|
TTS_ENABLED = os.environ.get("JARVIS_TTS_ENABLED", "1") not in ("0", "false", "False")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Lazy singletons. The first request pays the model-load cost; afterwards the
|
|
# brain stays warm. A lock guards initialization so concurrent Discord events
|
|
# don't double-load Whisper.
|
|
# ---------------------------------------------------------------------------
|
|
_init_lock = threading.Lock()
|
|
_cfg = None
|
|
_db = None
|
|
_dialogue_memory = None
|
|
_whisper = None
|
|
_piper_voice = None
|
|
_brain_error: Optional[str] = None
|
|
|
|
|
|
def _ensure_brain():
|
|
"""Initialize cfg, db, dialogue memory, and Whisper once."""
|
|
global _cfg, _db, _dialogue_memory, _whisper, _brain_error
|
|
if _cfg is not None or _brain_error is not None:
|
|
return
|
|
with _init_lock:
|
|
if _cfg is not None or _brain_error is not None:
|
|
return
|
|
try:
|
|
from jarvis.config import load_settings
|
|
from jarvis.memory.db import Database
|
|
from jarvis.memory.conversation import DialogueMemory
|
|
from faster_whisper import WhisperModel
|
|
|
|
cfg = load_settings()
|
|
db = Database(cfg.db_path, cfg.sqlite_vss_path)
|
|
dialogue_memory = DialogueMemory(
|
|
inactivity_timeout=getattr(cfg, "dialogue_memory_timeout", 300.0),
|
|
max_interactions=20,
|
|
)
|
|
device = os.environ.get("WHISPER_DEVICE", "auto")
|
|
compute = os.environ.get("WHISPER_COMPUTE_TYPE", "auto")
|
|
try:
|
|
whisper = WhisperModel(cfg.whisper_model, device=device, compute_type=compute)
|
|
except Exception as ge:
|
|
# GPU not available / unsupported -> fall back to CPU so the
|
|
# bridge still works without a GPU passed to the container.
|
|
if device != "cpu":
|
|
print(f"[bridge] whisper device='{device}' failed ({ge}); falling back to CPU", flush=True)
|
|
whisper = WhisperModel(cfg.whisper_model, device="cpu", compute_type="int8")
|
|
else:
|
|
raise
|
|
|
|
_cfg, _db, _dialogue_memory, _whisper = cfg, db, dialogue_memory, whisper
|
|
print(f"[bridge] brain ready (chat={cfg.ollama_chat_model}, whisper={cfg.whisper_model})", flush=True)
|
|
except Exception as e: # pragma: no cover - depends on local models
|
|
_brain_error = f"{type(e).__name__}: {e}"
|
|
print(f"[bridge] brain init FAILED: {_brain_error}", flush=True)
|
|
|
|
|
|
def _ensure_piper():
|
|
"""Initialize the Piper TTS voice once (independent of the brain)."""
|
|
global _piper_voice
|
|
if _piper_voice is not None or not TTS_ENABLED:
|
|
return
|
|
with _init_lock:
|
|
if _piper_voice is not None:
|
|
return
|
|
try:
|
|
from piper import PiperVoice # piper-tts package
|
|
model_path = os.environ.get("TTS_PIPER_MODEL_PATH")
|
|
if not model_path:
|
|
# Fall back to jarvis' default piper model location.
|
|
from jarvis.output.tts import _get_default_piper_model_path # type: ignore
|
|
model_path = _get_default_piper_model_path()
|
|
if not model_path or not Path(model_path).exists():
|
|
raise FileNotFoundError(
|
|
f"Piper voice model not found at '{model_path}'. "
|
|
f"Set TTS_PIPER_MODEL_PATH in .env or run scripts/setup_models.sh"
|
|
)
|
|
_piper_voice = PiperVoice.load(model_path)
|
|
print(f"[bridge] piper TTS ready ({model_path})", flush=True)
|
|
except Exception as e: # pragma: no cover
|
|
print(f"[bridge] piper init failed (TTS disabled): {e}", flush=True)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Core operations
|
|
# ---------------------------------------------------------------------------
|
|
def _read_wav_pcm(raw: bytes) -> tuple[bytes, int]:
|
|
"""Decode an incoming WAV blob to mono 16-bit PCM @ its sample rate."""
|
|
with wave.open(io.BytesIO(raw), "rb") as wf:
|
|
sr = wf.getframerate()
|
|
frames = wf.readframes(wf.getnframes())
|
|
return frames, sr
|
|
|
|
|
|
def transcribe(wav_bytes: bytes) -> dict:
|
|
_ensure_brain()
|
|
if _whisper is None:
|
|
return {"text": "", "language": None, "error": _brain_error or "stt unavailable"}
|
|
import numpy as np
|
|
|
|
pcm, sr = _read_wav_pcm(wav_bytes)
|
|
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
|
|
# faster-whisper expects 16kHz mono float32; linearly resample if needed.
|
|
if sr != 16000 and audio.size:
|
|
n_out = int(round(audio.size * 16000 / sr))
|
|
if n_out > 0:
|
|
x_old = np.linspace(0.0, 1.0, num=audio.size, endpoint=False)
|
|
x_new = np.linspace(0.0, 1.0, num=n_out, endpoint=False)
|
|
audio = np.interp(x_new, x_old, audio).astype(np.float32)
|
|
segments, info = _whisper.transcribe(audio, beam_size=1)
|
|
text = "".join(seg.text for seg in segments).strip()
|
|
return {"text": text, "language": getattr(info, "language", None)}
|
|
|
|
|
|
def think(text: str, language: Optional[str] = None) -> dict:
|
|
"""Run the Jarvis reply engine on a piece of text."""
|
|
if not BRAIN_ENABLED:
|
|
return {"reply": text, "error": "brain disabled (JARVIS_BRAIN_ENABLED=0)"}
|
|
_ensure_brain()
|
|
if _cfg is None:
|
|
return {"reply": "", "error": _brain_error or "brain unavailable"}
|
|
try:
|
|
from jarvis.reply.engine import run_reply_engine
|
|
|
|
# tts=None: we do our own Discord-side synthesis, the engine must not
|
|
# try to speak to a local speaker that doesn't exist in this process.
|
|
reply = run_reply_engine(
|
|
_db, _cfg, None, text, _dialogue_memory, language=language
|
|
)
|
|
reply = (reply or "").strip()
|
|
if reply:
|
|
_dialogue_memory.add_interaction(text, reply)
|
|
return {"reply": reply}
|
|
except Exception as e: # pragma: no cover
|
|
return {"reply": "", "error": f"{type(e).__name__}: {e}"}
|
|
|
|
|
|
def synthesize(text: str) -> Optional[bytes]:
|
|
"""Synthesize text to a 16-bit PCM WAV using Piper. Returns None if TTS off."""
|
|
if not TTS_ENABLED or not text.strip():
|
|
return None
|
|
_ensure_piper()
|
|
if _piper_voice is None:
|
|
return None
|
|
buf = io.BytesIO()
|
|
with wave.open(buf, "wb") as wf:
|
|
# piper-tts API: synthesize_wav(text, wav_file) writes a full WAV;
|
|
# plain synthesize() returns AudioChunks and takes a SynthesisConfig
|
|
# (NOT a wav file) as its 2nd arg.
|
|
_piper_voice.synthesize_wav(text, wf)
|
|
return buf.getvalue()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTTP endpoints
|
|
# ---------------------------------------------------------------------------
|
|
@app.get("/health")
|
|
def health():
|
|
return jsonify(
|
|
{
|
|
"ok": True,
|
|
"brain_enabled": BRAIN_ENABLED,
|
|
"brain_ready": _cfg is not None,
|
|
"brain_error": _brain_error,
|
|
"tts_enabled": TTS_ENABLED,
|
|
}
|
|
)
|
|
|
|
|
|
@app.post("/stt")
|
|
def http_stt():
|
|
raw = request.get_data()
|
|
if not raw:
|
|
return jsonify({"error": "empty body; send a WAV blob"}), 400
|
|
return jsonify(transcribe(raw))
|
|
|
|
|
|
@app.post("/text")
|
|
def http_text():
|
|
data = request.get_json(silent=True) or {}
|
|
text = (data.get("text") or "").strip()
|
|
if not text:
|
|
return jsonify({"error": "missing 'text'"}), 400
|
|
result = think(text, data.get("language"))
|
|
audio = synthesize(result.get("reply", ""))
|
|
if audio:
|
|
result["audio_b64"] = base64.b64encode(audio).decode("ascii")
|
|
return jsonify(result)
|
|
|
|
|
|
@app.post("/tts")
|
|
def http_tts():
|
|
data = request.get_json(silent=True) or {}
|
|
text = (data.get("text") or "").strip()
|
|
if not text:
|
|
return jsonify({"error": "missing 'text'"}), 400
|
|
audio = synthesize(text)
|
|
if not audio:
|
|
return jsonify({"error": "tts unavailable"}), 503
|
|
return jsonify({"audio_b64": base64.b64encode(audio).decode("ascii")})
|
|
|
|
|
|
@app.post("/converse")
|
|
def http_converse():
|
|
"""Full turn: speech in -> transcript -> reply -> speech out."""
|
|
raw = request.get_data()
|
|
if not raw:
|
|
return jsonify({"error": "empty body; send a WAV blob"}), 400
|
|
stt = transcribe(raw)
|
|
transcript = stt.get("text", "")
|
|
if not transcript:
|
|
return jsonify({"transcript": "", "reply": "", "audio_b64": None})
|
|
result = think(transcript, stt.get("language"))
|
|
audio = synthesize(result.get("reply", ""))
|
|
return jsonify(
|
|
{
|
|
"transcript": transcript,
|
|
"language": stt.get("language"),
|
|
"reply": result.get("reply", ""),
|
|
"error": result.get("error"),
|
|
"audio_b64": base64.b64encode(audio).decode("ascii") if audio else None,
|
|
}
|
|
)
|
|
|
|
|
|
def main():
|
|
print(f"[bridge] listening on http://{BRIDGE_HOST}:{BRIDGE_PORT}", flush=True)
|
|
# threaded=True so STT (slow) on one request doesn't block /health, etc.
|
|
app.run(host=BRIDGE_HOST, port=BRIDGE_PORT, threaded=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|