javis_bot/bridge/server.py

"""
Jarvis Brain Bridge
===================

A thin local HTTP service that exposes the existing Jarvis "brain"
(speech-to-text + reply engine + text-to-speech) to the Node/bun Discord bot.

The Discord layer (``bot/``) is responsible for everything Discord-specific:
joining voice channels, capturing user audio, playing audio back, slash
commands, and streaming the VNC screen. It does NOT contain any AI logic.
Instead it calls this bridge:

    POST /converse        (multipart wav)  -> { transcript, reply, audio_b64 }
    POST /text            (json {text})    -> { reply, audio_b64 }
    POST /stt             (multipart wav)  -> { text, language }
    POST /tts             (json {text})    -> { audio_b64 }
    GET  /health                            -> { ok, brain, stt, tts }

This keeps the mature ~39k-line Python brain intact while letting Node own the
Discord/voice/video integration (which is only feasible in the Node ecosystem).

Run:
    python -m bridge.server          # from repo root
    # or
    BRIDGE_HOST=127.0.0.1 BRIDGE_PORT=8765 python bridge/server.py
"""

from __future__ import annotations

import base64
import io
import os
import sys
import threading
import wave
from pathlib import Path
from typing import Optional

# Ensure repo-root/src is importable (jarvis package lives in src/jarvis)
_REPO_ROOT = Path(__file__).resolve().parent.parent
_SRC = _REPO_ROOT / "src"
if str(_SRC) not in sys.path:
    sys.path.insert(0, str(_SRC))

from flask import Flask, request, jsonify

app = Flask(__name__)

# ---------------------------------------------------------------------------
# Configuration (env-driven; see .env.example)
# ---------------------------------------------------------------------------
BRIDGE_HOST = os.environ.get("BRIDGE_HOST", "127.0.0.1")
BRIDGE_PORT = int(os.environ.get("BRIDGE_PORT", "8765"))
BRAIN_ENABLED = os.environ.get("JARVIS_BRAIN_ENABLED", "1") not in ("0", "false", "False")
TTS_ENABLED = os.environ.get("JARVIS_TTS_ENABLED", "1") not in ("0", "false", "False")

# ---------------------------------------------------------------------------
# Lazy singletons. The first request pays the model-load cost; afterwards the
# brain stays warm. A lock guards initialization so concurrent Discord events
# don't double-load Whisper.
# ---------------------------------------------------------------------------
_init_lock = threading.Lock()
_cfg = None
_db = None
_dialogue_memory = None
_whisper = None
_piper_voice = None
_brain_error: Optional[str] = None


def _ensure_brain():
    """Initialize cfg, db, dialogue memory, and Whisper once."""
    global _cfg, _db, _dialogue_memory, _whisper, _brain_error
    if _cfg is not None or _brain_error is not None:
        return
    with _init_lock:
        if _cfg is not None or _brain_error is not None:
            return
        try:
            from jarvis.config import load_settings
            from jarvis.memory.db import Database
            from jarvis.memory.conversation import DialogueMemory
            from faster_whisper import WhisperModel

            cfg = load_settings()
            db = Database(cfg.db_path, cfg.sqlite_vss_path)
            dialogue_memory = DialogueMemory(
                inactivity_timeout=getattr(cfg, "dialogue_memory_timeout", 300.0),
                max_interactions=20,
            )
            device = os.environ.get("WHISPER_DEVICE", "auto")
            compute = os.environ.get("WHISPER_COMPUTE_TYPE", "auto")
            try:
                whisper = WhisperModel(cfg.whisper_model, device=device, compute_type=compute)
            except Exception as ge:
                # GPU not available / unsupported -> fall back to CPU so the
                # bridge still works without a GPU passed to the container.
                if device != "cpu":
                    print(f"[bridge] whisper device='{device}' failed ({ge}); falling back to CPU", flush=True)
                    whisper = WhisperModel(cfg.whisper_model, device="cpu", compute_type="int8")
                else:
                    raise

            _cfg, _db, _dialogue_memory, _whisper = cfg, db, dialogue_memory, whisper
            print(f"[bridge] brain ready (chat={cfg.ollama_chat_model}, whisper={cfg.whisper_model})", flush=True)
        except Exception as e:  # pragma: no cover - depends on local models
            _brain_error = f"{type(e).__name__}: {e}"
            print(f"[bridge] brain init FAILED: {_brain_error}", flush=True)


def _ensure_piper():
    """Initialize the Piper TTS voice once (independent of the brain)."""
    global _piper_voice
    if _piper_voice is not None or not TTS_ENABLED:
        return
    with _init_lock:
        if _piper_voice is not None:
            return
        try:
            from piper import PiperVoice  # piper-tts package
            model_path = os.environ.get("TTS_PIPER_MODEL_PATH")
            if not model_path:
                # Fall back to jarvis' default piper model location.
                from jarvis.output.tts import _get_default_piper_model_path  # type: ignore
                model_path = _get_default_piper_model_path()
            if not model_path or not Path(model_path).exists():
                raise FileNotFoundError(
                    f"Piper voice model not found at '{model_path}'. "
                    f"Set TTS_PIPER_MODEL_PATH in .env or run scripts/setup_models.sh"
                )
            _piper_voice = PiperVoice.load(model_path)
            print(f"[bridge] piper TTS ready ({model_path})", flush=True)
        except Exception as e:  # pragma: no cover
            print(f"[bridge] piper init failed (TTS disabled): {e}", flush=True)


# ---------------------------------------------------------------------------
# Core operations
# ---------------------------------------------------------------------------
def _read_wav_pcm(raw: bytes) -> tuple[bytes, int]:
    """Decode an incoming WAV blob to mono 16-bit PCM @ its sample rate."""
    with wave.open(io.BytesIO(raw), "rb") as wf:
        sr = wf.getframerate()
        frames = wf.readframes(wf.getnframes())
    return frames, sr


def transcribe(wav_bytes: bytes) -> dict:
    _ensure_brain()
    if _whisper is None:
        return {"text": "", "language": None, "error": _brain_error or "stt unavailable"}
    import numpy as np

    pcm, sr = _read_wav_pcm(wav_bytes)
    audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
    # faster-whisper expects 16kHz mono float32; linearly resample if needed.
    if sr != 16000 and audio.size:
        n_out = int(round(audio.size * 16000 / sr))
        if n_out > 0:
            x_old = np.linspace(0.0, 1.0, num=audio.size, endpoint=False)
            x_new = np.linspace(0.0, 1.0, num=n_out, endpoint=False)
            audio = np.interp(x_new, x_old, audio).astype(np.float32)
    segments, info = _whisper.transcribe(audio, beam_size=1)
    text = "".join(seg.text for seg in segments).strip()
    return {"text": text, "language": getattr(info, "language", None)}


def think(text: str, language: Optional[str] = None) -> dict:
    """Run the Jarvis reply engine on a piece of text."""
    if not BRAIN_ENABLED:
        return {"reply": text, "error": "brain disabled (JARVIS_BRAIN_ENABLED=0)"}
    _ensure_brain()
    if _cfg is None:
        return {"reply": "", "error": _brain_error or "brain unavailable"}
    try:
        from jarvis.reply.engine import run_reply_engine

        # tts=None: we do our own Discord-side synthesis, the engine must not
        # try to speak to a local speaker that doesn't exist in this process.
        reply = run_reply_engine(
            _db, _cfg, None, text, _dialogue_memory, language=language
        )
        reply = (reply or "").strip()
        if reply:
            _dialogue_memory.add_interaction(text, reply)
        return {"reply": reply}
    except Exception as e:  # pragma: no cover
        return {"reply": "", "error": f"{type(e).__name__}: {e}"}


def synthesize(text: str) -> Optional[bytes]:
    """Synthesize text to a 16-bit PCM WAV using Piper. Returns None if TTS off."""
    if not TTS_ENABLED or not text.strip():
        return None
    _ensure_piper()
    if _piper_voice is None:
        return None
    buf = io.BytesIO()
    with wave.open(buf, "wb") as wf:
        # piper-tts API: synthesize_wav(text, wav_file) writes a full WAV;
        # plain synthesize() returns AudioChunks and takes a SynthesisConfig
        # (NOT a wav file) as its 2nd arg.
        _piper_voice.synthesize_wav(text, wf)
    return buf.getvalue()


# ---------------------------------------------------------------------------
# HTTP endpoints
# ---------------------------------------------------------------------------
@app.get("/health")
def health():
    return jsonify(
        {
            "ok": True,
            "brain_enabled": BRAIN_ENABLED,
            "brain_ready": _cfg is not None,
            "brain_error": _brain_error,
            "tts_enabled": TTS_ENABLED,
        }
    )


@app.post("/stt")
def http_stt():
    raw = request.get_data()
    if not raw:
        return jsonify({"error": "empty body; send a WAV blob"}), 400
    return jsonify(transcribe(raw))


@app.post("/text")
def http_text():
    data = request.get_json(silent=True) or {}
    text = (data.get("text") or "").strip()
    if not text:
        return jsonify({"error": "missing 'text'"}), 400
    result = think(text, data.get("language"))
    audio = synthesize(result.get("reply", ""))
    if audio:
        result["audio_b64"] = base64.b64encode(audio).decode("ascii")
    return jsonify(result)


@app.post("/tts")
def http_tts():
    data = request.get_json(silent=True) or {}
    text = (data.get("text") or "").strip()
    if not text:
        return jsonify({"error": "missing 'text'"}), 400
    audio = synthesize(text)
    if not audio:
        return jsonify({"error": "tts unavailable"}), 503
    return jsonify({"audio_b64": base64.b64encode(audio).decode("ascii")})


@app.post("/converse")
def http_converse():
    """Full turn: speech in -> transcript -> reply -> speech out."""
    raw = request.get_data()
    if not raw:
        return jsonify({"error": "empty body; send a WAV blob"}), 400
    stt = transcribe(raw)
    transcript = stt.get("text", "")
    if not transcript:
        return jsonify({"transcript": "", "reply": "", "audio_b64": None})
    result = think(transcript, stt.get("language"))
    audio = synthesize(result.get("reply", ""))
    return jsonify(
        {
            "transcript": transcript,
            "language": stt.get("language"),
            "reply": result.get("reply", ""),
            "error": result.get("error"),
            "audio_b64": base64.b64encode(audio).decode("ascii") if audio else None,
        }
    )


def main():
    print(f"[bridge] listening on http://{BRIDGE_HOST}:{BRIDGE_PORT}", flush=True)
    # threaded=True so STT (slow) on one request doesn't block /health, etc.
    app.run(host=BRIDGE_HOST, port=BRIDGE_PORT, threaded=True)


if __name__ == "__main__":
    main()