Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/tests/performance/README.md
+++ b/tests/performance/README.md
@@ -0,0 +1,43 @@
+# Performance tests
+
+Per-context timings for the reply pipeline. Excluded from the default pytest run
+(see `pytest.ini`'s `addopts = -m "not performance"`).
+
+## Running
+
+```bash
+pytest tests/performance/ -v -m performance -s
+```
+
+The `-s` flag lets the report table print to stdout. Tests auto-skip when Ollama
+is unreachable, so the harness is safe to leave in the repo.
+
+## Env vars
+
+| Var | Default | Description |
+|-----|---------|-------------|
+| `JARVIS_PERF_OLLAMA_URL` | `http://localhost:11434` | Ollama endpoint |
+| `JARVIS_PERF_MODEL` | `gemma4:e2b` | Model pulled in Ollama for the run |
+| `JARVIS_PERF_RUNS` | `3` | Runs per query (bump for tighter p95) |
+| `JARVIS_PERF_REPORT_DIR` | `tests/performance/reports/` | JSON report output |
+
+`PERF_RUNS=3` is a fast-iteration default. For stable p95 numbers when
+benchmarking a change, use `JARVIS_PERF_RUNS=10` or higher.
+
+## What it measures
+
+- **`test_micro_benchmark_tiny_prompt`** — one warmup + N tiny round-trips.
+  Hardware baseline: the floor for every context's per-call cost.
+- **`test_pipeline_timings_by_context`** — three representative queries × N runs
+  of `run_reply_engine`, with per-context timings bucketed via stack-frame
+  inspection in [`timing_recorder.py`](timing_recorder.py).
+
+Shape invariants (not absolute numbers):
+- Evaluator p50 ≤ main chat turn p50 × 1.5.
+- Tool router p50 ≤ main chat turn p50 × 1.5.
+- Enrichment extractor shares the router model chain.
+
+Unmapped callers print as `other:<qualname>` — that's a signal to update the
+`_CALLER_TO_CONTEXT` map in `timing_recorder.py` alongside `docs/llm_contexts.md`.
+
+Reports are written to `reports/` and git-ignored.
--- a/tests/performance/init.py
+++ b/tests/performance/init.py
--- a/tests/performance/test_pipeline_timings.py
+++ b/tests/performance/test_pipeline_timings.py
@@ -0,0 +1,236 @@
+"""⏱️ Performance: time each LLM context in the reply pipeline.
+
+Runs ``run_reply_engine`` N times against a live Ollama with a fixed tiny
+prompt, records per-context timings via the monkey-patching recorder, and
+asserts a few relative-shape invariants so the test fails when the pipeline
+shape drifts (e.g. the evaluator becomes more expensive than the main turn).
+
+Also includes a micro-benchmark that calls each configured model with a
+tiny fixed prompt, giving a hardware baseline to diff against.
+
+Run manually:
+    pytest tests/performance/ -v -m performance -s
+
+Requires:
+    - Ollama reachable at http://localhost:11434
+    - ``gemma4:e2b`` pulled (or override via env var)
+
+The test is skipped automatically if Ollama is unreachable, so it's safe to
+leave in the repo. Use ``-s`` to see the report table.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import time
+from pathlib import Path
+
+import pytest
+import requests
+
+from tests.performance.timing_recorder import TimingRecorder
+
+
+OLLAMA_URL = os.environ.get("JARVIS_PERF_OLLAMA_URL", "http://localhost:11434")
+PERF_MODEL = os.environ.get("JARVIS_PERF_MODEL", "gemma4:e2b")
+PERF_RUNS = int(os.environ.get("JARVIS_PERF_RUNS", "3"))
+PERF_REPORT_DIR = Path(os.environ.get(
+    "JARVIS_PERF_REPORT_DIR",
+    str(Path(__file__).parent / "reports"),
+))
+
+# Tiny fixed prompts — the whole point of the baseline is to measure the
+# per-call overhead and model warmup cost, not prompt-length effects.
+TINY_SYSTEM = "Reply with the single word OK."
+TINY_USER = "ping"
+
+# Representative reply-pipeline queries. Keep them small and shape-diverse.
+PIPELINE_QUERIES = [
+    "hello",                      # pure chat, no tools needed
+    "what's 2 plus 3?",           # math, one-shot
+    "what time is it in Tokyo?",  # likely triggers a tool
+]
+
+
+def _ollama_reachable() -> bool:
+    try:
+        resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=2)
+        if resp.status_code != 200:
+            return False
+        models = [m.get("name", "") for m in resp.json().get("models", [])]
+        return any(PERF_MODEL.split(":")[0] in m for m in models)
+    except Exception:
+        return False
+
+
+pytestmark = [
+    pytest.mark.performance,
+    pytest.mark.skipif(
+        not _ollama_reachable(),
+        reason=f"Ollama at {OLLAMA_URL} with {PERF_MODEL} not available",
+    ),
+]
+
+
+def _make_cfg():
+    from evals.helpers import MockConfig
+    cfg = MockConfig()
+    cfg.ollama_base_url = OLLAMA_URL
+    cfg.ollama_chat_model = PERF_MODEL
+    cfg.intent_judge_model = PERF_MODEL
+    # Let size-aware defaults kick in (evaluator + digests ON for small).
+    cfg.evaluator_enabled = None
+    cfg.memory_digest_enabled = None
+    cfg.tool_result_digest_enabled = None
+    # Force the LLM-based router so its timing shows up in the report.
+    # MockConfig doesn't set this attribute, and the engine's default varies.
+    cfg.tool_selection_strategy = "llm"
+    cfg.tool_router_model = ""  # fall through the router chain
+    cfg.evaluator_model = ""
+    return cfg
+
+
+def _write_report(rec: TimingRecorder, name: str) -> Path:
+    PERF_REPORT_DIR.mkdir(parents=True, exist_ok=True)
+    stamp = time.strftime("%Y%m%d-%H%M%S")
+    path = PERF_REPORT_DIR / f"{name}-{stamp}.json"
+    payload = {
+        "name": name,
+        "timestamp": stamp,
+        "model": PERF_MODEL,
+        "runs": PERF_RUNS,
+        "summary": rec.to_dict(),
+        "raw": [
+            {
+                "context": c.context,
+                "duration_sec": round(c.duration_sec, 4),
+                "model": c.model,
+                "prompt_chars": c.prompt_chars,
+                "response_chars": c.response_chars,
+            }
+            for c in rec.calls
+        ],
+    }
+    path.write_text(json.dumps(payload, indent=2))
+    return path
+
+
+# =============================================================================
+# Micro-benchmark: tiny fixed prompt per configured model
+# =============================================================================
+
+
+@pytest.mark.performance
+def test_micro_benchmark_tiny_prompt():
+    """Baseline: how long does a single tiny round-trip to Ollama take?
+
+    This is the floor for every context's per-call cost. If the floor moves,
+    every context's total moves with it. Reported separately from the
+    pipeline test so hardware drift is obvious in the numbers.
+    """
+    # Import the module (not the function) so the recorder's patch on
+    # jarvis.llm is visible at call time.
+    from jarvis import llm as _llm
+
+    with TimingRecorder() as rec:
+        # Warmup (first call pays weight-loading cost)
+        _llm.call_llm_direct(
+            base_url=OLLAMA_URL,
+            chat_model=PERF_MODEL,
+            system_prompt=TINY_SYSTEM,
+            user_content=TINY_USER,
+            timeout_sec=30.0,
+        )
+        # Measured runs
+        for _ in range(PERF_RUNS):
+            _llm.call_llm_direct(
+                base_url=OLLAMA_URL,
+                chat_model=PERF_MODEL,
+                system_prompt=TINY_SYSTEM,
+                user_content=TINY_USER,
+                timeout_sec=30.0,
+            )
+
+    rec.print_report(title=f"Micro-benchmark — tiny prompt × {PERF_RUNS + 1} on {PERF_MODEL}")
+    path = _write_report(rec, "micro")
+    print(f"   📄 saved: {path}")
+
+    # Shape check: warm calls should be noticeably faster than cold.
+    # Not a strict assertion (too noisy) — just make sure we got calls.
+    assert len(rec.calls) == PERF_RUNS + 1
+
+
+# =============================================================================
+# Full pipeline: run_reply_engine × N, per-context timings
+# =============================================================================
+
+
+@pytest.mark.performance
+def test_pipeline_timings_by_context():
+    """Run the full reply pipeline N times, record per-context timings.
+
+    Relative-shape invariants (not absolute numbers):
+      1. If the evaluator fires, it must be cheaper on average than the main
+         chat turn — otherwise we're paying more for the decision than for
+         the answer. This is the whole reason the evaluator uses a small
+         model.
+      2. The tool router, if it fires, must be cheaper than a main chat
+         turn on p50 — it's a classification call on the warm small model.
+      3. Enrichment extractor, if it fires, must run on the router chain
+         (same model as the router). This locks in the demotion we just did.
+    """
+    from jarvis.memory.db import Database
+    from jarvis.memory.conversation import DialogueMemory
+    from jarvis.reply.engine import run_reply_engine
+
+    cfg = _make_cfg()
+
+    with TimingRecorder() as rec:
+        for query in PIPELINE_QUERIES:
+            db = Database(":memory:", sqlite_vss_path=None)
+            dlg = DialogueMemory(inactivity_timeout=300, max_interactions=20)
+            try:
+                for _ in range(PERF_RUNS):
+                    run_reply_engine(db, cfg, None, query, dlg)
+            finally:
+                db.close()
+
+    rec.print_report(title=f"Pipeline timings — {len(PIPELINE_QUERIES)} queries × {PERF_RUNS} runs on {PERF_MODEL}")
+    path = _write_report(rec, "pipeline")
+    print(f"   📄 saved: {path}")
+
+    assert rec.calls, "no LLM calls recorded — pipeline did not invoke the LLM"
+
+    # Surface unmapped callers so new contexts show up in review.
+    other = [c for c in rec.calls if c.context.startswith("other:")]
+    if other:
+        unmapped = sorted({c.context for c in other})
+        print(f"   ⚠️  unmapped callers (add to _CALLER_TO_CONTEXT): {unmapped}")
+
+    # Shape invariants
+    main_p50 = rec.p50("main_chat_turn")
+    if main_p50 > 0:
+        ev_p50 = rec.p50("evaluator")
+        if ev_p50 > 0:
+            assert ev_p50 <= main_p50 * 1.5, (
+                f"evaluator p50 ({ev_p50:.2f}s) exceeds main chat turn p50 "
+                f"({main_p50:.2f}s) by >50% — evaluator should be cheaper"
+            )
+        router_p50 = rec.p50("tool_router")
+        if router_p50 > 0:
+            assert router_p50 <= main_p50 * 1.5, (
+                f"tool router p50 ({router_p50:.2f}s) exceeds main chat turn p50 "
+                f"({main_p50:.2f}s) by >50% — router should be cheaper"
+            )
+
+    # Locking in the demotion: enrichment extractor must use the router chain.
+    enrich_calls = [c for c in rec.calls if c.context == "enrichment_extract"]
+    router_calls = [c for c in rec.calls if c.context == "tool_router"]
+    if enrich_calls and router_calls:
+        enrich_models = {c.model for c in enrich_calls}
+        router_models = {c.model for c in router_calls}
+        assert enrich_models == router_models, (
+            f"enrichment extractor should share the router model chain "
+            f"(enrichment={enrich_models}, router={router_models})"
+        )
--- a/tests/performance/timing_recorder.py
+++ b/tests/performance/timing_recorder.py
@@ -0,0 +1,270 @@
+"""⏱️ LLM call timing recorder.
+
+Monkey-patches the three entry points in ``jarvis.llm`` (``call_llm_direct``,
+``call_llm_streaming``, ``chat_with_messages``) to record per-call timings
+grouped by the context that issued the call (evaluator, intent judge, tool
+router, etc.). The context is inferred from the caller's ``__qualname__`` on
+the Python call stack, so no instrumentation is needed at the call site.
+
+Usage:
+    with TimingRecorder() as rec:
+        run_reply_engine(...)
+    rec.print_report()
+    assert rec.p95("evaluator") < rec.p95("main_chat_turn")  # shape check
+"""
+
+from __future__ import annotations
+
+import sys
+import time
+import statistics
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import Callable, Optional
+
+from jarvis import llm as _llm_module
+
+
+# Map caller __qualname__ → graph context name. Matches the 13 contexts in
+# docs/llm_contexts.md. Anything not listed gets lumped into "other" so we
+# notice new call sites drift in without us updating the doc.
+#
+# ⚠️  This mapping mirrors docs/llm_contexts.md. When you add, remove, or
+# rename an LLM context per the CLAUDE.md rule, update both in the same PR
+# — the perf harness silently buckets unknown callers into "other:<qualname>"
+# so drift here is visible but not loud.
+_CALLER_TO_CONTEXT: dict[str, str] = {
+    # Context 1 — main chat loop uses chat_with_messages
+    "run_reply_engine": "main_chat_turn",
+    # Context 2 — intent judge (calls via internal helper)
+    "IntentJudge.evaluate": "intent_judge",
+    "IntentJudge._call_llm": "intent_judge",
+    # Context 3 — evaluator
+    "evaluate_turn": "evaluator",
+    # Context 4 — memory enrichment extractor
+    "extract_search_params_for_memory": "enrichment_extract",
+    # Context 5 — memory digest (per batch)
+    "_distil_batch": "memory_digest",
+    "digest_memory_for_query": "memory_digest",
+    # Context 6 — tool-result digest (per batch)
+    "_distil_tool_batch": "tool_result_digest",
+    "digest_tool_result_for_query": "tool_result_digest",
+    "_maybe_digest_tool_result": "tool_result_digest",
+    # Context 7 — max-turn loop digest
+    "digest_loop_for_max_turns": "max_turn_digest",
+    # Context 8 — tool router
+    # (Context 9 — tool searcher — reuses select_tools_with_llm so it falls
+    # under the same bucket; that's intentional per docs/llm_contexts.md.)
+    "select_tools_with_llm": "tool_router",
+    # Context 10 — conversation summariser
+    "generate_conversation_summary": "summariser",
+    # Context 11 — graph fact extraction
+    "extract_graph_memories": "graph_extract",
+    # Context 12 — graph best-child picker
+    "_llm_pick_best_child": "graph_best_child",
+    # Context 13 — tool-specific LLM calls
+    "_extract_place_from_user_text": "tool_weather",
+    "extract_and_log_meal": "tool_nutrition",
+    "generate_followups_for_meal": "tool_nutrition",
+}
+
+
+@dataclass
+class _Call:
+    context: str
+    duration_sec: float
+    model: str
+    prompt_chars: int
+    response_chars: int
+
+
+@dataclass
+class TimingRecorder:
+    calls: list[_Call] = field(default_factory=list)
+    _originals: dict = field(default_factory=dict)
+
+    def __enter__(self) -> "TimingRecorder":
+        self._patch()
+        return self
+
+    def __exit__(self, exc_type, exc, tb) -> None:
+        self._unpatch()
+
+    # ── context inference ────────────────────────────────────────────────
+    @staticmethod
+    def _infer_context(skip_frames: int = 2) -> str:
+        """Walk the stack looking for the nearest function whose qualname is
+        in our context map. Skip ``skip_frames`` to step over the wrapper
+        itself. Falls back to ``"other:<qualname>"`` when no known caller is
+        found — visible in the report so drift shows up."""
+        frame = sys._getframe(skip_frames)
+        first_unknown: Optional[str] = None
+        while frame is not None:
+            qual = frame.f_code.co_qualname if hasattr(frame.f_code, "co_qualname") else frame.f_code.co_name
+            if qual in _CALLER_TO_CONTEXT:
+                return _CALLER_TO_CONTEXT[qual]
+            # Also match by the bare function name (qualname can be e.g.
+            # ClassName.method — strip the class part).
+            bare = qual.rsplit(".", 1)[-1]
+            if bare in _CALLER_TO_CONTEXT:
+                return _CALLER_TO_CONTEXT[bare]
+            if first_unknown is None and not qual.startswith(("<", "_patch", "_unpatch")):
+                first_unknown = qual
+            frame = frame.f_back
+        return f"other:{first_unknown or 'unknown'}"
+
+    # ── patching ─────────────────────────────────────────────────────────
+    def _wrap(self, name: str, original: Callable) -> Callable:
+        def wrapped(*args, **kwargs):
+            ctx = self._infer_context(skip_frames=2)
+            # Extract model + prompt sizes from args heuristically — all three
+            # entry points take (base_url, chat_model, ...). chat_with_messages
+            # takes a messages list.
+            model = ""
+            prompt_chars = 0
+            if name == "chat_with_messages":
+                model = kwargs.get("chat_model") or (args[1] if len(args) > 1 else "")
+                msgs = kwargs.get("messages") or (args[2] if len(args) > 2 else [])
+                if isinstance(msgs, list):
+                    prompt_chars = sum(len(str(m.get("content", ""))) for m in msgs)
+            else:
+                model = kwargs.get("chat_model") or (args[1] if len(args) > 1 else "")
+                sys_p = kwargs.get("system_prompt") or (args[2] if len(args) > 2 else "")
+                user_c = kwargs.get("user_content") or (args[3] if len(args) > 3 else "")
+                prompt_chars = len(str(sys_p)) + len(str(user_c))
+
+            t0 = time.perf_counter()
+            result = original(*args, **kwargs)
+            elapsed = time.perf_counter() - t0
+
+            # response size: str for direct/streaming, dict for chat_with_messages
+            if isinstance(result, str):
+                response_chars = len(result)
+            elif isinstance(result, dict):
+                response_chars = len(str(result.get("content", "")))
+            else:
+                response_chars = 0
+
+            self.calls.append(_Call(
+                context=ctx,
+                duration_sec=elapsed,
+                model=str(model),
+                prompt_chars=prompt_chars,
+                response_chars=response_chars,
+            ))
+            return result
+
+        return wrapped
+
+    def _patch(self) -> None:
+        """Patch every module that has already imported one of the LLM entry
+        points via ``from ..llm import X``. Those bindings were resolved at
+        import time and do NOT see a setattr on ``jarvis.llm`` itself, so we
+        have to replace the attribute on each importer.
+        """
+        import sys as _sys
+        names = ("call_llm_direct", "call_llm_streaming", "chat_with_messages")
+        # Capture the originals from the llm module once.
+        originals = {n: getattr(_llm_module, n) for n in names}
+        # self._originals stores [(module, name, original_fn)] so _unpatch
+        # can put each binding back exactly where it came from.
+        self._originals["_sites"] = []
+        for mod in list(_sys.modules.values()):
+            if mod is None or mod is _llm_module:
+                continue
+            mod_name = getattr(mod, "__name__", "")
+            if not mod_name.startswith(("jarvis", "tests", "evals")):
+                continue
+            for name in names:
+                current = getattr(mod, name, None)
+                if current is originals[name]:
+                    wrapped = self._wrap(name, originals[name])
+                    setattr(mod, name, wrapped)
+                    self._originals["_sites"].append((mod, name, originals[name]))
+        # Also patch the canonical module so any late `from jarvis.llm import X`
+        # after we enter the context sees the wrapper.
+        for name in names:
+            wrapped = self._wrap(name, originals[name])
+            setattr(_llm_module, name, wrapped)
+            self._originals["_sites"].append((_llm_module, name, originals[name]))
+
+    def _unpatch(self) -> None:
+        for mod, name, original in self._originals.get("_sites", []):
+            setattr(mod, name, original)
+        self._originals.clear()
+
+    # ── queries ──────────────────────────────────────────────────────────
+    def by_context(self) -> dict[str, list[_Call]]:
+        out: dict[str, list[_Call]] = {}
+        for c in self.calls:
+            out.setdefault(c.context, []).append(c)
+        return out
+
+    def durations(self, context: str) -> list[float]:
+        return [c.duration_sec for c in self.calls if c.context == context]
+
+    def p50(self, context: str) -> float:
+        ds = self.durations(context)
+        return statistics.median(ds) if ds else 0.0
+
+    def p95(self, context: str) -> float:
+        ds = self.durations(context)
+        if not ds:
+            return 0.0
+        if len(ds) == 1:
+            return ds[0]
+        ds_sorted = sorted(ds)
+        idx = max(0, int(round(0.95 * (len(ds_sorted) - 1))))
+        return ds_sorted[idx]
+
+    def total(self, context: Optional[str] = None) -> float:
+        if context is None:
+            return sum(c.duration_sec for c in self.calls)
+        return sum(c.duration_sec for c in self.calls if c.context == context)
+
+    # ── reporting ────────────────────────────────────────────────────────
+    def to_dict(self) -> dict:
+        buckets = self.by_context()
+        return {
+            "total_calls": len(self.calls),
+            "total_sec": round(self.total(), 3),
+            "contexts": {
+                ctx: {
+                    "calls": len(calls),
+                    "total_sec": round(sum(c.duration_sec for c in calls), 3),
+                    "p50_sec": round(self.p50(ctx), 3),
+                    "p95_sec": round(self.p95(ctx), 3),
+                    "avg_prompt_chars": int(statistics.mean(c.prompt_chars for c in calls)) if calls else 0,
+                    "avg_response_chars": int(statistics.mean(c.response_chars for c in calls)) if calls else 0,
+                    "models": sorted({c.model for c in calls if c.model}),
+                }
+                for ctx, calls in buckets.items()
+            },
+        }
+
+    def print_report(self, title: str = "LLM pipeline timings") -> None:
+        print(f"\n⏱️  {title}")
+        print(f"   total calls: {len(self.calls)}   total wall time: {self.total():.2f}s")
+        rows = sorted(
+            self.by_context().items(),
+            key=lambda kv: -sum(c.duration_sec for c in kv[1]),
+        )
+        header = f"   {'context':<22} {'n':>3}  {'total':>7}  {'p50':>6}  {'p95':>6}  {'prompt':>7}  model"
+        print(header)
+        print("   " + "-" * (len(header) - 3))
+        for ctx, calls in rows:
+            total = sum(c.duration_sec for c in calls)
+            print(
+                f"   {ctx:<22} {len(calls):>3}  "
+                f"{total:>6.2f}s  {self.p50(ctx):>5.2f}s  {self.p95(ctx):>5.2f}s  "
+                f"{int(statistics.mean(c.prompt_chars for c in calls)):>7}  "
+                f"{','.join(sorted({c.model for c in calls if c.model}))}"
+            )
+
+
+@contextmanager
+def record_timings():
+    """Convenience context manager."""
+    rec = TimingRecorder()
+    with rec:
+        yield rec