Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled

Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
This commit is contained in:
javis-bot
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions

View File

@@ -0,0 +1,43 @@
# Performance tests
Per-context timings for the reply pipeline. Excluded from the default pytest run
(see `pytest.ini`'s `addopts = -m "not performance"`).
## Running
```bash
pytest tests/performance/ -v -m performance -s
```
The `-s` flag lets the report table print to stdout. Tests auto-skip when Ollama
is unreachable, so the harness is safe to leave in the repo.
## Env vars
| Var | Default | Description |
|-----|---------|-------------|
| `JARVIS_PERF_OLLAMA_URL` | `http://localhost:11434` | Ollama endpoint |
| `JARVIS_PERF_MODEL` | `gemma4:e2b` | Model pulled in Ollama for the run |
| `JARVIS_PERF_RUNS` | `3` | Runs per query (bump for tighter p95) |
| `JARVIS_PERF_REPORT_DIR` | `tests/performance/reports/` | JSON report output |
`PERF_RUNS=3` is a fast-iteration default. For stable p95 numbers when
benchmarking a change, use `JARVIS_PERF_RUNS=10` or higher.
## What it measures
- **`test_micro_benchmark_tiny_prompt`** — one warmup + N tiny round-trips.
Hardware baseline: the floor for every context's per-call cost.
- **`test_pipeline_timings_by_context`** — three representative queries × N runs
of `run_reply_engine`, with per-context timings bucketed via stack-frame
inspection in [`timing_recorder.py`](timing_recorder.py).
Shape invariants (not absolute numbers):
- Evaluator p50 ≤ main chat turn p50 × 1.5.
- Tool router p50 ≤ main chat turn p50 × 1.5.
- Enrichment extractor shares the router model chain.
Unmapped callers print as `other:<qualname>` — that's a signal to update the
`_CALLER_TO_CONTEXT` map in `timing_recorder.py` alongside `docs/llm_contexts.md`.
Reports are written to `reports/` and git-ignored.

View File

View File

@@ -0,0 +1,236 @@
"""⏱️ Performance: time each LLM context in the reply pipeline.
Runs ``run_reply_engine`` N times against a live Ollama with a fixed tiny
prompt, records per-context timings via the monkey-patching recorder, and
asserts a few relative-shape invariants so the test fails when the pipeline
shape drifts (e.g. the evaluator becomes more expensive than the main turn).
Also includes a micro-benchmark that calls each configured model with a
tiny fixed prompt, giving a hardware baseline to diff against.
Run manually:
pytest tests/performance/ -v -m performance -s
Requires:
- Ollama reachable at http://localhost:11434
- ``gemma4:e2b`` pulled (or override via env var)
The test is skipped automatically if Ollama is unreachable, so it's safe to
leave in the repo. Use ``-s`` to see the report table.
"""
from __future__ import annotations
import json
import os
import time
from pathlib import Path
import pytest
import requests
from tests.performance.timing_recorder import TimingRecorder
OLLAMA_URL = os.environ.get("JARVIS_PERF_OLLAMA_URL", "http://localhost:11434")
PERF_MODEL = os.environ.get("JARVIS_PERF_MODEL", "gemma4:e2b")
PERF_RUNS = int(os.environ.get("JARVIS_PERF_RUNS", "3"))
PERF_REPORT_DIR = Path(os.environ.get(
"JARVIS_PERF_REPORT_DIR",
str(Path(__file__).parent / "reports"),
))
# Tiny fixed prompts — the whole point of the baseline is to measure the
# per-call overhead and model warmup cost, not prompt-length effects.
TINY_SYSTEM = "Reply with the single word OK."
TINY_USER = "ping"
# Representative reply-pipeline queries. Keep them small and shape-diverse.
PIPELINE_QUERIES = [
"hello", # pure chat, no tools needed
"what's 2 plus 3?", # math, one-shot
"what time is it in Tokyo?", # likely triggers a tool
]
def _ollama_reachable() -> bool:
try:
resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=2)
if resp.status_code != 200:
return False
models = [m.get("name", "") for m in resp.json().get("models", [])]
return any(PERF_MODEL.split(":")[0] in m for m in models)
except Exception:
return False
pytestmark = [
pytest.mark.performance,
pytest.mark.skipif(
not _ollama_reachable(),
reason=f"Ollama at {OLLAMA_URL} with {PERF_MODEL} not available",
),
]
def _make_cfg():
from evals.helpers import MockConfig
cfg = MockConfig()
cfg.ollama_base_url = OLLAMA_URL
cfg.ollama_chat_model = PERF_MODEL
cfg.intent_judge_model = PERF_MODEL
# Let size-aware defaults kick in (evaluator + digests ON for small).
cfg.evaluator_enabled = None
cfg.memory_digest_enabled = None
cfg.tool_result_digest_enabled = None
# Force the LLM-based router so its timing shows up in the report.
# MockConfig doesn't set this attribute, and the engine's default varies.
cfg.tool_selection_strategy = "llm"
cfg.tool_router_model = "" # fall through the router chain
cfg.evaluator_model = ""
return cfg
def _write_report(rec: TimingRecorder, name: str) -> Path:
PERF_REPORT_DIR.mkdir(parents=True, exist_ok=True)
stamp = time.strftime("%Y%m%d-%H%M%S")
path = PERF_REPORT_DIR / f"{name}-{stamp}.json"
payload = {
"name": name,
"timestamp": stamp,
"model": PERF_MODEL,
"runs": PERF_RUNS,
"summary": rec.to_dict(),
"raw": [
{
"context": c.context,
"duration_sec": round(c.duration_sec, 4),
"model": c.model,
"prompt_chars": c.prompt_chars,
"response_chars": c.response_chars,
}
for c in rec.calls
],
}
path.write_text(json.dumps(payload, indent=2))
return path
# =============================================================================
# Micro-benchmark: tiny fixed prompt per configured model
# =============================================================================
@pytest.mark.performance
def test_micro_benchmark_tiny_prompt():
"""Baseline: how long does a single tiny round-trip to Ollama take?
This is the floor for every context's per-call cost. If the floor moves,
every context's total moves with it. Reported separately from the
pipeline test so hardware drift is obvious in the numbers.
"""
# Import the module (not the function) so the recorder's patch on
# jarvis.llm is visible at call time.
from jarvis import llm as _llm
with TimingRecorder() as rec:
# Warmup (first call pays weight-loading cost)
_llm.call_llm_direct(
base_url=OLLAMA_URL,
chat_model=PERF_MODEL,
system_prompt=TINY_SYSTEM,
user_content=TINY_USER,
timeout_sec=30.0,
)
# Measured runs
for _ in range(PERF_RUNS):
_llm.call_llm_direct(
base_url=OLLAMA_URL,
chat_model=PERF_MODEL,
system_prompt=TINY_SYSTEM,
user_content=TINY_USER,
timeout_sec=30.0,
)
rec.print_report(title=f"Micro-benchmark — tiny prompt × {PERF_RUNS + 1} on {PERF_MODEL}")
path = _write_report(rec, "micro")
print(f" 📄 saved: {path}")
# Shape check: warm calls should be noticeably faster than cold.
# Not a strict assertion (too noisy) — just make sure we got calls.
assert len(rec.calls) == PERF_RUNS + 1
# =============================================================================
# Full pipeline: run_reply_engine × N, per-context timings
# =============================================================================
@pytest.mark.performance
def test_pipeline_timings_by_context():
"""Run the full reply pipeline N times, record per-context timings.
Relative-shape invariants (not absolute numbers):
1. If the evaluator fires, it must be cheaper on average than the main
chat turn — otherwise we're paying more for the decision than for
the answer. This is the whole reason the evaluator uses a small
model.
2. The tool router, if it fires, must be cheaper than a main chat
turn on p50 — it's a classification call on the warm small model.
3. Enrichment extractor, if it fires, must run on the router chain
(same model as the router). This locks in the demotion we just did.
"""
from jarvis.memory.db import Database
from jarvis.memory.conversation import DialogueMemory
from jarvis.reply.engine import run_reply_engine
cfg = _make_cfg()
with TimingRecorder() as rec:
for query in PIPELINE_QUERIES:
db = Database(":memory:", sqlite_vss_path=None)
dlg = DialogueMemory(inactivity_timeout=300, max_interactions=20)
try:
for _ in range(PERF_RUNS):
run_reply_engine(db, cfg, None, query, dlg)
finally:
db.close()
rec.print_report(title=f"Pipeline timings — {len(PIPELINE_QUERIES)} queries × {PERF_RUNS} runs on {PERF_MODEL}")
path = _write_report(rec, "pipeline")
print(f" 📄 saved: {path}")
assert rec.calls, "no LLM calls recorded — pipeline did not invoke the LLM"
# Surface unmapped callers so new contexts show up in review.
other = [c for c in rec.calls if c.context.startswith("other:")]
if other:
unmapped = sorted({c.context for c in other})
print(f" ⚠️ unmapped callers (add to _CALLER_TO_CONTEXT): {unmapped}")
# Shape invariants
main_p50 = rec.p50("main_chat_turn")
if main_p50 > 0:
ev_p50 = rec.p50("evaluator")
if ev_p50 > 0:
assert ev_p50 <= main_p50 * 1.5, (
f"evaluator p50 ({ev_p50:.2f}s) exceeds main chat turn p50 "
f"({main_p50:.2f}s) by >50% — evaluator should be cheaper"
)
router_p50 = rec.p50("tool_router")
if router_p50 > 0:
assert router_p50 <= main_p50 * 1.5, (
f"tool router p50 ({router_p50:.2f}s) exceeds main chat turn p50 "
f"({main_p50:.2f}s) by >50% — router should be cheaper"
)
# Locking in the demotion: enrichment extractor must use the router chain.
enrich_calls = [c for c in rec.calls if c.context == "enrichment_extract"]
router_calls = [c for c in rec.calls if c.context == "tool_router"]
if enrich_calls and router_calls:
enrich_models = {c.model for c in enrich_calls}
router_models = {c.model for c in router_calls}
assert enrich_models == router_models, (
f"enrichment extractor should share the router model chain "
f"(enrichment={enrich_models}, router={router_models})"
)

View File

@@ -0,0 +1,270 @@
"""⏱️ LLM call timing recorder.
Monkey-patches the three entry points in ``jarvis.llm`` (``call_llm_direct``,
``call_llm_streaming``, ``chat_with_messages``) to record per-call timings
grouped by the context that issued the call (evaluator, intent judge, tool
router, etc.). The context is inferred from the caller's ``__qualname__`` on
the Python call stack, so no instrumentation is needed at the call site.
Usage:
with TimingRecorder() as rec:
run_reply_engine(...)
rec.print_report()
assert rec.p95("evaluator") < rec.p95("main_chat_turn") # shape check
"""
from __future__ import annotations
import sys
import time
import statistics
from contextlib import contextmanager
from dataclasses import dataclass, field
from typing import Callable, Optional
from jarvis import llm as _llm_module
# Map caller __qualname__ → graph context name. Matches the 13 contexts in
# docs/llm_contexts.md. Anything not listed gets lumped into "other" so we
# notice new call sites drift in without us updating the doc.
#
# ⚠️ This mapping mirrors docs/llm_contexts.md. When you add, remove, or
# rename an LLM context per the CLAUDE.md rule, update both in the same PR
# — the perf harness silently buckets unknown callers into "other:<qualname>"
# so drift here is visible but not loud.
_CALLER_TO_CONTEXT: dict[str, str] = {
# Context 1 — main chat loop uses chat_with_messages
"run_reply_engine": "main_chat_turn",
# Context 2 — intent judge (calls via internal helper)
"IntentJudge.evaluate": "intent_judge",
"IntentJudge._call_llm": "intent_judge",
# Context 3 — evaluator
"evaluate_turn": "evaluator",
# Context 4 — memory enrichment extractor
"extract_search_params_for_memory": "enrichment_extract",
# Context 5 — memory digest (per batch)
"_distil_batch": "memory_digest",
"digest_memory_for_query": "memory_digest",
# Context 6 — tool-result digest (per batch)
"_distil_tool_batch": "tool_result_digest",
"digest_tool_result_for_query": "tool_result_digest",
"_maybe_digest_tool_result": "tool_result_digest",
# Context 7 — max-turn loop digest
"digest_loop_for_max_turns": "max_turn_digest",
# Context 8 — tool router
# (Context 9 — tool searcher — reuses select_tools_with_llm so it falls
# under the same bucket; that's intentional per docs/llm_contexts.md.)
"select_tools_with_llm": "tool_router",
# Context 10 — conversation summariser
"generate_conversation_summary": "summariser",
# Context 11 — graph fact extraction
"extract_graph_memories": "graph_extract",
# Context 12 — graph best-child picker
"_llm_pick_best_child": "graph_best_child",
# Context 13 — tool-specific LLM calls
"_extract_place_from_user_text": "tool_weather",
"extract_and_log_meal": "tool_nutrition",
"generate_followups_for_meal": "tool_nutrition",
}
@dataclass
class _Call:
context: str
duration_sec: float
model: str
prompt_chars: int
response_chars: int
@dataclass
class TimingRecorder:
calls: list[_Call] = field(default_factory=list)
_originals: dict = field(default_factory=dict)
def __enter__(self) -> "TimingRecorder":
self._patch()
return self
def __exit__(self, exc_type, exc, tb) -> None:
self._unpatch()
# ── context inference ────────────────────────────────────────────────
@staticmethod
def _infer_context(skip_frames: int = 2) -> str:
"""Walk the stack looking for the nearest function whose qualname is
in our context map. Skip ``skip_frames`` to step over the wrapper
itself. Falls back to ``"other:<qualname>"`` when no known caller is
found — visible in the report so drift shows up."""
frame = sys._getframe(skip_frames)
first_unknown: Optional[str] = None
while frame is not None:
qual = frame.f_code.co_qualname if hasattr(frame.f_code, "co_qualname") else frame.f_code.co_name
if qual in _CALLER_TO_CONTEXT:
return _CALLER_TO_CONTEXT[qual]
# Also match by the bare function name (qualname can be e.g.
# ClassName.method — strip the class part).
bare = qual.rsplit(".", 1)[-1]
if bare in _CALLER_TO_CONTEXT:
return _CALLER_TO_CONTEXT[bare]
if first_unknown is None and not qual.startswith(("<", "_patch", "_unpatch")):
first_unknown = qual
frame = frame.f_back
return f"other:{first_unknown or 'unknown'}"
# ── patching ─────────────────────────────────────────────────────────
def _wrap(self, name: str, original: Callable) -> Callable:
def wrapped(*args, **kwargs):
ctx = self._infer_context(skip_frames=2)
# Extract model + prompt sizes from args heuristically — all three
# entry points take (base_url, chat_model, ...). chat_with_messages
# takes a messages list.
model = ""
prompt_chars = 0
if name == "chat_with_messages":
model = kwargs.get("chat_model") or (args[1] if len(args) > 1 else "")
msgs = kwargs.get("messages") or (args[2] if len(args) > 2 else [])
if isinstance(msgs, list):
prompt_chars = sum(len(str(m.get("content", ""))) for m in msgs)
else:
model = kwargs.get("chat_model") or (args[1] if len(args) > 1 else "")
sys_p = kwargs.get("system_prompt") or (args[2] if len(args) > 2 else "")
user_c = kwargs.get("user_content") or (args[3] if len(args) > 3 else "")
prompt_chars = len(str(sys_p)) + len(str(user_c))
t0 = time.perf_counter()
result = original(*args, **kwargs)
elapsed = time.perf_counter() - t0
# response size: str for direct/streaming, dict for chat_with_messages
if isinstance(result, str):
response_chars = len(result)
elif isinstance(result, dict):
response_chars = len(str(result.get("content", "")))
else:
response_chars = 0
self.calls.append(_Call(
context=ctx,
duration_sec=elapsed,
model=str(model),
prompt_chars=prompt_chars,
response_chars=response_chars,
))
return result
return wrapped
def _patch(self) -> None:
"""Patch every module that has already imported one of the LLM entry
points via ``from ..llm import X``. Those bindings were resolved at
import time and do NOT see a setattr on ``jarvis.llm`` itself, so we
have to replace the attribute on each importer.
"""
import sys as _sys
names = ("call_llm_direct", "call_llm_streaming", "chat_with_messages")
# Capture the originals from the llm module once.
originals = {n: getattr(_llm_module, n) for n in names}
# self._originals stores [(module, name, original_fn)] so _unpatch
# can put each binding back exactly where it came from.
self._originals["_sites"] = []
for mod in list(_sys.modules.values()):
if mod is None or mod is _llm_module:
continue
mod_name = getattr(mod, "__name__", "")
if not mod_name.startswith(("jarvis", "tests", "evals")):
continue
for name in names:
current = getattr(mod, name, None)
if current is originals[name]:
wrapped = self._wrap(name, originals[name])
setattr(mod, name, wrapped)
self._originals["_sites"].append((mod, name, originals[name]))
# Also patch the canonical module so any late `from jarvis.llm import X`
# after we enter the context sees the wrapper.
for name in names:
wrapped = self._wrap(name, originals[name])
setattr(_llm_module, name, wrapped)
self._originals["_sites"].append((_llm_module, name, originals[name]))
def _unpatch(self) -> None:
for mod, name, original in self._originals.get("_sites", []):
setattr(mod, name, original)
self._originals.clear()
# ── queries ──────────────────────────────────────────────────────────
def by_context(self) -> dict[str, list[_Call]]:
out: dict[str, list[_Call]] = {}
for c in self.calls:
out.setdefault(c.context, []).append(c)
return out
def durations(self, context: str) -> list[float]:
return [c.duration_sec for c in self.calls if c.context == context]
def p50(self, context: str) -> float:
ds = self.durations(context)
return statistics.median(ds) if ds else 0.0
def p95(self, context: str) -> float:
ds = self.durations(context)
if not ds:
return 0.0
if len(ds) == 1:
return ds[0]
ds_sorted = sorted(ds)
idx = max(0, int(round(0.95 * (len(ds_sorted) - 1))))
return ds_sorted[idx]
def total(self, context: Optional[str] = None) -> float:
if context is None:
return sum(c.duration_sec for c in self.calls)
return sum(c.duration_sec for c in self.calls if c.context == context)
# ── reporting ────────────────────────────────────────────────────────
def to_dict(self) -> dict:
buckets = self.by_context()
return {
"total_calls": len(self.calls),
"total_sec": round(self.total(), 3),
"contexts": {
ctx: {
"calls": len(calls),
"total_sec": round(sum(c.duration_sec for c in calls), 3),
"p50_sec": round(self.p50(ctx), 3),
"p95_sec": round(self.p95(ctx), 3),
"avg_prompt_chars": int(statistics.mean(c.prompt_chars for c in calls)) if calls else 0,
"avg_response_chars": int(statistics.mean(c.response_chars for c in calls)) if calls else 0,
"models": sorted({c.model for c in calls if c.model}),
}
for ctx, calls in buckets.items()
},
}
def print_report(self, title: str = "LLM pipeline timings") -> None:
print(f"\n⏱️ {title}")
print(f" total calls: {len(self.calls)} total wall time: {self.total():.2f}s")
rows = sorted(
self.by_context().items(),
key=lambda kv: -sum(c.duration_sec for c in kv[1]),
)
header = f" {'context':<22} {'n':>3} {'total':>7} {'p50':>6} {'p95':>6} {'prompt':>7} model"
print(header)
print(" " + "-" * (len(header) - 3))
for ctx, calls in rows:
total = sum(c.duration_sec for c in calls)
print(
f" {ctx:<22} {len(calls):>3} "
f"{total:>6.2f}s {self.p50(ctx):>5.2f}s {self.p95(ctx):>5.2f}s "
f"{int(statistics.mean(c.prompt_chars for c in calls)):>7} "
f"{','.join(sorted({c.model for c in calls if c.model}))}"
)
@contextmanager
def record_timings():
"""Convenience context manager."""
rec = TimingRecorder()
with rec:
yield rec