Files
javis_bot/tests/performance/test_pipeline_timings.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

237 lines
8.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""⏱️ Performance: time each LLM context in the reply pipeline.
Runs ``run_reply_engine`` N times against a live Ollama with a fixed tiny
prompt, records per-context timings via the monkey-patching recorder, and
asserts a few relative-shape invariants so the test fails when the pipeline
shape drifts (e.g. the evaluator becomes more expensive than the main turn).
Also includes a micro-benchmark that calls each configured model with a
tiny fixed prompt, giving a hardware baseline to diff against.
Run manually:
pytest tests/performance/ -v -m performance -s
Requires:
- Ollama reachable at http://localhost:11434
- ``gemma4:e2b`` pulled (or override via env var)
The test is skipped automatically if Ollama is unreachable, so it's safe to
leave in the repo. Use ``-s`` to see the report table.
"""
from __future__ import annotations
import json
import os
import time
from pathlib import Path
import pytest
import requests
from tests.performance.timing_recorder import TimingRecorder
OLLAMA_URL = os.environ.get("JARVIS_PERF_OLLAMA_URL", "http://localhost:11434")
PERF_MODEL = os.environ.get("JARVIS_PERF_MODEL", "gemma4:e2b")
PERF_RUNS = int(os.environ.get("JARVIS_PERF_RUNS", "3"))
PERF_REPORT_DIR = Path(os.environ.get(
"JARVIS_PERF_REPORT_DIR",
str(Path(__file__).parent / "reports"),
))
# Tiny fixed prompts — the whole point of the baseline is to measure the
# per-call overhead and model warmup cost, not prompt-length effects.
TINY_SYSTEM = "Reply with the single word OK."
TINY_USER = "ping"
# Representative reply-pipeline queries. Keep them small and shape-diverse.
PIPELINE_QUERIES = [
"hello", # pure chat, no tools needed
"what's 2 plus 3?", # math, one-shot
"what time is it in Tokyo?", # likely triggers a tool
]
def _ollama_reachable() -> bool:
try:
resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=2)
if resp.status_code != 200:
return False
models = [m.get("name", "") for m in resp.json().get("models", [])]
return any(PERF_MODEL.split(":")[0] in m for m in models)
except Exception:
return False
pytestmark = [
pytest.mark.performance,
pytest.mark.skipif(
not _ollama_reachable(),
reason=f"Ollama at {OLLAMA_URL} with {PERF_MODEL} not available",
),
]
def _make_cfg():
from evals.helpers import MockConfig
cfg = MockConfig()
cfg.ollama_base_url = OLLAMA_URL
cfg.ollama_chat_model = PERF_MODEL
cfg.intent_judge_model = PERF_MODEL
# Let size-aware defaults kick in (evaluator + digests ON for small).
cfg.evaluator_enabled = None
cfg.memory_digest_enabled = None
cfg.tool_result_digest_enabled = None
# Force the LLM-based router so its timing shows up in the report.
# MockConfig doesn't set this attribute, and the engine's default varies.
cfg.tool_selection_strategy = "llm"
cfg.tool_router_model = "" # fall through the router chain
cfg.evaluator_model = ""
return cfg
def _write_report(rec: TimingRecorder, name: str) -> Path:
PERF_REPORT_DIR.mkdir(parents=True, exist_ok=True)
stamp = time.strftime("%Y%m%d-%H%M%S")
path = PERF_REPORT_DIR / f"{name}-{stamp}.json"
payload = {
"name": name,
"timestamp": stamp,
"model": PERF_MODEL,
"runs": PERF_RUNS,
"summary": rec.to_dict(),
"raw": [
{
"context": c.context,
"duration_sec": round(c.duration_sec, 4),
"model": c.model,
"prompt_chars": c.prompt_chars,
"response_chars": c.response_chars,
}
for c in rec.calls
],
}
path.write_text(json.dumps(payload, indent=2))
return path
# =============================================================================
# Micro-benchmark: tiny fixed prompt per configured model
# =============================================================================
@pytest.mark.performance
def test_micro_benchmark_tiny_prompt():
"""Baseline: how long does a single tiny round-trip to Ollama take?
This is the floor for every context's per-call cost. If the floor moves,
every context's total moves with it. Reported separately from the
pipeline test so hardware drift is obvious in the numbers.
"""
# Import the module (not the function) so the recorder's patch on
# jarvis.llm is visible at call time.
from jarvis import llm as _llm
with TimingRecorder() as rec:
# Warmup (first call pays weight-loading cost)
_llm.call_llm_direct(
base_url=OLLAMA_URL,
chat_model=PERF_MODEL,
system_prompt=TINY_SYSTEM,
user_content=TINY_USER,
timeout_sec=30.0,
)
# Measured runs
for _ in range(PERF_RUNS):
_llm.call_llm_direct(
base_url=OLLAMA_URL,
chat_model=PERF_MODEL,
system_prompt=TINY_SYSTEM,
user_content=TINY_USER,
timeout_sec=30.0,
)
rec.print_report(title=f"Micro-benchmark — tiny prompt × {PERF_RUNS + 1} on {PERF_MODEL}")
path = _write_report(rec, "micro")
print(f" 📄 saved: {path}")
# Shape check: warm calls should be noticeably faster than cold.
# Not a strict assertion (too noisy) — just make sure we got calls.
assert len(rec.calls) == PERF_RUNS + 1
# =============================================================================
# Full pipeline: run_reply_engine × N, per-context timings
# =============================================================================
@pytest.mark.performance
def test_pipeline_timings_by_context():
"""Run the full reply pipeline N times, record per-context timings.
Relative-shape invariants (not absolute numbers):
1. If the evaluator fires, it must be cheaper on average than the main
chat turn — otherwise we're paying more for the decision than for
the answer. This is the whole reason the evaluator uses a small
model.
2. The tool router, if it fires, must be cheaper than a main chat
turn on p50 — it's a classification call on the warm small model.
3. Enrichment extractor, if it fires, must run on the router chain
(same model as the router). This locks in the demotion we just did.
"""
from jarvis.memory.db import Database
from jarvis.memory.conversation import DialogueMemory
from jarvis.reply.engine import run_reply_engine
cfg = _make_cfg()
with TimingRecorder() as rec:
for query in PIPELINE_QUERIES:
db = Database(":memory:", sqlite_vss_path=None)
dlg = DialogueMemory(inactivity_timeout=300, max_interactions=20)
try:
for _ in range(PERF_RUNS):
run_reply_engine(db, cfg, None, query, dlg)
finally:
db.close()
rec.print_report(title=f"Pipeline timings — {len(PIPELINE_QUERIES)} queries × {PERF_RUNS} runs on {PERF_MODEL}")
path = _write_report(rec, "pipeline")
print(f" 📄 saved: {path}")
assert rec.calls, "no LLM calls recorded — pipeline did not invoke the LLM"
# Surface unmapped callers so new contexts show up in review.
other = [c for c in rec.calls if c.context.startswith("other:")]
if other:
unmapped = sorted({c.context for c in other})
print(f" ⚠️ unmapped callers (add to _CALLER_TO_CONTEXT): {unmapped}")
# Shape invariants
main_p50 = rec.p50("main_chat_turn")
if main_p50 > 0:
ev_p50 = rec.p50("evaluator")
if ev_p50 > 0:
assert ev_p50 <= main_p50 * 1.5, (
f"evaluator p50 ({ev_p50:.2f}s) exceeds main chat turn p50 "
f"({main_p50:.2f}s) by >50% — evaluator should be cheaper"
)
router_p50 = rec.p50("tool_router")
if router_p50 > 0:
assert router_p50 <= main_p50 * 1.5, (
f"tool router p50 ({router_p50:.2f}s) exceeds main chat turn p50 "
f"({main_p50:.2f}s) by >50% — router should be cheaper"
)
# Locking in the demotion: enrichment extractor must use the router chain.
enrich_calls = [c for c in rec.calls if c.context == "enrichment_extract"]
router_calls = [c for c in rec.calls if c.context == "tool_router"]
if enrich_calls and router_calls:
enrich_models = {c.model for c in enrich_calls}
router_models = {c.model for c in router_calls}
assert enrich_models == router_models, (
f"enrichment extractor should share the router model chain "
f"(enrichment={enrich_models}, router={router_models})"
)