javis_bot/tests/performance/test_pipeline_timings.py

"""⏱️ Performance: time each LLM context in the reply pipeline.

Runs ``run_reply_engine`` N times against a live Ollama with a fixed tiny
prompt, records per-context timings via the monkey-patching recorder, and
asserts a few relative-shape invariants so the test fails when the pipeline
shape drifts (e.g. the evaluator becomes more expensive than the main turn).

Also includes a micro-benchmark that calls each configured model with a
tiny fixed prompt, giving a hardware baseline to diff against.

Run manually:
    pytest tests/performance/ -v -m performance -s

Requires:
    - Ollama reachable at http://localhost:11434
    - ``gemma4:e2b`` pulled (or override via env var)

The test is skipped automatically if Ollama is unreachable, so it's safe to
leave in the repo. Use ``-s`` to see the report table.
"""

from __future__ import annotations

import json
import os
import time
from pathlib import Path

import pytest
import requests

from tests.performance.timing_recorder import TimingRecorder


OLLAMA_URL = os.environ.get("JARVIS_PERF_OLLAMA_URL", "http://localhost:11434")
PERF_MODEL = os.environ.get("JARVIS_PERF_MODEL", "gemma4:e2b")
PERF_RUNS = int(os.environ.get("JARVIS_PERF_RUNS", "3"))
PERF_REPORT_DIR = Path(os.environ.get(
    "JARVIS_PERF_REPORT_DIR",
    str(Path(__file__).parent / "reports"),
))

# Tiny fixed prompts — the whole point of the baseline is to measure the
# per-call overhead and model warmup cost, not prompt-length effects.
TINY_SYSTEM = "Reply with the single word OK."
TINY_USER = "ping"

# Representative reply-pipeline queries. Keep them small and shape-diverse.
PIPELINE_QUERIES = [
    "hello",                      # pure chat, no tools needed
    "what's 2 plus 3?",           # math, one-shot
    "what time is it in Tokyo?",  # likely triggers a tool
]


def _ollama_reachable() -> bool:
    try:
        resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=2)
        if resp.status_code != 200:
            return False
        models = [m.get("name", "") for m in resp.json().get("models", [])]
        return any(PERF_MODEL.split(":")[0] in m for m in models)
    except Exception:
        return False


pytestmark = [
    pytest.mark.performance,
    pytest.mark.skipif(
        not _ollama_reachable(),
        reason=f"Ollama at {OLLAMA_URL} with {PERF_MODEL} not available",
    ),
]


def _make_cfg():
    from evals.helpers import MockConfig
    cfg = MockConfig()
    cfg.ollama_base_url = OLLAMA_URL
    cfg.ollama_chat_model = PERF_MODEL
    cfg.intent_judge_model = PERF_MODEL
    # Let size-aware defaults kick in (evaluator + digests ON for small).
    cfg.evaluator_enabled = None
    cfg.memory_digest_enabled = None
    cfg.tool_result_digest_enabled = None
    # Force the LLM-based router so its timing shows up in the report.
    # MockConfig doesn't set this attribute, and the engine's default varies.
    cfg.tool_selection_strategy = "llm"
    cfg.tool_router_model = ""  # fall through the router chain
    cfg.evaluator_model = ""
    return cfg


def _write_report(rec: TimingRecorder, name: str) -> Path:
    PERF_REPORT_DIR.mkdir(parents=True, exist_ok=True)
    stamp = time.strftime("%Y%m%d-%H%M%S")
    path = PERF_REPORT_DIR / f"{name}-{stamp}.json"
    payload = {
        "name": name,
        "timestamp": stamp,
        "model": PERF_MODEL,
        "runs": PERF_RUNS,
        "summary": rec.to_dict(),
        "raw": [
            {
                "context": c.context,
                "duration_sec": round(c.duration_sec, 4),
                "model": c.model,
                "prompt_chars": c.prompt_chars,
                "response_chars": c.response_chars,
            }
            for c in rec.calls
        ],
    }
    path.write_text(json.dumps(payload, indent=2))
    return path


# =============================================================================
# Micro-benchmark: tiny fixed prompt per configured model
# =============================================================================


@pytest.mark.performance
def test_micro_benchmark_tiny_prompt():
    """Baseline: how long does a single tiny round-trip to Ollama take?

    This is the floor for every context's per-call cost. If the floor moves,
    every context's total moves with it. Reported separately from the
    pipeline test so hardware drift is obvious in the numbers.
    """
    # Import the module (not the function) so the recorder's patch on
    # jarvis.llm is visible at call time.
    from jarvis import llm as _llm

    with TimingRecorder() as rec:
        # Warmup (first call pays weight-loading cost)
        _llm.call_llm_direct(
            base_url=OLLAMA_URL,
            chat_model=PERF_MODEL,
            system_prompt=TINY_SYSTEM,
            user_content=TINY_USER,
            timeout_sec=30.0,
        )
        # Measured runs
        for _ in range(PERF_RUNS):
            _llm.call_llm_direct(
                base_url=OLLAMA_URL,
                chat_model=PERF_MODEL,
                system_prompt=TINY_SYSTEM,
                user_content=TINY_USER,
                timeout_sec=30.0,
            )

    rec.print_report(title=f"Micro-benchmark — tiny prompt × {PERF_RUNS + 1} on {PERF_MODEL}")
    path = _write_report(rec, "micro")
    print(f"   📄 saved: {path}")

    # Shape check: warm calls should be noticeably faster than cold.
    # Not a strict assertion (too noisy) — just make sure we got calls.
    assert len(rec.calls) == PERF_RUNS + 1


# =============================================================================
# Full pipeline: run_reply_engine × N, per-context timings
# =============================================================================


@pytest.mark.performance
def test_pipeline_timings_by_context():
    """Run the full reply pipeline N times, record per-context timings.

    Relative-shape invariants (not absolute numbers):
      1. If the evaluator fires, it must be cheaper on average than the main
         chat turn — otherwise we're paying more for the decision than for
         the answer. This is the whole reason the evaluator uses a small
         model.
      2. The tool router, if it fires, must be cheaper than a main chat
         turn on p50 — it's a classification call on the warm small model.
      3. Enrichment extractor, if it fires, must run on the router chain
         (same model as the router). This locks in the demotion we just did.
    """
    from jarvis.memory.db import Database
    from jarvis.memory.conversation import DialogueMemory
    from jarvis.reply.engine import run_reply_engine

    cfg = _make_cfg()

    with TimingRecorder() as rec:
        for query in PIPELINE_QUERIES:
            db = Database(":memory:", sqlite_vss_path=None)
            dlg = DialogueMemory(inactivity_timeout=300, max_interactions=20)
            try:
                for _ in range(PERF_RUNS):
                    run_reply_engine(db, cfg, None, query, dlg)
            finally:
                db.close()

    rec.print_report(title=f"Pipeline timings — {len(PIPELINE_QUERIES)} queries × {PERF_RUNS} runs on {PERF_MODEL}")
    path = _write_report(rec, "pipeline")
    print(f"   📄 saved: {path}")

    assert rec.calls, "no LLM calls recorded — pipeline did not invoke the LLM"

    # Surface unmapped callers so new contexts show up in review.
    other = [c for c in rec.calls if c.context.startswith("other:")]
    if other:
        unmapped = sorted({c.context for c in other})
        print(f"   ⚠️  unmapped callers (add to _CALLER_TO_CONTEXT): {unmapped}")

    # Shape invariants
    main_p50 = rec.p50("main_chat_turn")
    if main_p50 > 0:
        ev_p50 = rec.p50("evaluator")
        if ev_p50 > 0:
            assert ev_p50 <= main_p50 * 1.5, (
                f"evaluator p50 ({ev_p50:.2f}s) exceeds main chat turn p50 "
                f"({main_p50:.2f}s) by >50% — evaluator should be cheaper"
            )
        router_p50 = rec.p50("tool_router")
        if router_p50 > 0:
            assert router_p50 <= main_p50 * 1.5, (
                f"tool router p50 ({router_p50:.2f}s) exceeds main chat turn p50 "
                f"({main_p50:.2f}s) by >50% — router should be cheaper"
            )

    # Locking in the demotion: enrichment extractor must use the router chain.
    enrich_calls = [c for c in rec.calls if c.context == "enrichment_extract"]
    router_calls = [c for c in rec.calls if c.context == "tool_router"]
    if enrich_calls and router_calls:
        enrich_models = {c.model for c in enrich_calls}
        router_models = {c.model for c in router_calls}
        assert enrich_models == router_models, (
            f"enrichment extractor should share the router model chain "
            f"(enrichment={enrich_models}, router={router_models})"
        )