javis_bot/evals/test_graph_supplies_missing_tool_arg.py

"""
End-to-end eval — single-turn flow where the user's location lives in the
User branch of the knowledge graph (warm profile). The warm profile is
always-loaded into the system prompt, so the chat model and planner can
ground ``getWeather`` on it without a ``searchMemory`` step.

This stresses the warm-profile-injection path. It complements:
  - ``evals/test_followup_supplies_missing_tool_arg.py`` (hot-window
    carry-over, two-turn).
  - ``evals/test_diary_supplies_missing_tool_arg.py`` (diary recall via
    planner-emitted ``searchMemory``).

Run: EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh graph_supplies_missing_tool_arg
"""

from unittest.mock import patch

import pytest

from conftest import requires_judge_llm
from helpers import (
    ToolCallCapture,
    assert_not_fallback_reply,
    JUDGE_MODEL,
)


_EDINBURGH_FORECAST = (
    "Weather for Edinburgh, UK:\n"
    "Today: 11°C, partly cloudy. High 13°C, low 7°C.\n"
    "Tomorrow: 12°C, light rain, high 14°C, low 8°C."
)


def _make_runner(capture: ToolCallCapture):
    from jarvis.tools.types import ToolExecutionResult

    def _runner(db, cfg, tool_name, tool_args, **kwargs):
        capture.record(tool_name, tool_args or {})
        if tool_name == "getWeather":
            location = ((tool_args or {}).get("location") or "").strip()
            if not location:
                return ToolExecutionResult(
                    success=False,
                    reply_text=(
                        "I couldn't auto-detect your location. Please "
                        "tell me which city to check the weather for."
                    ),
                )
            return ToolExecutionResult(
                success=True,
                reply_text=_EDINBURGH_FORECAST,
            )
        return ToolExecutionResult(success=True, reply_text="OK")

    return _runner


@pytest.mark.eval
@requires_judge_llm
class TestGraphSuppliesMissingToolArg:
    """Warm-profile injection path: a User-branch fact ("lives in
    Edinburgh") is always loaded into the system prompt, so the chat
    model can supply it as the location argument without an extra
    memory search."""

    def test_warm_profile_user_fact_grounds_get_weather_call(
        self, mock_config, eval_db, eval_dialogue_memory,
    ):
        from jarvis.reply.engine import run_reply_engine

        mock_config.ollama_base_url = "http://localhost:11434"
        mock_config.ollama_chat_model = JUDGE_MODEL
        # Geoip disabled — the only way the model gets a location is from
        # the warm profile loaded out of the graph.
        mock_config.location_enabled = False

        capture = ToolCallCapture()

        # Inject a User-branch fact directly into the warm-profile builder
        # rather than seeding the SQLite-backed graph store. The warm-
        # profile path the engine relies on is `build_warm_profile` →
        # `format_warm_profile_block`; seeding via the public API replays
        # the production shape without depending on graph-mutation
        # listeners or branch-root bootstrapping in the test DB.
        warm_profile = {
            "user": "The user lives in Edinburgh.",
            "directives": "",
        }

        with patch(
            "jarvis.memory.graph_ops.build_warm_profile",
            return_value=warm_profile,
        ), patch(
            "jarvis.reply.engine.run_tool_with_retries",
            side_effect=_make_runner(capture),
        ):
            response = run_reply_engine(
                db=eval_db, cfg=mock_config, tts=None,
                text="how's the weather, Jarvis?",
                dialogue_memory=eval_dialogue_memory,
            )

        print(f"\n  Graph Supplies Missing Tool Arg ({JUDGE_MODEL}):")
        print(f"  Tools called: {capture.tool_names()}")
        for c in capture.calls:
            print(f"    - {c['name']}({c['args']})")
        print(f"  Response: {(response or '')[:300]}")

        assert_not_fallback_reply(response, context="warm-profile")

        weather_calls = [c for c in capture.calls if c["name"] == "getWeather"]
        edinburgh_calls = [
            c for c in weather_calls
            if "edinburgh" in (c["args"].get("location") or "").lower()
        ]
        assert edinburgh_calls, (
            "getWeather was not invoked with location='Edinburgh' even "
            "though the warm profile names Edinburgh as the user's home. "
            "The chat model must use always-loaded user facts as tool "
            "arguments without an explicit prompt to do so. "
            f"All getWeather calls: {[c['args'] for c in weather_calls]}. "
            f"Tools observed: {capture.tool_names()}. "
            f"Response: {(response or '')[:400]}"
        )

        response_lower = (response or "").lower()
        assert "edinburgh" in response_lower, (
            "Reply does not mention Edinburgh despite the warm profile "
            f"naming it as the user's location. Response: {(response or '')[:400]}"
        )

        assert "hackney" not in response_lower, (
            "Reply mentions Hackney — the warm profile clearly states "
            "Edinburgh, and geoip is disabled in this test. The model "
            f"leaked a hardcoded default. Response: {(response or '')[:400]}"
        )