javis_bot/evals/test_diary_supplies_missing_tool_arg.py

"""
End-to-end eval — single-turn flow where the user's location lives only
in the diary from a past conversation. The planner must emit
``searchMemory``, the diary must surface "Manchester", and ``getWeather``
must then be invoked with ``location='Manchester'``.

This stresses the diary-recall path. It complements the carry-over
guard's hot-window path (covered by
``evals/test_followup_supplies_missing_tool_arg.py``) by exercising the
slower long-term-memory path: the user said "I live in Manchester" days
ago, the conversation has lapsed, and now the user asks "how's the
weather, Jarvis?" with no live geoip and nothing in the hot window.

Memory-recall reliability on small models is itself an open failure
mode separate from the tool carry-over guard. If gemma4:e2b consistently
deflects rather than grounding the search, this eval is best read as an
upper-bound regression guard: a green run on a reliable judge model
proves the wiring works, while a red run on a small model is expected
until follow-up memory work lands.

Run: EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh diary_supplies_missing_tool_arg
"""

from unittest.mock import patch

import pytest

from conftest import requires_judge_llm
from helpers import (
    ToolCallCapture,
    assert_not_fallback_reply,
    seed_diary_summaries,
    JUDGE_MODEL,
)


_DIARY_MANCHESTER = [
    (
        "2026-04-26",
        "The user mentioned they live in Manchester and prefer celsius "
        "for weather queries.",
    ),
]


_MANCHESTER_FORECAST = (
    "Weather for Manchester, UK:\n"
    "Today: 12°C, overcast. High 14°C, low 8°C.\n"
    "Tomorrow: 13°C, light rain, high 15°C, low 9°C."
)


def _make_runner(capture: ToolCallCapture):
    from jarvis.tools.types import ToolExecutionResult

    def _runner(db, cfg, tool_name, tool_args, **kwargs):
        capture.record(tool_name, tool_args or {})
        if tool_name == "getWeather":
            location = ((tool_args or {}).get("location") or "").strip()
            if not location:
                return ToolExecutionResult(
                    success=False,
                    reply_text=(
                        "I couldn't auto-detect your location. Please "
                        "tell me which city to check the weather for."
                    ),
                )
            return ToolExecutionResult(
                success=True,
                reply_text=_MANCHESTER_FORECAST,
            )
        return ToolExecutionResult(success=True, reply_text="OK")

    return _runner


@pytest.mark.eval
@requires_judge_llm
class TestDiarySuppliesMissingToolArg:
    """Diary-recall path: location surfaced from a prior conversation
    grounds the getWeather call without needing the hot window or
    explicit user re-statement."""

    def test_diary_location_grounds_get_weather_call(
        self, mock_config, eval_db, eval_dialogue_memory,
    ):
        from jarvis.reply.engine import run_reply_engine

        mock_config.ollama_base_url = "http://localhost:11434"
        mock_config.ollama_chat_model = JUDGE_MODEL
        # Geoip disabled — the only way the model gets a location is from
        # diary recall.
        mock_config.location_enabled = False
        mock_config.memory_enrichment_source = "diary"

        seed_diary_summaries(eval_db, _DIARY_MANCHESTER)

        capture = ToolCallCapture()

        with patch(
            "jarvis.reply.engine.run_tool_with_retries",
            side_effect=_make_runner(capture),
        ):
            response = run_reply_engine(
                db=eval_db, cfg=mock_config, tts=None,
                text="how's the weather, Jarvis?",
                dialogue_memory=eval_dialogue_memory,
            )

        print(f"\n  Diary Supplies Missing Tool Arg ({JUDGE_MODEL}):")
        print(f"  Tools called: {capture.tool_names()}")
        for c in capture.calls:
            print(f"    - {c['name']}({c['args']})")
        print(f"  Response: {(response or '')[:300]}")

        assert_not_fallback_reply(response, context="diary-recall")

        # The reply must actually use the recalled location, both at the
        # tool call layer and in the user-facing reply.
        weather_calls = [c for c in capture.calls if c["name"] == "getWeather"]
        manchester_calls = [
            c for c in weather_calls
            if "manchester" in (c["args"].get("location") or "").lower()
        ]
        assert manchester_calls, (
            "getWeather was not invoked with location='Manchester' even "
            "though the diary contains the user's stated location. The "
            "memory enrichment → tool argument grounding path is broken. "
            f"All getWeather calls: {[c['args'] for c in weather_calls]}. "
            f"Tools observed: {capture.tool_names()}. "
            f"Response: {(response or '')[:400]}"
        )

        response_lower = (response or "").lower()
        assert "manchester" in response_lower, (
            "Reply does not mention Manchester despite the diary stating "
            f"the user lives there. Response: {(response or '')[:400]}"
        )

        # Guard against a hardcoded-default leak: any reply that mentions
        # Hackney here is wrong (Hackney is the test fixture's geoip
        # default, but geoip is disabled in this test).
        assert "hackney" not in response_lower, (
            "Reply mentions Hackney — the diary clearly states Manchester, "
            "and geoip is disabled in this test. The model leaked a "
            f"hardcoded default. Response: {(response or '')[:400]}"
        )