""" Evaluator-Driven Agentic Loop Evaluations Covers the evaluator's end-to-end behaviour against a real small model (gemma4:e2b by default): the per-turn terminal/continue decision, nudge injection, nudge cap enforcement, max-turn digest fallback, the toolSearchTool escape hatch, and multi-turn multi-tool complexity. These evals complement the mock-LLM unit tests in ``tests/test_evaluator.py`` and ``tests/test_engine_tool_search_loop.py`` by observing what a live small model actually does when looped through the evaluator. Tool *implementations* are mocked for determinism; the chat model and the evaluator model run for real. Run: ./scripts/run_evals.sh """ from __future__ import annotations import pytest from unittest.mock import patch from conftest import requires_judge_llm from helpers import ( JUDGE_MODEL, ToolCallCapture, assert_not_fallback_reply, assert_not_max_turns_digest, ) # ============================================================================= # Canned tool payloads — short, deterministic, keyword-rich so the chat model # has something concrete to talk about after the evaluator forces the call. # ============================================================================= MOCK_WEATHER_PARIS = ( "Current weather in Paris, France:\n" "Conditions: Partly cloudy\n" "Temperature: 14.2C\n" "Feels like: 12C\n" "Humidity: 68%\n" "Wind: 10 km/h from the south-west\n" ) MOCK_WEATHER_LONDON = ( "Current weather in London, United Kingdom:\n" "Conditions: Light rain\n" "Temperature: 9.1C\n" "Feels like: 7C\n" "Humidity: 82%\n" "Wind: 18 km/h from the west\n" ) MOCK_NAV_SUCCESS = '{"status": "ok", "url": "https://youtube.com"}' MOCK_TOOLSEARCH_NAV = ( "chrome-devtools__navigate_page: Navigate the active browser tab to a URL.\n" "stop: Explicit end-of-turn sentinel." ) MOCK_TOOLSEARCH_EMPTY = "No additional tools were found for this query." MOCK_POSSESSOR_SEARCH = ( "Web search results for 'Possessor film director':\n" "Possessor is a 2020 sci-fi horror film directed by Brandon Cronenberg, " "son of David Cronenberg. It stars Andrea Riseborough and Christopher " "Abbott.\n" ) MOCK_CRONENBERG_FILMOGRAPHY = ( "Web search results for 'Brandon Cronenberg filmography':\n" "Brandon Cronenberg's films include Antiviral (2012), Possessor (2020), " "and Infinity Pool (2023).\n" ) MOCK_HARRY_STYLES_BIO = ( "Web search results for 'Harry Styles':\n" "Harry Styles is an English singer-songwriter, born 1 February 1994. " "Former member of One Direction; solo albums include Fine Line (2019) " "and Harry's House (2022).\n" ) MOCK_HARRY_STYLES_SONGS = ( "Web search results for 'Harry Styles famous songs':\n" "Notable songs: 'Watermelon Sugar' (2019), 'As It Was' (2022), " "'Sign of the Times' (2017), 'Adore You' (2019).\n" ) MOCK_MADRID_STALE = ( "Web search results for 'Real Madrid':\n" "Real Madrid CF is a Spanish football club founded in 1902. " "The club plays at the Santiago Bernabeu stadium.\n" ) MOCK_MADRID_LIVE = ( "Web search results for 'Real Madrid match live score':\n" "Real Madrid 2 - 1 Getafe (78'). Goals by Vinicius Jr and Bellingham.\n" ) # ============================================================================= # Helpers # ============================================================================= def _configure(mock_config): """Pin the eval to the live small model with the evaluator enabled.""" mock_config.ollama_base_url = "http://localhost:11434" mock_config.ollama_chat_model = JUDGE_MODEL # Evaluator on (default None for SMALL already enables it, but be explicit # so failures are unambiguous if the model-size detection changes). mock_config.evaluator_enabled = True mock_config.evaluator_nudge_max = 2 mock_config.tool_search_max_calls = 3 return mock_config def _make_router_stub(tools): """Return a ``select_tools`` replacement that always returns the given list.""" def _stub(*_args, **_kwargs): return list(tools) return _stub def _make_tool_runner(capture: ToolCallCapture, responder): """Wrap a responder that maps (name, args) -> reply_text into a ``run_tool_with_retries`` replacement.""" from jarvis.tools.types import ToolExecutionResult def _runner(db, cfg, tool_name, tool_args, **kwargs): args = tool_args or {} capture.record(tool_name, args) reply = responder(tool_name, args) if reply is None: reply = "OK" return ToolExecutionResult(success=True, reply_text=reply) return _runner # ============================================================================= # 1. Premature-prose nudge: router says "just call the tool" but turn-1 is prose # ============================================================================= class TestPrematureProseNudge: """The evaluator must nudge the agent back into a tool call when the router's pre-seeded tool could directly perform the action but the model opened with prose.""" @pytest.mark.eval @requires_judge_llm @pytest.mark.xfail( reason=( "Plumbing verified in unit tests (tests/test_engine_tool_search_loop.py, " "tests/test_evaluator.py). Live behaviour on gemma4:e2b is flaky: " "the small model sometimes refuses in prose despite the nudge. " "Tracked for iterative prompt tuning; architecture ships as-is." ), strict=False, ) def test_navigate_prose_gets_nudged_into_tool_call( self, mock_config, eval_db, eval_dialogue_memory ): from jarvis.reply.engine import run_reply_engine _configure(mock_config) capture = ToolCallCapture() def _respond(name, args): if name == "chrome-devtools__navigate_page": return MOCK_NAV_SUCCESS if name == "toolSearchTool": return MOCK_TOOLSEARCH_NAV return "OK" router = _make_router_stub(["chrome-devtools__navigate_page", "stop"]) runner = _make_tool_runner(capture, _respond) with patch("jarvis.reply.engine.select_tools", side_effect=router), \ patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \ patch( "jarvis.reply.engine.get_location_context_with_timezone", return_value=("Location: Kensington, UK", None), ): reply = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Open the YouTube homepage.", dialogue_memory=eval_dialogue_memory, ) names = capture.tool_names() print(f"\nšŸ“Š Premature-prose nudge:") print(f" tool calls: {names}") print(f" reply: {(reply or '')[:160]}...") assert "chrome-devtools__navigate_page" in names, ( "Evaluator should have nudged the model into calling " "chrome-devtools__navigate_page. " f"Tools actually called: {names}. Reply: {(reply or '')[:200]!r}" ) # ============================================================================= # 2. Terminal-on-success: one tool call, no thrashing # ============================================================================= class TestTerminalOnSuccessfulToolUse: """When the agent uses the correct tool and summarises the result, the evaluator must mark terminal; a single call should be enough.""" @pytest.mark.eval @requires_judge_llm def test_single_weather_call_terminates( self, mock_config, eval_db, eval_dialogue_memory ): from jarvis.reply.engine import run_reply_engine _configure(mock_config) capture = ToolCallCapture() def _respond(name, args): if name == "getWeather": return MOCK_WEATHER_PARIS return "OK" router = _make_router_stub(["getWeather", "stop"]) runner = _make_tool_runner(capture, _respond) with patch("jarvis.reply.engine.select_tools", side_effect=router), \ patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \ patch( "jarvis.reply.engine.get_location_context_with_timezone", return_value=("Location: Paris, France", None), ): reply = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="What's the weather in Paris?", dialogue_memory=eval_dialogue_memory, ) weather_calls = [c for c in capture.calls if c["name"] == "getWeather"] print(f"\nšŸ“Š Terminal-on-success — Paris weather:") print(f" getWeather calls: {len(weather_calls)}") print(f" all tool calls: {capture.tool_names()}") print(f" reply: {(reply or '')[:200]}...") # Guard against the two shields that used to mask evaluator failures # here: the malformed-output fallback and the max-turns digest # caveat. Either means the loop did not terminate cleanly on the # first grounded tool summary, even when the surrounding content # reads correctly. assert_not_fallback_reply(reply, context="single-weather-terminal") assert_not_max_turns_digest(reply, context="single-weather-terminal") assert len(weather_calls) == 1, ( f"Expected exactly one getWeather call (evaluator should terminate " f"after the first successful summary). Got {len(weather_calls)}: " f"{capture.tool_names()}" ) assert reply, "Reply should be non-empty" lower = reply.lower() assert "paris" in lower, f"Reply should mention Paris. Got: {reply[:200]!r}" weather_terms = ["weather", "cloud", "temperat", "14", "c ", "°c"] assert any(t in lower for t in weather_terms), ( f"Reply should reference weather facts from the tool payload. " f"Got: {reply[:200]!r}" ) # ============================================================================= # 3. Terminal on honest "can't do": no action tool available # ============================================================================= class TestTerminalOnHonestCantDo: """When no tool in the allow-list can perform the action and toolSearchTool turns up nothing, the agent should honestly decline and the evaluator must mark terminal — no infinite continuation, no confabulated success.""" @pytest.mark.eval @requires_judge_llm def test_no_email_tool_declines_honestly( self, mock_config, eval_db, eval_dialogue_memory ): from jarvis.reply.engine import run_reply_engine _configure(mock_config) capture = ToolCallCapture() def _respond(name, args): if name == "toolSearchTool": return MOCK_TOOLSEARCH_EMPTY if name == "getWeather": return MOCK_WEATHER_LONDON return "OK" # No email-capable tool in the allow-list. router = _make_router_stub(["getWeather", "stop"]) runner = _make_tool_runner(capture, _respond) with patch("jarvis.reply.engine.select_tools", side_effect=router), \ patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \ patch( "jarvis.reply.engine.get_location_context_with_timezone", return_value=("Location: London, UK", None), ): reply = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Send an email to my mum saying I'll be late.", dialogue_memory=eval_dialogue_memory, ) print(f"\nšŸ“Š Honest can't-do:") print(f" tool calls: {capture.tool_names()}") print(f" reply: {(reply or '')[:240]}...") assert reply and reply.strip(), "Reply must not be empty" # The reply must NOT claim the email was sent. Keyword-based rather # than full NL check, so flakes are diagnosable. lower = reply.lower() forbidden = [ "email has been sent", "i have sent", "i've sent", "i sent the email", "email sent successfully", ] claimed_success = any(p in lower for p in forbidden) assert not claimed_success, ( f"āŒ Reply falsely claims to have sent the email (no email tool " f"was available). Reply: {reply[:300]!r}" ) # ============================================================================= # 4. Nudge-cap enforcement: pathological loop is capped cleanly # ============================================================================= class TestNudgeCapEnforcement: """When the evaluator keeps wanting to nudge but the model won't comply, the nudge cap must stop the loop before agentic_max_turns and the reply must still be non-empty.""" @pytest.mark.eval @requires_judge_llm def test_nudge_cap_stops_loop(self, mock_config, eval_db, eval_dialogue_memory): from jarvis.reply.engine import run_reply_engine _configure(mock_config) mock_config.evaluator_nudge_max = 1 # tight cap so the test is fast mock_config.agentic_max_turns = 4 capture = ToolCallCapture() def _respond(name, args): if name == "getWeather": return MOCK_WEATHER_LONDON if name == "toolSearchTool": return MOCK_TOOLSEARCH_EMPTY return "OK" # An action-inappropriate tool is pre-seeded; the evaluator may try to # nudge toward it, but the cap must stop the ping-pong. router = _make_router_stub(["getWeather", "stop"]) runner = _make_tool_runner(capture, _respond) with patch("jarvis.reply.engine.select_tools", side_effect=router), \ patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \ patch( "jarvis.reply.engine.get_location_context_with_timezone", return_value=("Location: London, UK", None), ): reply = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Tell me a long poem about the sea.", dialogue_memory=eval_dialogue_memory, ) print(f"\nšŸ“Š Nudge-cap enforcement:") print(f" tool calls: {capture.tool_names()}") print(f" reply length: {len(reply or '')}") print(f" reply: {(reply or '')[:240]}...") assert reply and reply.strip(), ( "Reply must be non-empty even when the evaluator keeps wanting " "to nudge — the cap backstop must still deliver a reply." ) # ============================================================================= # 5. Max-turn digest caveat: the loop never terminates, digest fires # ============================================================================= class TestMaxTurnDigestCaveat: """Behaviour: when the agentic loop exhausts ``agentic_max_turns`` without ever emitting a natural-language reply (a pathological pure- tool-call loop), the engine must still deliver a non-empty reply by running the digest backstop. Evaluator-driven coverage was removed when the evaluator was retired in favour of the planner. The behaviour the user cares about — "you must never be left with an empty reply, even if the loop misbehaves" — is asserted here without coupling to deprecated internals.""" @pytest.mark.eval @requires_judge_llm def test_max_turn_triggers_digest( self, mock_config, eval_db, eval_dialogue_memory ): from jarvis.reply.engine import run_reply_engine _configure(mock_config) mock_config.agentic_max_turns = 3 capture = ToolCallCapture() def _respond(name, args): if name == "getWeather": return MOCK_WEATHER_LONDON return "OK" router = _make_router_stub(["getWeather", "stop"]) runner = _make_tool_runner(capture, _respond) digest_spy_calls: list[dict] = [] def _spy_digest(*, user_query, loop_messages, cfg, **_kwargs): digest_spy_calls.append( {"user_query": user_query, "loop_messages_len": len(loop_messages)} ) return ( "(Heads up, I couldn't finish this one) Based on what I " "gathered so far, I don't have a complete answer." ) # Force the chat model into an infinite tool-call loop: every turn # returns a structured tool_call instead of natural-language content, # so the loop never sees a terminal text reply and runs out of turns. def _always_tool_call(*_args, **_kwargs): return { "message": { "role": "assistant", "content": "", "tool_calls": [ { "function": { "name": "getWeather", "arguments": {"location": "London"}, } } ], } } with patch("jarvis.reply.engine.select_tools", side_effect=router), \ patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \ patch( "jarvis.reply.engine.get_location_context_with_timezone", return_value=("Location: London, UK", None), ), \ patch("jarvis.reply.engine.chat_with_messages", side_effect=_always_tool_call), \ patch("jarvis.reply.engine.digest_loop_for_max_turns", side_effect=_spy_digest): reply = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Write me a very long essay about abstract algebra.", dialogue_memory=eval_dialogue_memory, ) print(f"\nšŸ“Š Max-turn digest caveat:") print(f" digest invocations: {len(digest_spy_calls)}") print(f" tool calls: {capture.tool_names()}") print(f" reply: {(reply or '')[:240]}...") assert digest_spy_calls, ( "digest_loop_for_max_turns must fire when the loop exhausts " "agentic_max_turns without producing a text reply." ) assert digest_spy_calls[0]["loop_messages_len"] > 0, ( "Digest must receive the loop's accumulated messages, not an empty " "list. Got len=0." ) assert reply and reply.strip(), "Reply must be non-empty after digest" # ============================================================================= # 6. toolSearchTool escape hatch: widen allow-list mid-loop, then act # ============================================================================= class TestToolSearchToolEscapeHatch: """When the initial router pick is too narrow, the model should invoke ``toolSearchTool`` to widen the allow-list, then call the newly-surfaced tool. Order matters: navigate must come AFTER toolSearchTool.""" @pytest.mark.eval @requires_judge_llm @pytest.mark.xfail( reason=( "Plumbing verified in unit tests (tests/test_tool_search_tool.py, " "tests/test_engine_tool_search_loop.py). Live behaviour on " "gemma4:e2b is flaky: the small model often falls back to " "webSearch rather than invoking toolSearchTool. Tracked for " "iterative prompt tuning; architecture ships as-is." ), strict=False, ) def test_toolsearchtool_widens_then_navigate( self, mock_config, eval_db, eval_dialogue_memory ): from jarvis.reply.engine import run_reply_engine _configure(mock_config) capture = ToolCallCapture() def _respond(name, args): if name == "toolSearchTool": return MOCK_TOOLSEARCH_NAV if name == "chrome-devtools__navigate_page": return MOCK_NAV_SUCCESS if name == "webSearch": return "Web search results: YouTube is a video-sharing site.\n" return "OK" # Narrow router pick: only webSearch. Escape-hatch must surface the # navigation tool. router = _make_router_stub(["webSearch", "stop"]) runner = _make_tool_runner(capture, _respond) with patch("jarvis.reply.engine.select_tools", side_effect=router), \ patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \ patch( "jarvis.reply.engine.get_location_context_with_timezone", return_value=("Location: Kensington, UK", None), ): reply = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text=( "Open YouTube and tell me the title of the first trending " "video." ), dialogue_memory=eval_dialogue_memory, ) names = capture.tool_names() print(f"\nšŸ“Š toolSearchTool escape hatch:") print(f" tool calls: {names}") print(f" reply: {(reply or '')[:240]}...") assert "toolSearchTool" in names, ( f"Model must invoke toolSearchTool when the pre-seeded allow-list " f"has no navigation tool. Tools called: {names}" ) assert "chrome-devtools__navigate_page" in names, ( f"Navigation tool should have been invoked after toolSearchTool " f"widened the allow-list. Tools called: {names}" ) ts_idx = names.index("toolSearchTool") nav_idx = names.index("chrome-devtools__navigate_page") assert nav_idx > ts_idx, ( f"chrome-devtools__navigate_page must be invoked AFTER " f"toolSearchTool. Sequence: {names}" ) # ============================================================================= # 7. Complex multi-turn / multi-tool scenarios # ============================================================================= class TestComplexMultiTurnMultiTool: """Flavours of end-to-end complexity that stress the evaluator loop: chained research, parallel comparisons, cross-turn pronoun resolution, nudge-driven query refinement, and an escape-hatch follow-up.""" # ---- 7a --------------------------------------------------------------- @pytest.mark.eval @requires_judge_llm def test_chained_research_possessor_director( self, mock_config, eval_db, eval_dialogue_memory ): """Two distinct webSearch calls: entity lookup then filmography.""" from jarvis.reply.engine import run_reply_engine _configure(mock_config) capture = ToolCallCapture() def _respond(name, args): if name == "webSearch": arg_str = " ".join( str(v) for v in (args or {}).values() if isinstance(v, str) ).lower() if "cronenberg" in arg_str or "filmograph" in arg_str or \ "directed" in arg_str or "brandon" in arg_str: return MOCK_CRONENBERG_FILMOGRAPHY return MOCK_POSSESSOR_SEARCH return "OK" router = _make_router_stub(["webSearch", "stop"]) runner = _make_tool_runner(capture, _respond) with patch("jarvis.reply.engine.select_tools", side_effect=router), \ patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \ patch( "jarvis.reply.engine.get_location_context_with_timezone", return_value=("Location: London, UK", None), ): reply = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Who directed Possessor and what else have they directed?", dialogue_memory=eval_dialogue_memory, ) searches = [c for c in capture.calls if c["name"] == "webSearch"] print(f"\nšŸ“Š Chained research — Possessor + filmography:") print(f" webSearch count: {len(searches)}") for c in searches: print(f" args: {c['args']}") print(f" reply: {(reply or '')[:240]}...") assert len(searches) >= 2, ( f"Expected at least two webSearch calls (entity, then " f"filmography). Got {len(searches)}: " f"{[c['args'] for c in searches]}" ) # The two calls should have distinct argument strings. arg_fingerprints = { " ".join( str(v) for v in (c["args"] or {}).values() if isinstance(v, str) ).lower() for c in searches } assert len(arg_fingerprints) >= 2, ( f"Both webSearch calls had identical args — chain was not " f"progressed. Args: {arg_fingerprints}" ) # ---- 7b --------------------------------------------------------------- @pytest.mark.eval @requires_judge_llm def test_parallel_comparison_paris_vs_london( self, mock_config, eval_db, eval_dialogue_memory ): """Two getWeather calls, different locations, reply mentions both.""" from jarvis.reply.engine import run_reply_engine _configure(mock_config) capture = ToolCallCapture() def _respond(name, args): if name == "getWeather": loc = " ".join( str(v) for v in (args or {}).values() if isinstance(v, str) ).lower() if "london" in loc: return MOCK_WEATHER_LONDON return MOCK_WEATHER_PARIS return "OK" router = _make_router_stub(["getWeather", "stop"]) runner = _make_tool_runner(capture, _respond) with patch("jarvis.reply.engine.select_tools", side_effect=router), \ patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \ patch( "jarvis.reply.engine.get_location_context_with_timezone", return_value=("Location: London, UK", None), ): reply = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Compare the weather in Paris and London right now.", dialogue_memory=eval_dialogue_memory, ) weather_calls = [c for c in capture.calls if c["name"] == "getWeather"] locs = { " ".join( str(v) for v in (c["args"] or {}).values() if isinstance(v, str) ).lower() for c in weather_calls } print(f"\nšŸ“Š Parallel comparison — Paris vs London:") print(f" getWeather calls: {len(weather_calls)}") print(f" distinct location args: {locs}") print(f" reply: {(reply or '')[:240]}...") assert len(weather_calls) >= 2, ( f"Expected at least two getWeather calls (one per city). Got " f"{len(weather_calls)}: {[c['args'] for c in weather_calls]}" ) has_paris = any("paris" in loc for loc in locs) has_london = any("london" in loc for loc in locs) assert has_paris and has_london, ( f"getWeather must have been called for BOTH Paris and London. " f"Got location args: {locs}" ) if reply: lower = reply.lower() assert "paris" in lower and "london" in lower, ( f"Reply should mention both Paris and London. Got: " f"{reply[:300]!r}" ) # ---- 7c --------------------------------------------------------------- @pytest.mark.eval @requires_judge_llm def test_cross_turn_pronoun_resolution( self, mock_config, eval_db, eval_dialogue_memory ): """Turn 2 resolves 'his' to the entity established in turn 1.""" from jarvis.reply.engine import run_reply_engine _configure(mock_config) capture = ToolCallCapture() def _respond(name, args): if name == "webSearch": arg_str = " ".join( str(v) for v in (args or {}).values() if isinstance(v, str) ).lower() if "song" in arg_str or "music" in arg_str or "album" in arg_str: return MOCK_HARRY_STYLES_SONGS return MOCK_HARRY_STYLES_BIO return "OK" router = _make_router_stub(["webSearch", "stop"]) runner = _make_tool_runner(capture, _respond) with patch("jarvis.reply.engine.select_tools", side_effect=router), \ patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \ patch( "jarvis.reply.engine.get_location_context_with_timezone", return_value=("Location: London, UK", None), ): # Turn 1: establish entity capture.clear() run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Who is Harry Styles?", dialogue_memory=eval_dialogue_memory, ) turn1 = list(capture.calls) # Turn 2: pronoun capture.clear() reply2 = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="What are his most famous songs?", dialogue_memory=eval_dialogue_memory, ) turn2 = list(capture.calls) print(f"\nšŸ“Š Cross-turn pronoun resolution:") print(f" Turn 1 calls: {[c['name'] for c in turn1]}") print(f" Turn 2 calls: {turn2}") print(f" Turn 2 reply: {(reply2 or '')[:200]}...") turn2_searches = [c for c in turn2 if c["name"] == "webSearch"] assert turn2_searches, ( f"Turn 2 must trigger a webSearch to answer the follow-up. " f"Got: {[c['name'] for c in turn2]}" ) # At least one search arg must name the entity. resolved = False for c in turn2_searches: arg_str = " ".join( str(v) for v in (c["args"] or {}).values() if isinstance(v, str) ).lower() if "harry" in arg_str or "styles" in arg_str: resolved = True break assert resolved, ( f"Turn 2 webSearch arg did not resolve 'his' to the entity " f"established in turn 1. Args: {[c['args'] for c in turn2_searches]}" ) if reply2: lower = reply2.lower() mentions_song = any( k in lower for k in ("song", "watermelon", "as it was", "sign", "adore") ) assert mentions_song, ( f"Turn 2 reply should address the songs question. " f"Got: {reply2[:300]!r}" ) # ---- 7d --------------------------------------------------------------- @pytest.mark.eval @requires_judge_llm def test_correction_loop_accepts_single_or_retry( self, mock_config, eval_db, eval_dialogue_memory ): """At least one webSearch must happen; a nudge-driven retry is acceptable, zero searches is not.""" from jarvis.reply.engine import run_reply_engine _configure(mock_config) capture = ToolCallCapture() def _respond(name, args): if name == "webSearch": # First call returns stale; subsequent calls return live. n = sum(1 for c in capture.calls if c["name"] == "webSearch") # n is already incremented by this point (capture.record ran first) return MOCK_MADRID_LIVE if n > 1 else MOCK_MADRID_STALE return "OK" router = _make_router_stub(["webSearch", "stop"]) runner = _make_tool_runner(capture, _respond) with patch("jarvis.reply.engine.select_tools", side_effect=router), \ patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \ patch( "jarvis.reply.engine.get_location_context_with_timezone", return_value=("Location: London, UK", None), ): reply = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="What's the score in the Real Madrid game?", dialogue_memory=eval_dialogue_memory, ) searches = [c for c in capture.calls if c["name"] == "webSearch"] print(f"\nšŸ“Š Correction loop — Real Madrid score:") print(f" webSearch count: {len(searches)}") print(f" reply: {(reply or '')[:240]}...") assert len(searches) >= 1, ( f"At least one webSearch must fire for a live-score query. " f"Tools called: {capture.tool_names()}" ) # ---- 7e --------------------------------------------------------------- @pytest.mark.eval @requires_judge_llm @pytest.mark.xfail( reason=( "Plumbing verified in unit tests. Live behaviour on gemma4:e2b " "is flaky on multi-turn escape-hatch flows: the small model " "sometimes refuses turn 1 in prose despite the nudge. Tracked " "for iterative prompt tuning; architecture ships as-is." ), strict=False, ) def test_escape_hatch_then_follow_up_action( self, mock_config, eval_db, eval_dialogue_memory ): """Turn 1: narrow router → toolSearchTool → navigate. Turn 2: a new action whose argument must be self-contained ('lo-fi').""" from jarvis.reply.engine import run_reply_engine _configure(mock_config) capture = ToolCallCapture() def _respond(name, args): if name == "toolSearchTool": return MOCK_TOOLSEARCH_NAV if name == "chrome-devtools__navigate_page": return MOCK_NAV_SUCCESS if name == "webSearch": return ( "Web search results for 'lo-fi beats':\n" "Top results: Lofi Girl's YouTube radio, Chillhop Music, " "and Nujabes playlists.\n" ) return "OK" # Narrow initial pick so the escape hatch is needed. router = _make_router_stub(["webSearch", "stop"]) runner = _make_tool_runner(capture, _respond) with patch("jarvis.reply.engine.select_tools", side_effect=router), \ patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \ patch( "jarvis.reply.engine.get_location_context_with_timezone", return_value=("Location: London, UK", None), ): capture.clear() run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Open YouTube.", dialogue_memory=eval_dialogue_memory, ) turn1 = list(capture.calls) capture.clear() reply2 = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Now search for lo-fi beats.", dialogue_memory=eval_dialogue_memory, ) turn2 = list(capture.calls) print(f"\nšŸ“Š Escape hatch + follow-up:") print(f" Turn 1 calls: {[c['name'] for c in turn1]}") print(f" Turn 2 calls: {turn2}") print(f" Turn 2 reply: {(reply2 or '')[:200]}...") assert turn1, "Turn 1 should have at least one tool call" assert turn2, "Turn 2 should have at least one tool call" # Turn 2's tool call arg must contain the self-contained keyword. found_lofi = False for c in turn2: arg_str = " ".join( str(v) for v in (c["args"] or {}).values() if isinstance(v, str) ).lower() if "lo-fi" in arg_str or "lofi" in arg_str or "lo fi" in arg_str or "beats" in arg_str: found_lofi = True break assert found_lofi, ( f"Turn 2 tool arg must contain the self-contained keyword " f"'lo-fi' (or a reasonable paraphrase). Calls: {turn2}" ) # ============================================================================= # 8. Structured tool_call emission — the evaluator must not only nudge # textually, it must emit a structured {name, arguments} that the engine can # execute directly. This is the recovery path for small chat models that # routinely ignore textual nudges. # ============================================================================= class TestStructuredToolCallEmission: """The evaluator prompt now asks for a structured ``tool_call`` field alongside the textual nudge. Verify that a live small-model evaluator actually populates it when the intent is unambiguous.""" @pytest.mark.eval @requires_judge_llm @pytest.mark.xfail( reason=( "Prompt compliance depends on the live small evaluator model. " "Deterministic coverage lives in tests/test_evaluator.py " "(parse) and tests/test_engine_tool_search_loop.py (direct-exec). " "Tracked for iterative prompt tuning; architecture ships as-is." ), strict=False, ) def test_evaluator_emits_structured_tool_call_for_obvious_search( self, mock_config ): from jarvis.reply.evaluator import evaluate_turn _configure(mock_config) result = evaluate_turn( user_query="Give me an overview of China.", assistant_response_summary=( "I can look that up for you. Would you like me to search the " "web for an overview of China?" ), available_tools=[ ("webSearch", "Search the web and return ranked results."), ("stop", "Explicit end-of-turn sentinel."), ], turns_used=1, cfg=mock_config, ) print(f"\nšŸ“Š Structured tool_call emission:") print(f" terminal: {result.terminal}") print(f" nudge: {result.nudge!r}") print(f" tool_call: {result.tool_call!r}") assert result.terminal is False, ( "Evaluator should continue: the agent offered prose instead of " "calling webSearch. " f"Got terminal={result.terminal}, reason={result.reason!r}." ) assert isinstance(result.tool_call, dict), ( "Evaluator should emit a structured tool_call so the engine can " "run the search directly without relying on the chat model to " f"parse the textual nudge. Got tool_call={result.tool_call!r}." ) assert result.tool_call.get("name") == "webSearch", ( f"Structured tool_call.name should be 'webSearch'. " f"Got {result.tool_call!r}." ) args = result.tool_call.get("arguments") or {} assert isinstance(args, dict) and args, ( "Structured tool_call.arguments should be a non-empty dict with " f"the intended query. Got {result.tool_call!r}." ) arg_blob = " ".join( str(v).lower() for v in args.values() if isinstance(v, str) ) assert "china" in arg_blob, ( f"Structured tool_call.arguments should mention 'china'. " f"Got {result.tool_call!r}." )