"""Engine-level tool carry-over guard. Field trace (2026-05-03, gemma4:e2b): Turn 1 user: "how's the weather tomorrow Jarvis?" → no location set → assistant invokes ``getWeather``, tool returns ``success=False`` ("I couldn't auto-detect your location, please tell me a city"), assistant relays the request. Turn 2 user: "I'm in London" → small-model router picks ``webSearch`` instead of ``getWeather``, planner falls back to a web search for "weather in london tomorrow", DDG fails, Wikipedia matches the 2014 film "Edge of Tomorrow", and the assistant parrots the film summary as the weather answer. Fix: when the previous assistant turn invoked a tool that reported ``success=False`` on its ``ToolExecutionResult``, union the previous turn's tool name into the allow-list. The ``tool_failed`` flag stamped onto each recorded tool result is the truth source. Gating on failure (rather than recency or query length) means a successful chain followed by a genuine new short ask ("play some music") correctly does NOT carry over the prior tool. The carry-over is an engine-side per-turn overlay: the router cache stores only the raw router output, so future identical queries are unaffected. """ from unittest.mock import Mock, patch import pytest from src.jarvis.memory.conversation import DialogueMemory from src.jarvis.reply.engine import run_reply_engine def _mock_cfg(): cfg = Mock() cfg.ollama_base_url = "http://localhost:11434" cfg.ollama_chat_model = "test-large" cfg.voice_debug = False cfg.llm_tools_timeout_sec = 8.0 cfg.llm_embed_timeout_sec = 10.0 cfg.llm_chat_timeout_sec = 45.0 cfg.llm_digest_timeout_sec = 8.0 cfg.memory_enrichment_max_results = 5 cfg.memory_enrichment_source = "diary" cfg.memory_digest_enabled = False cfg.tool_result_digest_enabled = False cfg.location_ip_address = None cfg.location_auto_detect = False cfg.location_enabled = False cfg.agentic_max_turns = 8 cfg.tool_search_max_calls = 3 cfg.tool_selection_strategy = "all" cfg.tool_carryover_max_turns = 2 cfg.tool_carryover_per_entry_chars = 1200 cfg.mcps = {} cfg.llm_thinking_enabled = False cfg.tts_engine = "none" cfg.ollama_embed_model = "test-embed" cfg.db_path = ":memory:" return cfg def _tool_names_from_chat_call(call) -> set[str]: """Pull function names out of the OpenAI-style tools schema passed to chat_with_messages. """ schema = call.kwargs.get("tools") or [] names: set[str] = set() for entry in schema: if not isinstance(entry, dict): continue fn = entry.get("function") or {} nm = fn.get("name") if isinstance(nm, str): names.add(nm) return names def _failed_tool_turn(tool_name: str, tool_call_id: str = "c1") -> list[dict]: """Plant a previous-turn tool turn where the tool was invoked and reported failure. Mirrors the message shape the engine records for a native tool call whose ``ToolExecutionResult.success`` was False. """ return [ {"role": "assistant", "content": "", "tool_calls": [{ "id": tool_call_id, "type": "function", "function": {"name": tool_name, "arguments": {}}, }]}, {"role": "tool", "tool_call_id": tool_call_id, "tool_name": tool_name, "content": "I couldn't auto-detect your location.", "tool_failed": True}, ] def _succeeded_tool_turn(tool_name: str, tool_call_id: str = "c1") -> list[dict]: """Plant a previous-turn tool turn where the tool succeeded.""" return [ {"role": "assistant", "content": "", "tool_calls": [{ "id": tool_call_id, "type": "function", "function": {"name": tool_name, "arguments": {"location": "London"}}, }]}, {"role": "tool", "tool_call_id": tool_call_id, "tool_name": tool_name, "content": "London: 15°C and partly cloudy.", "tool_failed": False}, ] def _failed_text_tool_turn(tool_name: str) -> list[dict]: """Plant a previous-turn tool turn in the text-tool fallback shape (small models). Tool error is appended as a ``role=user`` message tagged with both ``tool_name`` and ``tool_failed=True``. """ return [ {"role": "assistant", "content": ( "```tool_call\n" '{"name": "' + tool_name + '", "arguments": {}}\n' "```" )}, {"role": "user", "content": ( "[Tool error: " + tool_name + "] I couldn't auto-detect " "your location." ), "tool_name": tool_name, "tool_failed": True}, ] @pytest.mark.unit @patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="") @patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""}) @patch("src.jarvis.memory.graph.GraphMemoryStore") @patch("src.jarvis.reply.engine.plan_query", return_value=[]) @patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={}) @patch("src.jarvis.reply.engine.extract_text_from_response") @patch("src.jarvis.reply.engine.chat_with_messages") def test_followup_carries_over_failed_previous_tool( mock_chat, mock_extract, _mock_extract_mem, _mock_plan, _mock_graph, _mock_warm, _mock_fmt, ): """Previous turn invoked ``getWeather`` and the tool reported failure; this turn's router only picked ``webSearch``. The engine must union ``getWeather`` back in so the chat model can re-call it with the location the user just supplied. """ mock_chat.side_effect = [ {"message": {"content": "Weather in London is 15°C."}}, ] mock_extract.side_effect = ["Weather in London is 15°C."] db = Mock() cfg = _mock_cfg() dm = DialogueMemory() dm.add_message("user", "how's the weather tomorrow Jarvis") dm.record_tool_turn(_failed_tool_turn("getWeather")) dm.add_message("assistant", "I do not have a location set.") with patch( "src.jarvis.reply.engine.select_tools", return_value=["webSearch"], ): run_reply_engine(db=db, cfg=cfg, tts=None, text="I'm in London", dialogue_memory=dm) tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1]) assert "webSearch" in tool_names, ( f"router pick must remain visible; saw {sorted(tool_names)}" ) assert "getWeather" in tool_names, ( "previous-turn failed tool must be carried over; " f"saw {sorted(tool_names)}" ) @pytest.mark.unit @patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="") @patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""}) @patch("src.jarvis.memory.graph.GraphMemoryStore") @patch("src.jarvis.reply.engine.plan_query", return_value=[]) @patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={}) @patch("src.jarvis.reply.engine.extract_text_from_response") @patch("src.jarvis.reply.engine.chat_with_messages") def test_successful_previous_tool_does_not_trigger_carryover( mock_chat, mock_extract, _mock_extract_mem, _mock_plan, _mock_graph, _mock_warm, _mock_fmt, ): """A successful prior tool call means the chain completed. A genuine new short ask ("log my breakfast") must NOT inherit the prior tool — that would noisily widen the allow-list for unrelated turns and risks small models replaying the previous tool. The router pick stands on its own. """ mock_chat.side_effect = [ {"message": {"content": "Logged."}}, ] mock_extract.side_effect = ["Logged."] db = Mock() cfg = _mock_cfg() dm = DialogueMemory() dm.add_message("user", "how's the weather in London") dm.record_tool_turn(_succeeded_tool_turn("getWeather")) dm.add_message("assistant", "It's 15°C and partly cloudy in London.") with patch( "src.jarvis.reply.engine.select_tools", return_value=["logMeal"], ): run_reply_engine(db=db, cfg=cfg, tts=None, text="log my breakfast", dialogue_memory=dm) tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1]) assert "logMeal" in tool_names assert "getWeather" not in tool_names, ( "successful prior tool must not be carried over; " f"saw {sorted(tool_names)}" ) @pytest.mark.unit @patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="") @patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""}) @patch("src.jarvis.memory.graph.GraphMemoryStore") @patch("src.jarvis.reply.engine.plan_query", return_value=[]) @patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={}) @patch("src.jarvis.reply.engine.extract_text_from_response") @patch("src.jarvis.reply.engine.chat_with_messages") def test_cold_start_does_not_trigger_carryover( mock_chat, mock_extract, _mock_extract_mem, _mock_plan, _mock_graph, _mock_warm, _mock_fmt, ): """Empty dialogue memory — the carry-over path must be a no-op.""" mock_chat.side_effect = [ {"message": {"content": "Hello."}}, ] mock_extract.side_effect = ["Hello."] db = Mock() cfg = _mock_cfg() dm = DialogueMemory() # cold start — no prior turns with patch( "src.jarvis.reply.engine.select_tools", return_value=["webSearch"], ): run_reply_engine(db=db, cfg=cfg, tts=None, text="hi", dialogue_memory=dm) tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1]) assert "webSearch" in tool_names assert "getWeather" not in tool_names @pytest.mark.unit @patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="") @patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""}) @patch("src.jarvis.memory.graph.GraphMemoryStore") @patch("src.jarvis.reply.engine.plan_query", return_value=[]) @patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={}) @patch("src.jarvis.reply.engine.extract_text_from_response") @patch("src.jarvis.reply.engine.chat_with_messages") def test_carryover_does_not_pollute_router_cache( mock_chat, mock_extract, _mock_extract_mem, _mock_plan, _mock_graph, _mock_warm, _mock_fmt, ): """The router cache stores the raw router output. Carry-over is a per-turn overlay layered on top — it must NOT be written back to the cache, otherwise every replay of the same query inherits a contaminated tool list. """ mock_chat.side_effect = [ {"message": {"content": "Weather in London is 15°C."}}, ] mock_extract.side_effect = ["Weather in London is 15°C."] db = Mock() cfg = _mock_cfg() dm = DialogueMemory() dm.add_message("user", "how's the weather tomorrow Jarvis") dm.record_tool_turn(_failed_tool_turn("getWeather")) dm.add_message("assistant", "I do not have a location set.") with patch( "src.jarvis.reply.engine.select_tools", return_value=["webSearch"], ): run_reply_engine(db=db, cfg=cfg, tts=None, text="I'm in London", dialogue_memory=dm) cached_router_entries = [ (k, v) for k, v in dm._hot_cache.items() if k.startswith("router:") ] assert cached_router_entries, "router output should have been cached" for key, (_ts, value) in cached_router_entries: assert value == ["webSearch"], ( f"router cache for {key!r} should hold raw router output " f"['webSearch']; got {value!r}" ) @pytest.mark.unit @patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="") @patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""}) @patch("src.jarvis.memory.graph.GraphMemoryStore") @patch("src.jarvis.reply.engine.plan_query", return_value=[]) @patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={}) @patch("src.jarvis.reply.engine.extract_text_from_response") @patch("src.jarvis.reply.engine.chat_with_messages") def test_long_followup_still_carries_over_when_previous_failed( mock_chat, mock_extract, _mock_extract_mem, _mock_plan, _mock_graph, _mock_warm, _mock_fmt, ): """Failure-gated carry-over does NOT depend on query length. A long follow-up that supplies the missing arg ("Right, sorry — I'm in Edinburgh, please try the lookup again for tomorrow") must still carry over the failed tool. The earlier char-length heuristic was dropped because it false-negatived this shape; the failure flag is the right signal. """ mock_chat.side_effect = [ {"message": {"content": "Edinburgh weather: 12°C."}}, ] mock_extract.side_effect = ["Edinburgh weather: 12°C."] db = Mock() cfg = _mock_cfg() dm = DialogueMemory() dm.add_message("user", "how's the weather tomorrow Jarvis") dm.record_tool_turn(_failed_tool_turn("getWeather")) dm.add_message("assistant", "I do not have a location set.") long_followup = ( "Right, sorry — I'm in Edinburgh, please try the lookup again for " "tomorrow morning if you would." ) assert len(long_followup) > 80 with patch( "src.jarvis.reply.engine.select_tools", return_value=["webSearch"], ): run_reply_engine(db=db, cfg=cfg, tts=None, text=long_followup, dialogue_memory=dm) tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1]) assert "getWeather" in tool_names, ( f"long follow-up to a failed tool must still carry over; " f"saw {sorted(tool_names)}" ) @pytest.mark.unit @patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="") @patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""}) @patch("src.jarvis.memory.graph.GraphMemoryStore") @patch("src.jarvis.reply.engine.plan_query", return_value=[]) @patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={}) @patch("src.jarvis.reply.engine.extract_text_from_response") @patch("src.jarvis.reply.engine.chat_with_messages") def test_text_tool_fallback_failure_carries_over( mock_chat, mock_extract, _mock_extract_mem, _mock_plan, _mock_graph, _mock_warm, _mock_fmt, ): """Small-model path: the previous turn's tool error was stored as a ``role=user`` message tagged with ``tool_name`` and ``tool_failed=True``. The walker must collect the name from this shape too, not only from native ``assistant.tool_calls`` + ``role=tool`` pairs. """ mock_chat.side_effect = [ {"message": {"content": "Weather in Berlin is 9°C."}}, ] mock_extract.side_effect = ["Weather in Berlin is 9°C."] db = Mock() cfg = _mock_cfg() dm = DialogueMemory() dm.add_message("user", "how's the weather") dm.record_tool_turn(_failed_text_tool_turn("getWeather")) dm.add_message("assistant", "I couldn't auto-detect your location.") with patch( "src.jarvis.reply.engine.select_tools", return_value=["webSearch"], ): run_reply_engine(db=db, cfg=cfg, tts=None, text="I'm in Berlin", dialogue_memory=dm) tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1]) assert "getWeather" in tool_names, ( "text-tool fallback failure shape must be carried over; " f"saw {sorted(tool_names)}" ) @pytest.mark.unit @patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="") @patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""}) @patch("src.jarvis.memory.graph.GraphMemoryStore") @patch("src.jarvis.reply.engine.plan_query", return_value=[]) @patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={}) @patch("src.jarvis.reply.engine.extract_text_from_response") @patch("src.jarvis.reply.engine.chat_with_messages") def test_multi_tool_call_only_failed_sibling_carries_over( mock_chat, mock_extract, _mock_extract_mem, _mock_plan, _mock_graph, _mock_warm, _mock_fmt, ): """When an assistant message carries multiple tool_calls but only one of them failed, only the failed name must be carried over. The successful sibling stays the chat model's responsibility through its own routing. """ mock_chat.side_effect = [ {"message": {"content": "Sure."}}, ] mock_extract.side_effect = ["Sure."] db = Mock() cfg = _mock_cfg() dm = DialogueMemory() dm.add_message("user", "weather and search Pushkin") dm.record_tool_turn([ {"role": "assistant", "content": "", "tool_calls": [ {"id": "c-w", "type": "function", "function": {"name": "getWeather", "arguments": {}}}, {"id": "c-s", "type": "function", "function": {"name": "webSearch", "arguments": {"query": "Pushkin"}}}, ]}, # getWeather failed (no location), webSearch succeeded. {"role": "tool", "tool_call_id": "c-w", "tool_name": "getWeather", "content": "I couldn't auto-detect your location.", "tool_failed": True}, {"role": "tool", "tool_call_id": "c-s", "tool_name": "webSearch", "content": "Pushkin was a Russian poet (1799-1837).", "tool_failed": False}, ]) dm.add_message("assistant", "Pushkin was a Russian poet. I couldn't auto-detect " "your location for the weather lookup.") with patch( "src.jarvis.reply.engine.select_tools", return_value=["fetchWebPage"], ): run_reply_engine(db=db, cfg=cfg, tts=None, text="I'm in Paris", dialogue_memory=dm) tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1]) assert "getWeather" in tool_names, ( "failed sibling tool_call must be carried over; " f"saw {sorted(tool_names)}" ) assert "webSearch" not in tool_names, ( "successful sibling tool_call must NOT be carried over; " f"saw {sorted(tool_names)}" ) @pytest.mark.unit @patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="") @patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""}) @patch("src.jarvis.memory.graph.GraphMemoryStore") @patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={}) @patch("src.jarvis.reply.engine.extract_text_from_response") @patch("src.jarvis.reply.engine.chat_with_messages") @patch("src.jarvis.reply.engine.run_tool_with_retries") def test_planner_direct_exec_stamps_tool_failed( mock_tool, mock_chat, mock_extract, _mock_extract_mem, _mock_graph, _mock_warm, _mock_fmt, ): """The planner's direct-exec path (text-tool mode + concrete plan step) appends tool results without going through the chat-model loop. Verify that path stamps ``tool_failed`` so the next turn's walker can see prior failures planted by direct-exec. """ from src.jarvis.tools.types import ToolExecutionResult cfg = _mock_cfg() cfg.ollama_chat_model = "gemma4:e2b" # triggers SMALL/text-tool path # First reply: planner emits a getWeather step, direct-exec runs the # tool which returns success=False (no location), then the chat # model produces a final text reply. mock_tool.return_value = ToolExecutionResult( success=False, reply_text="I couldn't auto-detect your location.", ) mock_chat.side_effect = [ {"message": {"content": "Tell me which city."}}, ] mock_extract.side_effect = ["Tell me which city."] db = Mock() dm = DialogueMemory() # Concrete plan step the resolver fast-path can parse without an LLM. with patch( "src.jarvis.reply.engine.plan_query", return_value=["getWeather", "Reply to the user."], ), patch( "src.jarvis.reply.engine.select_tools", return_value=["getWeather"], ): run_reply_engine(db=db, cfg=cfg, tts=None, text="how's the weather", dialogue_memory=dm) # The direct-exec path should have recorded a tool turn with the # failure flag set so a follow-up turn can carry over getWeather. assert dm._tool_turns, ( "planner direct-exec path must record a tool turn into " "dialogue memory carryover" ) stored_msgs = [m for _ts, msgs in dm._tool_turns for m in msgs] failed_entries = [ m for m in stored_msgs if m.get("tool_failed") and m.get("tool_name") == "getWeather" ] assert failed_entries, ( "direct-exec failure must stamp tool_failed=True; " f"stored messages: {stored_msgs}" ) @pytest.mark.unit def test_walker_logs_orphan_assistant_tool_call(caplog): """When an assistant tool_call has no matching role=tool result in the recent window (e.g. truncation, scrub, eviction), the walker should fail-open and log a diagnostic — never crash, never silently widen the allow-list with the orphan name. """ from src.jarvis.reply.engine import _previous_turn_failed_tool_names recent = [ {"role": "user", "content": "weather please"}, {"role": "assistant", "content": "", "tool_calls": [ {"id": "c-orphan", "type": "function", "function": {"name": "getWeather", "arguments": {}}}, ]}, # No matching role=tool result for c-orphan. {"role": "assistant", "content": "I couldn't auto-detect."}, ] names = _previous_turn_failed_tool_names(recent) # No failed tool result was seen, so nothing carries over even # though an assistant tool_call exists. assert names == [], ( f"orphan tool_call must not be carried over; got {names}" )