""" Multi-Turn Context Evaluations Tests the agent's ability to handle multi-turn conversations correctly: 1. Topic Switching - Selecting correct tool when conversation topic changes 2. Context Anchoring - Not getting "stuck" on previous turn's tool 3. Follow-up Handling - Using context from previous turns when relevant These evals are critical for catching regressions where the model might: - Call the wrong tool after a topic change (e.g., getWeather for store hours) - Ignore context from previous turns - Fail to follow up on established conversation context Run: ./scripts/run_evals.sh """ import pytest from unittest.mock import patch from conftest import requires_judge_llm from helpers import ( MockConfig, ToolCallCapture, create_mock_tool_run, JUDGE_MODEL, ) # ============================================================================= # Test Data - Consistent tool responses for reproducibility # ============================================================================= MOCK_WEATHER_RESPONSE = """Current weather in Kensington, Royal Kensington and Chelsea, United Kingdom: Conditions: Overcast Temperature: 7.8°C Feels like: 5°C Humidity: 75% Wind: 12 km/h from the west """ MOCK_STORE_HOURS_SEARCH = """Web search results for 'CEX store hours Kensington': **Content from top result:** CEX Kensington High Street Opening Hours: Monday - Saturday: 10:00 AM - 6:00 PM Sunday: 11:00 AM - 5:00 PM **Other search results:** 1. **CEX Kensington - Store Info** - https://uk.webuy.com/store/kensington 2. **CEX Store Locator** - https://uk.webuy.com/stores """ MOCK_NEWS_SEARCH = """Web search results for 'tech news today': **Content from top result:** Today's Tech Headlines: - Apple announces new M4 chip - OpenAI releases GPT-5 - SpaceX Starship completes orbital test **Other search results:** 1. **TechCrunch** - https://techcrunch.com 2. **The Verge** - https://theverge.com """ # ============================================================================= # Topic Switching Evaluations (Live LLM) # ============================================================================= class TestTopicSwitching: """ Tests that the agent selects the correct tool when the conversation topic changes between turns. Uses real LLM inference to test actual model behavior. Tool execution is mocked for consistent responses. """ @pytest.mark.eval @requires_judge_llm def test_weather_then_store_hours(self, mock_config, eval_db, eval_dialogue_memory): """ After weather query, asking about store hours should use webSearch. Scenario: - Turn 1: "How's the weather?" -> getWeather (correct) - Turn 2: "Can you check when CEX closes?" -> webSearch (NOT getWeather!) This tests the exact bug scenario where llama3.2:3b called getWeather for a store hours query because it got anchored on the previous tool. """ from jarvis.reply.engine import run_reply_engine mock_config.ollama_base_url = "http://localhost:11434" mock_config.ollama_chat_model = JUDGE_MODEL capture = ToolCallCapture() mock_tool_run = create_mock_tool_run(capture, { "getWeather": MOCK_WEATHER_RESPONSE, "webSearch": MOCK_STORE_HOURS_SEARCH, }) with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \ patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, Royal Kensington and Chelsea, United Kingdom", None)): # Turn 1: Weather query capture.clear() response1 = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="How's the weather today?", dialogue_memory=eval_dialogue_memory ) turn1_tools = capture.tool_sequence() # Turn 2: Store hours query (topic change) capture.clear() response2 = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Yeah, I could do but can you check how long CEX is open for?", dialogue_memory=eval_dialogue_memory ) turn2_tools = capture.tool_sequence() print(f"\nšŸ“Š Topic Switching - Weather → Store Hours:") print(f" Turn 1 query: 'How's the weather today?'") print(f" Turn 1 tools: {turn1_tools}") print(f" Turn 1 response: {response1[:100] if response1 else 'None'}...") print(f" Turn 2 query: 'can you check how long CEX is open for?'") print(f" Turn 2 tools: {turn2_tools}") print(f" Turn 2 response: {response2[:100] if response2 else 'None'}...") # Turn 1 should use getWeather assert "getWeather" in turn1_tools, \ f"Turn 1 should use getWeather for weather query. Used: {turn1_tools}" # Turn 2 MUST use webSearch, NOT getWeather # This is the critical assertion - the model should recognize topic change used_wrong_tool = "getWeather" in turn2_tools and "webSearch" not in turn2_tools if used_wrong_tool: pytest.fail( f"āŒ CONTEXT ANCHORING BUG: Model used getWeather for store hours!\n" f" Turn 2 tools: {turn2_tools}\n" f" Expected: webSearch\n" f" The model got 'stuck' on the previous turn's tool.\n" f" Response: {response2[:200] if response2 else 'None'}" ) assert "webSearch" in turn2_tools, \ f"Turn 2 should use webSearch for store hours. Used: {turn2_tools}" print(f" āœ… Correctly switched from getWeather to webSearch") @pytest.mark.eval @requires_judge_llm def test_search_then_weather(self, mock_config, eval_db, eval_dialogue_memory): """ After a web search, asking about weather should use getWeather. Tests the reverse direction - ensuring the model doesn't stay stuck on webSearch when weather is asked. """ from jarvis.reply.engine import run_reply_engine mock_config.ollama_base_url = "http://localhost:11434" mock_config.ollama_chat_model = JUDGE_MODEL capture = ToolCallCapture() mock_tool_run = create_mock_tool_run(capture, { "getWeather": MOCK_WEATHER_RESPONSE, "webSearch": MOCK_NEWS_SEARCH, }) with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \ patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)): # Turn 1: News search capture.clear() run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="What's the latest tech news?", dialogue_memory=eval_dialogue_memory ) turn1_tools = capture.tool_sequence() # Turn 2: Weather capture.clear() response2 = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="How's the weather outside?", dialogue_memory=eval_dialogue_memory ) turn2_tools = capture.tool_sequence() print(f"\nšŸ“Š Topic Switching - News → Weather:") print(f" Turn 1 tools: {turn1_tools}") print(f" Turn 2 tools: {turn2_tools}") assert "webSearch" in turn1_tools, \ f"Turn 1 should use webSearch for news. Used: {turn1_tools}" # Check for reverse anchoring if "webSearch" in turn2_tools and "getWeather" not in turn2_tools: pytest.fail( f"āŒ CONTEXT ANCHORING BUG: Model used webSearch for weather query!\n" f" Turn 2 tools: {turn2_tools}\n" f" Response: {response2[:200] if response2 else 'None'}" ) assert "getWeather" in turn2_tools, \ f"Turn 2 should use getWeather for weather query. Used: {turn2_tools}" print(f" āœ… Correctly switched from webSearch to getWeather") # ============================================================================= # Follow-Up Context Evaluations (Live LLM) # ============================================================================= class TestFollowUpContext: """ Tests that the agent maintains context from previous turns when handling follow-up questions. """ @pytest.mark.eval @requires_judge_llm def test_follow_up_references_previous_context(self, mock_config, eval_db, eval_dialogue_memory): """ Follow-up questions should reference information from previous turns. Scenario: - Turn 1: "How's the weather?" -> (gets weather data showing overcast, 7.8°C) - Turn 2: "Should I bring an umbrella?" -> Response should reference weather The model should use the weather context to inform the umbrella advice. """ from jarvis.reply.engine import run_reply_engine mock_config.ollama_base_url = "http://localhost:11434" mock_config.ollama_chat_model = JUDGE_MODEL capture = ToolCallCapture() mock_tool_run = create_mock_tool_run(capture, {"getWeather": MOCK_WEATHER_RESPONSE}) with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \ patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)): # Turn 1: Weather query capture.clear() response1 = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="How's the weather today?", dialogue_memory=eval_dialogue_memory ) turn1_tools = capture.tool_sequence() # Turn 2: Follow-up about umbrella capture.clear() response2 = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Should I bring an umbrella?", dialogue_memory=eval_dialogue_memory ) turn2_tools = capture.tool_sequence() print(f"\nšŸ“Š Follow-Up Context - Weather → Umbrella:") print(f" Turn 1 tools: {turn1_tools}") print(f" Turn 1 response: {response1[:80] if response1 else 'None'}...") print(f" Turn 2 tools: {turn2_tools}") print(f" Turn 2 response: {response2[:120] if response2 else 'None'}...") # Turn 1 should fetch weather assert "getWeather" in turn1_tools, "Turn 1 should fetch weather" # Turn 2: Check if response references weather context # (It may or may not call getWeather again - both are acceptable) if response2: weather_terms = ["overcast", "cloud", "rain", "weather", "chilly", "cold", "7", "8"] references_weather = any(term in response2.lower() for term in weather_terms) print(f" References weather context: {references_weather}") # The response should acknowledge or use the weather context # Not a hard fail if it doesn't, but we log it if not references_weather: print(f" āš ļø Response doesn't seem to reference weather context") # ============================================================================= # Self-Contained Tool Argument Evaluations (Live LLM) # ============================================================================= MOCK_HARRY_STYLES_SEARCH = """Web search results for 'Harry Styles': **Content from top result:** Harry Styles is an English singer and songwriter, born 1 February 1994. He rose to fame as a member of the boy band One Direction and has since released several solo albums including Fine Line (2019) and Harry's House (2022). **Other search results:** 1. **Harry Styles - Wikipedia** - https://en.wikipedia.org/wiki/Harry_Styles """ MOCK_HARRY_STYLES_SONGS_SEARCH = """Web search results for 'Harry Styles most famous songs': **Content from top result:** Harry Styles' most famous songs include: - "Watermelon Sugar" (2019) - "As It Was" (2022) - "Sign of the Times" (2017) - "Adore You" (2019) **Other search results:** 1. **Harry Styles Discography** - https://en.wikipedia.org/wiki/Harry_Styles_discography """ class TestSelfContainedToolArguments: """ Tests that follow-up queries with unresolved pronouns produce tool calls whose arguments resolve the referent from conversation history. A tool does not see prior turns — if the model passes "what are his most famous songs?" to webSearch, the search will miss the entity and return irrelevant results. The model must rewrite the argument to something like "Harry Styles most famous songs". """ @pytest.mark.eval @requires_judge_llm def test_follow_up_resolves_pronoun_in_search_query( self, mock_config, eval_db, eval_dialogue_memory ): """ Scenario: - Turn 1: "Who is Harry Styles?" -> webSearch("Harry Styles ...") - Turn 2: "What are his most famous songs?" -> webSearch argument MUST contain "Harry Styles" (pronoun resolved from context). """ from jarvis.reply.engine import run_reply_engine mock_config.ollama_base_url = "http://localhost:11434" mock_config.ollama_chat_model = JUDGE_MODEL capture = ToolCallCapture() def mock_tool_run(db, cfg, tool_name, tool_args, **kwargs): from jarvis.tools.types import ToolExecutionResult capture.record(tool_name, tool_args or {}) if tool_name == "webSearch": args_str = str(tool_args).lower() if tool_args else "" if "song" in args_str or "music" in args_str or "album" in args_str: return ToolExecutionResult(success=True, reply_text=MOCK_HARRY_STYLES_SONGS_SEARCH) return ToolExecutionResult(success=True, reply_text=MOCK_HARRY_STYLES_SEARCH) return ToolExecutionResult(success=True, reply_text="OK") with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \ patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)): # Turn 1: establish entity capture.clear() run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="Who is Harry Styles?", dialogue_memory=eval_dialogue_memory ) turn1_calls = list(capture.calls) # Turn 2: follow-up with pronoun capture.clear() response2 = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text="What are his most famous songs?", dialogue_memory=eval_dialogue_memory ) turn2_calls = list(capture.calls) print(f"\nšŸ“Š Self-contained tool arguments — Harry Styles follow-up:") print(f" Turn 1 calls: {turn1_calls}") print(f" Turn 2 calls: {turn2_calls}") print(f" Turn 2 response: {(response2 or '')[:120]}...") # Turn 2 must call a search-capable tool search_calls = [c for c in turn2_calls if c["name"] == "webSearch"] assert search_calls, ( f"Turn 2 should call webSearch to answer the follow-up. " f"Got: {[c['name'] for c in turn2_calls]}" ) # Every search call's string argument must name the entity for call in search_calls: args = call["args"] or {} arg_values = " ".join( str(v) for v in args.values() if isinstance(v, str) ).lower() assert "harry" in arg_values or "styles" in arg_values, ( f"āŒ PRONOUN-RESOLUTION BUG: webSearch argument did not include " f"the entity from the previous turn.\n" f" Args: {args}\n" f" Expected the string to contain 'Harry' or 'Styles' — the " f"tool has no access to conversation history, so 'his' must be " f"resolved by the model before the tool call." ) print(f" āœ… webSearch argument resolved the pronoun correctly") # ============================================================================= # Extended Multi-Turn Evaluations (Live LLM) # ============================================================================= class TestMultiTurnExtended: """ Extended multi-turn scenarios testing longer conversations and more complex topic changes. """ @pytest.mark.eval @requires_judge_llm def test_three_turn_topic_changes(self, mock_config, eval_db, eval_dialogue_memory): """ Three-turn conversation with multiple topic changes. Turn 1: Weather query Turn 2: Store hours query (topic change from weather) Turn 3: News query (topic change from store) Each turn should select the appropriate tool. """ from jarvis.reply.engine import run_reply_engine mock_config.ollama_base_url = "http://localhost:11434" mock_config.ollama_chat_model = JUDGE_MODEL capture = ToolCallCapture() all_turns = [] def mock_tool_run(db, cfg, tool_name, tool_args, **kwargs): from jarvis.tools.types import ToolExecutionResult capture.record(tool_name, tool_args or {}) if tool_name == "getWeather": return ToolExecutionResult(success=True, reply_text=MOCK_WEATHER_RESPONSE) elif tool_name == "webSearch": # Return appropriate content based on query args_str = str(tool_args).lower() if tool_args else "" if "cex" in args_str or "store" in args_str or "hour" in args_str: return ToolExecutionResult(success=True, reply_text=MOCK_STORE_HOURS_SEARCH) else: return ToolExecutionResult(success=True, reply_text=MOCK_NEWS_SEARCH) return ToolExecutionResult(success=True, reply_text="OK") with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \ patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)): queries = [ ("How's the weather today?", "getWeather"), ("What time does CEX close?", "webSearch"), ("What's happening in tech news?", "webSearch"), ] for query, expected_tool in queries: capture.clear() response = run_reply_engine( db=eval_db, cfg=mock_config, tts=None, text=query, dialogue_memory=eval_dialogue_memory ) all_turns.append({ "query": query, "expected": expected_tool, "tools": capture.tool_sequence().copy(), "response": response }) print(f"\nšŸ“Š Three-Turn Topic Changes:") failures = [] for i, turn in enumerate(all_turns, 1): tools = turn["tools"] expected = turn["expected"] has_expected = expected in tools status = "āœ…" if has_expected else "āŒ" print(f" Turn {i}: '{turn['query'][:35]}...'") print(f" Expected: {expected}, Got: {tools} {status}") if not has_expected: # Check for context anchoring specifically if i > 1 and all_turns[i-2]["expected"] in tools: failures.append( f"Turn {i}: Context anchoring bug - used {tools} (previous turn's tool) " f"instead of {expected}" ) else: failures.append(f"Turn {i}: Expected {expected}, got {tools}") if failures: pytest.fail( f"āŒ Multi-turn tool selection failures:\n" + "\n".join(f" - {f}" for f in failures) ) print(f" āœ… All turns selected correct tools")