""" Tool Router — Context-Aware Selection (Live) Guards that the LLM tool router, when handed a compact summary of what the main assistant can already see at reply time (current local time, resolved location, recent dialogue), correctly returns 'none' for queries fully answerable from that context — instead of embed-matching an adjacent tool. Motivating field incident (2026-04-20): User asked "what time is it, Jarvis?". The router, having no view of the assistant's live context, picked `getWeather` as the closest temporal tool on the catalogue. With only `getWeather, stop` in the allowed list, the main model dutifully called getWeather and the reply parroted the weather back as if it had answered the time question. The fix is upstream: pass the router the same compact context hint the memory extractor already uses, and let it judge for itself whether the query is answerable from context. Location may not always resolve, so the hint degrades gracefully — the router falls back to content-based selection when context is missing or partial, and should not over-commit to 'none' for queries whose answer was NOT visible in the hint. Run: EVAL_JUDGE_MODEL=gemma4:e2b pytest evals/test_tool_router_context_aware.py -v """ import pytest from conftest import requires_judge_llm from helpers import JUDGE_BASE_URL, JUDGE_MODEL _TIME_LOCATION_HINT = ( "Current local time: Sunday, 2026-04-20 17:42 (Europe/London). " "Location: Hackney, Hackney, United Kingdom." ) # Deliberately omits location — exercises the graceful-degradation path. _TIME_ONLY_HINT = "Current local time: Sunday, 2026-04-20 17:42 UTC." def _route(query: str, context_hint): """Invoke the real LLM router with the builtin tool catalogue.""" from jarvis.tools.registry import BUILTIN_TOOLS from jarvis.tools.selection import select_tools, ToolSelectionStrategy return select_tools( query=query, builtin_tools=BUILTIN_TOOLS, mcp_tools={}, strategy=ToolSelectionStrategy.LLM, llm_base_url=JUDGE_BASE_URL, llm_model=JUDGE_MODEL, llm_timeout_sec=30.0, context_hint=context_hint, ) @pytest.mark.eval @requires_judge_llm class TestRouterReturnsNoneWhenContextAnswers: """Router must opt out when the answer is already visible in context.""" def test_time_query_with_time_in_context_returns_none(self): selected = _route("what time is it, Jarvis?", _TIME_LOCATION_HINT) real = [t for t in selected if t != "stop"] print(f"\n Selected: {selected}") if real: pytest.xfail( f"Small router model {JUDGE_MODEL} still picked real tools " f"({real}) for a query fully answerable from context." ) assert not real, f"Router should opt out, got: {selected}" def test_date_query_with_date_in_context_returns_none(self): selected = _route("what's today's date?", _TIME_LOCATION_HINT) real = [t for t in selected if t != "stop"] print(f"\n Selected: {selected}") if real: pytest.xfail( f"Router picked real tools ({real}) for a date query " f"answerable from context." ) assert not real def test_location_query_with_location_in_context_returns_none(self): selected = _route("where am I right now?", _TIME_LOCATION_HINT) real = [t for t in selected if t != "stop"] print(f"\n Selected: {selected}") if real: pytest.xfail( f"Router picked real tools ({real}) for a location query " f"answerable from context." ) assert not real @pytest.mark.eval @requires_judge_llm class TestRouterPicksToolsWhenContextDoesNotAnswer: """Regression guard: router must not over-commit to 'none'.""" def test_weather_query_still_picks_getWeather(self): """Context has time+location, but weather itself is not in context — the router must still pick getWeather.""" selected = _route("what's the weather like?", _TIME_LOCATION_HINT) print(f"\n Selected: {selected}") assert "getWeather" in selected, ( f"Router dropped getWeather for an explicit weather query. " f"Got: {selected}" ) def test_location_query_with_partial_hint_still_routes_sensibly(self): """KNOWN LIMITATION on small router models (gemma4:e2b). When location failed to resolve (hint lacks it), a location query should not be silenced as 'none' — it must either route to a tool that can surface location or accept the fallback, but must not confidently claim the answer is in context when it isn't. Observed behaviour on gemma4:e2b: the mere presence of an ALREADY IN CONTEXT block primes the router to return 'none' for context-shaped queries even when the specific fact is absent from the block. Attempts to fix this purely at prompt level (adding "the block is NOT exhaustive" wording) regress the positive cases (time/date queries stop routing to 'none'). The practical impact is bounded: when location genuinely fails to resolve, the follow-up layers (main model + memory recall) still have a chance to produce a sensible answer, and this only fires on the narrow path where the hint is partial. Parked as xfail rather than deleted so that a future router model (or prompt iteration) will surface the improvement as an unexpected pass. If fixed, delete the xfail branch and assert `selected != ["stop"]` unconditionally. """ selected = _route("where am I right now?", _TIME_ONLY_HINT) print(f"\n Selected: {selected}") if selected == ["stop"]: pytest.xfail( f"Router returned 'none' for a location query whose answer " f"was NOT in the partial hint. Known small-model limit — " f"see test docstring." ) def test_followup_naming_place_routes_to_getWeather(self): """Field capture 2026-04-20: assistant asked "Which city should I check the weather for?" and the user replied "I'm in London". The router saw only "I'm in London" as the query and returned 'none' — reading it as idle chatter instead of a continuation. With the split-hint prompt (KNOWN FACTS + RECENT DIALOGUE), the router must merge intent across turns and route to getWeather.""" hint = ( "Current local time: Sunday, 2026-04-20 17:42 UTC.\n\n" "Recent dialogue (short-term memory):\n" "- user: what's the weather like?\n" "- assistant: Which city should I check the weather for?" ) selected = _route("I'm in London", hint) print(f"\n Selected: {selected}") if "getWeather" not in selected: pytest.xfail( f"Router did not resolve follow-up 'I'm in London' after the " f"assistant asked for a city. Got: {selected}. Known small-" f"model limit — the prompt change lands first, the eval " f"tracks the improvement." ) def test_no_hint_at_all_still_routes_sensibly(self): """With context_hint=None (e.g. first turn, location lookup failed entirely), the router must still work — selecting content-relevant tools. This guards the graceful-degradation path.""" selected = _route("what's the weather like?", None) print(f"\n Selected: {selected}") assert "getWeather" in selected, ( f"Router broke when context_hint was None. Got: {selected}" )