"""Unit tests for the task-list planner. These tests verify behaviours, not implementation: the parser cleans up messy LLM output, trivial single-reply plans don't leak out, the fail-open paths return an empty list, and the progress_nudge reflects accurate step progression. """ from __future__ import annotations from types import SimpleNamespace from unittest.mock import patch import pytest from jarvis.reply import planner as planner_mod from jarvis.reply.planner import ( MAX_STEPS, SEARCH_MEMORY_DIRECTIVE, _is_trivial_plan, _parse_plan, format_plan_block, is_search_memory_step, memory_topic_of, plan_query, plan_requires_memory, progress_nudge, resolve_next_tool_call, resolve_planner_model, strip_memory_directives, plan_has_unresolved_tool_steps, tool_names_in_plan, tool_steps_of, ) def _cfg(**overrides): base = { "ollama_base_url": "http://localhost:11434", "ollama_chat_model": "gemma4:e2b", "planner_model": "", "tool_router_model": "", "intent_judge_model": "", "planner_enabled": True, "planner_timeout_sec": 6.0, } base.update(overrides) return SimpleNamespace(**base) class TestParsePlan: def test_strips_numbering(self): raw = "1. webSearch query='foo'\n2. Reply to user" assert _parse_plan(raw) == ["webSearch query='foo'", "Reply to user"] def test_strips_bullet_prefixes(self): raw = "- step one\n* step two\n• step three" assert _parse_plan(raw) == ["step one", "step two", "step three"] def test_strips_wrapping_quotes(self): raw = '"step one"\n`step two`' assert _parse_plan(raw) == ["step one", "step two"] def test_ignores_json_fences_and_blank_lines(self): raw = "```\nstep one\n\n```\nstep two" assert _parse_plan(raw) == ["step one", "step two"] def test_caps_at_max_steps(self): raw = "\n".join(f"step {i}" for i in range(MAX_STEPS + 3)) assert len(_parse_plan(raw)) == MAX_STEPS def test_truncates_overlong_step(self): long = "a" * 500 parsed = _parse_plan(long) assert len(parsed) == 1 assert parsed[0].endswith("…") assert len(parsed[0]) <= 201 class TestIsTrivialPlan: def test_empty_is_trivial(self): assert _is_trivial_plan([]) is True def test_single_step_is_trivial_regardless_of_language(self): # Purely structural: any 1-step plan is trivial. Language-agnostic. assert _is_trivial_plan(["Reply to the user."]) is True assert _is_trivial_plan(["Répondre à l'utilisateur."]) is True assert _is_trivial_plan(["ユーザーに返信する"]) is True assert _is_trivial_plan(["webSearch query='x'"]) is True def test_multi_step_is_not_trivial(self): assert _is_trivial_plan(["webSearch ...", "Reply to user"]) is False assert _is_trivial_plan(["a", "b", "c"]) is False class TestResolvePlannerModel: def test_prefers_explicit_planner_model(self): cfg = _cfg(planner_model="gemma-plan", ollama_chat_model="chat") assert resolve_planner_model(cfg) == "gemma-plan" def test_tracks_chat_model_by_default(self): cfg = _cfg(ollama_chat_model="gemma4:e2b") assert resolve_planner_model(cfg) == "gemma4:e2b" def test_ignores_tool_router_model(self): # Planner must track the chat model — not the router. Upgrading # the chat model through setup must upgrade the planner too. cfg = _cfg(tool_router_model="router-x", ollama_chat_model="chat-y") assert resolve_planner_model(cfg) == "chat-y" def test_upgrading_chat_model_upgrades_planner(self): cfg = _cfg(ollama_chat_model="gpt-oss:20b") assert resolve_planner_model(cfg) == "gpt-oss:20b" def test_returns_empty_when_no_candidates(self): cfg = _cfg(ollama_chat_model="") assert resolve_planner_model(cfg) == "" class TestPlanQuery: def test_short_query_returns_empty(self): cfg = _cfg() assert plan_query(cfg, "hi", "", []) == [] def test_disabled_returns_empty(self): cfg = _cfg(planner_enabled=False) long = "what films did the director of Possessor make?" assert plan_query(cfg, long, "", []) == [] def test_missing_model_returns_empty(self): cfg = _cfg(ollama_chat_model="") long = "what films did the director of Possessor make?" assert plan_query(cfg, long, "", []) == [] def test_returns_parsed_steps(self): cfg = _cfg() raw_plan = ( "webSearch query='Possessor 2020 director'\n" "webSearch query='films by '\n" "Reply to the user with the combined findings." ) with patch.object(planner_mod, "call_llm_direct", return_value=raw_plan): steps = plan_query( cfg, "what films did the director of Possessor make?", "", [("webSearch", "Search the web.")], ) assert len(steps) == 3 assert "Possessor" in steps[0] assert steps[-1].lower().startswith("reply") def test_single_reply_plan_is_preserved(self): """A 1-step reply-only plan is the planner's POSITIVE "no memory, no tools needed" signal. It must NOT be filtered to [] — the engine distinguishes [] (fail-open) from ["Reply ..."] (explicit skip-everything decision) and uses the latter to skip the memory extractor and tool router entirely. """ cfg = _cfg() with patch.object(planner_mod, "call_llm_direct", return_value="Reply to user."): steps = plan_query( cfg, "tell me a joke about cats please", "", [], ) assert steps == ["Reply to user."] def test_llm_failure_returns_empty(self): cfg = _cfg() with patch.object(planner_mod, "call_llm_direct", return_value=None): steps = plan_query( cfg, "what films did the director of Possessor make?", "", [("webSearch", "Search the web.")], ) assert steps == [] def test_memory_context_arg_still_accepted_for_back_compat(self): """Old callers pass `memory_context=` as a positional or keyword argument. Planner now ignores it (the planner runs before memory search), but the signature must still accept it so downstream code doesn't break.""" cfg = _cfg() with patch.object(planner_mod, "call_llm_direct", return_value="Reply to user."): steps = plan_query( cfg, "tell me a joke about cats please", "", [], memory_context="some old memory text", ) assert steps == ["Reply to user."] def test_prompt_warns_against_fabricating_optional_arguments(self): """The planner prompt must explicitly tell the model to omit optional arguments when the user didn't supply a value, and warn against grabbing unrelated words from the utterance as fake values. 2026-04-24 field regression: gemma4:e2b responded to "how's the weather going to be today" with a plan step of ``getWeather location='today'``. The temporal qualifier "today" was geocoded to a village called "Todaya" in the Philippines — because the small model was trained by our prompt to always give a concrete argument, even when the user's utterance had none to give. This content-assertion guards the fix so the rule can't be silently reverted during future prompt edits without a test failure pointing the editor at the behavioural consequence. """ prompt = planner_mod._PROMPT_TEMPLATE.lower() assert "omit" in prompt, ( "Planner prompt must tell the model to OMIT optional args " "when no value was provided." ) # The guidance must name the exact failure mode so the model # doesn't pattern-match on generic 'omit' without knowing why. assert "fabricate" in prompt or "do not fabricate" in prompt, ( "Planner prompt must warn against fabricating argument values " "from unrelated words in the utterance." ) class TestFormatPlanBlock: def test_empty_returns_empty_string(self): assert format_plan_block([]) == "" def test_numbers_the_steps(self): block = format_plan_block(["step a", "step b"]) assert "1. step a" in block assert "2. step b" in block assert "ACTION PLAN" in block class TestProgressNudge: def test_empty_plan_returns_empty(self): assert progress_nudge([], 0) == "" def test_single_reply_step_returns_empty(self): """A 1-step reply-only plan has no tool steps, so there is nothing to nudge. The empty string tells the engine to skip injecting a progress reminder after the (non-existent) tool result.""" assert progress_nudge(["Reply to user"], 0) == "" def test_points_at_next_step(self): steps = ["webSearch query='foo'", "webSearch query='bar'", "Reply to user"] msg = progress_nudge(steps, 0) assert "foo" in msg assert "0/2" in msg msg2 = progress_nudge(steps, 1) assert "bar" in msg2 assert "1/2" in msg2 def test_all_steps_done_prompts_synthesis(self): steps = ["webSearch query='foo'", "webSearch query='bar'", "Reply to user"] msg = progress_nudge(steps, 2) assert "all tool steps executed" in msg.lower() or "synthes" in msg.lower() class TestResolveNextToolCall: def _schema(self): return [ { "type": "function", "function": { "name": "webSearch", "description": "Search the web.", "parameters": { "type": "object", "properties": {"query": {"type": "string"}}, }, }, } ] def test_returns_tool_and_args(self): cfg = _cfg() raw = '{"name": "webSearch", "arguments": {"query": "weather in Paris"}}' with patch.object(planner_mod, "call_llm_direct", return_value=raw): result = resolve_next_tool_call( cfg, "webSearch query='weather in Paris'", [], self._schema() ) assert result == ("webSearch", {"query": "weather in Paris"}) def test_rejects_unknown_tool(self): cfg = _cfg() raw = '{"name": "mysteryTool", "arguments": {}}' with patch.object(planner_mod, "call_llm_direct", return_value=raw): assert resolve_next_tool_call( cfg, "do the thing", [], self._schema() ) is None def test_null_means_synthesis(self): cfg = _cfg() with patch.object(planner_mod, "call_llm_direct", return_value="null"): assert resolve_next_tool_call( cfg, "Reply to user", [], self._schema() ) is None def test_peels_markdown_fences(self): cfg = _cfg() raw = '```json\n{"name": "webSearch", "arguments": {"query": "x"}}\n```' with patch.object(planner_mod, "call_llm_direct", return_value=raw): result = resolve_next_tool_call( cfg, "search for x", [], self._schema() ) assert result == ("webSearch", {"query": "x"}) def test_invalid_json_returns_none(self): cfg = _cfg() with patch.object(planner_mod, "call_llm_direct", return_value="not json"): assert resolve_next_tool_call( cfg, "do the thing", [], self._schema() ) is None def test_missing_schema_returns_none(self): cfg = _cfg() assert resolve_next_tool_call(cfg, "do the thing", [], []) is None def test_drops_unknown_argument_keys(self): cfg = _cfg() raw = ( '{"name": "webSearch", "arguments": ' '{"query": "weather", "evil_key": "shell"}}' ) with patch.object(planner_mod, "call_llm_direct", return_value=raw): result = resolve_next_tool_call( cfg, "search weather", [], self._schema() ) assert result == ("webSearch", {"query": "weather"}) def test_deterministic_parse_skips_llm_for_concrete_step(self): """A fully concrete plan step (tool name + `key='value'` args, no ````) must be parsed deterministically without calling the LLM resolver at all. Motivation (2026-04-24 field trace): a follow-up query produced the plan `webSearch query='Justin Bieber most famous songs'` — trivially concrete — but the LLM resolver flaked (returned ``null`` or garbage) and the engine fell back to the chat model, which then refused. Parsing concrete steps deterministically removes the LLM call as a failure surface for the common case. """ cfg = _cfg() call_count = [0] def _spy(*args, **kwargs): call_count[0] += 1 return "null" with patch.object(planner_mod, "call_llm_direct", side_effect=_spy): result = resolve_next_tool_call( cfg, "webSearch query='Justin Bieber most famous songs'", [], self._schema(), ) assert result == ( "webSearch", {"query": "Justin Bieber most famous songs"}, ) assert call_count[0] == 0, ( f"LLM should not be called for a concrete step; was called {call_count[0]}×" ) def test_deterministic_parse_still_rejects_unknown_tool(self): """The fast path must still honour the allow-list — a concrete step naming a tool not in the schema falls through to ``None``, not to an unfiltered dispatch.""" cfg = _cfg() with patch.object(planner_mod, "call_llm_direct", return_value="null"): assert resolve_next_tool_call( cfg, "mysteryTool query='anything'", [], self._schema(), ) is None def test_falls_back_to_llm_when_step_has_placeholder(self): """Steps containing an ```` placeholder need entity substitution from prior results — that requires the LLM resolver, so the fast path must decline and defer.""" cfg = _cfg() raw = ( '{"name": "webSearch", "arguments": ' '{"query": "films directed by Brandon Cronenberg"}}' ) with patch.object( planner_mod, "call_llm_direct", return_value=raw, ) as spy: result = resolve_next_tool_call( cfg, "webSearch query='films directed by '", [("webSearch", '{"query": "Possessor director"}', "Possessor directed by Brandon Cronenberg.")], self._schema(), ) assert result == ( "webSearch", {"query": "films directed by Brandon Cronenberg"}, ) assert spy.called, "Placeholder substitution must go through the LLM" def test_deterministic_parse_accepts_bare_tool_name_as_empty_args(self): """A plan step naming the tool with no trailing args must parse to ``(name, {})`` without an LLM call. This is the shape the planner emits when it follows the "omit optional arguments" rule — e.g. a weather query with no named place plans as ``getWeather`` (no args), and the tool auto-derives location from the user's geoip context. """ cfg = _cfg() schema = [ { "type": "function", "function": { "name": "getWeather", "description": "Weather.", "parameters": { "type": "object", "properties": {"location": {"type": "string"}}, "required": [], }, }, } ] with patch.object(planner_mod, "call_llm_direct") as spy: result = resolve_next_tool_call(cfg, "getWeather", [], schema) assert result == ("getWeather", {}) assert not spy.called, ( "Bare tool name must not trigger an LLM round-trip" ) def test_deterministic_parse_handles_double_quoted_values(self): """Planner output occasionally uses double quotes — parse both.""" cfg = _cfg() with patch.object(planner_mod, "call_llm_direct") as spy: result = resolve_next_tool_call( cfg, 'webSearch query="weather in Paris"', [], self._schema(), ) assert result == ("webSearch", {"query": "weather in Paris"}) assert not spy.called def test_deterministic_parse_handles_hyphenated_mcp_tool_name(self): """MCP tool names like ``chrome-devtools__navigate_page`` contain hyphens. The fast-path parser must accept them and produce a clean ``(name, args)`` without an LLM round-trip — otherwise the engine falls through to the chat model, which on small models flakes into the empty-reply fallback.""" cfg = _cfg() schema = [ { "type": "function", "function": { "name": "chrome-devtools__navigate_page", "description": "Navigate the browser to a URL.", "parameters": { "type": "object", "properties": {"url": {"type": "string"}}, }, }, } ] with patch.object(planner_mod, "call_llm_direct") as spy: result = resolve_next_tool_call( cfg, "chrome-devtools__navigate_page url='https://youtube.com'", [], schema, ) assert result == ( "chrome-devtools__navigate_page", {"url": "https://youtube.com"}, ) assert not spy.called, ( "Hyphenated MCP tool name must parse without an LLM round-trip" ) def test_keeps_args_as_is_when_schema_has_no_properties(self): cfg = _cfg() schema = [ { "type": "function", "function": { "name": "freeform", "description": "freeform", "parameters": {"type": "object"}, }, } ] raw = '{"name": "freeform", "arguments": {"anything": "goes"}}' with patch.object(planner_mod, "call_llm_direct", return_value=raw): result = resolve_next_tool_call(cfg, "do it", [], schema) assert result == ("freeform", {"anything": "goes"}) class TestUrlArgNormalisation: """The resolver must hand chrome/browser MCP tools a fully-qualified URL. Field trace (2026-05): the planner emitted ``page='[youtube.com](http://youtube.com)'`` for the user query "navigate to youtube.com". The slow-path resolver remapped the key to ``url`` (the schema's actual property) but preserved the markdown wrapper as the value, so chrome-devtools-mcp received ``{"url": "[youtube.com](http://youtube.com)"}`` and Puppeteer's Page.navigate rejected with "Cannot navigate to invalid URL". A scheme-less bare ``youtube.com`` value fails the same way. The fix is generic: any URL-keyed string value gets markdown-stripped and scheme-prepended before it leaves the planner. """ def _navigate_schema(self): return [ { "type": "function", "function": { "name": "chrome-devtools__navigate_page", "description": "Navigate to a URL.", "parameters": { "type": "object", "properties": { "url": {"type": "string"}, }, }, }, } ] def test_strips_markdown_link_wrapper_in_slow_path(self): cfg = _cfg() raw = ( '{"name": "chrome-devtools__navigate_page", "arguments": ' '{"url": "[youtube.com](http://youtube.com)"}}' ) with patch.object(planner_mod, "call_llm_direct", return_value=raw): result = resolve_next_tool_call( cfg, "chrome-devtools__navigate_page page='[youtube.com](http://youtube.com)'", [], self._navigate_schema(), ) assert result == ( "chrome-devtools__navigate_page", {"url": "http://youtube.com"}, ) def test_prepends_scheme_to_bare_domain_in_slow_path(self): cfg = _cfg() raw = ( '{"name": "chrome-devtools__navigate_page", "arguments": ' '{"url": "youtube.com"}}' ) with patch.object(planner_mod, "call_llm_direct", return_value=raw): result = resolve_next_tool_call( cfg, "chrome-devtools__navigate_page page='youtube.com'", [], self._navigate_schema(), ) assert result == ( "chrome-devtools__navigate_page", {"url": "https://youtube.com"}, ) def test_prepends_scheme_to_bare_domain_in_fast_path(self): """Fast path parses ``url='youtube.com'`` deterministically; the normalisation must apply there too so we don't regress on the common case where the planner uses the right key name.""" cfg = _cfg() with patch.object(planner_mod, "call_llm_direct", return_value="null") as spy: result = resolve_next_tool_call( cfg, "chrome-devtools__navigate_page url='youtube.com'", [], self._navigate_schema(), ) assert result == ( "chrome-devtools__navigate_page", {"url": "https://youtube.com"}, ) assert spy.call_count == 0, "fast path must not call the LLM resolver" def test_preserves_already_qualified_url(self): cfg = _cfg() with patch.object(planner_mod, "call_llm_direct", return_value="null"): result = resolve_next_tool_call( cfg, "chrome-devtools__navigate_page url='https://youtube.com/feed/trending'", [], self._navigate_schema(), ) assert result == ( "chrome-devtools__navigate_page", {"url": "https://youtube.com/feed/trending"}, ) def test_does_not_touch_unrelated_string_args(self): """A ``query='youtube.com tutorials'`` arg on webSearch must stay literal — we only normalise values keyed as URLs.""" cfg = _cfg() schema = [ { "type": "function", "function": { "name": "webSearch", "description": "Search.", "parameters": { "type": "object", "properties": {"query": {"type": "string"}}, }, }, } ] raw = ( '{"name": "webSearch", "arguments": ' '{"query": "youtube.com tutorials"}}' ) with patch.object(planner_mod, "call_llm_direct", return_value=raw): result = resolve_next_tool_call( cfg, "webSearch find tutorials", [], schema, ) assert result == ("webSearch", {"query": "youtube.com tutorials"}) class TestToolStepsOf: def test_multi_step_drops_final_synthesis_step(self): assert tool_steps_of(["a", "b", "reply"]) == ["a", "b"] def test_single_step_has_no_tool_steps(self): """A 1-step plan is reply-only by contract (rule 9), so it contributes no tool steps. Engine uses this to skip the direct-exec path and the progress nudge for pure-reply plans.""" assert tool_steps_of(["only"]) == [] def test_empty_plan(self): assert tool_steps_of([]) == [] def test_strips_search_memory_directive(self): plan = [ "searchMemory topic='user preferences'", "webSearch query='foo'", "Reply to the user.", ] assert tool_steps_of(plan) == ["webSearch query='foo'"] class TestIsSearchMemoryStep: def test_detects_directive(self): assert is_search_memory_step("searchMemory topic='x'") is True assert is_search_memory_step(" SEARCHMEMORY topic='x'") is True def test_rejects_other_steps(self): assert is_search_memory_step("webSearch query='foo'") is False assert is_search_memory_step("Reply to the user.") is False class TestMemoryTopicOf: def test_single_quoted(self): assert memory_topic_of("searchMemory topic='pets'") == "pets" def test_double_quoted(self): assert memory_topic_of('searchMemory topic="favourite films"') == "favourite films" def test_bare_value(self): assert memory_topic_of("searchMemory topic=preferences") == "preferences" def test_missing_topic_returns_empty(self): assert memory_topic_of("searchMemory") == "" class TestPlanRequiresMemory: def test_true_when_directive_present(self): assert plan_requires_memory([ "searchMemory topic='pets'", "Reply to user", ]) is True def test_false_when_only_tools_and_reply(self): assert plan_requires_memory([ "webSearch query='foo'", "Reply to the user.", ]) is False def test_false_for_empty(self): assert plan_requires_memory([]) is False class TestStripMemoryDirectives: def test_removes_directive(self): plan = [ "searchMemory topic='pets'", "Reply to user", ] assert strip_memory_directives(plan) == ["Reply to user"] def test_leaves_tool_only_plan_untouched(self): plan = ["webSearch query='foo'", "Reply"] assert strip_memory_directives(plan) == plan class TestToolNamesInPlan: def test_extracts_known_names_in_order(self): plan = [ "webSearch query='a'", "getWeather", "webSearch query='b'", # duplicate should dedup "Reply to the user.", ] names = tool_names_in_plan(plan, ["webSearch", "getWeather", "stop"]) assert names == ["webSearch", "getWeather"] def test_filters_unknown_names(self): plan = ["hallucinatedTool x='y'", "webSearch query='q'", "Reply"] assert tool_names_in_plan(plan, ["webSearch"]) == ["webSearch"] def test_ignores_search_memory_directive(self): plan = ["searchMemory topic='t'", "webSearch query='q'", "Reply"] assert tool_names_in_plan(plan, ["webSearch", "searchMemory"]) == ["webSearch"] def test_empty_plan(self): assert tool_names_in_plan([], ["webSearch"]) == [] def test_extracts_hyphenated_mcp_tool_name(self): """MCP tool names embed the server in the prefix and use hyphens (e.g. ``chrome-devtools__navigate_page``). The head regex must accept hyphens so the planner-driven allow-list union and the ``_plan_under_specified`` guard don't misclassify a perfectly valid plan step as paraphrased prose. Field trace (2026-05-03): user said "navigate to youtube.com". Planner emitted ``chrome-devtools__navigate_page page='...'`` correctly, but the hyphen-stripping regex extracted only ``chrome``, which wasn't a known tool — so direct-exec was skipped and the small chat model flaked into the empty-reply fallback. """ plan = [ "chrome-devtools__navigate_page page='https://youtube.com'", "Reply to the user.", ] names = tool_names_in_plan(plan, ["chrome-devtools__navigate_page"]) assert names == ["chrome-devtools__navigate_page"] class TestPlanHasUnresolvedToolSteps: def test_true_when_step_paraphrases_tool(self): plan = ["get the weather", "Reply to the user."] assert plan_has_unresolved_tool_steps(plan, ["getWeather", "stop"]) is True def test_false_when_step_names_tool(self): plan = ["getWeather", "Reply to the user."] assert plan_has_unresolved_tool_steps(plan, ["getWeather"]) is False def test_false_for_reply_only_plan(self): # No tool steps at all — the planner explicitly decided no tools. assert plan_has_unresolved_tool_steps( ["Reply to the user."], ["getWeather"] ) is False def test_false_for_empty_plan(self): assert plan_has_unresolved_tool_steps([], ["getWeather"]) is False def test_false_when_search_memory_only_and_reply(self): # searchMemory is a directive, not a tool — but there's also no # real tool step paraphrased either. plan = ["searchMemory topic='t'", "Reply to the user."] assert plan_has_unresolved_tool_steps(plan, ["getWeather"]) is False def test_false_for_hyphenated_mcp_tool_step(self): """A concrete plan step naming a hyphenated MCP tool must NOT be treated as under-specified — otherwise the engine skips direct-exec and forces the chat model to take the turn instead.""" plan = [ "chrome-devtools__navigate_page page='https://youtube.com'", "Reply to the user.", ] assert plan_has_unresolved_tool_steps( plan, ["chrome-devtools__navigate_page"] ) is False