Files
javis_bot/tests/test_planner.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

791 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Unit tests for the task-list planner.
These tests verify behaviours, not implementation: the parser cleans up
messy LLM output, trivial single-reply plans don't leak out, the
fail-open paths return an empty list, and the progress_nudge reflects
accurate step progression.
"""
from __future__ import annotations
from types import SimpleNamespace
from unittest.mock import patch
import pytest
from jarvis.reply import planner as planner_mod
from jarvis.reply.planner import (
MAX_STEPS,
SEARCH_MEMORY_DIRECTIVE,
_is_trivial_plan,
_parse_plan,
format_plan_block,
is_search_memory_step,
memory_topic_of,
plan_query,
plan_requires_memory,
progress_nudge,
resolve_next_tool_call,
resolve_planner_model,
strip_memory_directives,
plan_has_unresolved_tool_steps,
tool_names_in_plan,
tool_steps_of,
)
def _cfg(**overrides):
base = {
"ollama_base_url": "http://localhost:11434",
"ollama_chat_model": "gemma4:e2b",
"planner_model": "",
"tool_router_model": "",
"intent_judge_model": "",
"planner_enabled": True,
"planner_timeout_sec": 6.0,
}
base.update(overrides)
return SimpleNamespace(**base)
class TestParsePlan:
def test_strips_numbering(self):
raw = "1. webSearch query='foo'\n2. Reply to user"
assert _parse_plan(raw) == ["webSearch query='foo'", "Reply to user"]
def test_strips_bullet_prefixes(self):
raw = "- step one\n* step two\n• step three"
assert _parse_plan(raw) == ["step one", "step two", "step three"]
def test_strips_wrapping_quotes(self):
raw = '"step one"\n`step two`'
assert _parse_plan(raw) == ["step one", "step two"]
def test_ignores_json_fences_and_blank_lines(self):
raw = "```\nstep one\n\n```\nstep two"
assert _parse_plan(raw) == ["step one", "step two"]
def test_caps_at_max_steps(self):
raw = "\n".join(f"step {i}" for i in range(MAX_STEPS + 3))
assert len(_parse_plan(raw)) == MAX_STEPS
def test_truncates_overlong_step(self):
long = "a" * 500
parsed = _parse_plan(long)
assert len(parsed) == 1
assert parsed[0].endswith("")
assert len(parsed[0]) <= 201
class TestIsTrivialPlan:
def test_empty_is_trivial(self):
assert _is_trivial_plan([]) is True
def test_single_step_is_trivial_regardless_of_language(self):
# Purely structural: any 1-step plan is trivial. Language-agnostic.
assert _is_trivial_plan(["Reply to the user."]) is True
assert _is_trivial_plan(["Répondre à l'utilisateur."]) is True
assert _is_trivial_plan(["ユーザーに返信する"]) is True
assert _is_trivial_plan(["webSearch query='x'"]) is True
def test_multi_step_is_not_trivial(self):
assert _is_trivial_plan(["webSearch ...", "Reply to user"]) is False
assert _is_trivial_plan(["a", "b", "c"]) is False
class TestResolvePlannerModel:
def test_prefers_explicit_planner_model(self):
cfg = _cfg(planner_model="gemma-plan", ollama_chat_model="chat")
assert resolve_planner_model(cfg) == "gemma-plan"
def test_tracks_chat_model_by_default(self):
cfg = _cfg(ollama_chat_model="gemma4:e2b")
assert resolve_planner_model(cfg) == "gemma4:e2b"
def test_ignores_tool_router_model(self):
# Planner must track the chat model — not the router. Upgrading
# the chat model through setup must upgrade the planner too.
cfg = _cfg(tool_router_model="router-x", ollama_chat_model="chat-y")
assert resolve_planner_model(cfg) == "chat-y"
def test_upgrading_chat_model_upgrades_planner(self):
cfg = _cfg(ollama_chat_model="gpt-oss:20b")
assert resolve_planner_model(cfg) == "gpt-oss:20b"
def test_returns_empty_when_no_candidates(self):
cfg = _cfg(ollama_chat_model="")
assert resolve_planner_model(cfg) == ""
class TestPlanQuery:
def test_short_query_returns_empty(self):
cfg = _cfg()
assert plan_query(cfg, "hi", "", []) == []
def test_disabled_returns_empty(self):
cfg = _cfg(planner_enabled=False)
long = "what films did the director of Possessor make?"
assert plan_query(cfg, long, "", []) == []
def test_missing_model_returns_empty(self):
cfg = _cfg(ollama_chat_model="")
long = "what films did the director of Possessor make?"
assert plan_query(cfg, long, "", []) == []
def test_returns_parsed_steps(self):
cfg = _cfg()
raw_plan = (
"webSearch query='Possessor 2020 director'\n"
"webSearch query='films by <director name from step 1>'\n"
"Reply to the user with the combined findings."
)
with patch.object(planner_mod, "call_llm_direct", return_value=raw_plan):
steps = plan_query(
cfg,
"what films did the director of Possessor make?",
"",
[("webSearch", "Search the web.")],
)
assert len(steps) == 3
assert "Possessor" in steps[0]
assert steps[-1].lower().startswith("reply")
def test_single_reply_plan_is_preserved(self):
"""A 1-step reply-only plan is the planner's POSITIVE "no memory,
no tools needed" signal. It must NOT be filtered to [] — the
engine distinguishes [] (fail-open) from ["Reply ..."] (explicit
skip-everything decision) and uses the latter to skip the
memory extractor and tool router entirely.
"""
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="Reply to user."):
steps = plan_query(
cfg,
"tell me a joke about cats please",
"",
[],
)
assert steps == ["Reply to user."]
def test_llm_failure_returns_empty(self):
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value=None):
steps = plan_query(
cfg,
"what films did the director of Possessor make?",
"",
[("webSearch", "Search the web.")],
)
assert steps == []
def test_memory_context_arg_still_accepted_for_back_compat(self):
"""Old callers pass `memory_context=` as a positional or keyword
argument. Planner now ignores it (the planner runs before memory
search), but the signature must still accept it so downstream
code doesn't break."""
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="Reply to user."):
steps = plan_query(
cfg,
"tell me a joke about cats please",
"",
[],
memory_context="some old memory text",
)
assert steps == ["Reply to user."]
def test_prompt_warns_against_fabricating_optional_arguments(self):
"""The planner prompt must explicitly tell the model to omit
optional arguments when the user didn't supply a value, and warn
against grabbing unrelated words from the utterance as fake values.
2026-04-24 field regression: gemma4:e2b responded to "how's the
weather going to be today" with a plan step of
``getWeather location='today'``. The temporal qualifier "today"
was geocoded to a village called "Todaya" in the Philippines —
because the small model was trained by our prompt to always give
a concrete argument, even when the user's utterance had none to
give. This content-assertion guards the fix so the rule can't be
silently reverted during future prompt edits without a test
failure pointing the editor at the behavioural consequence.
"""
prompt = planner_mod._PROMPT_TEMPLATE.lower()
assert "omit" in prompt, (
"Planner prompt must tell the model to OMIT optional args "
"when no value was provided."
)
# The guidance must name the exact failure mode so the model
# doesn't pattern-match on generic 'omit' without knowing why.
assert "fabricate" in prompt or "do not fabricate" in prompt, (
"Planner prompt must warn against fabricating argument values "
"from unrelated words in the utterance."
)
class TestFormatPlanBlock:
def test_empty_returns_empty_string(self):
assert format_plan_block([]) == ""
def test_numbers_the_steps(self):
block = format_plan_block(["step a", "step b"])
assert "1. step a" in block
assert "2. step b" in block
assert "ACTION PLAN" in block
class TestProgressNudge:
def test_empty_plan_returns_empty(self):
assert progress_nudge([], 0) == ""
def test_single_reply_step_returns_empty(self):
"""A 1-step reply-only plan has no tool steps, so there is
nothing to nudge. The empty string tells the engine to skip
injecting a progress reminder after the (non-existent) tool
result."""
assert progress_nudge(["Reply to user"], 0) == ""
def test_points_at_next_step(self):
steps = ["webSearch query='foo'", "webSearch query='bar'", "Reply to user"]
msg = progress_nudge(steps, 0)
assert "foo" in msg
assert "0/2" in msg
msg2 = progress_nudge(steps, 1)
assert "bar" in msg2
assert "1/2" in msg2
def test_all_steps_done_prompts_synthesis(self):
steps = ["webSearch query='foo'", "webSearch query='bar'", "Reply to user"]
msg = progress_nudge(steps, 2)
assert "all tool steps executed" in msg.lower() or "synthes" in msg.lower()
class TestResolveNextToolCall:
def _schema(self):
return [
{
"type": "function",
"function": {
"name": "webSearch",
"description": "Search the web.",
"parameters": {
"type": "object",
"properties": {"query": {"type": "string"}},
},
},
}
]
def test_returns_tool_and_args(self):
cfg = _cfg()
raw = '{"name": "webSearch", "arguments": {"query": "weather in Paris"}}'
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg, "webSearch query='weather in Paris'", [], self._schema()
)
assert result == ("webSearch", {"query": "weather in Paris"})
def test_rejects_unknown_tool(self):
cfg = _cfg()
raw = '{"name": "mysteryTool", "arguments": {}}'
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
assert resolve_next_tool_call(
cfg, "do the thing", [], self._schema()
) is None
def test_null_means_synthesis(self):
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="null"):
assert resolve_next_tool_call(
cfg, "Reply to user", [], self._schema()
) is None
def test_peels_markdown_fences(self):
cfg = _cfg()
raw = '```json\n{"name": "webSearch", "arguments": {"query": "x"}}\n```'
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg, "search for x", [], self._schema()
)
assert result == ("webSearch", {"query": "x"})
def test_invalid_json_returns_none(self):
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="not json"):
assert resolve_next_tool_call(
cfg, "do the thing", [], self._schema()
) is None
def test_missing_schema_returns_none(self):
cfg = _cfg()
assert resolve_next_tool_call(cfg, "do the thing", [], []) is None
def test_drops_unknown_argument_keys(self):
cfg = _cfg()
raw = (
'{"name": "webSearch", "arguments": '
'{"query": "weather", "evil_key": "shell"}}'
)
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg, "search weather", [], self._schema()
)
assert result == ("webSearch", {"query": "weather"})
def test_deterministic_parse_skips_llm_for_concrete_step(self):
"""A fully concrete plan step (tool name + `key='value'` args, no
``<placeholder>``) must be parsed deterministically without calling
the LLM resolver at all.
Motivation (2026-04-24 field trace): a follow-up query produced the
plan `webSearch query='Justin Bieber most famous songs'` — trivially
concrete — but the LLM resolver flaked (returned ``null`` or
garbage) and the engine fell back to the chat model, which then
refused. Parsing concrete steps deterministically removes the LLM
call as a failure surface for the common case.
"""
cfg = _cfg()
call_count = [0]
def _spy(*args, **kwargs):
call_count[0] += 1
return "null"
with patch.object(planner_mod, "call_llm_direct", side_effect=_spy):
result = resolve_next_tool_call(
cfg,
"webSearch query='Justin Bieber most famous songs'",
[],
self._schema(),
)
assert result == (
"webSearch",
{"query": "Justin Bieber most famous songs"},
)
assert call_count[0] == 0, (
f"LLM should not be called for a concrete step; was called {call_count[0]}×"
)
def test_deterministic_parse_still_rejects_unknown_tool(self):
"""The fast path must still honour the allow-list — a concrete step
naming a tool not in the schema falls through to ``None``, not to an
unfiltered dispatch."""
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="null"):
assert resolve_next_tool_call(
cfg,
"mysteryTool query='anything'",
[],
self._schema(),
) is None
def test_falls_back_to_llm_when_step_has_placeholder(self):
"""Steps containing an ``<entity from step N>`` placeholder need
entity substitution from prior results — that requires the LLM
resolver, so the fast path must decline and defer."""
cfg = _cfg()
raw = (
'{"name": "webSearch", "arguments": '
'{"query": "films directed by Brandon Cronenberg"}}'
)
with patch.object(
planner_mod, "call_llm_direct", return_value=raw,
) as spy:
result = resolve_next_tool_call(
cfg,
"webSearch query='films directed by <director name from step 1>'",
[("webSearch", '{"query": "Possessor director"}',
"Possessor directed by Brandon Cronenberg.")],
self._schema(),
)
assert result == (
"webSearch",
{"query": "films directed by Brandon Cronenberg"},
)
assert spy.called, "Placeholder substitution must go through the LLM"
def test_deterministic_parse_accepts_bare_tool_name_as_empty_args(self):
"""A plan step naming the tool with no trailing args must parse to
``(name, {})`` without an LLM call.
This is the shape the planner emits when it follows the
"omit optional arguments" rule — e.g. a weather query with no
named place plans as ``getWeather`` (no args), and the tool
auto-derives location from the user's geoip context.
"""
cfg = _cfg()
schema = [
{
"type": "function",
"function": {
"name": "getWeather",
"description": "Weather.",
"parameters": {
"type": "object",
"properties": {"location": {"type": "string"}},
"required": [],
},
},
}
]
with patch.object(planner_mod, "call_llm_direct") as spy:
result = resolve_next_tool_call(cfg, "getWeather", [], schema)
assert result == ("getWeather", {})
assert not spy.called, (
"Bare tool name must not trigger an LLM round-trip"
)
def test_deterministic_parse_handles_double_quoted_values(self):
"""Planner output occasionally uses double quotes — parse both."""
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct") as spy:
result = resolve_next_tool_call(
cfg,
'webSearch query="weather in Paris"',
[],
self._schema(),
)
assert result == ("webSearch", {"query": "weather in Paris"})
assert not spy.called
def test_deterministic_parse_handles_hyphenated_mcp_tool_name(self):
"""MCP tool names like ``chrome-devtools__navigate_page`` contain
hyphens. The fast-path parser must accept them and produce a clean
``(name, args)`` without an LLM round-trip — otherwise the engine
falls through to the chat model, which on small models flakes into
the empty-reply fallback."""
cfg = _cfg()
schema = [
{
"type": "function",
"function": {
"name": "chrome-devtools__navigate_page",
"description": "Navigate the browser to a URL.",
"parameters": {
"type": "object",
"properties": {"url": {"type": "string"}},
},
},
}
]
with patch.object(planner_mod, "call_llm_direct") as spy:
result = resolve_next_tool_call(
cfg,
"chrome-devtools__navigate_page url='https://youtube.com'",
[],
schema,
)
assert result == (
"chrome-devtools__navigate_page",
{"url": "https://youtube.com"},
)
assert not spy.called, (
"Hyphenated MCP tool name must parse without an LLM round-trip"
)
def test_keeps_args_as_is_when_schema_has_no_properties(self):
cfg = _cfg()
schema = [
{
"type": "function",
"function": {
"name": "freeform",
"description": "freeform",
"parameters": {"type": "object"},
},
}
]
raw = '{"name": "freeform", "arguments": {"anything": "goes"}}'
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(cfg, "do it", [], schema)
assert result == ("freeform", {"anything": "goes"})
class TestUrlArgNormalisation:
"""The resolver must hand chrome/browser MCP tools a fully-qualified
URL.
Field trace (2026-05): the planner emitted
``page='[youtube.com](http://youtube.com)'`` for the user query
"navigate to youtube.com". The slow-path resolver remapped the key
to ``url`` (the schema's actual property) but preserved the markdown
wrapper as the value, so chrome-devtools-mcp received
``{"url": "[youtube.com](http://youtube.com)"}`` and Puppeteer's
Page.navigate rejected with "Cannot navigate to invalid URL".
A scheme-less bare ``youtube.com`` value fails the same way.
The fix is generic: any URL-keyed string value gets
markdown-stripped and scheme-prepended before it leaves the planner.
"""
def _navigate_schema(self):
return [
{
"type": "function",
"function": {
"name": "chrome-devtools__navigate_page",
"description": "Navigate to a URL.",
"parameters": {
"type": "object",
"properties": {
"url": {"type": "string"},
},
},
},
}
]
def test_strips_markdown_link_wrapper_in_slow_path(self):
cfg = _cfg()
raw = (
'{"name": "chrome-devtools__navigate_page", "arguments": '
'{"url": "[youtube.com](http://youtube.com)"}}'
)
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg,
"chrome-devtools__navigate_page page='[youtube.com](http://youtube.com)'",
[],
self._navigate_schema(),
)
assert result == (
"chrome-devtools__navigate_page",
{"url": "http://youtube.com"},
)
def test_prepends_scheme_to_bare_domain_in_slow_path(self):
cfg = _cfg()
raw = (
'{"name": "chrome-devtools__navigate_page", "arguments": '
'{"url": "youtube.com"}}'
)
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg,
"chrome-devtools__navigate_page page='youtube.com'",
[],
self._navigate_schema(),
)
assert result == (
"chrome-devtools__navigate_page",
{"url": "https://youtube.com"},
)
def test_prepends_scheme_to_bare_domain_in_fast_path(self):
"""Fast path parses ``url='youtube.com'`` deterministically; the
normalisation must apply there too so we don't regress on the
common case where the planner uses the right key name."""
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="null") as spy:
result = resolve_next_tool_call(
cfg,
"chrome-devtools__navigate_page url='youtube.com'",
[],
self._navigate_schema(),
)
assert result == (
"chrome-devtools__navigate_page",
{"url": "https://youtube.com"},
)
assert spy.call_count == 0, "fast path must not call the LLM resolver"
def test_preserves_already_qualified_url(self):
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="null"):
result = resolve_next_tool_call(
cfg,
"chrome-devtools__navigate_page url='https://youtube.com/feed/trending'",
[],
self._navigate_schema(),
)
assert result == (
"chrome-devtools__navigate_page",
{"url": "https://youtube.com/feed/trending"},
)
def test_does_not_touch_unrelated_string_args(self):
"""A ``query='youtube.com tutorials'`` arg on webSearch must stay
literal — we only normalise values keyed as URLs."""
cfg = _cfg()
schema = [
{
"type": "function",
"function": {
"name": "webSearch",
"description": "Search.",
"parameters": {
"type": "object",
"properties": {"query": {"type": "string"}},
},
},
}
]
raw = (
'{"name": "webSearch", "arguments": '
'{"query": "youtube.com tutorials"}}'
)
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg,
"webSearch find tutorials",
[],
schema,
)
assert result == ("webSearch", {"query": "youtube.com tutorials"})
class TestToolStepsOf:
def test_multi_step_drops_final_synthesis_step(self):
assert tool_steps_of(["a", "b", "reply"]) == ["a", "b"]
def test_single_step_has_no_tool_steps(self):
"""A 1-step plan is reply-only by contract (rule 9), so it
contributes no tool steps. Engine uses this to skip the
direct-exec path and the progress nudge for pure-reply plans."""
assert tool_steps_of(["only"]) == []
def test_empty_plan(self):
assert tool_steps_of([]) == []
def test_strips_search_memory_directive(self):
plan = [
"searchMemory topic='user preferences'",
"webSearch query='foo'",
"Reply to the user.",
]
assert tool_steps_of(plan) == ["webSearch query='foo'"]
class TestIsSearchMemoryStep:
def test_detects_directive(self):
assert is_search_memory_step("searchMemory topic='x'") is True
assert is_search_memory_step(" SEARCHMEMORY topic='x'") is True
def test_rejects_other_steps(self):
assert is_search_memory_step("webSearch query='foo'") is False
assert is_search_memory_step("Reply to the user.") is False
class TestMemoryTopicOf:
def test_single_quoted(self):
assert memory_topic_of("searchMemory topic='pets'") == "pets"
def test_double_quoted(self):
assert memory_topic_of('searchMemory topic="favourite films"') == "favourite films"
def test_bare_value(self):
assert memory_topic_of("searchMemory topic=preferences") == "preferences"
def test_missing_topic_returns_empty(self):
assert memory_topic_of("searchMemory") == ""
class TestPlanRequiresMemory:
def test_true_when_directive_present(self):
assert plan_requires_memory([
"searchMemory topic='pets'",
"Reply to user",
]) is True
def test_false_when_only_tools_and_reply(self):
assert plan_requires_memory([
"webSearch query='foo'",
"Reply to the user.",
]) is False
def test_false_for_empty(self):
assert plan_requires_memory([]) is False
class TestStripMemoryDirectives:
def test_removes_directive(self):
plan = [
"searchMemory topic='pets'",
"Reply to user",
]
assert strip_memory_directives(plan) == ["Reply to user"]
def test_leaves_tool_only_plan_untouched(self):
plan = ["webSearch query='foo'", "Reply"]
assert strip_memory_directives(plan) == plan
class TestToolNamesInPlan:
def test_extracts_known_names_in_order(self):
plan = [
"webSearch query='a'",
"getWeather",
"webSearch query='b'", # duplicate should dedup
"Reply to the user.",
]
names = tool_names_in_plan(plan, ["webSearch", "getWeather", "stop"])
assert names == ["webSearch", "getWeather"]
def test_filters_unknown_names(self):
plan = ["hallucinatedTool x='y'", "webSearch query='q'", "Reply"]
assert tool_names_in_plan(plan, ["webSearch"]) == ["webSearch"]
def test_ignores_search_memory_directive(self):
plan = ["searchMemory topic='t'", "webSearch query='q'", "Reply"]
assert tool_names_in_plan(plan, ["webSearch", "searchMemory"]) == ["webSearch"]
def test_empty_plan(self):
assert tool_names_in_plan([], ["webSearch"]) == []
def test_extracts_hyphenated_mcp_tool_name(self):
"""MCP tool names embed the server in the prefix and use hyphens
(e.g. ``chrome-devtools__navigate_page``). The head regex must accept
hyphens so the planner-driven allow-list union and the
``_plan_under_specified`` guard don't misclassify a perfectly valid
plan step as paraphrased prose.
Field trace (2026-05-03): user said "navigate to youtube.com". Planner
emitted ``chrome-devtools__navigate_page page='...'`` correctly, but
the hyphen-stripping regex extracted only ``chrome``, which wasn't a
known tool — so direct-exec was skipped and the small chat model
flaked into the empty-reply fallback.
"""
plan = [
"chrome-devtools__navigate_page page='https://youtube.com'",
"Reply to the user.",
]
names = tool_names_in_plan(plan, ["chrome-devtools__navigate_page"])
assert names == ["chrome-devtools__navigate_page"]
class TestPlanHasUnresolvedToolSteps:
def test_true_when_step_paraphrases_tool(self):
plan = ["get the weather", "Reply to the user."]
assert plan_has_unresolved_tool_steps(plan, ["getWeather", "stop"]) is True
def test_false_when_step_names_tool(self):
plan = ["getWeather", "Reply to the user."]
assert plan_has_unresolved_tool_steps(plan, ["getWeather"]) is False
def test_false_for_reply_only_plan(self):
# No tool steps at all — the planner explicitly decided no tools.
assert plan_has_unresolved_tool_steps(
["Reply to the user."], ["getWeather"]
) is False
def test_false_for_empty_plan(self):
assert plan_has_unresolved_tool_steps([], ["getWeather"]) is False
def test_false_when_search_memory_only_and_reply(self):
# searchMemory is a directive, not a tool — but there's also no
# real tool step paraphrased either.
plan = ["searchMemory topic='t'", "Reply to the user."]
assert plan_has_unresolved_tool_steps(plan, ["getWeather"]) is False
def test_false_for_hyphenated_mcp_tool_step(self):
"""A concrete plan step naming a hyphenated MCP tool must NOT be
treated as under-specified — otherwise the engine skips direct-exec
and forces the chat model to take the turn instead."""
plan = [
"chrome-devtools__navigate_page page='https://youtube.com'",
"Reply to the user.",
]
assert plan_has_unresolved_tool_steps(
plan, ["chrome-devtools__navigate_page"]
) is False