Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled

Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
This commit is contained in:
javis-bot
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions

790
tests/test_planner.py Normal file
View File

@@ -0,0 +1,790 @@
"""Unit tests for the task-list planner.
These tests verify behaviours, not implementation: the parser cleans up
messy LLM output, trivial single-reply plans don't leak out, the
fail-open paths return an empty list, and the progress_nudge reflects
accurate step progression.
"""
from __future__ import annotations
from types import SimpleNamespace
from unittest.mock import patch
import pytest
from jarvis.reply import planner as planner_mod
from jarvis.reply.planner import (
MAX_STEPS,
SEARCH_MEMORY_DIRECTIVE,
_is_trivial_plan,
_parse_plan,
format_plan_block,
is_search_memory_step,
memory_topic_of,
plan_query,
plan_requires_memory,
progress_nudge,
resolve_next_tool_call,
resolve_planner_model,
strip_memory_directives,
plan_has_unresolved_tool_steps,
tool_names_in_plan,
tool_steps_of,
)
def _cfg(**overrides):
base = {
"ollama_base_url": "http://localhost:11434",
"ollama_chat_model": "gemma4:e2b",
"planner_model": "",
"tool_router_model": "",
"intent_judge_model": "",
"planner_enabled": True,
"planner_timeout_sec": 6.0,
}
base.update(overrides)
return SimpleNamespace(**base)
class TestParsePlan:
def test_strips_numbering(self):
raw = "1. webSearch query='foo'\n2. Reply to user"
assert _parse_plan(raw) == ["webSearch query='foo'", "Reply to user"]
def test_strips_bullet_prefixes(self):
raw = "- step one\n* step two\n• step three"
assert _parse_plan(raw) == ["step one", "step two", "step three"]
def test_strips_wrapping_quotes(self):
raw = '"step one"\n`step two`'
assert _parse_plan(raw) == ["step one", "step two"]
def test_ignores_json_fences_and_blank_lines(self):
raw = "```\nstep one\n\n```\nstep two"
assert _parse_plan(raw) == ["step one", "step two"]
def test_caps_at_max_steps(self):
raw = "\n".join(f"step {i}" for i in range(MAX_STEPS + 3))
assert len(_parse_plan(raw)) == MAX_STEPS
def test_truncates_overlong_step(self):
long = "a" * 500
parsed = _parse_plan(long)
assert len(parsed) == 1
assert parsed[0].endswith("")
assert len(parsed[0]) <= 201
class TestIsTrivialPlan:
def test_empty_is_trivial(self):
assert _is_trivial_plan([]) is True
def test_single_step_is_trivial_regardless_of_language(self):
# Purely structural: any 1-step plan is trivial. Language-agnostic.
assert _is_trivial_plan(["Reply to the user."]) is True
assert _is_trivial_plan(["Répondre à l'utilisateur."]) is True
assert _is_trivial_plan(["ユーザーに返信する"]) is True
assert _is_trivial_plan(["webSearch query='x'"]) is True
def test_multi_step_is_not_trivial(self):
assert _is_trivial_plan(["webSearch ...", "Reply to user"]) is False
assert _is_trivial_plan(["a", "b", "c"]) is False
class TestResolvePlannerModel:
def test_prefers_explicit_planner_model(self):
cfg = _cfg(planner_model="gemma-plan", ollama_chat_model="chat")
assert resolve_planner_model(cfg) == "gemma-plan"
def test_tracks_chat_model_by_default(self):
cfg = _cfg(ollama_chat_model="gemma4:e2b")
assert resolve_planner_model(cfg) == "gemma4:e2b"
def test_ignores_tool_router_model(self):
# Planner must track the chat model — not the router. Upgrading
# the chat model through setup must upgrade the planner too.
cfg = _cfg(tool_router_model="router-x", ollama_chat_model="chat-y")
assert resolve_planner_model(cfg) == "chat-y"
def test_upgrading_chat_model_upgrades_planner(self):
cfg = _cfg(ollama_chat_model="gpt-oss:20b")
assert resolve_planner_model(cfg) == "gpt-oss:20b"
def test_returns_empty_when_no_candidates(self):
cfg = _cfg(ollama_chat_model="")
assert resolve_planner_model(cfg) == ""
class TestPlanQuery:
def test_short_query_returns_empty(self):
cfg = _cfg()
assert plan_query(cfg, "hi", "", []) == []
def test_disabled_returns_empty(self):
cfg = _cfg(planner_enabled=False)
long = "what films did the director of Possessor make?"
assert plan_query(cfg, long, "", []) == []
def test_missing_model_returns_empty(self):
cfg = _cfg(ollama_chat_model="")
long = "what films did the director of Possessor make?"
assert plan_query(cfg, long, "", []) == []
def test_returns_parsed_steps(self):
cfg = _cfg()
raw_plan = (
"webSearch query='Possessor 2020 director'\n"
"webSearch query='films by <director name from step 1>'\n"
"Reply to the user with the combined findings."
)
with patch.object(planner_mod, "call_llm_direct", return_value=raw_plan):
steps = plan_query(
cfg,
"what films did the director of Possessor make?",
"",
[("webSearch", "Search the web.")],
)
assert len(steps) == 3
assert "Possessor" in steps[0]
assert steps[-1].lower().startswith("reply")
def test_single_reply_plan_is_preserved(self):
"""A 1-step reply-only plan is the planner's POSITIVE "no memory,
no tools needed" signal. It must NOT be filtered to [] — the
engine distinguishes [] (fail-open) from ["Reply ..."] (explicit
skip-everything decision) and uses the latter to skip the
memory extractor and tool router entirely.
"""
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="Reply to user."):
steps = plan_query(
cfg,
"tell me a joke about cats please",
"",
[],
)
assert steps == ["Reply to user."]
def test_llm_failure_returns_empty(self):
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value=None):
steps = plan_query(
cfg,
"what films did the director of Possessor make?",
"",
[("webSearch", "Search the web.")],
)
assert steps == []
def test_memory_context_arg_still_accepted_for_back_compat(self):
"""Old callers pass `memory_context=` as a positional or keyword
argument. Planner now ignores it (the planner runs before memory
search), but the signature must still accept it so downstream
code doesn't break."""
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="Reply to user."):
steps = plan_query(
cfg,
"tell me a joke about cats please",
"",
[],
memory_context="some old memory text",
)
assert steps == ["Reply to user."]
def test_prompt_warns_against_fabricating_optional_arguments(self):
"""The planner prompt must explicitly tell the model to omit
optional arguments when the user didn't supply a value, and warn
against grabbing unrelated words from the utterance as fake values.
2026-04-24 field regression: gemma4:e2b responded to "how's the
weather going to be today" with a plan step of
``getWeather location='today'``. The temporal qualifier "today"
was geocoded to a village called "Todaya" in the Philippines —
because the small model was trained by our prompt to always give
a concrete argument, even when the user's utterance had none to
give. This content-assertion guards the fix so the rule can't be
silently reverted during future prompt edits without a test
failure pointing the editor at the behavioural consequence.
"""
prompt = planner_mod._PROMPT_TEMPLATE.lower()
assert "omit" in prompt, (
"Planner prompt must tell the model to OMIT optional args "
"when no value was provided."
)
# The guidance must name the exact failure mode so the model
# doesn't pattern-match on generic 'omit' without knowing why.
assert "fabricate" in prompt or "do not fabricate" in prompt, (
"Planner prompt must warn against fabricating argument values "
"from unrelated words in the utterance."
)
class TestFormatPlanBlock:
def test_empty_returns_empty_string(self):
assert format_plan_block([]) == ""
def test_numbers_the_steps(self):
block = format_plan_block(["step a", "step b"])
assert "1. step a" in block
assert "2. step b" in block
assert "ACTION PLAN" in block
class TestProgressNudge:
def test_empty_plan_returns_empty(self):
assert progress_nudge([], 0) == ""
def test_single_reply_step_returns_empty(self):
"""A 1-step reply-only plan has no tool steps, so there is
nothing to nudge. The empty string tells the engine to skip
injecting a progress reminder after the (non-existent) tool
result."""
assert progress_nudge(["Reply to user"], 0) == ""
def test_points_at_next_step(self):
steps = ["webSearch query='foo'", "webSearch query='bar'", "Reply to user"]
msg = progress_nudge(steps, 0)
assert "foo" in msg
assert "0/2" in msg
msg2 = progress_nudge(steps, 1)
assert "bar" in msg2
assert "1/2" in msg2
def test_all_steps_done_prompts_synthesis(self):
steps = ["webSearch query='foo'", "webSearch query='bar'", "Reply to user"]
msg = progress_nudge(steps, 2)
assert "all tool steps executed" in msg.lower() or "synthes" in msg.lower()
class TestResolveNextToolCall:
def _schema(self):
return [
{
"type": "function",
"function": {
"name": "webSearch",
"description": "Search the web.",
"parameters": {
"type": "object",
"properties": {"query": {"type": "string"}},
},
},
}
]
def test_returns_tool_and_args(self):
cfg = _cfg()
raw = '{"name": "webSearch", "arguments": {"query": "weather in Paris"}}'
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg, "webSearch query='weather in Paris'", [], self._schema()
)
assert result == ("webSearch", {"query": "weather in Paris"})
def test_rejects_unknown_tool(self):
cfg = _cfg()
raw = '{"name": "mysteryTool", "arguments": {}}'
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
assert resolve_next_tool_call(
cfg, "do the thing", [], self._schema()
) is None
def test_null_means_synthesis(self):
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="null"):
assert resolve_next_tool_call(
cfg, "Reply to user", [], self._schema()
) is None
def test_peels_markdown_fences(self):
cfg = _cfg()
raw = '```json\n{"name": "webSearch", "arguments": {"query": "x"}}\n```'
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg, "search for x", [], self._schema()
)
assert result == ("webSearch", {"query": "x"})
def test_invalid_json_returns_none(self):
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="not json"):
assert resolve_next_tool_call(
cfg, "do the thing", [], self._schema()
) is None
def test_missing_schema_returns_none(self):
cfg = _cfg()
assert resolve_next_tool_call(cfg, "do the thing", [], []) is None
def test_drops_unknown_argument_keys(self):
cfg = _cfg()
raw = (
'{"name": "webSearch", "arguments": '
'{"query": "weather", "evil_key": "shell"}}'
)
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg, "search weather", [], self._schema()
)
assert result == ("webSearch", {"query": "weather"})
def test_deterministic_parse_skips_llm_for_concrete_step(self):
"""A fully concrete plan step (tool name + `key='value'` args, no
``<placeholder>``) must be parsed deterministically without calling
the LLM resolver at all.
Motivation (2026-04-24 field trace): a follow-up query produced the
plan `webSearch query='Justin Bieber most famous songs'` — trivially
concrete — but the LLM resolver flaked (returned ``null`` or
garbage) and the engine fell back to the chat model, which then
refused. Parsing concrete steps deterministically removes the LLM
call as a failure surface for the common case.
"""
cfg = _cfg()
call_count = [0]
def _spy(*args, **kwargs):
call_count[0] += 1
return "null"
with patch.object(planner_mod, "call_llm_direct", side_effect=_spy):
result = resolve_next_tool_call(
cfg,
"webSearch query='Justin Bieber most famous songs'",
[],
self._schema(),
)
assert result == (
"webSearch",
{"query": "Justin Bieber most famous songs"},
)
assert call_count[0] == 0, (
f"LLM should not be called for a concrete step; was called {call_count[0]}×"
)
def test_deterministic_parse_still_rejects_unknown_tool(self):
"""The fast path must still honour the allow-list — a concrete step
naming a tool not in the schema falls through to ``None``, not to an
unfiltered dispatch."""
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="null"):
assert resolve_next_tool_call(
cfg,
"mysteryTool query='anything'",
[],
self._schema(),
) is None
def test_falls_back_to_llm_when_step_has_placeholder(self):
"""Steps containing an ``<entity from step N>`` placeholder need
entity substitution from prior results — that requires the LLM
resolver, so the fast path must decline and defer."""
cfg = _cfg()
raw = (
'{"name": "webSearch", "arguments": '
'{"query": "films directed by Brandon Cronenberg"}}'
)
with patch.object(
planner_mod, "call_llm_direct", return_value=raw,
) as spy:
result = resolve_next_tool_call(
cfg,
"webSearch query='films directed by <director name from step 1>'",
[("webSearch", '{"query": "Possessor director"}',
"Possessor directed by Brandon Cronenberg.")],
self._schema(),
)
assert result == (
"webSearch",
{"query": "films directed by Brandon Cronenberg"},
)
assert spy.called, "Placeholder substitution must go through the LLM"
def test_deterministic_parse_accepts_bare_tool_name_as_empty_args(self):
"""A plan step naming the tool with no trailing args must parse to
``(name, {})`` without an LLM call.
This is the shape the planner emits when it follows the
"omit optional arguments" rule — e.g. a weather query with no
named place plans as ``getWeather`` (no args), and the tool
auto-derives location from the user's geoip context.
"""
cfg = _cfg()
schema = [
{
"type": "function",
"function": {
"name": "getWeather",
"description": "Weather.",
"parameters": {
"type": "object",
"properties": {"location": {"type": "string"}},
"required": [],
},
},
}
]
with patch.object(planner_mod, "call_llm_direct") as spy:
result = resolve_next_tool_call(cfg, "getWeather", [], schema)
assert result == ("getWeather", {})
assert not spy.called, (
"Bare tool name must not trigger an LLM round-trip"
)
def test_deterministic_parse_handles_double_quoted_values(self):
"""Planner output occasionally uses double quotes — parse both."""
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct") as spy:
result = resolve_next_tool_call(
cfg,
'webSearch query="weather in Paris"',
[],
self._schema(),
)
assert result == ("webSearch", {"query": "weather in Paris"})
assert not spy.called
def test_deterministic_parse_handles_hyphenated_mcp_tool_name(self):
"""MCP tool names like ``chrome-devtools__navigate_page`` contain
hyphens. The fast-path parser must accept them and produce a clean
``(name, args)`` without an LLM round-trip — otherwise the engine
falls through to the chat model, which on small models flakes into
the empty-reply fallback."""
cfg = _cfg()
schema = [
{
"type": "function",
"function": {
"name": "chrome-devtools__navigate_page",
"description": "Navigate the browser to a URL.",
"parameters": {
"type": "object",
"properties": {"url": {"type": "string"}},
},
},
}
]
with patch.object(planner_mod, "call_llm_direct") as spy:
result = resolve_next_tool_call(
cfg,
"chrome-devtools__navigate_page url='https://youtube.com'",
[],
schema,
)
assert result == (
"chrome-devtools__navigate_page",
{"url": "https://youtube.com"},
)
assert not spy.called, (
"Hyphenated MCP tool name must parse without an LLM round-trip"
)
def test_keeps_args_as_is_when_schema_has_no_properties(self):
cfg = _cfg()
schema = [
{
"type": "function",
"function": {
"name": "freeform",
"description": "freeform",
"parameters": {"type": "object"},
},
}
]
raw = '{"name": "freeform", "arguments": {"anything": "goes"}}'
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(cfg, "do it", [], schema)
assert result == ("freeform", {"anything": "goes"})
class TestUrlArgNormalisation:
"""The resolver must hand chrome/browser MCP tools a fully-qualified
URL.
Field trace (2026-05): the planner emitted
``page='[youtube.com](http://youtube.com)'`` for the user query
"navigate to youtube.com". The slow-path resolver remapped the key
to ``url`` (the schema's actual property) but preserved the markdown
wrapper as the value, so chrome-devtools-mcp received
``{"url": "[youtube.com](http://youtube.com)"}`` and Puppeteer's
Page.navigate rejected with "Cannot navigate to invalid URL".
A scheme-less bare ``youtube.com`` value fails the same way.
The fix is generic: any URL-keyed string value gets
markdown-stripped and scheme-prepended before it leaves the planner.
"""
def _navigate_schema(self):
return [
{
"type": "function",
"function": {
"name": "chrome-devtools__navigate_page",
"description": "Navigate to a URL.",
"parameters": {
"type": "object",
"properties": {
"url": {"type": "string"},
},
},
},
}
]
def test_strips_markdown_link_wrapper_in_slow_path(self):
cfg = _cfg()
raw = (
'{"name": "chrome-devtools__navigate_page", "arguments": '
'{"url": "[youtube.com](http://youtube.com)"}}'
)
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg,
"chrome-devtools__navigate_page page='[youtube.com](http://youtube.com)'",
[],
self._navigate_schema(),
)
assert result == (
"chrome-devtools__navigate_page",
{"url": "http://youtube.com"},
)
def test_prepends_scheme_to_bare_domain_in_slow_path(self):
cfg = _cfg()
raw = (
'{"name": "chrome-devtools__navigate_page", "arguments": '
'{"url": "youtube.com"}}'
)
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg,
"chrome-devtools__navigate_page page='youtube.com'",
[],
self._navigate_schema(),
)
assert result == (
"chrome-devtools__navigate_page",
{"url": "https://youtube.com"},
)
def test_prepends_scheme_to_bare_domain_in_fast_path(self):
"""Fast path parses ``url='youtube.com'`` deterministically; the
normalisation must apply there too so we don't regress on the
common case where the planner uses the right key name."""
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="null") as spy:
result = resolve_next_tool_call(
cfg,
"chrome-devtools__navigate_page url='youtube.com'",
[],
self._navigate_schema(),
)
assert result == (
"chrome-devtools__navigate_page",
{"url": "https://youtube.com"},
)
assert spy.call_count == 0, "fast path must not call the LLM resolver"
def test_preserves_already_qualified_url(self):
cfg = _cfg()
with patch.object(planner_mod, "call_llm_direct", return_value="null"):
result = resolve_next_tool_call(
cfg,
"chrome-devtools__navigate_page url='https://youtube.com/feed/trending'",
[],
self._navigate_schema(),
)
assert result == (
"chrome-devtools__navigate_page",
{"url": "https://youtube.com/feed/trending"},
)
def test_does_not_touch_unrelated_string_args(self):
"""A ``query='youtube.com tutorials'`` arg on webSearch must stay
literal — we only normalise values keyed as URLs."""
cfg = _cfg()
schema = [
{
"type": "function",
"function": {
"name": "webSearch",
"description": "Search.",
"parameters": {
"type": "object",
"properties": {"query": {"type": "string"}},
},
},
}
]
raw = (
'{"name": "webSearch", "arguments": '
'{"query": "youtube.com tutorials"}}'
)
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
result = resolve_next_tool_call(
cfg,
"webSearch find tutorials",
[],
schema,
)
assert result == ("webSearch", {"query": "youtube.com tutorials"})
class TestToolStepsOf:
def test_multi_step_drops_final_synthesis_step(self):
assert tool_steps_of(["a", "b", "reply"]) == ["a", "b"]
def test_single_step_has_no_tool_steps(self):
"""A 1-step plan is reply-only by contract (rule 9), so it
contributes no tool steps. Engine uses this to skip the
direct-exec path and the progress nudge for pure-reply plans."""
assert tool_steps_of(["only"]) == []
def test_empty_plan(self):
assert tool_steps_of([]) == []
def test_strips_search_memory_directive(self):
plan = [
"searchMemory topic='user preferences'",
"webSearch query='foo'",
"Reply to the user.",
]
assert tool_steps_of(plan) == ["webSearch query='foo'"]
class TestIsSearchMemoryStep:
def test_detects_directive(self):
assert is_search_memory_step("searchMemory topic='x'") is True
assert is_search_memory_step(" SEARCHMEMORY topic='x'") is True
def test_rejects_other_steps(self):
assert is_search_memory_step("webSearch query='foo'") is False
assert is_search_memory_step("Reply to the user.") is False
class TestMemoryTopicOf:
def test_single_quoted(self):
assert memory_topic_of("searchMemory topic='pets'") == "pets"
def test_double_quoted(self):
assert memory_topic_of('searchMemory topic="favourite films"') == "favourite films"
def test_bare_value(self):
assert memory_topic_of("searchMemory topic=preferences") == "preferences"
def test_missing_topic_returns_empty(self):
assert memory_topic_of("searchMemory") == ""
class TestPlanRequiresMemory:
def test_true_when_directive_present(self):
assert plan_requires_memory([
"searchMemory topic='pets'",
"Reply to user",
]) is True
def test_false_when_only_tools_and_reply(self):
assert plan_requires_memory([
"webSearch query='foo'",
"Reply to the user.",
]) is False
def test_false_for_empty(self):
assert plan_requires_memory([]) is False
class TestStripMemoryDirectives:
def test_removes_directive(self):
plan = [
"searchMemory topic='pets'",
"Reply to user",
]
assert strip_memory_directives(plan) == ["Reply to user"]
def test_leaves_tool_only_plan_untouched(self):
plan = ["webSearch query='foo'", "Reply"]
assert strip_memory_directives(plan) == plan
class TestToolNamesInPlan:
def test_extracts_known_names_in_order(self):
plan = [
"webSearch query='a'",
"getWeather",
"webSearch query='b'", # duplicate should dedup
"Reply to the user.",
]
names = tool_names_in_plan(plan, ["webSearch", "getWeather", "stop"])
assert names == ["webSearch", "getWeather"]
def test_filters_unknown_names(self):
plan = ["hallucinatedTool x='y'", "webSearch query='q'", "Reply"]
assert tool_names_in_plan(plan, ["webSearch"]) == ["webSearch"]
def test_ignores_search_memory_directive(self):
plan = ["searchMemory topic='t'", "webSearch query='q'", "Reply"]
assert tool_names_in_plan(plan, ["webSearch", "searchMemory"]) == ["webSearch"]
def test_empty_plan(self):
assert tool_names_in_plan([], ["webSearch"]) == []
def test_extracts_hyphenated_mcp_tool_name(self):
"""MCP tool names embed the server in the prefix and use hyphens
(e.g. ``chrome-devtools__navigate_page``). The head regex must accept
hyphens so the planner-driven allow-list union and the
``_plan_under_specified`` guard don't misclassify a perfectly valid
plan step as paraphrased prose.
Field trace (2026-05-03): user said "navigate to youtube.com". Planner
emitted ``chrome-devtools__navigate_page page='...'`` correctly, but
the hyphen-stripping regex extracted only ``chrome``, which wasn't a
known tool — so direct-exec was skipped and the small chat model
flaked into the empty-reply fallback.
"""
plan = [
"chrome-devtools__navigate_page page='https://youtube.com'",
"Reply to the user.",
]
names = tool_names_in_plan(plan, ["chrome-devtools__navigate_page"])
assert names == ["chrome-devtools__navigate_page"]
class TestPlanHasUnresolvedToolSteps:
def test_true_when_step_paraphrases_tool(self):
plan = ["get the weather", "Reply to the user."]
assert plan_has_unresolved_tool_steps(plan, ["getWeather", "stop"]) is True
def test_false_when_step_names_tool(self):
plan = ["getWeather", "Reply to the user."]
assert plan_has_unresolved_tool_steps(plan, ["getWeather"]) is False
def test_false_for_reply_only_plan(self):
# No tool steps at all — the planner explicitly decided no tools.
assert plan_has_unresolved_tool_steps(
["Reply to the user."], ["getWeather"]
) is False
def test_false_for_empty_plan(self):
assert plan_has_unresolved_tool_steps([], ["getWeather"]) is False
def test_false_when_search_memory_only_and_reply(self):
# searchMemory is a directive, not a tool — but there's also no
# real tool step paraphrased either.
plan = ["searchMemory topic='t'", "Reply to the user."]
assert plan_has_unresolved_tool_steps(plan, ["getWeather"]) is False
def test_false_for_hyphenated_mcp_tool_step(self):
"""A concrete plan step naming a hyphenated MCP tool must NOT be
treated as under-specified — otherwise the engine skips direct-exec
and forces the chat model to take the turn instead."""
plan = [
"chrome-devtools__navigate_page page='https://youtube.com'",
"Reply to the user.",
]
assert plan_has_unresolved_tool_steps(
plan, ["chrome-devtools__navigate_page"]
) is False