Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
790
tests/test_planner.py
Normal file
790
tests/test_planner.py
Normal file
@@ -0,0 +1,790 @@
|
||||
"""Unit tests for the task-list planner.
|
||||
|
||||
These tests verify behaviours, not implementation: the parser cleans up
|
||||
messy LLM output, trivial single-reply plans don't leak out, the
|
||||
fail-open paths return an empty list, and the progress_nudge reflects
|
||||
accurate step progression.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from jarvis.reply import planner as planner_mod
|
||||
from jarvis.reply.planner import (
|
||||
MAX_STEPS,
|
||||
SEARCH_MEMORY_DIRECTIVE,
|
||||
_is_trivial_plan,
|
||||
_parse_plan,
|
||||
format_plan_block,
|
||||
is_search_memory_step,
|
||||
memory_topic_of,
|
||||
plan_query,
|
||||
plan_requires_memory,
|
||||
progress_nudge,
|
||||
resolve_next_tool_call,
|
||||
resolve_planner_model,
|
||||
strip_memory_directives,
|
||||
plan_has_unresolved_tool_steps,
|
||||
tool_names_in_plan,
|
||||
tool_steps_of,
|
||||
)
|
||||
|
||||
|
||||
def _cfg(**overrides):
|
||||
base = {
|
||||
"ollama_base_url": "http://localhost:11434",
|
||||
"ollama_chat_model": "gemma4:e2b",
|
||||
"planner_model": "",
|
||||
"tool_router_model": "",
|
||||
"intent_judge_model": "",
|
||||
"planner_enabled": True,
|
||||
"planner_timeout_sec": 6.0,
|
||||
}
|
||||
base.update(overrides)
|
||||
return SimpleNamespace(**base)
|
||||
|
||||
|
||||
class TestParsePlan:
|
||||
def test_strips_numbering(self):
|
||||
raw = "1. webSearch query='foo'\n2. Reply to user"
|
||||
assert _parse_plan(raw) == ["webSearch query='foo'", "Reply to user"]
|
||||
|
||||
def test_strips_bullet_prefixes(self):
|
||||
raw = "- step one\n* step two\n• step three"
|
||||
assert _parse_plan(raw) == ["step one", "step two", "step three"]
|
||||
|
||||
def test_strips_wrapping_quotes(self):
|
||||
raw = '"step one"\n`step two`'
|
||||
assert _parse_plan(raw) == ["step one", "step two"]
|
||||
|
||||
def test_ignores_json_fences_and_blank_lines(self):
|
||||
raw = "```\nstep one\n\n```\nstep two"
|
||||
assert _parse_plan(raw) == ["step one", "step two"]
|
||||
|
||||
def test_caps_at_max_steps(self):
|
||||
raw = "\n".join(f"step {i}" for i in range(MAX_STEPS + 3))
|
||||
assert len(_parse_plan(raw)) == MAX_STEPS
|
||||
|
||||
def test_truncates_overlong_step(self):
|
||||
long = "a" * 500
|
||||
parsed = _parse_plan(long)
|
||||
assert len(parsed) == 1
|
||||
assert parsed[0].endswith("…")
|
||||
assert len(parsed[0]) <= 201
|
||||
|
||||
|
||||
class TestIsTrivialPlan:
|
||||
def test_empty_is_trivial(self):
|
||||
assert _is_trivial_plan([]) is True
|
||||
|
||||
def test_single_step_is_trivial_regardless_of_language(self):
|
||||
# Purely structural: any 1-step plan is trivial. Language-agnostic.
|
||||
assert _is_trivial_plan(["Reply to the user."]) is True
|
||||
assert _is_trivial_plan(["Répondre à l'utilisateur."]) is True
|
||||
assert _is_trivial_plan(["ユーザーに返信する"]) is True
|
||||
assert _is_trivial_plan(["webSearch query='x'"]) is True
|
||||
|
||||
def test_multi_step_is_not_trivial(self):
|
||||
assert _is_trivial_plan(["webSearch ...", "Reply to user"]) is False
|
||||
assert _is_trivial_plan(["a", "b", "c"]) is False
|
||||
|
||||
|
||||
class TestResolvePlannerModel:
|
||||
def test_prefers_explicit_planner_model(self):
|
||||
cfg = _cfg(planner_model="gemma-plan", ollama_chat_model="chat")
|
||||
assert resolve_planner_model(cfg) == "gemma-plan"
|
||||
|
||||
def test_tracks_chat_model_by_default(self):
|
||||
cfg = _cfg(ollama_chat_model="gemma4:e2b")
|
||||
assert resolve_planner_model(cfg) == "gemma4:e2b"
|
||||
|
||||
def test_ignores_tool_router_model(self):
|
||||
# Planner must track the chat model — not the router. Upgrading
|
||||
# the chat model through setup must upgrade the planner too.
|
||||
cfg = _cfg(tool_router_model="router-x", ollama_chat_model="chat-y")
|
||||
assert resolve_planner_model(cfg) == "chat-y"
|
||||
|
||||
def test_upgrading_chat_model_upgrades_planner(self):
|
||||
cfg = _cfg(ollama_chat_model="gpt-oss:20b")
|
||||
assert resolve_planner_model(cfg) == "gpt-oss:20b"
|
||||
|
||||
def test_returns_empty_when_no_candidates(self):
|
||||
cfg = _cfg(ollama_chat_model="")
|
||||
assert resolve_planner_model(cfg) == ""
|
||||
|
||||
|
||||
class TestPlanQuery:
|
||||
def test_short_query_returns_empty(self):
|
||||
cfg = _cfg()
|
||||
assert plan_query(cfg, "hi", "", []) == []
|
||||
|
||||
def test_disabled_returns_empty(self):
|
||||
cfg = _cfg(planner_enabled=False)
|
||||
long = "what films did the director of Possessor make?"
|
||||
assert plan_query(cfg, long, "", []) == []
|
||||
|
||||
def test_missing_model_returns_empty(self):
|
||||
cfg = _cfg(ollama_chat_model="")
|
||||
long = "what films did the director of Possessor make?"
|
||||
assert plan_query(cfg, long, "", []) == []
|
||||
|
||||
def test_returns_parsed_steps(self):
|
||||
cfg = _cfg()
|
||||
raw_plan = (
|
||||
"webSearch query='Possessor 2020 director'\n"
|
||||
"webSearch query='films by <director name from step 1>'\n"
|
||||
"Reply to the user with the combined findings."
|
||||
)
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value=raw_plan):
|
||||
steps = plan_query(
|
||||
cfg,
|
||||
"what films did the director of Possessor make?",
|
||||
"",
|
||||
[("webSearch", "Search the web.")],
|
||||
)
|
||||
assert len(steps) == 3
|
||||
assert "Possessor" in steps[0]
|
||||
assert steps[-1].lower().startswith("reply")
|
||||
|
||||
def test_single_reply_plan_is_preserved(self):
|
||||
"""A 1-step reply-only plan is the planner's POSITIVE "no memory,
|
||||
no tools needed" signal. It must NOT be filtered to [] — the
|
||||
engine distinguishes [] (fail-open) from ["Reply ..."] (explicit
|
||||
skip-everything decision) and uses the latter to skip the
|
||||
memory extractor and tool router entirely.
|
||||
"""
|
||||
cfg = _cfg()
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value="Reply to user."):
|
||||
steps = plan_query(
|
||||
cfg,
|
||||
"tell me a joke about cats please",
|
||||
"",
|
||||
[],
|
||||
)
|
||||
assert steps == ["Reply to user."]
|
||||
|
||||
def test_llm_failure_returns_empty(self):
|
||||
cfg = _cfg()
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value=None):
|
||||
steps = plan_query(
|
||||
cfg,
|
||||
"what films did the director of Possessor make?",
|
||||
"",
|
||||
[("webSearch", "Search the web.")],
|
||||
)
|
||||
assert steps == []
|
||||
|
||||
def test_memory_context_arg_still_accepted_for_back_compat(self):
|
||||
"""Old callers pass `memory_context=` as a positional or keyword
|
||||
argument. Planner now ignores it (the planner runs before memory
|
||||
search), but the signature must still accept it so downstream
|
||||
code doesn't break."""
|
||||
cfg = _cfg()
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value="Reply to user."):
|
||||
steps = plan_query(
|
||||
cfg,
|
||||
"tell me a joke about cats please",
|
||||
"",
|
||||
[],
|
||||
memory_context="some old memory text",
|
||||
)
|
||||
assert steps == ["Reply to user."]
|
||||
|
||||
def test_prompt_warns_against_fabricating_optional_arguments(self):
|
||||
"""The planner prompt must explicitly tell the model to omit
|
||||
optional arguments when the user didn't supply a value, and warn
|
||||
against grabbing unrelated words from the utterance as fake values.
|
||||
|
||||
2026-04-24 field regression: gemma4:e2b responded to "how's the
|
||||
weather going to be today" with a plan step of
|
||||
``getWeather location='today'``. The temporal qualifier "today"
|
||||
was geocoded to a village called "Todaya" in the Philippines —
|
||||
because the small model was trained by our prompt to always give
|
||||
a concrete argument, even when the user's utterance had none to
|
||||
give. This content-assertion guards the fix so the rule can't be
|
||||
silently reverted during future prompt edits without a test
|
||||
failure pointing the editor at the behavioural consequence.
|
||||
"""
|
||||
prompt = planner_mod._PROMPT_TEMPLATE.lower()
|
||||
assert "omit" in prompt, (
|
||||
"Planner prompt must tell the model to OMIT optional args "
|
||||
"when no value was provided."
|
||||
)
|
||||
# The guidance must name the exact failure mode so the model
|
||||
# doesn't pattern-match on generic 'omit' without knowing why.
|
||||
assert "fabricate" in prompt or "do not fabricate" in prompt, (
|
||||
"Planner prompt must warn against fabricating argument values "
|
||||
"from unrelated words in the utterance."
|
||||
)
|
||||
|
||||
|
||||
class TestFormatPlanBlock:
|
||||
def test_empty_returns_empty_string(self):
|
||||
assert format_plan_block([]) == ""
|
||||
|
||||
def test_numbers_the_steps(self):
|
||||
block = format_plan_block(["step a", "step b"])
|
||||
assert "1. step a" in block
|
||||
assert "2. step b" in block
|
||||
assert "ACTION PLAN" in block
|
||||
|
||||
|
||||
class TestProgressNudge:
|
||||
def test_empty_plan_returns_empty(self):
|
||||
assert progress_nudge([], 0) == ""
|
||||
|
||||
def test_single_reply_step_returns_empty(self):
|
||||
"""A 1-step reply-only plan has no tool steps, so there is
|
||||
nothing to nudge. The empty string tells the engine to skip
|
||||
injecting a progress reminder after the (non-existent) tool
|
||||
result."""
|
||||
assert progress_nudge(["Reply to user"], 0) == ""
|
||||
|
||||
def test_points_at_next_step(self):
|
||||
steps = ["webSearch query='foo'", "webSearch query='bar'", "Reply to user"]
|
||||
msg = progress_nudge(steps, 0)
|
||||
assert "foo" in msg
|
||||
assert "0/2" in msg
|
||||
msg2 = progress_nudge(steps, 1)
|
||||
assert "bar" in msg2
|
||||
assert "1/2" in msg2
|
||||
|
||||
def test_all_steps_done_prompts_synthesis(self):
|
||||
steps = ["webSearch query='foo'", "webSearch query='bar'", "Reply to user"]
|
||||
msg = progress_nudge(steps, 2)
|
||||
assert "all tool steps executed" in msg.lower() or "synthes" in msg.lower()
|
||||
|
||||
|
||||
class TestResolveNextToolCall:
|
||||
def _schema(self):
|
||||
return [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "webSearch",
|
||||
"description": "Search the web.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"query": {"type": "string"}},
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
def test_returns_tool_and_args(self):
|
||||
cfg = _cfg()
|
||||
raw = '{"name": "webSearch", "arguments": {"query": "weather in Paris"}}'
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
|
||||
result = resolve_next_tool_call(
|
||||
cfg, "webSearch query='weather in Paris'", [], self._schema()
|
||||
)
|
||||
assert result == ("webSearch", {"query": "weather in Paris"})
|
||||
|
||||
def test_rejects_unknown_tool(self):
|
||||
cfg = _cfg()
|
||||
raw = '{"name": "mysteryTool", "arguments": {}}'
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
|
||||
assert resolve_next_tool_call(
|
||||
cfg, "do the thing", [], self._schema()
|
||||
) is None
|
||||
|
||||
def test_null_means_synthesis(self):
|
||||
cfg = _cfg()
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value="null"):
|
||||
assert resolve_next_tool_call(
|
||||
cfg, "Reply to user", [], self._schema()
|
||||
) is None
|
||||
|
||||
def test_peels_markdown_fences(self):
|
||||
cfg = _cfg()
|
||||
raw = '```json\n{"name": "webSearch", "arguments": {"query": "x"}}\n```'
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
|
||||
result = resolve_next_tool_call(
|
||||
cfg, "search for x", [], self._schema()
|
||||
)
|
||||
assert result == ("webSearch", {"query": "x"})
|
||||
|
||||
def test_invalid_json_returns_none(self):
|
||||
cfg = _cfg()
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value="not json"):
|
||||
assert resolve_next_tool_call(
|
||||
cfg, "do the thing", [], self._schema()
|
||||
) is None
|
||||
|
||||
def test_missing_schema_returns_none(self):
|
||||
cfg = _cfg()
|
||||
assert resolve_next_tool_call(cfg, "do the thing", [], []) is None
|
||||
|
||||
def test_drops_unknown_argument_keys(self):
|
||||
cfg = _cfg()
|
||||
raw = (
|
||||
'{"name": "webSearch", "arguments": '
|
||||
'{"query": "weather", "evil_key": "shell"}}'
|
||||
)
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
|
||||
result = resolve_next_tool_call(
|
||||
cfg, "search weather", [], self._schema()
|
||||
)
|
||||
assert result == ("webSearch", {"query": "weather"})
|
||||
|
||||
def test_deterministic_parse_skips_llm_for_concrete_step(self):
|
||||
"""A fully concrete plan step (tool name + `key='value'` args, no
|
||||
``<placeholder>``) must be parsed deterministically without calling
|
||||
the LLM resolver at all.
|
||||
|
||||
Motivation (2026-04-24 field trace): a follow-up query produced the
|
||||
plan `webSearch query='Justin Bieber most famous songs'` — trivially
|
||||
concrete — but the LLM resolver flaked (returned ``null`` or
|
||||
garbage) and the engine fell back to the chat model, which then
|
||||
refused. Parsing concrete steps deterministically removes the LLM
|
||||
call as a failure surface for the common case.
|
||||
"""
|
||||
cfg = _cfg()
|
||||
call_count = [0]
|
||||
|
||||
def _spy(*args, **kwargs):
|
||||
call_count[0] += 1
|
||||
return "null"
|
||||
|
||||
with patch.object(planner_mod, "call_llm_direct", side_effect=_spy):
|
||||
result = resolve_next_tool_call(
|
||||
cfg,
|
||||
"webSearch query='Justin Bieber most famous songs'",
|
||||
[],
|
||||
self._schema(),
|
||||
)
|
||||
|
||||
assert result == (
|
||||
"webSearch",
|
||||
{"query": "Justin Bieber most famous songs"},
|
||||
)
|
||||
assert call_count[0] == 0, (
|
||||
f"LLM should not be called for a concrete step; was called {call_count[0]}×"
|
||||
)
|
||||
|
||||
def test_deterministic_parse_still_rejects_unknown_tool(self):
|
||||
"""The fast path must still honour the allow-list — a concrete step
|
||||
naming a tool not in the schema falls through to ``None``, not to an
|
||||
unfiltered dispatch."""
|
||||
cfg = _cfg()
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value="null"):
|
||||
assert resolve_next_tool_call(
|
||||
cfg,
|
||||
"mysteryTool query='anything'",
|
||||
[],
|
||||
self._schema(),
|
||||
) is None
|
||||
|
||||
def test_falls_back_to_llm_when_step_has_placeholder(self):
|
||||
"""Steps containing an ``<entity from step N>`` placeholder need
|
||||
entity substitution from prior results — that requires the LLM
|
||||
resolver, so the fast path must decline and defer."""
|
||||
cfg = _cfg()
|
||||
raw = (
|
||||
'{"name": "webSearch", "arguments": '
|
||||
'{"query": "films directed by Brandon Cronenberg"}}'
|
||||
)
|
||||
with patch.object(
|
||||
planner_mod, "call_llm_direct", return_value=raw,
|
||||
) as spy:
|
||||
result = resolve_next_tool_call(
|
||||
cfg,
|
||||
"webSearch query='films directed by <director name from step 1>'",
|
||||
[("webSearch", '{"query": "Possessor director"}',
|
||||
"Possessor directed by Brandon Cronenberg.")],
|
||||
self._schema(),
|
||||
)
|
||||
assert result == (
|
||||
"webSearch",
|
||||
{"query": "films directed by Brandon Cronenberg"},
|
||||
)
|
||||
assert spy.called, "Placeholder substitution must go through the LLM"
|
||||
|
||||
def test_deterministic_parse_accepts_bare_tool_name_as_empty_args(self):
|
||||
"""A plan step naming the tool with no trailing args must parse to
|
||||
``(name, {})`` without an LLM call.
|
||||
|
||||
This is the shape the planner emits when it follows the
|
||||
"omit optional arguments" rule — e.g. a weather query with no
|
||||
named place plans as ``getWeather`` (no args), and the tool
|
||||
auto-derives location from the user's geoip context.
|
||||
"""
|
||||
cfg = _cfg()
|
||||
schema = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "getWeather",
|
||||
"description": "Weather.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"location": {"type": "string"}},
|
||||
"required": [],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
with patch.object(planner_mod, "call_llm_direct") as spy:
|
||||
result = resolve_next_tool_call(cfg, "getWeather", [], schema)
|
||||
assert result == ("getWeather", {})
|
||||
assert not spy.called, (
|
||||
"Bare tool name must not trigger an LLM round-trip"
|
||||
)
|
||||
|
||||
def test_deterministic_parse_handles_double_quoted_values(self):
|
||||
"""Planner output occasionally uses double quotes — parse both."""
|
||||
cfg = _cfg()
|
||||
with patch.object(planner_mod, "call_llm_direct") as spy:
|
||||
result = resolve_next_tool_call(
|
||||
cfg,
|
||||
'webSearch query="weather in Paris"',
|
||||
[],
|
||||
self._schema(),
|
||||
)
|
||||
assert result == ("webSearch", {"query": "weather in Paris"})
|
||||
assert not spy.called
|
||||
|
||||
def test_deterministic_parse_handles_hyphenated_mcp_tool_name(self):
|
||||
"""MCP tool names like ``chrome-devtools__navigate_page`` contain
|
||||
hyphens. The fast-path parser must accept them and produce a clean
|
||||
``(name, args)`` without an LLM round-trip — otherwise the engine
|
||||
falls through to the chat model, which on small models flakes into
|
||||
the empty-reply fallback."""
|
||||
cfg = _cfg()
|
||||
schema = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "chrome-devtools__navigate_page",
|
||||
"description": "Navigate the browser to a URL.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"url": {"type": "string"}},
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
with patch.object(planner_mod, "call_llm_direct") as spy:
|
||||
result = resolve_next_tool_call(
|
||||
cfg,
|
||||
"chrome-devtools__navigate_page url='https://youtube.com'",
|
||||
[],
|
||||
schema,
|
||||
)
|
||||
assert result == (
|
||||
"chrome-devtools__navigate_page",
|
||||
{"url": "https://youtube.com"},
|
||||
)
|
||||
assert not spy.called, (
|
||||
"Hyphenated MCP tool name must parse without an LLM round-trip"
|
||||
)
|
||||
|
||||
def test_keeps_args_as_is_when_schema_has_no_properties(self):
|
||||
cfg = _cfg()
|
||||
schema = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "freeform",
|
||||
"description": "freeform",
|
||||
"parameters": {"type": "object"},
|
||||
},
|
||||
}
|
||||
]
|
||||
raw = '{"name": "freeform", "arguments": {"anything": "goes"}}'
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
|
||||
result = resolve_next_tool_call(cfg, "do it", [], schema)
|
||||
assert result == ("freeform", {"anything": "goes"})
|
||||
|
||||
|
||||
class TestUrlArgNormalisation:
|
||||
"""The resolver must hand chrome/browser MCP tools a fully-qualified
|
||||
URL.
|
||||
|
||||
Field trace (2026-05): the planner emitted
|
||||
``page='[youtube.com](http://youtube.com)'`` for the user query
|
||||
"navigate to youtube.com". The slow-path resolver remapped the key
|
||||
to ``url`` (the schema's actual property) but preserved the markdown
|
||||
wrapper as the value, so chrome-devtools-mcp received
|
||||
``{"url": "[youtube.com](http://youtube.com)"}`` and Puppeteer's
|
||||
Page.navigate rejected with "Cannot navigate to invalid URL".
|
||||
A scheme-less bare ``youtube.com`` value fails the same way.
|
||||
|
||||
The fix is generic: any URL-keyed string value gets
|
||||
markdown-stripped and scheme-prepended before it leaves the planner.
|
||||
"""
|
||||
|
||||
def _navigate_schema(self):
|
||||
return [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "chrome-devtools__navigate_page",
|
||||
"description": "Navigate to a URL.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {"type": "string"},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
def test_strips_markdown_link_wrapper_in_slow_path(self):
|
||||
cfg = _cfg()
|
||||
raw = (
|
||||
'{"name": "chrome-devtools__navigate_page", "arguments": '
|
||||
'{"url": "[youtube.com](http://youtube.com)"}}'
|
||||
)
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
|
||||
result = resolve_next_tool_call(
|
||||
cfg,
|
||||
"chrome-devtools__navigate_page page='[youtube.com](http://youtube.com)'",
|
||||
[],
|
||||
self._navigate_schema(),
|
||||
)
|
||||
assert result == (
|
||||
"chrome-devtools__navigate_page",
|
||||
{"url": "http://youtube.com"},
|
||||
)
|
||||
|
||||
def test_prepends_scheme_to_bare_domain_in_slow_path(self):
|
||||
cfg = _cfg()
|
||||
raw = (
|
||||
'{"name": "chrome-devtools__navigate_page", "arguments": '
|
||||
'{"url": "youtube.com"}}'
|
||||
)
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
|
||||
result = resolve_next_tool_call(
|
||||
cfg,
|
||||
"chrome-devtools__navigate_page page='youtube.com'",
|
||||
[],
|
||||
self._navigate_schema(),
|
||||
)
|
||||
assert result == (
|
||||
"chrome-devtools__navigate_page",
|
||||
{"url": "https://youtube.com"},
|
||||
)
|
||||
|
||||
def test_prepends_scheme_to_bare_domain_in_fast_path(self):
|
||||
"""Fast path parses ``url='youtube.com'`` deterministically; the
|
||||
normalisation must apply there too so we don't regress on the
|
||||
common case where the planner uses the right key name."""
|
||||
cfg = _cfg()
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value="null") as spy:
|
||||
result = resolve_next_tool_call(
|
||||
cfg,
|
||||
"chrome-devtools__navigate_page url='youtube.com'",
|
||||
[],
|
||||
self._navigate_schema(),
|
||||
)
|
||||
assert result == (
|
||||
"chrome-devtools__navigate_page",
|
||||
{"url": "https://youtube.com"},
|
||||
)
|
||||
assert spy.call_count == 0, "fast path must not call the LLM resolver"
|
||||
|
||||
def test_preserves_already_qualified_url(self):
|
||||
cfg = _cfg()
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value="null"):
|
||||
result = resolve_next_tool_call(
|
||||
cfg,
|
||||
"chrome-devtools__navigate_page url='https://youtube.com/feed/trending'",
|
||||
[],
|
||||
self._navigate_schema(),
|
||||
)
|
||||
assert result == (
|
||||
"chrome-devtools__navigate_page",
|
||||
{"url": "https://youtube.com/feed/trending"},
|
||||
)
|
||||
|
||||
def test_does_not_touch_unrelated_string_args(self):
|
||||
"""A ``query='youtube.com tutorials'`` arg on webSearch must stay
|
||||
literal — we only normalise values keyed as URLs."""
|
||||
cfg = _cfg()
|
||||
schema = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "webSearch",
|
||||
"description": "Search.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"query": {"type": "string"}},
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
raw = (
|
||||
'{"name": "webSearch", "arguments": '
|
||||
'{"query": "youtube.com tutorials"}}'
|
||||
)
|
||||
with patch.object(planner_mod, "call_llm_direct", return_value=raw):
|
||||
result = resolve_next_tool_call(
|
||||
cfg,
|
||||
"webSearch find tutorials",
|
||||
[],
|
||||
schema,
|
||||
)
|
||||
assert result == ("webSearch", {"query": "youtube.com tutorials"})
|
||||
|
||||
|
||||
class TestToolStepsOf:
|
||||
def test_multi_step_drops_final_synthesis_step(self):
|
||||
assert tool_steps_of(["a", "b", "reply"]) == ["a", "b"]
|
||||
|
||||
def test_single_step_has_no_tool_steps(self):
|
||||
"""A 1-step plan is reply-only by contract (rule 9), so it
|
||||
contributes no tool steps. Engine uses this to skip the
|
||||
direct-exec path and the progress nudge for pure-reply plans."""
|
||||
assert tool_steps_of(["only"]) == []
|
||||
|
||||
def test_empty_plan(self):
|
||||
assert tool_steps_of([]) == []
|
||||
|
||||
def test_strips_search_memory_directive(self):
|
||||
plan = [
|
||||
"searchMemory topic='user preferences'",
|
||||
"webSearch query='foo'",
|
||||
"Reply to the user.",
|
||||
]
|
||||
assert tool_steps_of(plan) == ["webSearch query='foo'"]
|
||||
|
||||
|
||||
class TestIsSearchMemoryStep:
|
||||
def test_detects_directive(self):
|
||||
assert is_search_memory_step("searchMemory topic='x'") is True
|
||||
assert is_search_memory_step(" SEARCHMEMORY topic='x'") is True
|
||||
|
||||
def test_rejects_other_steps(self):
|
||||
assert is_search_memory_step("webSearch query='foo'") is False
|
||||
assert is_search_memory_step("Reply to the user.") is False
|
||||
|
||||
|
||||
class TestMemoryTopicOf:
|
||||
def test_single_quoted(self):
|
||||
assert memory_topic_of("searchMemory topic='pets'") == "pets"
|
||||
|
||||
def test_double_quoted(self):
|
||||
assert memory_topic_of('searchMemory topic="favourite films"') == "favourite films"
|
||||
|
||||
def test_bare_value(self):
|
||||
assert memory_topic_of("searchMemory topic=preferences") == "preferences"
|
||||
|
||||
def test_missing_topic_returns_empty(self):
|
||||
assert memory_topic_of("searchMemory") == ""
|
||||
|
||||
|
||||
class TestPlanRequiresMemory:
|
||||
def test_true_when_directive_present(self):
|
||||
assert plan_requires_memory([
|
||||
"searchMemory topic='pets'",
|
||||
"Reply to user",
|
||||
]) is True
|
||||
|
||||
def test_false_when_only_tools_and_reply(self):
|
||||
assert plan_requires_memory([
|
||||
"webSearch query='foo'",
|
||||
"Reply to the user.",
|
||||
]) is False
|
||||
|
||||
def test_false_for_empty(self):
|
||||
assert plan_requires_memory([]) is False
|
||||
|
||||
|
||||
class TestStripMemoryDirectives:
|
||||
def test_removes_directive(self):
|
||||
plan = [
|
||||
"searchMemory topic='pets'",
|
||||
"Reply to user",
|
||||
]
|
||||
assert strip_memory_directives(plan) == ["Reply to user"]
|
||||
|
||||
def test_leaves_tool_only_plan_untouched(self):
|
||||
plan = ["webSearch query='foo'", "Reply"]
|
||||
assert strip_memory_directives(plan) == plan
|
||||
|
||||
|
||||
class TestToolNamesInPlan:
|
||||
def test_extracts_known_names_in_order(self):
|
||||
plan = [
|
||||
"webSearch query='a'",
|
||||
"getWeather",
|
||||
"webSearch query='b'", # duplicate should dedup
|
||||
"Reply to the user.",
|
||||
]
|
||||
names = tool_names_in_plan(plan, ["webSearch", "getWeather", "stop"])
|
||||
assert names == ["webSearch", "getWeather"]
|
||||
|
||||
def test_filters_unknown_names(self):
|
||||
plan = ["hallucinatedTool x='y'", "webSearch query='q'", "Reply"]
|
||||
assert tool_names_in_plan(plan, ["webSearch"]) == ["webSearch"]
|
||||
|
||||
def test_ignores_search_memory_directive(self):
|
||||
plan = ["searchMemory topic='t'", "webSearch query='q'", "Reply"]
|
||||
assert tool_names_in_plan(plan, ["webSearch", "searchMemory"]) == ["webSearch"]
|
||||
|
||||
def test_empty_plan(self):
|
||||
assert tool_names_in_plan([], ["webSearch"]) == []
|
||||
|
||||
def test_extracts_hyphenated_mcp_tool_name(self):
|
||||
"""MCP tool names embed the server in the prefix and use hyphens
|
||||
(e.g. ``chrome-devtools__navigate_page``). The head regex must accept
|
||||
hyphens so the planner-driven allow-list union and the
|
||||
``_plan_under_specified`` guard don't misclassify a perfectly valid
|
||||
plan step as paraphrased prose.
|
||||
|
||||
Field trace (2026-05-03): user said "navigate to youtube.com". Planner
|
||||
emitted ``chrome-devtools__navigate_page page='...'`` correctly, but
|
||||
the hyphen-stripping regex extracted only ``chrome``, which wasn't a
|
||||
known tool — so direct-exec was skipped and the small chat model
|
||||
flaked into the empty-reply fallback.
|
||||
"""
|
||||
plan = [
|
||||
"chrome-devtools__navigate_page page='https://youtube.com'",
|
||||
"Reply to the user.",
|
||||
]
|
||||
names = tool_names_in_plan(plan, ["chrome-devtools__navigate_page"])
|
||||
assert names == ["chrome-devtools__navigate_page"]
|
||||
|
||||
|
||||
class TestPlanHasUnresolvedToolSteps:
|
||||
def test_true_when_step_paraphrases_tool(self):
|
||||
plan = ["get the weather", "Reply to the user."]
|
||||
assert plan_has_unresolved_tool_steps(plan, ["getWeather", "stop"]) is True
|
||||
|
||||
def test_false_when_step_names_tool(self):
|
||||
plan = ["getWeather", "Reply to the user."]
|
||||
assert plan_has_unresolved_tool_steps(plan, ["getWeather"]) is False
|
||||
|
||||
def test_false_for_reply_only_plan(self):
|
||||
# No tool steps at all — the planner explicitly decided no tools.
|
||||
assert plan_has_unresolved_tool_steps(
|
||||
["Reply to the user."], ["getWeather"]
|
||||
) is False
|
||||
|
||||
def test_false_for_empty_plan(self):
|
||||
assert plan_has_unresolved_tool_steps([], ["getWeather"]) is False
|
||||
|
||||
def test_false_when_search_memory_only_and_reply(self):
|
||||
# searchMemory is a directive, not a tool — but there's also no
|
||||
# real tool step paraphrased either.
|
||||
plan = ["searchMemory topic='t'", "Reply to the user."]
|
||||
assert plan_has_unresolved_tool_steps(plan, ["getWeather"]) is False
|
||||
|
||||
def test_false_for_hyphenated_mcp_tool_step(self):
|
||||
"""A concrete plan step naming a hyphenated MCP tool must NOT be
|
||||
treated as under-specified — otherwise the engine skips direct-exec
|
||||
and forces the chat model to take the turn instead."""
|
||||
plan = [
|
||||
"chrome-devtools__navigate_page page='https://youtube.com'",
|
||||
"Reply to the user.",
|
||||
]
|
||||
assert plan_has_unresolved_tool_steps(
|
||||
plan, ["chrome-devtools__navigate_page"]
|
||||
) is False
|
||||
Reference in New Issue
Block a user