Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/tests/test_planner.py
+++ b/tests/test_planner.py
@@ -0,0 +1,790 @@
+"""Unit tests for the task-list planner.
+
+These tests verify behaviours, not implementation: the parser cleans up
+messy LLM output, trivial single-reply plans don't leak out, the
+fail-open paths return an empty list, and the progress_nudge reflects
+accurate step progression.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from unittest.mock import patch
+
+import pytest
+
+from jarvis.reply import planner as planner_mod
+from jarvis.reply.planner import (
+    MAX_STEPS,
+    SEARCH_MEMORY_DIRECTIVE,
+    _is_trivial_plan,
+    _parse_plan,
+    format_plan_block,
+    is_search_memory_step,
+    memory_topic_of,
+    plan_query,
+    plan_requires_memory,
+    progress_nudge,
+    resolve_next_tool_call,
+    resolve_planner_model,
+    strip_memory_directives,
+    plan_has_unresolved_tool_steps,
+    tool_names_in_plan,
+    tool_steps_of,
+)
+
+
+def _cfg(**overrides):
+    base = {
+        "ollama_base_url": "http://localhost:11434",
+        "ollama_chat_model": "gemma4:e2b",
+        "planner_model": "",
+        "tool_router_model": "",
+        "intent_judge_model": "",
+        "planner_enabled": True,
+        "planner_timeout_sec": 6.0,
+    }
+    base.update(overrides)
+    return SimpleNamespace(**base)
+
+
+class TestParsePlan:
+    def test_strips_numbering(self):
+        raw = "1. webSearch query='foo'\n2. Reply to user"
+        assert _parse_plan(raw) == ["webSearch query='foo'", "Reply to user"]
+
+    def test_strips_bullet_prefixes(self):
+        raw = "- step one\n* step two\n• step three"
+        assert _parse_plan(raw) == ["step one", "step two", "step three"]
+
+    def test_strips_wrapping_quotes(self):
+        raw = '"step one"\n`step two`'
+        assert _parse_plan(raw) == ["step one", "step two"]
+
+    def test_ignores_json_fences_and_blank_lines(self):
+        raw = "```\nstep one\n\n```\nstep two"
+        assert _parse_plan(raw) == ["step one", "step two"]
+
+    def test_caps_at_max_steps(self):
+        raw = "\n".join(f"step {i}" for i in range(MAX_STEPS + 3))
+        assert len(_parse_plan(raw)) == MAX_STEPS
+
+    def test_truncates_overlong_step(self):
+        long = "a" * 500
+        parsed = _parse_plan(long)
+        assert len(parsed) == 1
+        assert parsed[0].endswith("…")
+        assert len(parsed[0]) <= 201
+
+
+class TestIsTrivialPlan:
+    def test_empty_is_trivial(self):
+        assert _is_trivial_plan([]) is True
+
+    def test_single_step_is_trivial_regardless_of_language(self):
+        # Purely structural: any 1-step plan is trivial. Language-agnostic.
+        assert _is_trivial_plan(["Reply to the user."]) is True
+        assert _is_trivial_plan(["Répondre à l'utilisateur."]) is True
+        assert _is_trivial_plan(["ユーザーに返信する"]) is True
+        assert _is_trivial_plan(["webSearch query='x'"]) is True
+
+    def test_multi_step_is_not_trivial(self):
+        assert _is_trivial_plan(["webSearch ...", "Reply to user"]) is False
+        assert _is_trivial_plan(["a", "b", "c"]) is False
+
+
+class TestResolvePlannerModel:
+    def test_prefers_explicit_planner_model(self):
+        cfg = _cfg(planner_model="gemma-plan", ollama_chat_model="chat")
+        assert resolve_planner_model(cfg) == "gemma-plan"
+
+    def test_tracks_chat_model_by_default(self):
+        cfg = _cfg(ollama_chat_model="gemma4:e2b")
+        assert resolve_planner_model(cfg) == "gemma4:e2b"
+
+    def test_ignores_tool_router_model(self):
+        # Planner must track the chat model — not the router. Upgrading
+        # the chat model through setup must upgrade the planner too.
+        cfg = _cfg(tool_router_model="router-x", ollama_chat_model="chat-y")
+        assert resolve_planner_model(cfg) == "chat-y"
+
+    def test_upgrading_chat_model_upgrades_planner(self):
+        cfg = _cfg(ollama_chat_model="gpt-oss:20b")
+        assert resolve_planner_model(cfg) == "gpt-oss:20b"
+
+    def test_returns_empty_when_no_candidates(self):
+        cfg = _cfg(ollama_chat_model="")
+        assert resolve_planner_model(cfg) == ""
+
+
+class TestPlanQuery:
+    def test_short_query_returns_empty(self):
+        cfg = _cfg()
+        assert plan_query(cfg, "hi", "", []) == []
+
+    def test_disabled_returns_empty(self):
+        cfg = _cfg(planner_enabled=False)
+        long = "what films did the director of Possessor make?"
+        assert plan_query(cfg, long, "", []) == []
+
+    def test_missing_model_returns_empty(self):
+        cfg = _cfg(ollama_chat_model="")
+        long = "what films did the director of Possessor make?"
+        assert plan_query(cfg, long, "", []) == []
+
+    def test_returns_parsed_steps(self):
+        cfg = _cfg()
+        raw_plan = (
+            "webSearch query='Possessor 2020 director'\n"
+            "webSearch query='films by <director name from step 1>'\n"
+            "Reply to the user with the combined findings."
+        )
+        with patch.object(planner_mod, "call_llm_direct", return_value=raw_plan):
+            steps = plan_query(
+                cfg,
+                "what films did the director of Possessor make?",
+                "",
+                [("webSearch", "Search the web.")],
+            )
+        assert len(steps) == 3
+        assert "Possessor" in steps[0]
+        assert steps[-1].lower().startswith("reply")
+
+    def test_single_reply_plan_is_preserved(self):
+        """A 1-step reply-only plan is the planner's POSITIVE "no memory,
+        no tools needed" signal. It must NOT be filtered to [] — the
+        engine distinguishes [] (fail-open) from ["Reply ..."] (explicit
+        skip-everything decision) and uses the latter to skip the
+        memory extractor and tool router entirely.
+        """
+        cfg = _cfg()
+        with patch.object(planner_mod, "call_llm_direct", return_value="Reply to user."):
+            steps = plan_query(
+                cfg,
+                "tell me a joke about cats please",
+                "",
+                [],
+            )
+        assert steps == ["Reply to user."]
+
+    def test_llm_failure_returns_empty(self):
+        cfg = _cfg()
+        with patch.object(planner_mod, "call_llm_direct", return_value=None):
+            steps = plan_query(
+                cfg,
+                "what films did the director of Possessor make?",
+                "",
+                [("webSearch", "Search the web.")],
+            )
+        assert steps == []
+
+    def test_memory_context_arg_still_accepted_for_back_compat(self):
+        """Old callers pass `memory_context=` as a positional or keyword
+        argument. Planner now ignores it (the planner runs before memory
+        search), but the signature must still accept it so downstream
+        code doesn't break."""
+        cfg = _cfg()
+        with patch.object(planner_mod, "call_llm_direct", return_value="Reply to user."):
+            steps = plan_query(
+                cfg,
+                "tell me a joke about cats please",
+                "",
+                [],
+                memory_context="some old memory text",
+            )
+        assert steps == ["Reply to user."]
+
+    def test_prompt_warns_against_fabricating_optional_arguments(self):
+        """The planner prompt must explicitly tell the model to omit
+        optional arguments when the user didn't supply a value, and warn
+        against grabbing unrelated words from the utterance as fake values.
+
+        2026-04-24 field regression: gemma4:e2b responded to "how's the
+        weather going to be today" with a plan step of
+        ``getWeather location='today'``. The temporal qualifier "today"
+        was geocoded to a village called "Todaya" in the Philippines —
+        because the small model was trained by our prompt to always give
+        a concrete argument, even when the user's utterance had none to
+        give. This content-assertion guards the fix so the rule can't be
+        silently reverted during future prompt edits without a test
+        failure pointing the editor at the behavioural consequence.
+        """
+        prompt = planner_mod._PROMPT_TEMPLATE.lower()
+        assert "omit" in prompt, (
+            "Planner prompt must tell the model to OMIT optional args "
+            "when no value was provided."
+        )
+        # The guidance must name the exact failure mode so the model
+        # doesn't pattern-match on generic 'omit' without knowing why.
+        assert "fabricate" in prompt or "do not fabricate" in prompt, (
+            "Planner prompt must warn against fabricating argument values "
+            "from unrelated words in the utterance."
+        )
+
+
+class TestFormatPlanBlock:
+    def test_empty_returns_empty_string(self):
+        assert format_plan_block([]) == ""
+
+    def test_numbers_the_steps(self):
+        block = format_plan_block(["step a", "step b"])
+        assert "1. step a" in block
+        assert "2. step b" in block
+        assert "ACTION PLAN" in block
+
+
+class TestProgressNudge:
+    def test_empty_plan_returns_empty(self):
+        assert progress_nudge([], 0) == ""
+
+    def test_single_reply_step_returns_empty(self):
+        """A 1-step reply-only plan has no tool steps, so there is
+        nothing to nudge. The empty string tells the engine to skip
+        injecting a progress reminder after the (non-existent) tool
+        result."""
+        assert progress_nudge(["Reply to user"], 0) == ""
+
+    def test_points_at_next_step(self):
+        steps = ["webSearch query='foo'", "webSearch query='bar'", "Reply to user"]
+        msg = progress_nudge(steps, 0)
+        assert "foo" in msg
+        assert "0/2" in msg
+        msg2 = progress_nudge(steps, 1)
+        assert "bar" in msg2
+        assert "1/2" in msg2
+
+    def test_all_steps_done_prompts_synthesis(self):
+        steps = ["webSearch query='foo'", "webSearch query='bar'", "Reply to user"]
+        msg = progress_nudge(steps, 2)
+        assert "all tool steps executed" in msg.lower() or "synthes" in msg.lower()
+
+
+class TestResolveNextToolCall:
+    def _schema(self):
+        return [
+            {
+                "type": "function",
+                "function": {
+                    "name": "webSearch",
+                    "description": "Search the web.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"query": {"type": "string"}},
+                    },
+                },
+            }
+        ]
+
+    def test_returns_tool_and_args(self):
+        cfg = _cfg()
+        raw = '{"name": "webSearch", "arguments": {"query": "weather in Paris"}}'
+        with patch.object(planner_mod, "call_llm_direct", return_value=raw):
+            result = resolve_next_tool_call(
+                cfg, "webSearch query='weather in Paris'", [], self._schema()
+            )
+        assert result == ("webSearch", {"query": "weather in Paris"})
+
+    def test_rejects_unknown_tool(self):
+        cfg = _cfg()
+        raw = '{"name": "mysteryTool", "arguments": {}}'
+        with patch.object(planner_mod, "call_llm_direct", return_value=raw):
+            assert resolve_next_tool_call(
+                cfg, "do the thing", [], self._schema()
+            ) is None
+
+    def test_null_means_synthesis(self):
+        cfg = _cfg()
+        with patch.object(planner_mod, "call_llm_direct", return_value="null"):
+            assert resolve_next_tool_call(
+                cfg, "Reply to user", [], self._schema()
+            ) is None
+
+    def test_peels_markdown_fences(self):
+        cfg = _cfg()
+        raw = '```json\n{"name": "webSearch", "arguments": {"query": "x"}}\n```'
+        with patch.object(planner_mod, "call_llm_direct", return_value=raw):
+            result = resolve_next_tool_call(
+                cfg, "search for x", [], self._schema()
+            )
+        assert result == ("webSearch", {"query": "x"})
+
+    def test_invalid_json_returns_none(self):
+        cfg = _cfg()
+        with patch.object(planner_mod, "call_llm_direct", return_value="not json"):
+            assert resolve_next_tool_call(
+                cfg, "do the thing", [], self._schema()
+            ) is None
+
+    def test_missing_schema_returns_none(self):
+        cfg = _cfg()
+        assert resolve_next_tool_call(cfg, "do the thing", [], []) is None
+
+    def test_drops_unknown_argument_keys(self):
+        cfg = _cfg()
+        raw = (
+            '{"name": "webSearch", "arguments": '
+            '{"query": "weather", "evil_key": "shell"}}'
+        )
+        with patch.object(planner_mod, "call_llm_direct", return_value=raw):
+            result = resolve_next_tool_call(
+                cfg, "search weather", [], self._schema()
+            )
+        assert result == ("webSearch", {"query": "weather"})
+
+    def test_deterministic_parse_skips_llm_for_concrete_step(self):
+        """A fully concrete plan step (tool name + `key='value'` args, no
+        ``<placeholder>``) must be parsed deterministically without calling
+        the LLM resolver at all.
+
+        Motivation (2026-04-24 field trace): a follow-up query produced the
+        plan `webSearch query='Justin Bieber most famous songs'` — trivially
+        concrete — but the LLM resolver flaked (returned ``null`` or
+        garbage) and the engine fell back to the chat model, which then
+        refused. Parsing concrete steps deterministically removes the LLM
+        call as a failure surface for the common case.
+        """
+        cfg = _cfg()
+        call_count = [0]
+
+        def _spy(*args, **kwargs):
+            call_count[0] += 1
+            return "null"
+
+        with patch.object(planner_mod, "call_llm_direct", side_effect=_spy):
+            result = resolve_next_tool_call(
+                cfg,
+                "webSearch query='Justin Bieber most famous songs'",
+                [],
+                self._schema(),
+            )
+
+        assert result == (
+            "webSearch",
+            {"query": "Justin Bieber most famous songs"},
+        )
+        assert call_count[0] == 0, (
+            f"LLM should not be called for a concrete step; was called {call_count[0]}×"
+        )
+
+    def test_deterministic_parse_still_rejects_unknown_tool(self):
+        """The fast path must still honour the allow-list — a concrete step
+        naming a tool not in the schema falls through to ``None``, not to an
+        unfiltered dispatch."""
+        cfg = _cfg()
+        with patch.object(planner_mod, "call_llm_direct", return_value="null"):
+            assert resolve_next_tool_call(
+                cfg,
+                "mysteryTool query='anything'",
+                [],
+                self._schema(),
+            ) is None
+
+    def test_falls_back_to_llm_when_step_has_placeholder(self):
+        """Steps containing an ``<entity from step N>`` placeholder need
+        entity substitution from prior results — that requires the LLM
+        resolver, so the fast path must decline and defer."""
+        cfg = _cfg()
+        raw = (
+            '{"name": "webSearch", "arguments": '
+            '{"query": "films directed by Brandon Cronenberg"}}'
+        )
+        with patch.object(
+            planner_mod, "call_llm_direct", return_value=raw,
+        ) as spy:
+            result = resolve_next_tool_call(
+                cfg,
+                "webSearch query='films directed by <director name from step 1>'",
+                [("webSearch", '{"query": "Possessor director"}',
+                  "Possessor directed by Brandon Cronenberg.")],
+                self._schema(),
+            )
+        assert result == (
+            "webSearch",
+            {"query": "films directed by Brandon Cronenberg"},
+        )
+        assert spy.called, "Placeholder substitution must go through the LLM"
+
+    def test_deterministic_parse_accepts_bare_tool_name_as_empty_args(self):
+        """A plan step naming the tool with no trailing args must parse to
+        ``(name, {})`` without an LLM call.
+
+        This is the shape the planner emits when it follows the
+        "omit optional arguments" rule — e.g. a weather query with no
+        named place plans as ``getWeather`` (no args), and the tool
+        auto-derives location from the user's geoip context.
+        """
+        cfg = _cfg()
+        schema = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "getWeather",
+                    "description": "Weather.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"location": {"type": "string"}},
+                        "required": [],
+                    },
+                },
+            }
+        ]
+        with patch.object(planner_mod, "call_llm_direct") as spy:
+            result = resolve_next_tool_call(cfg, "getWeather", [], schema)
+        assert result == ("getWeather", {})
+        assert not spy.called, (
+            "Bare tool name must not trigger an LLM round-trip"
+        )
+
+    def test_deterministic_parse_handles_double_quoted_values(self):
+        """Planner output occasionally uses double quotes — parse both."""
+        cfg = _cfg()
+        with patch.object(planner_mod, "call_llm_direct") as spy:
+            result = resolve_next_tool_call(
+                cfg,
+                'webSearch query="weather in Paris"',
+                [],
+                self._schema(),
+            )
+        assert result == ("webSearch", {"query": "weather in Paris"})
+        assert not spy.called
+
+    def test_deterministic_parse_handles_hyphenated_mcp_tool_name(self):
+        """MCP tool names like ``chrome-devtools__navigate_page`` contain
+        hyphens. The fast-path parser must accept them and produce a clean
+        ``(name, args)`` without an LLM round-trip — otherwise the engine
+        falls through to the chat model, which on small models flakes into
+        the empty-reply fallback."""
+        cfg = _cfg()
+        schema = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "chrome-devtools__navigate_page",
+                    "description": "Navigate the browser to a URL.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"url": {"type": "string"}},
+                    },
+                },
+            }
+        ]
+        with patch.object(planner_mod, "call_llm_direct") as spy:
+            result = resolve_next_tool_call(
+                cfg,
+                "chrome-devtools__navigate_page url='https://youtube.com'",
+                [],
+                schema,
+            )
+        assert result == (
+            "chrome-devtools__navigate_page",
+            {"url": "https://youtube.com"},
+        )
+        assert not spy.called, (
+            "Hyphenated MCP tool name must parse without an LLM round-trip"
+        )
+
+    def test_keeps_args_as_is_when_schema_has_no_properties(self):
+        cfg = _cfg()
+        schema = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "freeform",
+                    "description": "freeform",
+                    "parameters": {"type": "object"},
+                },
+            }
+        ]
+        raw = '{"name": "freeform", "arguments": {"anything": "goes"}}'
+        with patch.object(planner_mod, "call_llm_direct", return_value=raw):
+            result = resolve_next_tool_call(cfg, "do it", [], schema)
+        assert result == ("freeform", {"anything": "goes"})
+
+
+class TestUrlArgNormalisation:
+    """The resolver must hand chrome/browser MCP tools a fully-qualified
+    URL.
+
+    Field trace (2026-05): the planner emitted
+    ``page='[youtube.com](http://youtube.com)'`` for the user query
+    "navigate to youtube.com". The slow-path resolver remapped the key
+    to ``url`` (the schema's actual property) but preserved the markdown
+    wrapper as the value, so chrome-devtools-mcp received
+    ``{"url": "[youtube.com](http://youtube.com)"}`` and Puppeteer's
+    Page.navigate rejected with "Cannot navigate to invalid URL".
+    A scheme-less bare ``youtube.com`` value fails the same way.
+
+    The fix is generic: any URL-keyed string value gets
+    markdown-stripped and scheme-prepended before it leaves the planner.
+    """
+
+    def _navigate_schema(self):
+        return [
+            {
+                "type": "function",
+                "function": {
+                    "name": "chrome-devtools__navigate_page",
+                    "description": "Navigate to a URL.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "url": {"type": "string"},
+                        },
+                    },
+                },
+            }
+        ]
+
+    def test_strips_markdown_link_wrapper_in_slow_path(self):
+        cfg = _cfg()
+        raw = (
+            '{"name": "chrome-devtools__navigate_page", "arguments": '
+            '{"url": "[youtube.com](http://youtube.com)"}}'
+        )
+        with patch.object(planner_mod, "call_llm_direct", return_value=raw):
+            result = resolve_next_tool_call(
+                cfg,
+                "chrome-devtools__navigate_page page='[youtube.com](http://youtube.com)'",
+                [],
+                self._navigate_schema(),
+            )
+        assert result == (
+            "chrome-devtools__navigate_page",
+            {"url": "http://youtube.com"},
+        )
+
+    def test_prepends_scheme_to_bare_domain_in_slow_path(self):
+        cfg = _cfg()
+        raw = (
+            '{"name": "chrome-devtools__navigate_page", "arguments": '
+            '{"url": "youtube.com"}}'
+        )
+        with patch.object(planner_mod, "call_llm_direct", return_value=raw):
+            result = resolve_next_tool_call(
+                cfg,
+                "chrome-devtools__navigate_page page='youtube.com'",
+                [],
+                self._navigate_schema(),
+            )
+        assert result == (
+            "chrome-devtools__navigate_page",
+            {"url": "https://youtube.com"},
+        )
+
+    def test_prepends_scheme_to_bare_domain_in_fast_path(self):
+        """Fast path parses ``url='youtube.com'`` deterministically; the
+        normalisation must apply there too so we don't regress on the
+        common case where the planner uses the right key name."""
+        cfg = _cfg()
+        with patch.object(planner_mod, "call_llm_direct", return_value="null") as spy:
+            result = resolve_next_tool_call(
+                cfg,
+                "chrome-devtools__navigate_page url='youtube.com'",
+                [],
+                self._navigate_schema(),
+            )
+        assert result == (
+            "chrome-devtools__navigate_page",
+            {"url": "https://youtube.com"},
+        )
+        assert spy.call_count == 0, "fast path must not call the LLM resolver"
+
+    def test_preserves_already_qualified_url(self):
+        cfg = _cfg()
+        with patch.object(planner_mod, "call_llm_direct", return_value="null"):
+            result = resolve_next_tool_call(
+                cfg,
+                "chrome-devtools__navigate_page url='https://youtube.com/feed/trending'",
+                [],
+                self._navigate_schema(),
+            )
+        assert result == (
+            "chrome-devtools__navigate_page",
+            {"url": "https://youtube.com/feed/trending"},
+        )
+
+    def test_does_not_touch_unrelated_string_args(self):
+        """A ``query='youtube.com tutorials'`` arg on webSearch must stay
+        literal — we only normalise values keyed as URLs."""
+        cfg = _cfg()
+        schema = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "webSearch",
+                    "description": "Search.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"query": {"type": "string"}},
+                    },
+                },
+            }
+        ]
+        raw = (
+            '{"name": "webSearch", "arguments": '
+            '{"query": "youtube.com tutorials"}}'
+        )
+        with patch.object(planner_mod, "call_llm_direct", return_value=raw):
+            result = resolve_next_tool_call(
+                cfg,
+                "webSearch find tutorials",
+                [],
+                schema,
+            )
+        assert result == ("webSearch", {"query": "youtube.com tutorials"})
+
+
+class TestToolStepsOf:
+    def test_multi_step_drops_final_synthesis_step(self):
+        assert tool_steps_of(["a", "b", "reply"]) == ["a", "b"]
+
+    def test_single_step_has_no_tool_steps(self):
+        """A 1-step plan is reply-only by contract (rule 9), so it
+        contributes no tool steps. Engine uses this to skip the
+        direct-exec path and the progress nudge for pure-reply plans."""
+        assert tool_steps_of(["only"]) == []
+
+    def test_empty_plan(self):
+        assert tool_steps_of([]) == []
+
+    def test_strips_search_memory_directive(self):
+        plan = [
+            "searchMemory topic='user preferences'",
+            "webSearch query='foo'",
+            "Reply to the user.",
+        ]
+        assert tool_steps_of(plan) == ["webSearch query='foo'"]
+
+
+class TestIsSearchMemoryStep:
+    def test_detects_directive(self):
+        assert is_search_memory_step("searchMemory topic='x'") is True
+        assert is_search_memory_step("  SEARCHMEMORY topic='x'") is True
+
+    def test_rejects_other_steps(self):
+        assert is_search_memory_step("webSearch query='foo'") is False
+        assert is_search_memory_step("Reply to the user.") is False
+
+
+class TestMemoryTopicOf:
+    def test_single_quoted(self):
+        assert memory_topic_of("searchMemory topic='pets'") == "pets"
+
+    def test_double_quoted(self):
+        assert memory_topic_of('searchMemory topic="favourite films"') == "favourite films"
+
+    def test_bare_value(self):
+        assert memory_topic_of("searchMemory topic=preferences") == "preferences"
+
+    def test_missing_topic_returns_empty(self):
+        assert memory_topic_of("searchMemory") == ""
+
+
+class TestPlanRequiresMemory:
+    def test_true_when_directive_present(self):
+        assert plan_requires_memory([
+            "searchMemory topic='pets'",
+            "Reply to user",
+        ]) is True
+
+    def test_false_when_only_tools_and_reply(self):
+        assert plan_requires_memory([
+            "webSearch query='foo'",
+            "Reply to the user.",
+        ]) is False
+
+    def test_false_for_empty(self):
+        assert plan_requires_memory([]) is False
+
+
+class TestStripMemoryDirectives:
+    def test_removes_directive(self):
+        plan = [
+            "searchMemory topic='pets'",
+            "Reply to user",
+        ]
+        assert strip_memory_directives(plan) == ["Reply to user"]
+
+    def test_leaves_tool_only_plan_untouched(self):
+        plan = ["webSearch query='foo'", "Reply"]
+        assert strip_memory_directives(plan) == plan
+
+
+class TestToolNamesInPlan:
+    def test_extracts_known_names_in_order(self):
+        plan = [
+            "webSearch query='a'",
+            "getWeather",
+            "webSearch query='b'",  # duplicate should dedup
+            "Reply to the user.",
+        ]
+        names = tool_names_in_plan(plan, ["webSearch", "getWeather", "stop"])
+        assert names == ["webSearch", "getWeather"]
+
+    def test_filters_unknown_names(self):
+        plan = ["hallucinatedTool x='y'", "webSearch query='q'", "Reply"]
+        assert tool_names_in_plan(plan, ["webSearch"]) == ["webSearch"]
+
+    def test_ignores_search_memory_directive(self):
+        plan = ["searchMemory topic='t'", "webSearch query='q'", "Reply"]
+        assert tool_names_in_plan(plan, ["webSearch", "searchMemory"]) == ["webSearch"]
+
+    def test_empty_plan(self):
+        assert tool_names_in_plan([], ["webSearch"]) == []
+
+    def test_extracts_hyphenated_mcp_tool_name(self):
+        """MCP tool names embed the server in the prefix and use hyphens
+        (e.g. ``chrome-devtools__navigate_page``). The head regex must accept
+        hyphens so the planner-driven allow-list union and the
+        ``_plan_under_specified`` guard don't misclassify a perfectly valid
+        plan step as paraphrased prose.
+
+        Field trace (2026-05-03): user said "navigate to youtube.com". Planner
+        emitted ``chrome-devtools__navigate_page page='...'`` correctly, but
+        the hyphen-stripping regex extracted only ``chrome``, which wasn't a
+        known tool — so direct-exec was skipped and the small chat model
+        flaked into the empty-reply fallback.
+        """
+        plan = [
+            "chrome-devtools__navigate_page page='https://youtube.com'",
+            "Reply to the user.",
+        ]
+        names = tool_names_in_plan(plan, ["chrome-devtools__navigate_page"])
+        assert names == ["chrome-devtools__navigate_page"]
+
+
+class TestPlanHasUnresolvedToolSteps:
+    def test_true_when_step_paraphrases_tool(self):
+        plan = ["get the weather", "Reply to the user."]
+        assert plan_has_unresolved_tool_steps(plan, ["getWeather", "stop"]) is True
+
+    def test_false_when_step_names_tool(self):
+        plan = ["getWeather", "Reply to the user."]
+        assert plan_has_unresolved_tool_steps(plan, ["getWeather"]) is False
+
+    def test_false_for_reply_only_plan(self):
+        # No tool steps at all — the planner explicitly decided no tools.
+        assert plan_has_unresolved_tool_steps(
+            ["Reply to the user."], ["getWeather"]
+        ) is False
+
+    def test_false_for_empty_plan(self):
+        assert plan_has_unresolved_tool_steps([], ["getWeather"]) is False
+
+    def test_false_when_search_memory_only_and_reply(self):
+        # searchMemory is a directive, not a tool — but there's also no
+        # real tool step paraphrased either.
+        plan = ["searchMemory topic='t'", "Reply to the user."]
+        assert plan_has_unresolved_tool_steps(plan, ["getWeather"]) is False
+
+    def test_false_for_hyphenated_mcp_tool_step(self):
+        """A concrete plan step naming a hyphenated MCP tool must NOT be
+        treated as under-specified — otherwise the engine skips direct-exec
+        and forces the chat model to take the turn instead."""
+        plan = [
+            "chrome-devtools__navigate_page page='https://youtube.com'",
+            "Reply to the user.",
+        ]
+        assert plan_has_unresolved_tool_steps(
+            plan, ["chrome-devtools__navigate_page"]
+        ) is False