javis_bot/tests/test_engine_tool_search_loop.py

"""Integration test for the toolSearchTool escape hatch and related loop behaviours.

Scenario: the router picks a narrow initial tool set. Mid-loop the chat model
realises it needs a different tool and invokes ``toolSearchTool``. The engine
dispatches it, merges the returned tool names into the per-turn allow-list,
and the next turn calls the newly-surfaced tool (``getWeather``). The final
content is delivered immediately.
"""

from unittest.mock import patch

import pytest


def _assistant_tool_call(name: str, args: dict, call_id: str = "call_1"):
    return {
        "message": {
            "role": "assistant",
            "content": "",
            "tool_calls": [
                {
                    "id": call_id,
                    "type": "function",
                    "function": {"name": name, "arguments": args},
                }
            ],
        }
    }


def _assistant_content(text: str):
    return {"message": {"role": "assistant", "content": text}}


def test_loop_merges_toolsearchtool_results_into_allowlist(
    mock_config, db, dialogue_memory
):
    from jarvis.reply import engine as engine_mod
    from jarvis.tools.types import ToolExecutionResult

    mock_config.ollama_chat_model = "gpt-oss:20b"  # LARGE → no forced text tools

    invoked_tools: list[tuple[str, dict]] = []

    def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
        invoked_tools.append((tool_name, tool_args or {}))
        if tool_name == "toolSearchTool":
            # Returns a newly-routed tool that was NOT in the initial pick.
            return ToolExecutionResult(
                success=True,
                reply_text="getWeather: Report current weather.",
                error_message=None,
            )
        if tool_name == "getWeather":
            return ToolExecutionResult(
                success=True,
                reply_text="London: 12C partly cloudy.",
                error_message=None,
            )
        return ToolExecutionResult(
            success=True, reply_text="result", error_message=None
        )

    chat_responses = iter(
        [
            # Turn 1: model calls toolSearchTool.
            _assistant_tool_call(
                "toolSearchTool", {"query": "current weather in london"}
            ),
            # Turn 2: model uses the newly-surfaced getWeather.
            _assistant_tool_call(
                "getWeather", {"location": "London"}, call_id="call_2"
            ),
            # Turn 3: final reply.
            _assistant_content("It's 12C and partly cloudy in London."),
        ]
    )

    def fake_chat(*args, **kwargs):
        try:
            return next(chat_responses)
        except StopIteration:
            return _assistant_content("Done.")

    with patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
         patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
         patch.object(engine_mod, "select_tools", return_value=["webSearch", "stop"]), \
         patch.object(
             engine_mod,
             "extract_search_params_for_memory",
             return_value={"keywords": []},
         ):
        reply = engine_mod.run_reply_engine(
            db=db,
            cfg=mock_config,
            tts=None,
            text="how's the weather in london?",
            dialogue_memory=dialogue_memory,
        )

    tool_names = [n for n, _ in invoked_tools]
    assert "toolSearchTool" in tool_names, (
        f"Expected toolSearchTool to be invoked; got {tool_names}"
    )
    assert "getWeather" in tool_names, (
        "Expected getWeather (surfaced mid-loop by toolSearchTool) to be "
        f"invoked on a subsequent turn; got {tool_names}"
    )
    # getWeather must follow toolSearchTool (the allow-list widening
    # happens after the tool result is appended).
    assert tool_names.index("getWeather") > tool_names.index("toolSearchTool")
    assert reply and "London" in reply


def test_initial_allowlist_always_includes_toolsearchtool(
    mock_config, db, dialogue_memory
):
    """Even when the router returns no additional tools, the engine must
    always append ``toolSearchTool`` so the escape hatch is reachable."""
    from jarvis.reply import engine as engine_mod
    from jarvis.tools.types import ToolExecutionResult

    mock_config.ollama_chat_model = "gpt-oss:20b"

    captured_allow_lists: list[list[str]] = []

    def fake_chat(*args, **kwargs):
        # Capture a snapshot of allowed_tools via the first system message
        # (too invasive to reach into the closure — instead we assert on the
        # final reply path indirectly).
        return _assistant_content("Hello back!")

    with patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
         patch.object(engine_mod, "select_tools", return_value=["stop"]), \
         patch.object(
             engine_mod,
             "extract_search_params_for_memory",
             return_value={"keywords": []},
         ):
        # Patch the tools description generator to snapshot the allow-list.
        real_generate = engine_mod.generate_tools_json_schema

        def spy_schema(allowed_tools, mcp_tools):
            captured_allow_lists.append(list(allowed_tools))
            return real_generate(allowed_tools, mcp_tools)

        with patch.object(
            engine_mod, "generate_tools_json_schema", side_effect=spy_schema
        ):
            engine_mod.run_reply_engine(
                db=db,
                cfg=mock_config,
                tts=None,
                text="hi",
                dialogue_memory=dialogue_memory,
            )

    assert captured_allow_lists, "generate_tools_json_schema was never called"
    # The engine now runs the router before the planner, which builds an
    # auxiliary schema for the planner's tool catalogue (router-narrowed,
    # no escape hatch) before the final chat-model schema. The escape hatch
    # only joins in the chat-model allow-list. Assert it appears somewhere
    # in the captured calls — implementations are free to reuse the same
    # schema generator at multiple call sites.
    assert any("toolSearchTool" in al for al in captured_allow_lists), (
        f"toolSearchTool missing from any allow-list: {captured_allow_lists}"
    )


def test_schema_regenerated_after_toolsearchtool_merge(
    mock_config, db, dialogue_memory
):
    """F1: after toolSearchTool widens the allow-list, the next native-mode
    LLM call must receive a tools schema that includes the newly surfaced
    tool name."""
    from jarvis.reply import engine as engine_mod
    from jarvis.tools.types import ToolExecutionResult

    mock_config.ollama_chat_model = "gpt-oss:20b"  # LARGE → native tools

    def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
        if tool_name == "toolSearchTool":
            return ToolExecutionResult(
                success=True,
                reply_text="getWeather: Report current weather.",
                error_message=None,
            )
        return ToolExecutionResult(
            success=True, reply_text="done", error_message=None
        )

    chat_responses = iter(
        [
            _assistant_tool_call(
                "toolSearchTool", {"query": "weather"}, call_id="c1"
            ),
            _assistant_content("All good."),
        ]
    )
    captured_tools_params: list = []

    def fake_chat(*args, **kwargs):
        captured_tools_params.append(kwargs.get("tools"))
        try:
            return next(chat_responses)
        except StopIteration:
            return _assistant_content("done")

    with patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
         patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
         patch.object(engine_mod, "select_tools", return_value=["webSearch", "stop"]), \
         patch.object(
             engine_mod,
             "extract_search_params_for_memory",
             return_value={"keywords": []},
         ):
        engine_mod.run_reply_engine(
            db=db,
            cfg=mock_config,
            tts=None,
            text="weather?",
            dialogue_memory=dialogue_memory,
        )

    # Two LLM calls: pre-merge and post-merge. The post-merge call must
    # include getWeather in its tools schema.
    assert len(captured_tools_params) >= 2
    post_merge_schema = captured_tools_params[1] or []
    names = []
    for s in post_merge_schema:
        if isinstance(s, dict):
            fn = s.get("function", {}) if isinstance(s.get("function"), dict) else {}
            nm = fn.get("name")
            if nm:
                names.append(nm)
    assert "getWeather" in names, (
        f"Expected getWeather in post-merge tools schema; got {names}"
    )


def test_tool_search_max_calls_cap(mock_config, db, dialogue_memory):
    """F5: toolSearchTool invocations are capped per reply."""
    from jarvis.reply import engine as engine_mod
    from jarvis.tools.types import ToolExecutionResult

    mock_config.ollama_chat_model = "gpt-oss:20b"
    mock_config.tool_search_max_calls = 2

    dispatch_count = {"toolSearchTool": 0}

    def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
        if tool_name == "toolSearchTool":
            dispatch_count["toolSearchTool"] += 1
            return ToolExecutionResult(
                success=True,
                reply_text="No additional tools found for that description.",
                error_message=None,
            )
        return ToolExecutionResult(
            success=True, reply_text="ok", error_message=None
        )

    # Model keeps trying toolSearchTool; last turn emits final content.
    responses = [
        _assistant_tool_call("toolSearchTool", {"query": "a"}, call_id="c1"),
        _assistant_tool_call("toolSearchTool", {"query": "b"}, call_id="c2"),
        _assistant_tool_call("toolSearchTool", {"query": "c"}, call_id="c3"),
        _assistant_tool_call("toolSearchTool", {"query": "d"}, call_id="c4"),
        _assistant_content("All right, giving up."),
    ]
    it = iter(responses)

    def fake_chat(*args, **kwargs):
        try:
            return next(it)
        except StopIteration:
            return _assistant_content("done")

    with patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
         patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
         patch.object(engine_mod, "select_tools", return_value=["webSearch", "stop"]), \
         patch.object(
             engine_mod,
             "extract_search_params_for_memory",
             return_value={"keywords": []},
         ):
        engine_mod.run_reply_engine(
            db=db,
            cfg=mock_config,
            tts=None,
            text="hello",
            dialogue_memory=dialogue_memory,
        )

    assert dispatch_count["toolSearchTool"] == 2, (
        f"Expected cap to limit dispatch to 2; got "
        f"{dispatch_count['toolSearchTool']}"
    )


def test_validate_tool_args_catches_unknown_keys():
    """Unit test for the schema validator — unknown arg key is the exact
    failure mode the field log hit."""
    from jarvis.reply.engine import _validate_tool_args_against_schema

    err = _validate_tool_args_against_schema(
        "webSearch",
        {"query": "tube strikes today"},
        mcp_tools=None,
    )
    assert err is not None
    assert "unknown argument" in err.lower()
    assert "search_query" in err


def test_validate_tool_args_passes_correct_keys():
    from jarvis.reply.engine import _validate_tool_args_against_schema

    err = _validate_tool_args_against_schema(
        "webSearch",
        {"search_query": "tube strikes today"},
        mcp_tools=None,
    )
    assert err is None


def test_validate_tool_args_catches_missing_required():
    from jarvis.reply.engine import _validate_tool_args_against_schema

    err = _validate_tool_args_against_schema(
        "webSearch",
        {},
        mcp_tools=None,
    )
    assert err is not None
    assert "missing required" in err.lower()


def test_max_turns_produces_digest(mock_config, db, dialogue_memory):
    """When the loop hits ``agentic_max_turns`` via a pure tool-call loop
    (no content turn), the engine runs ``digest_loop_for_max_turns`` and
    ships the caveat-prefixed digest."""
    from jarvis.reply import engine as engine_mod
    from jarvis.tools.types import ToolExecutionResult

    mock_config.ollama_chat_model = "gpt-oss:20b"
    mock_config.agentic_max_turns = 3

    # The model keeps calling toolSearchTool every turn — no content is
    # ever produced, so the loop exhausts max_turns and the digest fires.
    def fake_chat(*args, **kwargs):
        return _assistant_tool_call("toolSearchTool", {"query": "a"}, call_id="c1")

    def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
        return ToolExecutionResult(
            success=True,
            reply_text="No additional tools found.",
            error_message=None,
        )

    captured = {}

    def fake_digest(user_query, loop_messages, cfg):
        captured["user_query"] = user_query
        captured["loop_messages"] = loop_messages
        return "Couldn't finish: I was still working through the request."

    with patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
         patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
         patch.object(
             engine_mod, "select_tools", return_value=["toolSearchTool", "stop"]
         ), \
         patch.object(
             engine_mod,
             "extract_search_params_for_memory",
             return_value={"keywords": []},
         ), \
         patch.object(
             engine_mod, "digest_loop_for_max_turns", side_effect=fake_digest
         ):
        reply = engine_mod.run_reply_engine(
            db=db,
            cfg=mock_config,
            tts=None,
            text="do something complicated",
            dialogue_memory=dialogue_memory,
        )

    assert reply == "Couldn't finish: I was still working through the request."
    assert captured.get("user_query"), "digest should receive the user query"
    assert isinstance(captured.get("loop_messages"), list)


def test_max_turns_digest_failure_falls_back_to_generic_error(
    mock_config, db, dialogue_memory
):
    """If the digest returns None (e.g. timeout) and there is no last
    candidate reply (pure tool-call loop), the engine must emit the
    generic error rather than returning None."""
    from jarvis.reply import engine as engine_mod
    from jarvis.tools.types import ToolExecutionResult

    mock_config.ollama_chat_model = "gpt-oss:20b"
    mock_config.agentic_max_turns = 2

    # Pure tool-call loop — no content, so last_candidate_reply stays None.
    def fake_chat(*args, **kwargs):
        return _assistant_tool_call("toolSearchTool", {"query": "a"}, call_id="c1")

    def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
        return ToolExecutionResult(
            success=True,
            reply_text="No additional tools found.",
            error_message=None,
        )

    with patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
         patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
         patch.object(
             engine_mod, "select_tools", return_value=["toolSearchTool", "stop"]
         ), \
         patch.object(
             engine_mod,
             "extract_search_params_for_memory",
             return_value={"keywords": []},
         ), \
         patch.object(
             engine_mod, "digest_loop_for_max_turns", return_value=None
         ):
        reply = engine_mod.run_reply_engine(
            db=db,
            cfg=mock_config,
            tts=None,
            text="do something complicated",
            dialogue_memory=dialogue_memory,
        )

    # Must return some reply (generic error), not None.
    assert reply is not None and reply.strip()


def test_toolsearchtool_empty_result_does_not_register_sentence_as_tool(
    mock_config, db, dialogue_memory, capsys
):
    """Regression: when toolSearchTool surfaces nothing, it returns the
    plain sentence ``"No additional tools found for that description."``
    as ``reply_text``. The engine's line-splitting merger used to treat
    that whole sentence as a tool name and append it to ``allowed_tools``,
    producing the field-log line ``🔧 Discovered 1 tool(s): No additional
    tools found for that description.`` and polluting the allow-list
    with a bogus entry. The parser must reject anything that is not an
    actual tool name from the registry.
    """
    from jarvis.reply import engine as engine_mod
    from jarvis.tools.types import ToolExecutionResult

    mock_config.ollama_chat_model = "gpt-oss:20b"

    def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
        if tool_name == "toolSearchTool":
            return ToolExecutionResult(
                success=True,
                reply_text="No additional tools found for that description.",
                error_message=None,
            )
        return ToolExecutionResult(
            success=True, reply_text="ok", error_message=None
        )

    chat_responses = iter(
        [
            _assistant_tool_call(
                "toolSearchTool", {"query": "open youtube"}, call_id="c1"
            ),
            _assistant_content("I could not find a tool for that."),
        ]
    )
    captured_tools_params: list = []

    def fake_chat(*args, **kwargs):
        captured_tools_params.append(kwargs.get("tools"))
        try:
            return next(chat_responses)
        except StopIteration:
            return _assistant_content("done")

    with patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
         patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
         patch.object(engine_mod, "select_tools", return_value=["stop"]), \
         patch.object(
             engine_mod,
             "extract_search_params_for_memory",
             return_value={"keywords": []},
         ):
        engine_mod.run_reply_engine(
            db=db,
            cfg=mock_config,
            tts=None,
            text="open youtube",
            dialogue_memory=dialogue_memory,
        )

    # The user-facing `🔧 Discovered N tool(s):` line is the first
    # symptom of the bug — if the parser accepts the empty-result
    # sentence as a tool name, the log prints it verbatim.
    stdout = capsys.readouterr().out
    assert "No additional tools found for that description" not in stdout or (
        "🔍 No new tools found" in stdout
    ), (
        "Engine's toolSearchTool merger printed the empty-result sentence "
        "as a discovered tool name. Expected `🔍 No new tools found` "
        "instead. Full stdout:\n" + stdout
    )
    assert "🔧 Discovered" not in stdout or (
        "No additional tools found" not in stdout
    ), (
        "Engine logged `🔧 Discovered ... No additional tools found ...` "
        "— the sentence was misclassified as a tool name. Stdout:\n" + stdout
    )