Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
520 lines
19 KiB
Python
520 lines
19 KiB
Python
"""Integration test for the toolSearchTool escape hatch and related loop behaviours.
|
|
|
|
Scenario: the router picks a narrow initial tool set. Mid-loop the chat model
|
|
realises it needs a different tool and invokes ``toolSearchTool``. The engine
|
|
dispatches it, merges the returned tool names into the per-turn allow-list,
|
|
and the next turn calls the newly-surfaced tool (``getWeather``). The final
|
|
content is delivered immediately.
|
|
"""
|
|
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
|
|
def _assistant_tool_call(name: str, args: dict, call_id: str = "call_1"):
|
|
return {
|
|
"message": {
|
|
"role": "assistant",
|
|
"content": "",
|
|
"tool_calls": [
|
|
{
|
|
"id": call_id,
|
|
"type": "function",
|
|
"function": {"name": name, "arguments": args},
|
|
}
|
|
],
|
|
}
|
|
}
|
|
|
|
|
|
def _assistant_content(text: str):
|
|
return {"message": {"role": "assistant", "content": text}}
|
|
|
|
|
|
def test_loop_merges_toolsearchtool_results_into_allowlist(
|
|
mock_config, db, dialogue_memory
|
|
):
|
|
from jarvis.reply import engine as engine_mod
|
|
from jarvis.tools.types import ToolExecutionResult
|
|
|
|
mock_config.ollama_chat_model = "gpt-oss:20b" # LARGE → no forced text tools
|
|
|
|
invoked_tools: list[tuple[str, dict]] = []
|
|
|
|
def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
|
|
invoked_tools.append((tool_name, tool_args or {}))
|
|
if tool_name == "toolSearchTool":
|
|
# Returns a newly-routed tool that was NOT in the initial pick.
|
|
return ToolExecutionResult(
|
|
success=True,
|
|
reply_text="getWeather: Report current weather.",
|
|
error_message=None,
|
|
)
|
|
if tool_name == "getWeather":
|
|
return ToolExecutionResult(
|
|
success=True,
|
|
reply_text="London: 12C partly cloudy.",
|
|
error_message=None,
|
|
)
|
|
return ToolExecutionResult(
|
|
success=True, reply_text="result", error_message=None
|
|
)
|
|
|
|
chat_responses = iter(
|
|
[
|
|
# Turn 1: model calls toolSearchTool.
|
|
_assistant_tool_call(
|
|
"toolSearchTool", {"query": "current weather in london"}
|
|
),
|
|
# Turn 2: model uses the newly-surfaced getWeather.
|
|
_assistant_tool_call(
|
|
"getWeather", {"location": "London"}, call_id="call_2"
|
|
),
|
|
# Turn 3: final reply.
|
|
_assistant_content("It's 12C and partly cloudy in London."),
|
|
]
|
|
)
|
|
|
|
def fake_chat(*args, **kwargs):
|
|
try:
|
|
return next(chat_responses)
|
|
except StopIteration:
|
|
return _assistant_content("Done.")
|
|
|
|
with patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
|
|
patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
|
|
patch.object(engine_mod, "select_tools", return_value=["webSearch", "stop"]), \
|
|
patch.object(
|
|
engine_mod,
|
|
"extract_search_params_for_memory",
|
|
return_value={"keywords": []},
|
|
):
|
|
reply = engine_mod.run_reply_engine(
|
|
db=db,
|
|
cfg=mock_config,
|
|
tts=None,
|
|
text="how's the weather in london?",
|
|
dialogue_memory=dialogue_memory,
|
|
)
|
|
|
|
tool_names = [n for n, _ in invoked_tools]
|
|
assert "toolSearchTool" in tool_names, (
|
|
f"Expected toolSearchTool to be invoked; got {tool_names}"
|
|
)
|
|
assert "getWeather" in tool_names, (
|
|
"Expected getWeather (surfaced mid-loop by toolSearchTool) to be "
|
|
f"invoked on a subsequent turn; got {tool_names}"
|
|
)
|
|
# getWeather must follow toolSearchTool (the allow-list widening
|
|
# happens after the tool result is appended).
|
|
assert tool_names.index("getWeather") > tool_names.index("toolSearchTool")
|
|
assert reply and "London" in reply
|
|
|
|
|
|
def test_initial_allowlist_always_includes_toolsearchtool(
|
|
mock_config, db, dialogue_memory
|
|
):
|
|
"""Even when the router returns no additional tools, the engine must
|
|
always append ``toolSearchTool`` so the escape hatch is reachable."""
|
|
from jarvis.reply import engine as engine_mod
|
|
from jarvis.tools.types import ToolExecutionResult
|
|
|
|
mock_config.ollama_chat_model = "gpt-oss:20b"
|
|
|
|
captured_allow_lists: list[list[str]] = []
|
|
|
|
def fake_chat(*args, **kwargs):
|
|
# Capture a snapshot of allowed_tools via the first system message
|
|
# (too invasive to reach into the closure — instead we assert on the
|
|
# final reply path indirectly).
|
|
return _assistant_content("Hello back!")
|
|
|
|
with patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
|
|
patch.object(engine_mod, "select_tools", return_value=["stop"]), \
|
|
patch.object(
|
|
engine_mod,
|
|
"extract_search_params_for_memory",
|
|
return_value={"keywords": []},
|
|
):
|
|
# Patch the tools description generator to snapshot the allow-list.
|
|
real_generate = engine_mod.generate_tools_json_schema
|
|
|
|
def spy_schema(allowed_tools, mcp_tools):
|
|
captured_allow_lists.append(list(allowed_tools))
|
|
return real_generate(allowed_tools, mcp_tools)
|
|
|
|
with patch.object(
|
|
engine_mod, "generate_tools_json_schema", side_effect=spy_schema
|
|
):
|
|
engine_mod.run_reply_engine(
|
|
db=db,
|
|
cfg=mock_config,
|
|
tts=None,
|
|
text="hi",
|
|
dialogue_memory=dialogue_memory,
|
|
)
|
|
|
|
assert captured_allow_lists, "generate_tools_json_schema was never called"
|
|
# The engine now runs the router before the planner, which builds an
|
|
# auxiliary schema for the planner's tool catalogue (router-narrowed,
|
|
# no escape hatch) before the final chat-model schema. The escape hatch
|
|
# only joins in the chat-model allow-list. Assert it appears somewhere
|
|
# in the captured calls — implementations are free to reuse the same
|
|
# schema generator at multiple call sites.
|
|
assert any("toolSearchTool" in al for al in captured_allow_lists), (
|
|
f"toolSearchTool missing from any allow-list: {captured_allow_lists}"
|
|
)
|
|
|
|
|
|
def test_schema_regenerated_after_toolsearchtool_merge(
|
|
mock_config, db, dialogue_memory
|
|
):
|
|
"""F1: after toolSearchTool widens the allow-list, the next native-mode
|
|
LLM call must receive a tools schema that includes the newly surfaced
|
|
tool name."""
|
|
from jarvis.reply import engine as engine_mod
|
|
from jarvis.tools.types import ToolExecutionResult
|
|
|
|
mock_config.ollama_chat_model = "gpt-oss:20b" # LARGE → native tools
|
|
|
|
def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
|
|
if tool_name == "toolSearchTool":
|
|
return ToolExecutionResult(
|
|
success=True,
|
|
reply_text="getWeather: Report current weather.",
|
|
error_message=None,
|
|
)
|
|
return ToolExecutionResult(
|
|
success=True, reply_text="done", error_message=None
|
|
)
|
|
|
|
chat_responses = iter(
|
|
[
|
|
_assistant_tool_call(
|
|
"toolSearchTool", {"query": "weather"}, call_id="c1"
|
|
),
|
|
_assistant_content("All good."),
|
|
]
|
|
)
|
|
captured_tools_params: list = []
|
|
|
|
def fake_chat(*args, **kwargs):
|
|
captured_tools_params.append(kwargs.get("tools"))
|
|
try:
|
|
return next(chat_responses)
|
|
except StopIteration:
|
|
return _assistant_content("done")
|
|
|
|
with patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
|
|
patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
|
|
patch.object(engine_mod, "select_tools", return_value=["webSearch", "stop"]), \
|
|
patch.object(
|
|
engine_mod,
|
|
"extract_search_params_for_memory",
|
|
return_value={"keywords": []},
|
|
):
|
|
engine_mod.run_reply_engine(
|
|
db=db,
|
|
cfg=mock_config,
|
|
tts=None,
|
|
text="weather?",
|
|
dialogue_memory=dialogue_memory,
|
|
)
|
|
|
|
# Two LLM calls: pre-merge and post-merge. The post-merge call must
|
|
# include getWeather in its tools schema.
|
|
assert len(captured_tools_params) >= 2
|
|
post_merge_schema = captured_tools_params[1] or []
|
|
names = []
|
|
for s in post_merge_schema:
|
|
if isinstance(s, dict):
|
|
fn = s.get("function", {}) if isinstance(s.get("function"), dict) else {}
|
|
nm = fn.get("name")
|
|
if nm:
|
|
names.append(nm)
|
|
assert "getWeather" in names, (
|
|
f"Expected getWeather in post-merge tools schema; got {names}"
|
|
)
|
|
|
|
|
|
def test_tool_search_max_calls_cap(mock_config, db, dialogue_memory):
|
|
"""F5: toolSearchTool invocations are capped per reply."""
|
|
from jarvis.reply import engine as engine_mod
|
|
from jarvis.tools.types import ToolExecutionResult
|
|
|
|
mock_config.ollama_chat_model = "gpt-oss:20b"
|
|
mock_config.tool_search_max_calls = 2
|
|
|
|
dispatch_count = {"toolSearchTool": 0}
|
|
|
|
def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
|
|
if tool_name == "toolSearchTool":
|
|
dispatch_count["toolSearchTool"] += 1
|
|
return ToolExecutionResult(
|
|
success=True,
|
|
reply_text="No additional tools found for that description.",
|
|
error_message=None,
|
|
)
|
|
return ToolExecutionResult(
|
|
success=True, reply_text="ok", error_message=None
|
|
)
|
|
|
|
# Model keeps trying toolSearchTool; last turn emits final content.
|
|
responses = [
|
|
_assistant_tool_call("toolSearchTool", {"query": "a"}, call_id="c1"),
|
|
_assistant_tool_call("toolSearchTool", {"query": "b"}, call_id="c2"),
|
|
_assistant_tool_call("toolSearchTool", {"query": "c"}, call_id="c3"),
|
|
_assistant_tool_call("toolSearchTool", {"query": "d"}, call_id="c4"),
|
|
_assistant_content("All right, giving up."),
|
|
]
|
|
it = iter(responses)
|
|
|
|
def fake_chat(*args, **kwargs):
|
|
try:
|
|
return next(it)
|
|
except StopIteration:
|
|
return _assistant_content("done")
|
|
|
|
with patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
|
|
patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
|
|
patch.object(engine_mod, "select_tools", return_value=["webSearch", "stop"]), \
|
|
patch.object(
|
|
engine_mod,
|
|
"extract_search_params_for_memory",
|
|
return_value={"keywords": []},
|
|
):
|
|
engine_mod.run_reply_engine(
|
|
db=db,
|
|
cfg=mock_config,
|
|
tts=None,
|
|
text="hello",
|
|
dialogue_memory=dialogue_memory,
|
|
)
|
|
|
|
assert dispatch_count["toolSearchTool"] == 2, (
|
|
f"Expected cap to limit dispatch to 2; got "
|
|
f"{dispatch_count['toolSearchTool']}"
|
|
)
|
|
|
|
|
|
def test_validate_tool_args_catches_unknown_keys():
|
|
"""Unit test for the schema validator — unknown arg key is the exact
|
|
failure mode the field log hit."""
|
|
from jarvis.reply.engine import _validate_tool_args_against_schema
|
|
|
|
err = _validate_tool_args_against_schema(
|
|
"webSearch",
|
|
{"query": "tube strikes today"},
|
|
mcp_tools=None,
|
|
)
|
|
assert err is not None
|
|
assert "unknown argument" in err.lower()
|
|
assert "search_query" in err
|
|
|
|
|
|
def test_validate_tool_args_passes_correct_keys():
|
|
from jarvis.reply.engine import _validate_tool_args_against_schema
|
|
|
|
err = _validate_tool_args_against_schema(
|
|
"webSearch",
|
|
{"search_query": "tube strikes today"},
|
|
mcp_tools=None,
|
|
)
|
|
assert err is None
|
|
|
|
|
|
def test_validate_tool_args_catches_missing_required():
|
|
from jarvis.reply.engine import _validate_tool_args_against_schema
|
|
|
|
err = _validate_tool_args_against_schema(
|
|
"webSearch",
|
|
{},
|
|
mcp_tools=None,
|
|
)
|
|
assert err is not None
|
|
assert "missing required" in err.lower()
|
|
|
|
|
|
def test_max_turns_produces_digest(mock_config, db, dialogue_memory):
|
|
"""When the loop hits ``agentic_max_turns`` via a pure tool-call loop
|
|
(no content turn), the engine runs ``digest_loop_for_max_turns`` and
|
|
ships the caveat-prefixed digest."""
|
|
from jarvis.reply import engine as engine_mod
|
|
from jarvis.tools.types import ToolExecutionResult
|
|
|
|
mock_config.ollama_chat_model = "gpt-oss:20b"
|
|
mock_config.agentic_max_turns = 3
|
|
|
|
# The model keeps calling toolSearchTool every turn — no content is
|
|
# ever produced, so the loop exhausts max_turns and the digest fires.
|
|
def fake_chat(*args, **kwargs):
|
|
return _assistant_tool_call("toolSearchTool", {"query": "a"}, call_id="c1")
|
|
|
|
def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
|
|
return ToolExecutionResult(
|
|
success=True,
|
|
reply_text="No additional tools found.",
|
|
error_message=None,
|
|
)
|
|
|
|
captured = {}
|
|
|
|
def fake_digest(user_query, loop_messages, cfg):
|
|
captured["user_query"] = user_query
|
|
captured["loop_messages"] = loop_messages
|
|
return "Couldn't finish: I was still working through the request."
|
|
|
|
with patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
|
|
patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
|
|
patch.object(
|
|
engine_mod, "select_tools", return_value=["toolSearchTool", "stop"]
|
|
), \
|
|
patch.object(
|
|
engine_mod,
|
|
"extract_search_params_for_memory",
|
|
return_value={"keywords": []},
|
|
), \
|
|
patch.object(
|
|
engine_mod, "digest_loop_for_max_turns", side_effect=fake_digest
|
|
):
|
|
reply = engine_mod.run_reply_engine(
|
|
db=db,
|
|
cfg=mock_config,
|
|
tts=None,
|
|
text="do something complicated",
|
|
dialogue_memory=dialogue_memory,
|
|
)
|
|
|
|
assert reply == "Couldn't finish: I was still working through the request."
|
|
assert captured.get("user_query"), "digest should receive the user query"
|
|
assert isinstance(captured.get("loop_messages"), list)
|
|
|
|
|
|
def test_max_turns_digest_failure_falls_back_to_generic_error(
|
|
mock_config, db, dialogue_memory
|
|
):
|
|
"""If the digest returns None (e.g. timeout) and there is no last
|
|
candidate reply (pure tool-call loop), the engine must emit the
|
|
generic error rather than returning None."""
|
|
from jarvis.reply import engine as engine_mod
|
|
from jarvis.tools.types import ToolExecutionResult
|
|
|
|
mock_config.ollama_chat_model = "gpt-oss:20b"
|
|
mock_config.agentic_max_turns = 2
|
|
|
|
# Pure tool-call loop — no content, so last_candidate_reply stays None.
|
|
def fake_chat(*args, **kwargs):
|
|
return _assistant_tool_call("toolSearchTool", {"query": "a"}, call_id="c1")
|
|
|
|
def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
|
|
return ToolExecutionResult(
|
|
success=True,
|
|
reply_text="No additional tools found.",
|
|
error_message=None,
|
|
)
|
|
|
|
with patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
|
|
patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
|
|
patch.object(
|
|
engine_mod, "select_tools", return_value=["toolSearchTool", "stop"]
|
|
), \
|
|
patch.object(
|
|
engine_mod,
|
|
"extract_search_params_for_memory",
|
|
return_value={"keywords": []},
|
|
), \
|
|
patch.object(
|
|
engine_mod, "digest_loop_for_max_turns", return_value=None
|
|
):
|
|
reply = engine_mod.run_reply_engine(
|
|
db=db,
|
|
cfg=mock_config,
|
|
tts=None,
|
|
text="do something complicated",
|
|
dialogue_memory=dialogue_memory,
|
|
)
|
|
|
|
# Must return some reply (generic error), not None.
|
|
assert reply is not None and reply.strip()
|
|
|
|
|
|
def test_toolsearchtool_empty_result_does_not_register_sentence_as_tool(
|
|
mock_config, db, dialogue_memory, capsys
|
|
):
|
|
"""Regression: when toolSearchTool surfaces nothing, it returns the
|
|
plain sentence ``"No additional tools found for that description."``
|
|
as ``reply_text``. The engine's line-splitting merger used to treat
|
|
that whole sentence as a tool name and append it to ``allowed_tools``,
|
|
producing the field-log line ``🔧 Discovered 1 tool(s): No additional
|
|
tools found for that description.`` and polluting the allow-list
|
|
with a bogus entry. The parser must reject anything that is not an
|
|
actual tool name from the registry.
|
|
"""
|
|
from jarvis.reply import engine as engine_mod
|
|
from jarvis.tools.types import ToolExecutionResult
|
|
|
|
mock_config.ollama_chat_model = "gpt-oss:20b"
|
|
|
|
def fake_tool_runner(db, cfg, tool_name, tool_args, **kwargs):
|
|
if tool_name == "toolSearchTool":
|
|
return ToolExecutionResult(
|
|
success=True,
|
|
reply_text="No additional tools found for that description.",
|
|
error_message=None,
|
|
)
|
|
return ToolExecutionResult(
|
|
success=True, reply_text="ok", error_message=None
|
|
)
|
|
|
|
chat_responses = iter(
|
|
[
|
|
_assistant_tool_call(
|
|
"toolSearchTool", {"query": "open youtube"}, call_id="c1"
|
|
),
|
|
_assistant_content("I could not find a tool for that."),
|
|
]
|
|
)
|
|
captured_tools_params: list = []
|
|
|
|
def fake_chat(*args, **kwargs):
|
|
captured_tools_params.append(kwargs.get("tools"))
|
|
try:
|
|
return next(chat_responses)
|
|
except StopIteration:
|
|
return _assistant_content("done")
|
|
|
|
with patch.object(engine_mod, "run_tool_with_retries", side_effect=fake_tool_runner), \
|
|
patch.object(engine_mod, "chat_with_messages", side_effect=fake_chat), \
|
|
patch.object(engine_mod, "select_tools", return_value=["stop"]), \
|
|
patch.object(
|
|
engine_mod,
|
|
"extract_search_params_for_memory",
|
|
return_value={"keywords": []},
|
|
):
|
|
engine_mod.run_reply_engine(
|
|
db=db,
|
|
cfg=mock_config,
|
|
tts=None,
|
|
text="open youtube",
|
|
dialogue_memory=dialogue_memory,
|
|
)
|
|
|
|
# The user-facing `🔧 Discovered N tool(s):` line is the first
|
|
# symptom of the bug — if the parser accepts the empty-result
|
|
# sentence as a tool name, the log prints it verbatim.
|
|
stdout = capsys.readouterr().out
|
|
assert "No additional tools found for that description" not in stdout or (
|
|
"🔍 No new tools found" in stdout
|
|
), (
|
|
"Engine's toolSearchTool merger printed the empty-result sentence "
|
|
"as a discovered tool name. Expected `🔍 No new tools found` "
|
|
"instead. Full stdout:\n" + stdout
|
|
)
|
|
assert "🔧 Discovered" not in stdout or (
|
|
"No additional tools found" not in stdout
|
|
), (
|
|
"Engine logged `🔧 Discovered ... No additional tools found ...` "
|
|
"— the sentence was misclassified as a tool name. Stdout:\n" + stdout
|
|
)
|