Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
563
tests/test_engine_tool_carryover_guard.py
Normal file
563
tests/test_engine_tool_carryover_guard.py
Normal file
@@ -0,0 +1,563 @@
|
||||
"""Engine-level tool carry-over guard.
|
||||
|
||||
Field trace (2026-05-03, gemma4:e2b):
|
||||
Turn 1 user: "how's the weather tomorrow Jarvis?" → no location set →
|
||||
assistant invokes ``getWeather``, tool returns ``success=False``
|
||||
("I couldn't auto-detect your location, please tell me a city"),
|
||||
assistant relays the request.
|
||||
Turn 2 user: "I'm in London" → small-model router picks ``webSearch``
|
||||
instead of ``getWeather``, planner falls back to a web search for
|
||||
"weather in london tomorrow", DDG fails, Wikipedia matches the 2014
|
||||
film "Edge of Tomorrow", and the assistant parrots the film summary
|
||||
as the weather answer.
|
||||
|
||||
Fix: when the previous assistant turn invoked a tool that reported
|
||||
``success=False`` on its ``ToolExecutionResult``, union the previous
|
||||
turn's tool name into the allow-list. The ``tool_failed`` flag stamped
|
||||
onto each recorded tool result is the truth source. Gating on failure
|
||||
(rather than recency or query length) means a successful chain followed
|
||||
by a genuine new short ask ("play some music") correctly does NOT carry
|
||||
over the prior tool.
|
||||
|
||||
The carry-over is an engine-side per-turn overlay: the router cache
|
||||
stores only the raw router output, so future identical queries are
|
||||
unaffected.
|
||||
"""
|
||||
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from src.jarvis.memory.conversation import DialogueMemory
|
||||
from src.jarvis.reply.engine import run_reply_engine
|
||||
|
||||
|
||||
def _mock_cfg():
|
||||
cfg = Mock()
|
||||
cfg.ollama_base_url = "http://localhost:11434"
|
||||
cfg.ollama_chat_model = "test-large"
|
||||
cfg.voice_debug = False
|
||||
cfg.llm_tools_timeout_sec = 8.0
|
||||
cfg.llm_embed_timeout_sec = 10.0
|
||||
cfg.llm_chat_timeout_sec = 45.0
|
||||
cfg.llm_digest_timeout_sec = 8.0
|
||||
cfg.memory_enrichment_max_results = 5
|
||||
cfg.memory_enrichment_source = "diary"
|
||||
cfg.memory_digest_enabled = False
|
||||
cfg.tool_result_digest_enabled = False
|
||||
cfg.location_ip_address = None
|
||||
cfg.location_auto_detect = False
|
||||
cfg.location_enabled = False
|
||||
cfg.agentic_max_turns = 8
|
||||
cfg.tool_search_max_calls = 3
|
||||
cfg.tool_selection_strategy = "all"
|
||||
cfg.tool_carryover_max_turns = 2
|
||||
cfg.tool_carryover_per_entry_chars = 1200
|
||||
cfg.mcps = {}
|
||||
cfg.llm_thinking_enabled = False
|
||||
cfg.tts_engine = "none"
|
||||
cfg.ollama_embed_model = "test-embed"
|
||||
cfg.db_path = ":memory:"
|
||||
return cfg
|
||||
|
||||
|
||||
def _tool_names_from_chat_call(call) -> set[str]:
|
||||
"""Pull function names out of the OpenAI-style tools schema passed
|
||||
to chat_with_messages.
|
||||
"""
|
||||
schema = call.kwargs.get("tools") or []
|
||||
names: set[str] = set()
|
||||
for entry in schema:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
fn = entry.get("function") or {}
|
||||
nm = fn.get("name")
|
||||
if isinstance(nm, str):
|
||||
names.add(nm)
|
||||
return names
|
||||
|
||||
|
||||
def _failed_tool_turn(tool_name: str, tool_call_id: str = "c1") -> list[dict]:
|
||||
"""Plant a previous-turn tool turn where the tool was invoked and
|
||||
reported failure. Mirrors the message shape the engine records for a
|
||||
native tool call whose ``ToolExecutionResult.success`` was False.
|
||||
"""
|
||||
return [
|
||||
{"role": "assistant", "content": "", "tool_calls": [{
|
||||
"id": tool_call_id, "type": "function",
|
||||
"function": {"name": tool_name, "arguments": {}},
|
||||
}]},
|
||||
{"role": "tool", "tool_call_id": tool_call_id,
|
||||
"tool_name": tool_name,
|
||||
"content": "I couldn't auto-detect your location.",
|
||||
"tool_failed": True},
|
||||
]
|
||||
|
||||
|
||||
def _succeeded_tool_turn(tool_name: str, tool_call_id: str = "c1") -> list[dict]:
|
||||
"""Plant a previous-turn tool turn where the tool succeeded."""
|
||||
return [
|
||||
{"role": "assistant", "content": "", "tool_calls": [{
|
||||
"id": tool_call_id, "type": "function",
|
||||
"function": {"name": tool_name, "arguments": {"location": "London"}},
|
||||
}]},
|
||||
{"role": "tool", "tool_call_id": tool_call_id,
|
||||
"tool_name": tool_name,
|
||||
"content": "London: 15°C and partly cloudy.",
|
||||
"tool_failed": False},
|
||||
]
|
||||
|
||||
|
||||
def _failed_text_tool_turn(tool_name: str) -> list[dict]:
|
||||
"""Plant a previous-turn tool turn in the text-tool fallback shape
|
||||
(small models). Tool error is appended as a ``role=user`` message
|
||||
tagged with both ``tool_name`` and ``tool_failed=True``.
|
||||
"""
|
||||
return [
|
||||
{"role": "assistant",
|
||||
"content": (
|
||||
"```tool_call\n"
|
||||
'{"name": "' + tool_name + '", "arguments": {}}\n'
|
||||
"```"
|
||||
)},
|
||||
{"role": "user",
|
||||
"content": (
|
||||
"[Tool error: " + tool_name + "] I couldn't auto-detect "
|
||||
"your location."
|
||||
),
|
||||
"tool_name": tool_name,
|
||||
"tool_failed": True},
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
|
||||
@patch("src.jarvis.memory.graph_ops.build_warm_profile",
|
||||
return_value={"user": "", "directives": ""})
|
||||
@patch("src.jarvis.memory.graph.GraphMemoryStore")
|
||||
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
|
||||
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
|
||||
@patch("src.jarvis.reply.engine.extract_text_from_response")
|
||||
@patch("src.jarvis.reply.engine.chat_with_messages")
|
||||
def test_followup_carries_over_failed_previous_tool(
|
||||
mock_chat, mock_extract, _mock_extract_mem, _mock_plan,
|
||||
_mock_graph, _mock_warm, _mock_fmt,
|
||||
):
|
||||
"""Previous turn invoked ``getWeather`` and the tool reported failure;
|
||||
this turn's router only picked ``webSearch``. The engine must union
|
||||
``getWeather`` back in so the chat model can re-call it with the
|
||||
location the user just supplied.
|
||||
"""
|
||||
mock_chat.side_effect = [
|
||||
{"message": {"content": "Weather in London is 15°C."}},
|
||||
]
|
||||
mock_extract.side_effect = ["Weather in London is 15°C."]
|
||||
|
||||
db = Mock()
|
||||
cfg = _mock_cfg()
|
||||
dm = DialogueMemory()
|
||||
dm.add_message("user", "how's the weather tomorrow Jarvis")
|
||||
dm.record_tool_turn(_failed_tool_turn("getWeather"))
|
||||
dm.add_message("assistant", "I do not have a location set.")
|
||||
|
||||
with patch(
|
||||
"src.jarvis.reply.engine.select_tools",
|
||||
return_value=["webSearch"],
|
||||
):
|
||||
run_reply_engine(db=db, cfg=cfg, tts=None,
|
||||
text="I'm in London", dialogue_memory=dm)
|
||||
|
||||
tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1])
|
||||
assert "webSearch" in tool_names, (
|
||||
f"router pick must remain visible; saw {sorted(tool_names)}"
|
||||
)
|
||||
assert "getWeather" in tool_names, (
|
||||
"previous-turn failed tool must be carried over; "
|
||||
f"saw {sorted(tool_names)}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
|
||||
@patch("src.jarvis.memory.graph_ops.build_warm_profile",
|
||||
return_value={"user": "", "directives": ""})
|
||||
@patch("src.jarvis.memory.graph.GraphMemoryStore")
|
||||
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
|
||||
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
|
||||
@patch("src.jarvis.reply.engine.extract_text_from_response")
|
||||
@patch("src.jarvis.reply.engine.chat_with_messages")
|
||||
def test_successful_previous_tool_does_not_trigger_carryover(
|
||||
mock_chat, mock_extract, _mock_extract_mem, _mock_plan,
|
||||
_mock_graph, _mock_warm, _mock_fmt,
|
||||
):
|
||||
"""A successful prior tool call means the chain completed. A genuine
|
||||
new short ask ("log my breakfast") must NOT inherit the prior tool —
|
||||
that would noisily widen the allow-list for unrelated turns and
|
||||
risks small models replaying the previous tool. The router pick
|
||||
stands on its own.
|
||||
"""
|
||||
mock_chat.side_effect = [
|
||||
{"message": {"content": "Logged."}},
|
||||
]
|
||||
mock_extract.side_effect = ["Logged."]
|
||||
|
||||
db = Mock()
|
||||
cfg = _mock_cfg()
|
||||
dm = DialogueMemory()
|
||||
dm.add_message("user", "how's the weather in London")
|
||||
dm.record_tool_turn(_succeeded_tool_turn("getWeather"))
|
||||
dm.add_message("assistant", "It's 15°C and partly cloudy in London.")
|
||||
|
||||
with patch(
|
||||
"src.jarvis.reply.engine.select_tools",
|
||||
return_value=["logMeal"],
|
||||
):
|
||||
run_reply_engine(db=db, cfg=cfg, tts=None,
|
||||
text="log my breakfast", dialogue_memory=dm)
|
||||
|
||||
tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1])
|
||||
assert "logMeal" in tool_names
|
||||
assert "getWeather" not in tool_names, (
|
||||
"successful prior tool must not be carried over; "
|
||||
f"saw {sorted(tool_names)}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
|
||||
@patch("src.jarvis.memory.graph_ops.build_warm_profile",
|
||||
return_value={"user": "", "directives": ""})
|
||||
@patch("src.jarvis.memory.graph.GraphMemoryStore")
|
||||
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
|
||||
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
|
||||
@patch("src.jarvis.reply.engine.extract_text_from_response")
|
||||
@patch("src.jarvis.reply.engine.chat_with_messages")
|
||||
def test_cold_start_does_not_trigger_carryover(
|
||||
mock_chat, mock_extract, _mock_extract_mem, _mock_plan,
|
||||
_mock_graph, _mock_warm, _mock_fmt,
|
||||
):
|
||||
"""Empty dialogue memory — the carry-over path must be a no-op."""
|
||||
mock_chat.side_effect = [
|
||||
{"message": {"content": "Hello."}},
|
||||
]
|
||||
mock_extract.side_effect = ["Hello."]
|
||||
|
||||
db = Mock()
|
||||
cfg = _mock_cfg()
|
||||
dm = DialogueMemory() # cold start — no prior turns
|
||||
|
||||
with patch(
|
||||
"src.jarvis.reply.engine.select_tools",
|
||||
return_value=["webSearch"],
|
||||
):
|
||||
run_reply_engine(db=db, cfg=cfg, tts=None,
|
||||
text="hi", dialogue_memory=dm)
|
||||
|
||||
tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1])
|
||||
assert "webSearch" in tool_names
|
||||
assert "getWeather" not in tool_names
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
|
||||
@patch("src.jarvis.memory.graph_ops.build_warm_profile",
|
||||
return_value={"user": "", "directives": ""})
|
||||
@patch("src.jarvis.memory.graph.GraphMemoryStore")
|
||||
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
|
||||
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
|
||||
@patch("src.jarvis.reply.engine.extract_text_from_response")
|
||||
@patch("src.jarvis.reply.engine.chat_with_messages")
|
||||
def test_carryover_does_not_pollute_router_cache(
|
||||
mock_chat, mock_extract, _mock_extract_mem, _mock_plan,
|
||||
_mock_graph, _mock_warm, _mock_fmt,
|
||||
):
|
||||
"""The router cache stores the raw router output. Carry-over is a
|
||||
per-turn overlay layered on top — it must NOT be written back to the
|
||||
cache, otherwise every replay of the same query inherits a
|
||||
contaminated tool list.
|
||||
"""
|
||||
mock_chat.side_effect = [
|
||||
{"message": {"content": "Weather in London is 15°C."}},
|
||||
]
|
||||
mock_extract.side_effect = ["Weather in London is 15°C."]
|
||||
|
||||
db = Mock()
|
||||
cfg = _mock_cfg()
|
||||
dm = DialogueMemory()
|
||||
dm.add_message("user", "how's the weather tomorrow Jarvis")
|
||||
dm.record_tool_turn(_failed_tool_turn("getWeather"))
|
||||
dm.add_message("assistant", "I do not have a location set.")
|
||||
|
||||
with patch(
|
||||
"src.jarvis.reply.engine.select_tools",
|
||||
return_value=["webSearch"],
|
||||
):
|
||||
run_reply_engine(db=db, cfg=cfg, tts=None,
|
||||
text="I'm in London", dialogue_memory=dm)
|
||||
|
||||
cached_router_entries = [
|
||||
(k, v) for k, v in dm._hot_cache.items() if k.startswith("router:")
|
||||
]
|
||||
assert cached_router_entries, "router output should have been cached"
|
||||
for key, (_ts, value) in cached_router_entries:
|
||||
assert value == ["webSearch"], (
|
||||
f"router cache for {key!r} should hold raw router output "
|
||||
f"['webSearch']; got {value!r}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
|
||||
@patch("src.jarvis.memory.graph_ops.build_warm_profile",
|
||||
return_value={"user": "", "directives": ""})
|
||||
@patch("src.jarvis.memory.graph.GraphMemoryStore")
|
||||
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
|
||||
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
|
||||
@patch("src.jarvis.reply.engine.extract_text_from_response")
|
||||
@patch("src.jarvis.reply.engine.chat_with_messages")
|
||||
def test_long_followup_still_carries_over_when_previous_failed(
|
||||
mock_chat, mock_extract, _mock_extract_mem, _mock_plan,
|
||||
_mock_graph, _mock_warm, _mock_fmt,
|
||||
):
|
||||
"""Failure-gated carry-over does NOT depend on query length. A long
|
||||
follow-up that supplies the missing arg ("Right, sorry — I'm in
|
||||
Edinburgh, please try the lookup again for tomorrow") must still
|
||||
carry over the failed tool. The earlier char-length heuristic was
|
||||
dropped because it false-negatived this shape; the failure flag is
|
||||
the right signal.
|
||||
"""
|
||||
mock_chat.side_effect = [
|
||||
{"message": {"content": "Edinburgh weather: 12°C."}},
|
||||
]
|
||||
mock_extract.side_effect = ["Edinburgh weather: 12°C."]
|
||||
|
||||
db = Mock()
|
||||
cfg = _mock_cfg()
|
||||
dm = DialogueMemory()
|
||||
dm.add_message("user", "how's the weather tomorrow Jarvis")
|
||||
dm.record_tool_turn(_failed_tool_turn("getWeather"))
|
||||
dm.add_message("assistant", "I do not have a location set.")
|
||||
|
||||
long_followup = (
|
||||
"Right, sorry — I'm in Edinburgh, please try the lookup again for "
|
||||
"tomorrow morning if you would."
|
||||
)
|
||||
assert len(long_followup) > 80
|
||||
|
||||
with patch(
|
||||
"src.jarvis.reply.engine.select_tools",
|
||||
return_value=["webSearch"],
|
||||
):
|
||||
run_reply_engine(db=db, cfg=cfg, tts=None,
|
||||
text=long_followup, dialogue_memory=dm)
|
||||
|
||||
tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1])
|
||||
assert "getWeather" in tool_names, (
|
||||
f"long follow-up to a failed tool must still carry over; "
|
||||
f"saw {sorted(tool_names)}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
|
||||
@patch("src.jarvis.memory.graph_ops.build_warm_profile",
|
||||
return_value={"user": "", "directives": ""})
|
||||
@patch("src.jarvis.memory.graph.GraphMemoryStore")
|
||||
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
|
||||
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
|
||||
@patch("src.jarvis.reply.engine.extract_text_from_response")
|
||||
@patch("src.jarvis.reply.engine.chat_with_messages")
|
||||
def test_text_tool_fallback_failure_carries_over(
|
||||
mock_chat, mock_extract, _mock_extract_mem, _mock_plan,
|
||||
_mock_graph, _mock_warm, _mock_fmt,
|
||||
):
|
||||
"""Small-model path: the previous turn's tool error was stored as a
|
||||
``role=user`` message tagged with ``tool_name`` and
|
||||
``tool_failed=True``. The walker must collect the name from this
|
||||
shape too, not only from native ``assistant.tool_calls`` + ``role=tool``
|
||||
pairs.
|
||||
"""
|
||||
mock_chat.side_effect = [
|
||||
{"message": {"content": "Weather in Berlin is 9°C."}},
|
||||
]
|
||||
mock_extract.side_effect = ["Weather in Berlin is 9°C."]
|
||||
|
||||
db = Mock()
|
||||
cfg = _mock_cfg()
|
||||
dm = DialogueMemory()
|
||||
dm.add_message("user", "how's the weather")
|
||||
dm.record_tool_turn(_failed_text_tool_turn("getWeather"))
|
||||
dm.add_message("assistant", "I couldn't auto-detect your location.")
|
||||
|
||||
with patch(
|
||||
"src.jarvis.reply.engine.select_tools",
|
||||
return_value=["webSearch"],
|
||||
):
|
||||
run_reply_engine(db=db, cfg=cfg, tts=None,
|
||||
text="I'm in Berlin", dialogue_memory=dm)
|
||||
|
||||
tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1])
|
||||
assert "getWeather" in tool_names, (
|
||||
"text-tool fallback failure shape must be carried over; "
|
||||
f"saw {sorted(tool_names)}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
|
||||
@patch("src.jarvis.memory.graph_ops.build_warm_profile",
|
||||
return_value={"user": "", "directives": ""})
|
||||
@patch("src.jarvis.memory.graph.GraphMemoryStore")
|
||||
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
|
||||
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
|
||||
@patch("src.jarvis.reply.engine.extract_text_from_response")
|
||||
@patch("src.jarvis.reply.engine.chat_with_messages")
|
||||
def test_multi_tool_call_only_failed_sibling_carries_over(
|
||||
mock_chat, mock_extract, _mock_extract_mem, _mock_plan,
|
||||
_mock_graph, _mock_warm, _mock_fmt,
|
||||
):
|
||||
"""When an assistant message carries multiple tool_calls but only
|
||||
one of them failed, only the failed name must be carried over. The
|
||||
successful sibling stays the chat model's responsibility through
|
||||
its own routing.
|
||||
"""
|
||||
mock_chat.side_effect = [
|
||||
{"message": {"content": "Sure."}},
|
||||
]
|
||||
mock_extract.side_effect = ["Sure."]
|
||||
|
||||
db = Mock()
|
||||
cfg = _mock_cfg()
|
||||
dm = DialogueMemory()
|
||||
dm.add_message("user", "weather and search Pushkin")
|
||||
dm.record_tool_turn([
|
||||
{"role": "assistant", "content": "", "tool_calls": [
|
||||
{"id": "c-w", "type": "function",
|
||||
"function": {"name": "getWeather", "arguments": {}}},
|
||||
{"id": "c-s", "type": "function",
|
||||
"function": {"name": "webSearch", "arguments": {"query": "Pushkin"}}},
|
||||
]},
|
||||
# getWeather failed (no location), webSearch succeeded.
|
||||
{"role": "tool", "tool_call_id": "c-w",
|
||||
"tool_name": "getWeather",
|
||||
"content": "I couldn't auto-detect your location.",
|
||||
"tool_failed": True},
|
||||
{"role": "tool", "tool_call_id": "c-s",
|
||||
"tool_name": "webSearch",
|
||||
"content": "Pushkin was a Russian poet (1799-1837).",
|
||||
"tool_failed": False},
|
||||
])
|
||||
dm.add_message("assistant",
|
||||
"Pushkin was a Russian poet. I couldn't auto-detect "
|
||||
"your location for the weather lookup.")
|
||||
|
||||
with patch(
|
||||
"src.jarvis.reply.engine.select_tools",
|
||||
return_value=["fetchWebPage"],
|
||||
):
|
||||
run_reply_engine(db=db, cfg=cfg, tts=None,
|
||||
text="I'm in Paris", dialogue_memory=dm)
|
||||
|
||||
tool_names = _tool_names_from_chat_call(mock_chat.call_args_list[-1])
|
||||
assert "getWeather" in tool_names, (
|
||||
"failed sibling tool_call must be carried over; "
|
||||
f"saw {sorted(tool_names)}"
|
||||
)
|
||||
assert "webSearch" not in tool_names, (
|
||||
"successful sibling tool_call must NOT be carried over; "
|
||||
f"saw {sorted(tool_names)}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
|
||||
@patch("src.jarvis.memory.graph_ops.build_warm_profile",
|
||||
return_value={"user": "", "directives": ""})
|
||||
@patch("src.jarvis.memory.graph.GraphMemoryStore")
|
||||
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
|
||||
@patch("src.jarvis.reply.engine.extract_text_from_response")
|
||||
@patch("src.jarvis.reply.engine.chat_with_messages")
|
||||
@patch("src.jarvis.reply.engine.run_tool_with_retries")
|
||||
def test_planner_direct_exec_stamps_tool_failed(
|
||||
mock_tool, mock_chat, mock_extract, _mock_extract_mem,
|
||||
_mock_graph, _mock_warm, _mock_fmt,
|
||||
):
|
||||
"""The planner's direct-exec path (text-tool mode + concrete plan
|
||||
step) appends tool results without going through the chat-model
|
||||
loop. Verify that path stamps ``tool_failed`` so the next turn's
|
||||
walker can see prior failures planted by direct-exec.
|
||||
"""
|
||||
from src.jarvis.tools.types import ToolExecutionResult
|
||||
|
||||
cfg = _mock_cfg()
|
||||
cfg.ollama_chat_model = "gemma4:e2b" # triggers SMALL/text-tool path
|
||||
|
||||
# First reply: planner emits a getWeather step, direct-exec runs the
|
||||
# tool which returns success=False (no location), then the chat
|
||||
# model produces a final text reply.
|
||||
mock_tool.return_value = ToolExecutionResult(
|
||||
success=False,
|
||||
reply_text="I couldn't auto-detect your location.",
|
||||
)
|
||||
mock_chat.side_effect = [
|
||||
{"message": {"content": "Tell me which city."}},
|
||||
]
|
||||
mock_extract.side_effect = ["Tell me which city."]
|
||||
|
||||
db = Mock()
|
||||
dm = DialogueMemory()
|
||||
|
||||
# Concrete plan step the resolver fast-path can parse without an LLM.
|
||||
with patch(
|
||||
"src.jarvis.reply.engine.plan_query",
|
||||
return_value=["getWeather", "Reply to the user."],
|
||||
), patch(
|
||||
"src.jarvis.reply.engine.select_tools",
|
||||
return_value=["getWeather"],
|
||||
):
|
||||
run_reply_engine(db=db, cfg=cfg, tts=None,
|
||||
text="how's the weather",
|
||||
dialogue_memory=dm)
|
||||
|
||||
# The direct-exec path should have recorded a tool turn with the
|
||||
# failure flag set so a follow-up turn can carry over getWeather.
|
||||
assert dm._tool_turns, (
|
||||
"planner direct-exec path must record a tool turn into "
|
||||
"dialogue memory carryover"
|
||||
)
|
||||
stored_msgs = [m for _ts, msgs in dm._tool_turns for m in msgs]
|
||||
failed_entries = [
|
||||
m for m in stored_msgs
|
||||
if m.get("tool_failed") and m.get("tool_name") == "getWeather"
|
||||
]
|
||||
assert failed_entries, (
|
||||
"direct-exec failure must stamp tool_failed=True; "
|
||||
f"stored messages: {stored_msgs}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_walker_logs_orphan_assistant_tool_call(caplog):
|
||||
"""When an assistant tool_call has no matching role=tool result in
|
||||
the recent window (e.g. truncation, scrub, eviction), the walker
|
||||
should fail-open and log a diagnostic — never crash, never silently
|
||||
widen the allow-list with the orphan name.
|
||||
"""
|
||||
from src.jarvis.reply.engine import _previous_turn_failed_tool_names
|
||||
|
||||
recent = [
|
||||
{"role": "user", "content": "weather please"},
|
||||
{"role": "assistant", "content": "", "tool_calls": [
|
||||
{"id": "c-orphan", "type": "function",
|
||||
"function": {"name": "getWeather", "arguments": {}}},
|
||||
]},
|
||||
# No matching role=tool result for c-orphan.
|
||||
{"role": "assistant", "content": "I couldn't auto-detect."},
|
||||
]
|
||||
|
||||
names = _previous_turn_failed_tool_names(recent)
|
||||
# No failed tool result was seen, so nothing carries over even
|
||||
# though an assistant tool_call exists.
|
||||
assert names == [], (
|
||||
f"orphan tool_call must not be carried over; got {names}"
|
||||
)
|
||||
Reference in New Issue
Block a user