Files
javis_bot/tests/test_engine_tool_carryover.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

228 lines
8.4 KiB
Python

"""End-to-end: tool-call + tool-result messages from one reply must be
visible to the LLM on the next reply within the hot window, so the model
can synthesise from prior results rather than re-fetching.
"""
from unittest.mock import Mock, patch
import pytest
from src.jarvis.memory.conversation import DialogueMemory
from src.jarvis.reply.engine import run_reply_engine
def _mock_cfg():
cfg = Mock()
cfg.ollama_base_url = "http://localhost:11434"
cfg.ollama_chat_model = "test-large" # avoid SMALL-model text-tool path
cfg.voice_debug = False
cfg.llm_tools_timeout_sec = 8.0
cfg.llm_embed_timeout_sec = 10.0
cfg.llm_chat_timeout_sec = 45.0
cfg.llm_digest_timeout_sec = 8.0
cfg.memory_enrichment_max_results = 5
cfg.memory_enrichment_source = "diary"
cfg.memory_digest_enabled = False
cfg.tool_result_digest_enabled = False
cfg.location_ip_address = None
cfg.location_auto_detect = False
cfg.location_enabled = False
cfg.agentic_max_turns = 8
cfg.tool_search_max_calls = 3
cfg.tool_selection_strategy = "all"
cfg.tool_carryover_max_turns = 2
cfg.tool_carryover_per_entry_chars = 1200
cfg.mcps = {}
cfg.llm_thinking_enabled = False
cfg.tts_engine = "none"
cfg.ollama_embed_model = "test-embed"
return cfg
@pytest.mark.unit
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
@patch("src.jarvis.reply.engine.run_tool_with_retries")
@patch("src.jarvis.reply.engine.extract_text_from_response")
@patch("src.jarvis.reply.engine.chat_with_messages")
def test_tool_carryover_makes_prior_result_visible_to_next_turn(
mock_chat, mock_extract, mock_tool, _mock_extract, _mock_plan
):
# Turn 1: model emits webSearch call, then final text.
mock_tool.return_value = Mock(
reply_text="Justin Bieber is a Canadian singer.",
error_message=None,
)
mock_chat.side_effect = [
# Turn 1a: tool call
{"message": {"content": "", "tool_calls": [{
"id": "c1", "type": "function",
"function": {"name": "webSearch",
"arguments": {"query": "justin bieber"}},
}]}},
# Turn 1b: final reply
{"message": {"content": "He is a Canadian singer."}},
# Turn 2a: final reply directly — reuse from prior context
{"message": {"content": "His breakout song was Baby."}},
]
mock_extract.side_effect = [
"",
"He is a Canadian singer.",
"His breakout song was Baby.",
]
db = Mock()
cfg = _mock_cfg()
dm = DialogueMemory()
run_reply_engine(db=db, cfg=cfg, tts=None,
text="who is justin bieber",
dialogue_memory=dm)
# Confirm carryover was recorded
assert len(dm._tool_turns) == 1
stored = dm._tool_turns[0][1]
stored_roles = [m.get("role") for m in stored]
assert "tool" in stored_roles
assert any(m.get("tool_calls") for m in stored)
# Turn 2: query on the same topic — the turn-2 LLM call should receive
# the turn-1 tool messages in its `messages` argument.
run_reply_engine(db=db, cfg=cfg, tts=None,
text="what is his most famous song",
dialogue_memory=dm)
# The third chat_with_messages call is turn-2's only turn (single text).
turn2_kwargs = mock_chat.call_args_list[-1].kwargs
turn2_messages = turn2_kwargs.get("messages")
roles_in_turn2 = [m.get("role") for m in turn2_messages]
assert "tool" in roles_in_turn2, (
f"Expected prior tool-role message to be injected on turn 2; "
f"got roles={roles_in_turn2}"
)
# The tool message content must be the prior webSearch result
tool_contents = [
m.get("content") for m in turn2_messages if m.get("role") == "tool"
]
assert any("Canadian singer" in (c or "") for c in tool_contents)
@pytest.mark.unit
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
@patch("src.jarvis.reply.engine.run_tool_with_retries")
@patch("src.jarvis.reply.engine.extract_text_from_response")
@patch("src.jarvis.reply.engine.chat_with_messages")
def test_stop_signal_clears_tool_carryover(
mock_chat, mock_extract, mock_tool, _mock_extract, _mock_plan
):
"""Turn 1 runs a tool; turn 2 receives the stop signal. After turn 2,
carryover must be empty so the next wake-word turn starts fresh.
"""
from src.jarvis.tools.builtin.stop import STOP_SIGNAL
mock_tool.side_effect = [
Mock(reply_text="Justin Bieber is a Canadian singer.", error_message=None),
Mock(reply_text=STOP_SIGNAL, error_message=None),
]
mock_chat.side_effect = [
# Turn 1a: tool call
{"message": {"content": "", "tool_calls": [{
"id": "c1", "type": "function",
"function": {"name": "webSearch", "arguments": {"query": "bieber"}},
}]}},
# Turn 1b: final reply
{"message": {"content": "He is a Canadian singer."}},
# Turn 2: stop tool
{"message": {"content": "", "tool_calls": [{
"id": "c2", "type": "function",
"function": {"name": "stop", "arguments": {}},
}]}},
]
mock_extract.side_effect = ["", "He is a Canadian singer.", ""]
db = Mock()
cfg = _mock_cfg()
dm = DialogueMemory()
run_reply_engine(db=db, cfg=cfg, tts=None,
text="who is justin bieber", dialogue_memory=dm)
assert len(dm._tool_turns) == 1, "turn-1 tool carryover should be recorded"
reply = run_reply_engine(db=db, cfg=cfg, tts=None,
text="stop", dialogue_memory=dm)
assert reply is None, "stop signal returns None"
assert dm._tool_turns == [], (
"stop signal must clear carryover so the next wake-word turn is clean"
)
@pytest.mark.unit
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
@patch("src.jarvis.reply.engine.run_tool_with_retries")
@patch("src.jarvis.reply.engine.extract_text_from_response")
@patch("src.jarvis.reply.engine.chat_with_messages")
def test_tool_carryover_text_tool_mode(
mock_chat, mock_extract, mock_tool, _mock_extract, _mock_plan
):
"""Small-model path: tool results come back as role=user with a
``tool_name`` tag. Carryover must pick those up too.
"""
cfg = _mock_cfg()
cfg.ollama_chat_model = "gemma4:e2b" # triggers SMALL/text-tool path
mock_tool.return_value = Mock(
reply_text="Paris is the capital of France.", error_message=None,
)
fence_call = (
"```tool_call\n"
'{"name": "webSearch", "arguments": {"query": "paris"}}\n'
"```"
)
mock_chat.side_effect = [
# Turn 1a: text-tool call emitted inside a markdown fence
{"message": {"content": fence_call}},
# Turn 1b: final reply
{"message": {"content": "Paris is in France."}},
# Turn 2: follow-up reply
{"message": {"content": "Its population is about 2.1 million."}},
]
mock_extract.side_effect = [
fence_call,
"Paris is in France.",
"Its population is about 2.1 million.",
]
db = Mock()
dm = DialogueMemory()
run_reply_engine(db=db, cfg=cfg, tts=None,
text="what about paris", dialogue_memory=dm)
assert len(dm._tool_turns) == 1
stored = dm._tool_turns[0][1]
roles = [m.get("role") for m in stored]
# Text-tool fallback stores tool results as role=user with tool_name.
assert "user" in roles
assert any(m.get("tool_name") == "webSearch" for m in stored)
run_reply_engine(db=db, cfg=cfg, tts=None,
text="tell me more", dialogue_memory=dm)
turn2_messages = mock_chat.call_args_list[-1].kwargs.get("messages") or []
# The prior tool payload should appear in the turn-2 messages list —
# either as role=tool (native) or role=user with tool_name (text-tool).
tool_like = [
m for m in turn2_messages
if m.get("role") == "tool"
or (m.get("role") == "user" and m.get("tool_name"))
]
assert tool_like, (
f"expected prior text-tool result to be carried over; got roles="
f"{[m.get('role') for m in turn2_messages]}"
)
assert any(
"Paris" in (m.get("content") or "") for m in tool_like
)