Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
996
evals/test_evaluator_loop.py
Normal file
996
evals/test_evaluator_loop.py
Normal file
@@ -0,0 +1,996 @@
|
||||
"""
|
||||
Evaluator-Driven Agentic Loop Evaluations
|
||||
|
||||
Covers the evaluator's end-to-end behaviour against a real small model
|
||||
(gemma4:e2b by default): the per-turn terminal/continue decision, nudge
|
||||
injection, nudge cap enforcement, max-turn digest fallback, the
|
||||
toolSearchTool escape hatch, and multi-turn multi-tool complexity.
|
||||
|
||||
These evals complement the mock-LLM unit tests in
|
||||
``tests/test_evaluator.py`` and ``tests/test_engine_tool_search_loop.py``
|
||||
by observing what a live small model actually does when looped through
|
||||
the evaluator. Tool *implementations* are mocked for determinism; the
|
||||
chat model and the evaluator model run for real.
|
||||
|
||||
Run: ./scripts/run_evals.sh
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
|
||||
from conftest import requires_judge_llm
|
||||
from helpers import (
|
||||
JUDGE_MODEL,
|
||||
ToolCallCapture,
|
||||
assert_not_fallback_reply,
|
||||
assert_not_max_turns_digest,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Canned tool payloads — short, deterministic, keyword-rich so the chat model
|
||||
# has something concrete to talk about after the evaluator forces the call.
|
||||
# =============================================================================
|
||||
|
||||
MOCK_WEATHER_PARIS = (
|
||||
"Current weather in Paris, France:\n"
|
||||
"Conditions: Partly cloudy\n"
|
||||
"Temperature: 14.2C\n"
|
||||
"Feels like: 12C\n"
|
||||
"Humidity: 68%\n"
|
||||
"Wind: 10 km/h from the south-west\n"
|
||||
)
|
||||
|
||||
MOCK_WEATHER_LONDON = (
|
||||
"Current weather in London, United Kingdom:\n"
|
||||
"Conditions: Light rain\n"
|
||||
"Temperature: 9.1C\n"
|
||||
"Feels like: 7C\n"
|
||||
"Humidity: 82%\n"
|
||||
"Wind: 18 km/h from the west\n"
|
||||
)
|
||||
|
||||
MOCK_NAV_SUCCESS = '{"status": "ok", "url": "https://youtube.com"}'
|
||||
|
||||
MOCK_TOOLSEARCH_NAV = (
|
||||
"chrome-devtools__navigate_page: Navigate the active browser tab to a URL.\n"
|
||||
"stop: Explicit end-of-turn sentinel."
|
||||
)
|
||||
|
||||
MOCK_TOOLSEARCH_EMPTY = "No additional tools were found for this query."
|
||||
|
||||
MOCK_POSSESSOR_SEARCH = (
|
||||
"Web search results for 'Possessor film director':\n"
|
||||
"Possessor is a 2020 sci-fi horror film directed by Brandon Cronenberg, "
|
||||
"son of David Cronenberg. It stars Andrea Riseborough and Christopher "
|
||||
"Abbott.\n"
|
||||
)
|
||||
|
||||
MOCK_CRONENBERG_FILMOGRAPHY = (
|
||||
"Web search results for 'Brandon Cronenberg filmography':\n"
|
||||
"Brandon Cronenberg's films include Antiviral (2012), Possessor (2020), "
|
||||
"and Infinity Pool (2023).\n"
|
||||
)
|
||||
|
||||
MOCK_HARRY_STYLES_BIO = (
|
||||
"Web search results for 'Harry Styles':\n"
|
||||
"Harry Styles is an English singer-songwriter, born 1 February 1994. "
|
||||
"Former member of One Direction; solo albums include Fine Line (2019) "
|
||||
"and Harry's House (2022).\n"
|
||||
)
|
||||
|
||||
MOCK_HARRY_STYLES_SONGS = (
|
||||
"Web search results for 'Harry Styles famous songs':\n"
|
||||
"Notable songs: 'Watermelon Sugar' (2019), 'As It Was' (2022), "
|
||||
"'Sign of the Times' (2017), 'Adore You' (2019).\n"
|
||||
)
|
||||
|
||||
MOCK_MADRID_STALE = (
|
||||
"Web search results for 'Real Madrid':\n"
|
||||
"Real Madrid CF is a Spanish football club founded in 1902. "
|
||||
"The club plays at the Santiago Bernabeu stadium.\n"
|
||||
)
|
||||
|
||||
MOCK_MADRID_LIVE = (
|
||||
"Web search results for 'Real Madrid match live score':\n"
|
||||
"Real Madrid 2 - 1 Getafe (78'). Goals by Vinicius Jr and Bellingham.\n"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helpers
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _configure(mock_config):
|
||||
"""Pin the eval to the live small model with the evaluator enabled."""
|
||||
mock_config.ollama_base_url = "http://localhost:11434"
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
# Evaluator on (default None for SMALL already enables it, but be explicit
|
||||
# so failures are unambiguous if the model-size detection changes).
|
||||
mock_config.evaluator_enabled = True
|
||||
mock_config.evaluator_nudge_max = 2
|
||||
mock_config.tool_search_max_calls = 3
|
||||
return mock_config
|
||||
|
||||
|
||||
def _make_router_stub(tools):
|
||||
"""Return a ``select_tools`` replacement that always returns the given list."""
|
||||
|
||||
def _stub(*_args, **_kwargs):
|
||||
return list(tools)
|
||||
|
||||
return _stub
|
||||
|
||||
|
||||
def _make_tool_runner(capture: ToolCallCapture, responder):
|
||||
"""Wrap a responder that maps (name, args) -> reply_text into a
|
||||
``run_tool_with_retries`` replacement."""
|
||||
|
||||
from jarvis.tools.types import ToolExecutionResult
|
||||
|
||||
def _runner(db, cfg, tool_name, tool_args, **kwargs):
|
||||
args = tool_args or {}
|
||||
capture.record(tool_name, args)
|
||||
reply = responder(tool_name, args)
|
||||
if reply is None:
|
||||
reply = "OK"
|
||||
return ToolExecutionResult(success=True, reply_text=reply)
|
||||
|
||||
return _runner
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 1. Premature-prose nudge: router says "just call the tool" but turn-1 is prose
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestPrematureProseNudge:
|
||||
"""The evaluator must nudge the agent back into a tool call when the
|
||||
router's pre-seeded tool could directly perform the action but the model
|
||||
opened with prose."""
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"Plumbing verified in unit tests (tests/test_engine_tool_search_loop.py, "
|
||||
"tests/test_evaluator.py). Live behaviour on gemma4:e2b is flaky: "
|
||||
"the small model sometimes refuses in prose despite the nudge. "
|
||||
"Tracked for iterative prompt tuning; architecture ships as-is."
|
||||
),
|
||||
strict=False,
|
||||
)
|
||||
def test_navigate_prose_gets_nudged_into_tool_call(
|
||||
self, mock_config, eval_db, eval_dialogue_memory
|
||||
):
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
_configure(mock_config)
|
||||
capture = ToolCallCapture()
|
||||
|
||||
def _respond(name, args):
|
||||
if name == "chrome-devtools__navigate_page":
|
||||
return MOCK_NAV_SUCCESS
|
||||
if name == "toolSearchTool":
|
||||
return MOCK_TOOLSEARCH_NAV
|
||||
return "OK"
|
||||
|
||||
router = _make_router_stub(["chrome-devtools__navigate_page", "stop"])
|
||||
runner = _make_tool_runner(capture, _respond)
|
||||
|
||||
with patch("jarvis.reply.engine.select_tools", side_effect=router), \
|
||||
patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \
|
||||
patch(
|
||||
"jarvis.reply.engine.get_location_context_with_timezone",
|
||||
return_value=("Location: Kensington, UK", None),
|
||||
):
|
||||
reply = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="Open the YouTube homepage.",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
names = capture.tool_names()
|
||||
print(f"\n📊 Premature-prose nudge:")
|
||||
print(f" tool calls: {names}")
|
||||
print(f" reply: {(reply or '')[:160]}...")
|
||||
|
||||
assert "chrome-devtools__navigate_page" in names, (
|
||||
"Evaluator should have nudged the model into calling "
|
||||
"chrome-devtools__navigate_page. "
|
||||
f"Tools actually called: {names}. Reply: {(reply or '')[:200]!r}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 2. Terminal-on-success: one tool call, no thrashing
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestTerminalOnSuccessfulToolUse:
|
||||
"""When the agent uses the correct tool and summarises the result, the
|
||||
evaluator must mark terminal; a single call should be enough."""
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_single_weather_call_terminates(
|
||||
self, mock_config, eval_db, eval_dialogue_memory
|
||||
):
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
_configure(mock_config)
|
||||
capture = ToolCallCapture()
|
||||
|
||||
def _respond(name, args):
|
||||
if name == "getWeather":
|
||||
return MOCK_WEATHER_PARIS
|
||||
return "OK"
|
||||
|
||||
router = _make_router_stub(["getWeather", "stop"])
|
||||
runner = _make_tool_runner(capture, _respond)
|
||||
|
||||
with patch("jarvis.reply.engine.select_tools", side_effect=router), \
|
||||
patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \
|
||||
patch(
|
||||
"jarvis.reply.engine.get_location_context_with_timezone",
|
||||
return_value=("Location: Paris, France", None),
|
||||
):
|
||||
reply = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="What's the weather in Paris?",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
weather_calls = [c for c in capture.calls if c["name"] == "getWeather"]
|
||||
print(f"\n📊 Terminal-on-success — Paris weather:")
|
||||
print(f" getWeather calls: {len(weather_calls)}")
|
||||
print(f" all tool calls: {capture.tool_names()}")
|
||||
print(f" reply: {(reply or '')[:200]}...")
|
||||
|
||||
# Guard against the two shields that used to mask evaluator failures
|
||||
# here: the malformed-output fallback and the max-turns digest
|
||||
# caveat. Either means the loop did not terminate cleanly on the
|
||||
# first grounded tool summary, even when the surrounding content
|
||||
# reads correctly.
|
||||
assert_not_fallback_reply(reply, context="single-weather-terminal")
|
||||
assert_not_max_turns_digest(reply, context="single-weather-terminal")
|
||||
|
||||
assert len(weather_calls) == 1, (
|
||||
f"Expected exactly one getWeather call (evaluator should terminate "
|
||||
f"after the first successful summary). Got {len(weather_calls)}: "
|
||||
f"{capture.tool_names()}"
|
||||
)
|
||||
assert reply, "Reply should be non-empty"
|
||||
lower = reply.lower()
|
||||
assert "paris" in lower, f"Reply should mention Paris. Got: {reply[:200]!r}"
|
||||
weather_terms = ["weather", "cloud", "temperat", "14", "c ", "°c"]
|
||||
assert any(t in lower for t in weather_terms), (
|
||||
f"Reply should reference weather facts from the tool payload. "
|
||||
f"Got: {reply[:200]!r}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 3. Terminal on honest "can't do": no action tool available
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestTerminalOnHonestCantDo:
|
||||
"""When no tool in the allow-list can perform the action and toolSearchTool
|
||||
turns up nothing, the agent should honestly decline and the evaluator must
|
||||
mark terminal — no infinite continuation, no confabulated success."""
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_no_email_tool_declines_honestly(
|
||||
self, mock_config, eval_db, eval_dialogue_memory
|
||||
):
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
_configure(mock_config)
|
||||
capture = ToolCallCapture()
|
||||
|
||||
def _respond(name, args):
|
||||
if name == "toolSearchTool":
|
||||
return MOCK_TOOLSEARCH_EMPTY
|
||||
if name == "getWeather":
|
||||
return MOCK_WEATHER_LONDON
|
||||
return "OK"
|
||||
|
||||
# No email-capable tool in the allow-list.
|
||||
router = _make_router_stub(["getWeather", "stop"])
|
||||
runner = _make_tool_runner(capture, _respond)
|
||||
|
||||
with patch("jarvis.reply.engine.select_tools", side_effect=router), \
|
||||
patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \
|
||||
patch(
|
||||
"jarvis.reply.engine.get_location_context_with_timezone",
|
||||
return_value=("Location: London, UK", None),
|
||||
):
|
||||
reply = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="Send an email to my mum saying I'll be late.",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
print(f"\n📊 Honest can't-do:")
|
||||
print(f" tool calls: {capture.tool_names()}")
|
||||
print(f" reply: {(reply or '')[:240]}...")
|
||||
|
||||
assert reply and reply.strip(), "Reply must not be empty"
|
||||
# The reply must NOT claim the email was sent. Keyword-based rather
|
||||
# than full NL check, so flakes are diagnosable.
|
||||
lower = reply.lower()
|
||||
forbidden = [
|
||||
"email has been sent",
|
||||
"i have sent",
|
||||
"i've sent",
|
||||
"i sent the email",
|
||||
"email sent successfully",
|
||||
]
|
||||
claimed_success = any(p in lower for p in forbidden)
|
||||
assert not claimed_success, (
|
||||
f"❌ Reply falsely claims to have sent the email (no email tool "
|
||||
f"was available). Reply: {reply[:300]!r}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 4. Nudge-cap enforcement: pathological loop is capped cleanly
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestNudgeCapEnforcement:
|
||||
"""When the evaluator keeps wanting to nudge but the model won't comply,
|
||||
the nudge cap must stop the loop before agentic_max_turns and the reply
|
||||
must still be non-empty."""
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_nudge_cap_stops_loop(self, mock_config, eval_db, eval_dialogue_memory):
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
_configure(mock_config)
|
||||
mock_config.evaluator_nudge_max = 1 # tight cap so the test is fast
|
||||
mock_config.agentic_max_turns = 4
|
||||
capture = ToolCallCapture()
|
||||
|
||||
def _respond(name, args):
|
||||
if name == "getWeather":
|
||||
return MOCK_WEATHER_LONDON
|
||||
if name == "toolSearchTool":
|
||||
return MOCK_TOOLSEARCH_EMPTY
|
||||
return "OK"
|
||||
|
||||
# An action-inappropriate tool is pre-seeded; the evaluator may try to
|
||||
# nudge toward it, but the cap must stop the ping-pong.
|
||||
router = _make_router_stub(["getWeather", "stop"])
|
||||
runner = _make_tool_runner(capture, _respond)
|
||||
|
||||
with patch("jarvis.reply.engine.select_tools", side_effect=router), \
|
||||
patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \
|
||||
patch(
|
||||
"jarvis.reply.engine.get_location_context_with_timezone",
|
||||
return_value=("Location: London, UK", None),
|
||||
):
|
||||
reply = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="Tell me a long poem about the sea.",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
print(f"\n📊 Nudge-cap enforcement:")
|
||||
print(f" tool calls: {capture.tool_names()}")
|
||||
print(f" reply length: {len(reply or '')}")
|
||||
print(f" reply: {(reply or '')[:240]}...")
|
||||
|
||||
assert reply and reply.strip(), (
|
||||
"Reply must be non-empty even when the evaluator keeps wanting "
|
||||
"to nudge — the cap backstop must still deliver a reply."
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 5. Max-turn digest caveat: the loop never terminates, digest fires
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestMaxTurnDigestCaveat:
|
||||
"""Behaviour: when the agentic loop exhausts ``agentic_max_turns``
|
||||
without ever emitting a natural-language reply (a pathological pure-
|
||||
tool-call loop), the engine must still deliver a non-empty reply by
|
||||
running the digest backstop.
|
||||
|
||||
Evaluator-driven coverage was removed when the evaluator was retired
|
||||
in favour of the planner. The behaviour the user cares about — "you
|
||||
must never be left with an empty reply, even if the loop misbehaves"
|
||||
— is asserted here without coupling to deprecated internals."""
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_max_turn_triggers_digest(
|
||||
self, mock_config, eval_db, eval_dialogue_memory
|
||||
):
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
_configure(mock_config)
|
||||
mock_config.agentic_max_turns = 3
|
||||
capture = ToolCallCapture()
|
||||
|
||||
def _respond(name, args):
|
||||
if name == "getWeather":
|
||||
return MOCK_WEATHER_LONDON
|
||||
return "OK"
|
||||
|
||||
router = _make_router_stub(["getWeather", "stop"])
|
||||
runner = _make_tool_runner(capture, _respond)
|
||||
|
||||
digest_spy_calls: list[dict] = []
|
||||
|
||||
def _spy_digest(*, user_query, loop_messages, cfg, **_kwargs):
|
||||
digest_spy_calls.append(
|
||||
{"user_query": user_query, "loop_messages_len": len(loop_messages)}
|
||||
)
|
||||
return (
|
||||
"(Heads up, I couldn't finish this one) Based on what I "
|
||||
"gathered so far, I don't have a complete answer."
|
||||
)
|
||||
|
||||
# Force the chat model into an infinite tool-call loop: every turn
|
||||
# returns a structured tool_call instead of natural-language content,
|
||||
# so the loop never sees a terminal text reply and runs out of turns.
|
||||
def _always_tool_call(*_args, **_kwargs):
|
||||
return {
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "",
|
||||
"tool_calls": [
|
||||
{
|
||||
"function": {
|
||||
"name": "getWeather",
|
||||
"arguments": {"location": "London"},
|
||||
}
|
||||
}
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
with patch("jarvis.reply.engine.select_tools", side_effect=router), \
|
||||
patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \
|
||||
patch(
|
||||
"jarvis.reply.engine.get_location_context_with_timezone",
|
||||
return_value=("Location: London, UK", None),
|
||||
), \
|
||||
patch("jarvis.reply.engine.chat_with_messages", side_effect=_always_tool_call), \
|
||||
patch("jarvis.reply.engine.digest_loop_for_max_turns", side_effect=_spy_digest):
|
||||
reply = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="Write me a very long essay about abstract algebra.",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
print(f"\n📊 Max-turn digest caveat:")
|
||||
print(f" digest invocations: {len(digest_spy_calls)}")
|
||||
print(f" tool calls: {capture.tool_names()}")
|
||||
print(f" reply: {(reply or '')[:240]}...")
|
||||
|
||||
assert digest_spy_calls, (
|
||||
"digest_loop_for_max_turns must fire when the loop exhausts "
|
||||
"agentic_max_turns without producing a text reply."
|
||||
)
|
||||
assert digest_spy_calls[0]["loop_messages_len"] > 0, (
|
||||
"Digest must receive the loop's accumulated messages, not an empty "
|
||||
"list. Got len=0."
|
||||
)
|
||||
assert reply and reply.strip(), "Reply must be non-empty after digest"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 6. toolSearchTool escape hatch: widen allow-list mid-loop, then act
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestToolSearchToolEscapeHatch:
|
||||
"""When the initial router pick is too narrow, the model should invoke
|
||||
``toolSearchTool`` to widen the allow-list, then call the newly-surfaced
|
||||
tool. Order matters: navigate must come AFTER toolSearchTool."""
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"Plumbing verified in unit tests (tests/test_tool_search_tool.py, "
|
||||
"tests/test_engine_tool_search_loop.py). Live behaviour on "
|
||||
"gemma4:e2b is flaky: the small model often falls back to "
|
||||
"webSearch rather than invoking toolSearchTool. Tracked for "
|
||||
"iterative prompt tuning; architecture ships as-is."
|
||||
),
|
||||
strict=False,
|
||||
)
|
||||
def test_toolsearchtool_widens_then_navigate(
|
||||
self, mock_config, eval_db, eval_dialogue_memory
|
||||
):
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
_configure(mock_config)
|
||||
capture = ToolCallCapture()
|
||||
|
||||
def _respond(name, args):
|
||||
if name == "toolSearchTool":
|
||||
return MOCK_TOOLSEARCH_NAV
|
||||
if name == "chrome-devtools__navigate_page":
|
||||
return MOCK_NAV_SUCCESS
|
||||
if name == "webSearch":
|
||||
return "Web search results: YouTube is a video-sharing site.\n"
|
||||
return "OK"
|
||||
|
||||
# Narrow router pick: only webSearch. Escape-hatch must surface the
|
||||
# navigation tool.
|
||||
router = _make_router_stub(["webSearch", "stop"])
|
||||
runner = _make_tool_runner(capture, _respond)
|
||||
|
||||
with patch("jarvis.reply.engine.select_tools", side_effect=router), \
|
||||
patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \
|
||||
patch(
|
||||
"jarvis.reply.engine.get_location_context_with_timezone",
|
||||
return_value=("Location: Kensington, UK", None),
|
||||
):
|
||||
reply = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text=(
|
||||
"Open YouTube and tell me the title of the first trending "
|
||||
"video."
|
||||
),
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
names = capture.tool_names()
|
||||
print(f"\n📊 toolSearchTool escape hatch:")
|
||||
print(f" tool calls: {names}")
|
||||
print(f" reply: {(reply or '')[:240]}...")
|
||||
|
||||
assert "toolSearchTool" in names, (
|
||||
f"Model must invoke toolSearchTool when the pre-seeded allow-list "
|
||||
f"has no navigation tool. Tools called: {names}"
|
||||
)
|
||||
assert "chrome-devtools__navigate_page" in names, (
|
||||
f"Navigation tool should have been invoked after toolSearchTool "
|
||||
f"widened the allow-list. Tools called: {names}"
|
||||
)
|
||||
ts_idx = names.index("toolSearchTool")
|
||||
nav_idx = names.index("chrome-devtools__navigate_page")
|
||||
assert nav_idx > ts_idx, (
|
||||
f"chrome-devtools__navigate_page must be invoked AFTER "
|
||||
f"toolSearchTool. Sequence: {names}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 7. Complex multi-turn / multi-tool scenarios
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestComplexMultiTurnMultiTool:
|
||||
"""Flavours of end-to-end complexity that stress the evaluator loop:
|
||||
chained research, parallel comparisons, cross-turn pronoun resolution,
|
||||
nudge-driven query refinement, and an escape-hatch follow-up."""
|
||||
|
||||
# ---- 7a ---------------------------------------------------------------
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_chained_research_possessor_director(
|
||||
self, mock_config, eval_db, eval_dialogue_memory
|
||||
):
|
||||
"""Two distinct webSearch calls: entity lookup then filmography."""
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
_configure(mock_config)
|
||||
capture = ToolCallCapture()
|
||||
|
||||
def _respond(name, args):
|
||||
if name == "webSearch":
|
||||
arg_str = " ".join(
|
||||
str(v) for v in (args or {}).values() if isinstance(v, str)
|
||||
).lower()
|
||||
if "cronenberg" in arg_str or "filmograph" in arg_str or \
|
||||
"directed" in arg_str or "brandon" in arg_str:
|
||||
return MOCK_CRONENBERG_FILMOGRAPHY
|
||||
return MOCK_POSSESSOR_SEARCH
|
||||
return "OK"
|
||||
|
||||
router = _make_router_stub(["webSearch", "stop"])
|
||||
runner = _make_tool_runner(capture, _respond)
|
||||
|
||||
with patch("jarvis.reply.engine.select_tools", side_effect=router), \
|
||||
patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \
|
||||
patch(
|
||||
"jarvis.reply.engine.get_location_context_with_timezone",
|
||||
return_value=("Location: London, UK", None),
|
||||
):
|
||||
reply = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="Who directed Possessor and what else have they directed?",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
searches = [c for c in capture.calls if c["name"] == "webSearch"]
|
||||
print(f"\n📊 Chained research — Possessor + filmography:")
|
||||
print(f" webSearch count: {len(searches)}")
|
||||
for c in searches:
|
||||
print(f" args: {c['args']}")
|
||||
print(f" reply: {(reply or '')[:240]}...")
|
||||
|
||||
assert len(searches) >= 2, (
|
||||
f"Expected at least two webSearch calls (entity, then "
|
||||
f"filmography). Got {len(searches)}: "
|
||||
f"{[c['args'] for c in searches]}"
|
||||
)
|
||||
# The two calls should have distinct argument strings.
|
||||
arg_fingerprints = {
|
||||
" ".join(
|
||||
str(v) for v in (c["args"] or {}).values() if isinstance(v, str)
|
||||
).lower()
|
||||
for c in searches
|
||||
}
|
||||
assert len(arg_fingerprints) >= 2, (
|
||||
f"Both webSearch calls had identical args — chain was not "
|
||||
f"progressed. Args: {arg_fingerprints}"
|
||||
)
|
||||
|
||||
# ---- 7b ---------------------------------------------------------------
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_parallel_comparison_paris_vs_london(
|
||||
self, mock_config, eval_db, eval_dialogue_memory
|
||||
):
|
||||
"""Two getWeather calls, different locations, reply mentions both."""
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
_configure(mock_config)
|
||||
capture = ToolCallCapture()
|
||||
|
||||
def _respond(name, args):
|
||||
if name == "getWeather":
|
||||
loc = " ".join(
|
||||
str(v) for v in (args or {}).values() if isinstance(v, str)
|
||||
).lower()
|
||||
if "london" in loc:
|
||||
return MOCK_WEATHER_LONDON
|
||||
return MOCK_WEATHER_PARIS
|
||||
return "OK"
|
||||
|
||||
router = _make_router_stub(["getWeather", "stop"])
|
||||
runner = _make_tool_runner(capture, _respond)
|
||||
|
||||
with patch("jarvis.reply.engine.select_tools", side_effect=router), \
|
||||
patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \
|
||||
patch(
|
||||
"jarvis.reply.engine.get_location_context_with_timezone",
|
||||
return_value=("Location: London, UK", None),
|
||||
):
|
||||
reply = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="Compare the weather in Paris and London right now.",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
weather_calls = [c for c in capture.calls if c["name"] == "getWeather"]
|
||||
locs = {
|
||||
" ".join(
|
||||
str(v) for v in (c["args"] or {}).values() if isinstance(v, str)
|
||||
).lower()
|
||||
for c in weather_calls
|
||||
}
|
||||
print(f"\n📊 Parallel comparison — Paris vs London:")
|
||||
print(f" getWeather calls: {len(weather_calls)}")
|
||||
print(f" distinct location args: {locs}")
|
||||
print(f" reply: {(reply or '')[:240]}...")
|
||||
|
||||
assert len(weather_calls) >= 2, (
|
||||
f"Expected at least two getWeather calls (one per city). Got "
|
||||
f"{len(weather_calls)}: {[c['args'] for c in weather_calls]}"
|
||||
)
|
||||
has_paris = any("paris" in loc for loc in locs)
|
||||
has_london = any("london" in loc for loc in locs)
|
||||
assert has_paris and has_london, (
|
||||
f"getWeather must have been called for BOTH Paris and London. "
|
||||
f"Got location args: {locs}"
|
||||
)
|
||||
if reply:
|
||||
lower = reply.lower()
|
||||
assert "paris" in lower and "london" in lower, (
|
||||
f"Reply should mention both Paris and London. Got: "
|
||||
f"{reply[:300]!r}"
|
||||
)
|
||||
|
||||
# ---- 7c ---------------------------------------------------------------
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_cross_turn_pronoun_resolution(
|
||||
self, mock_config, eval_db, eval_dialogue_memory
|
||||
):
|
||||
"""Turn 2 resolves 'his' to the entity established in turn 1."""
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
_configure(mock_config)
|
||||
capture = ToolCallCapture()
|
||||
|
||||
def _respond(name, args):
|
||||
if name == "webSearch":
|
||||
arg_str = " ".join(
|
||||
str(v) for v in (args or {}).values() if isinstance(v, str)
|
||||
).lower()
|
||||
if "song" in arg_str or "music" in arg_str or "album" in arg_str:
|
||||
return MOCK_HARRY_STYLES_SONGS
|
||||
return MOCK_HARRY_STYLES_BIO
|
||||
return "OK"
|
||||
|
||||
router = _make_router_stub(["webSearch", "stop"])
|
||||
runner = _make_tool_runner(capture, _respond)
|
||||
|
||||
with patch("jarvis.reply.engine.select_tools", side_effect=router), \
|
||||
patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \
|
||||
patch(
|
||||
"jarvis.reply.engine.get_location_context_with_timezone",
|
||||
return_value=("Location: London, UK", None),
|
||||
):
|
||||
# Turn 1: establish entity
|
||||
capture.clear()
|
||||
run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="Who is Harry Styles?",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
turn1 = list(capture.calls)
|
||||
|
||||
# Turn 2: pronoun
|
||||
capture.clear()
|
||||
reply2 = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="What are his most famous songs?",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
turn2 = list(capture.calls)
|
||||
|
||||
print(f"\n📊 Cross-turn pronoun resolution:")
|
||||
print(f" Turn 1 calls: {[c['name'] for c in turn1]}")
|
||||
print(f" Turn 2 calls: {turn2}")
|
||||
print(f" Turn 2 reply: {(reply2 or '')[:200]}...")
|
||||
|
||||
turn2_searches = [c for c in turn2 if c["name"] == "webSearch"]
|
||||
assert turn2_searches, (
|
||||
f"Turn 2 must trigger a webSearch to answer the follow-up. "
|
||||
f"Got: {[c['name'] for c in turn2]}"
|
||||
)
|
||||
# At least one search arg must name the entity.
|
||||
resolved = False
|
||||
for c in turn2_searches:
|
||||
arg_str = " ".join(
|
||||
str(v) for v in (c["args"] or {}).values() if isinstance(v, str)
|
||||
).lower()
|
||||
if "harry" in arg_str or "styles" in arg_str:
|
||||
resolved = True
|
||||
break
|
||||
assert resolved, (
|
||||
f"Turn 2 webSearch arg did not resolve 'his' to the entity "
|
||||
f"established in turn 1. Args: {[c['args'] for c in turn2_searches]}"
|
||||
)
|
||||
if reply2:
|
||||
lower = reply2.lower()
|
||||
mentions_song = any(
|
||||
k in lower for k in ("song", "watermelon", "as it was", "sign", "adore")
|
||||
)
|
||||
assert mentions_song, (
|
||||
f"Turn 2 reply should address the songs question. "
|
||||
f"Got: {reply2[:300]!r}"
|
||||
)
|
||||
|
||||
# ---- 7d ---------------------------------------------------------------
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_correction_loop_accepts_single_or_retry(
|
||||
self, mock_config, eval_db, eval_dialogue_memory
|
||||
):
|
||||
"""At least one webSearch must happen; a nudge-driven retry is
|
||||
acceptable, zero searches is not."""
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
_configure(mock_config)
|
||||
capture = ToolCallCapture()
|
||||
|
||||
def _respond(name, args):
|
||||
if name == "webSearch":
|
||||
# First call returns stale; subsequent calls return live.
|
||||
n = sum(1 for c in capture.calls if c["name"] == "webSearch")
|
||||
# n is already incremented by this point (capture.record ran first)
|
||||
return MOCK_MADRID_LIVE if n > 1 else MOCK_MADRID_STALE
|
||||
return "OK"
|
||||
|
||||
router = _make_router_stub(["webSearch", "stop"])
|
||||
runner = _make_tool_runner(capture, _respond)
|
||||
|
||||
with patch("jarvis.reply.engine.select_tools", side_effect=router), \
|
||||
patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \
|
||||
patch(
|
||||
"jarvis.reply.engine.get_location_context_with_timezone",
|
||||
return_value=("Location: London, UK", None),
|
||||
):
|
||||
reply = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="What's the score in the Real Madrid game?",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
searches = [c for c in capture.calls if c["name"] == "webSearch"]
|
||||
print(f"\n📊 Correction loop — Real Madrid score:")
|
||||
print(f" webSearch count: {len(searches)}")
|
||||
print(f" reply: {(reply or '')[:240]}...")
|
||||
|
||||
assert len(searches) >= 1, (
|
||||
f"At least one webSearch must fire for a live-score query. "
|
||||
f"Tools called: {capture.tool_names()}"
|
||||
)
|
||||
|
||||
# ---- 7e ---------------------------------------------------------------
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"Plumbing verified in unit tests. Live behaviour on gemma4:e2b "
|
||||
"is flaky on multi-turn escape-hatch flows: the small model "
|
||||
"sometimes refuses turn 1 in prose despite the nudge. Tracked "
|
||||
"for iterative prompt tuning; architecture ships as-is."
|
||||
),
|
||||
strict=False,
|
||||
)
|
||||
def test_escape_hatch_then_follow_up_action(
|
||||
self, mock_config, eval_db, eval_dialogue_memory
|
||||
):
|
||||
"""Turn 1: narrow router → toolSearchTool → navigate. Turn 2: a new
|
||||
action whose argument must be self-contained ('lo-fi')."""
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
_configure(mock_config)
|
||||
capture = ToolCallCapture()
|
||||
|
||||
def _respond(name, args):
|
||||
if name == "toolSearchTool":
|
||||
return MOCK_TOOLSEARCH_NAV
|
||||
if name == "chrome-devtools__navigate_page":
|
||||
return MOCK_NAV_SUCCESS
|
||||
if name == "webSearch":
|
||||
return (
|
||||
"Web search results for 'lo-fi beats':\n"
|
||||
"Top results: Lofi Girl's YouTube radio, Chillhop Music, "
|
||||
"and Nujabes playlists.\n"
|
||||
)
|
||||
return "OK"
|
||||
|
||||
# Narrow initial pick so the escape hatch is needed.
|
||||
router = _make_router_stub(["webSearch", "stop"])
|
||||
runner = _make_tool_runner(capture, _respond)
|
||||
|
||||
with patch("jarvis.reply.engine.select_tools", side_effect=router), \
|
||||
patch("jarvis.reply.engine.run_tool_with_retries", side_effect=runner), \
|
||||
patch(
|
||||
"jarvis.reply.engine.get_location_context_with_timezone",
|
||||
return_value=("Location: London, UK", None),
|
||||
):
|
||||
capture.clear()
|
||||
run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="Open YouTube.",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
turn1 = list(capture.calls)
|
||||
|
||||
capture.clear()
|
||||
reply2 = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="Now search for lo-fi beats.",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
turn2 = list(capture.calls)
|
||||
|
||||
print(f"\n📊 Escape hatch + follow-up:")
|
||||
print(f" Turn 1 calls: {[c['name'] for c in turn1]}")
|
||||
print(f" Turn 2 calls: {turn2}")
|
||||
print(f" Turn 2 reply: {(reply2 or '')[:200]}...")
|
||||
|
||||
assert turn1, "Turn 1 should have at least one tool call"
|
||||
assert turn2, "Turn 2 should have at least one tool call"
|
||||
|
||||
# Turn 2's tool call arg must contain the self-contained keyword.
|
||||
found_lofi = False
|
||||
for c in turn2:
|
||||
arg_str = " ".join(
|
||||
str(v) for v in (c["args"] or {}).values() if isinstance(v, str)
|
||||
).lower()
|
||||
if "lo-fi" in arg_str or "lofi" in arg_str or "lo fi" in arg_str or "beats" in arg_str:
|
||||
found_lofi = True
|
||||
break
|
||||
assert found_lofi, (
|
||||
f"Turn 2 tool arg must contain the self-contained keyword "
|
||||
f"'lo-fi' (or a reasonable paraphrase). Calls: {turn2}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 8. Structured tool_call emission — the evaluator must not only nudge
|
||||
# textually, it must emit a structured {name, arguments} that the engine can
|
||||
# execute directly. This is the recovery path for small chat models that
|
||||
# routinely ignore textual nudges.
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestStructuredToolCallEmission:
|
||||
"""The evaluator prompt now asks for a structured ``tool_call`` field
|
||||
alongside the textual nudge. Verify that a live small-model evaluator
|
||||
actually populates it when the intent is unambiguous."""
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"Prompt compliance depends on the live small evaluator model. "
|
||||
"Deterministic coverage lives in tests/test_evaluator.py "
|
||||
"(parse) and tests/test_engine_tool_search_loop.py (direct-exec). "
|
||||
"Tracked for iterative prompt tuning; architecture ships as-is."
|
||||
),
|
||||
strict=False,
|
||||
)
|
||||
def test_evaluator_emits_structured_tool_call_for_obvious_search(
|
||||
self, mock_config
|
||||
):
|
||||
from jarvis.reply.evaluator import evaluate_turn
|
||||
|
||||
_configure(mock_config)
|
||||
|
||||
result = evaluate_turn(
|
||||
user_query="Give me an overview of China.",
|
||||
assistant_response_summary=(
|
||||
"I can look that up for you. Would you like me to search the "
|
||||
"web for an overview of China?"
|
||||
),
|
||||
available_tools=[
|
||||
("webSearch", "Search the web and return ranked results."),
|
||||
("stop", "Explicit end-of-turn sentinel."),
|
||||
],
|
||||
turns_used=1,
|
||||
cfg=mock_config,
|
||||
)
|
||||
|
||||
print(f"\n📊 Structured tool_call emission:")
|
||||
print(f" terminal: {result.terminal}")
|
||||
print(f" nudge: {result.nudge!r}")
|
||||
print(f" tool_call: {result.tool_call!r}")
|
||||
|
||||
assert result.terminal is False, (
|
||||
"Evaluator should continue: the agent offered prose instead of "
|
||||
"calling webSearch. "
|
||||
f"Got terminal={result.terminal}, reason={result.reason!r}."
|
||||
)
|
||||
assert isinstance(result.tool_call, dict), (
|
||||
"Evaluator should emit a structured tool_call so the engine can "
|
||||
"run the search directly without relying on the chat model to "
|
||||
f"parse the textual nudge. Got tool_call={result.tool_call!r}."
|
||||
)
|
||||
assert result.tool_call.get("name") == "webSearch", (
|
||||
f"Structured tool_call.name should be 'webSearch'. "
|
||||
f"Got {result.tool_call!r}."
|
||||
)
|
||||
args = result.tool_call.get("arguments") or {}
|
||||
assert isinstance(args, dict) and args, (
|
||||
"Structured tool_call.arguments should be a non-empty dict with "
|
||||
f"the intended query. Got {result.tool_call!r}."
|
||||
)
|
||||
arg_blob = " ".join(
|
||||
str(v).lower() for v in args.values() if isinstance(v, str)
|
||||
)
|
||||
assert "china" in arg_blob, (
|
||||
f"Structured tool_call.arguments should mention 'china'. "
|
||||
f"Got {result.tool_call!r}."
|
||||
)
|
||||
Reference in New Issue
Block a user