Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
171 lines
6.6 KiB
Python
171 lines
6.6 KiB
Python
"""
|
|
End-to-end eval — two-turn flow where the user supplies a missing tool
|
|
argument on the second turn.
|
|
|
|
Field trace (2026-05-03, gemma4:e2b):
|
|
|
|
Turn 1: "how's the weather tomorrow Jarvis?"
|
|
→ location not configured → getWeather reports "no location set"
|
|
→ assistant asks the user for a location.
|
|
|
|
Turn 2: "I'm in London"
|
|
→ small router picks webSearch (not getWeather), planner does
|
|
`webSearch query='weather in london tomorrow'`, DDG bot-challenges,
|
|
Wikipedia fallback matches "Edge of Tomorrow" (the 2014 Tom Cruise
|
|
film) on the keyword "tomorrow", and the assistant parrots the film
|
|
summary as the weather answer.
|
|
|
|
The fix lives at the engine level: when the previous assistant turn
|
|
invoked a tool and the current user query is a short follow-up
|
|
(≤ ~80 chars), the previous tool name is unioned back into the allow-list
|
|
so the chat model can continue the original tool chain with the new info.
|
|
|
|
This eval drives the full reply engine over both turns and asserts that
|
|
``getWeather`` is invoked twice — once with empty args (turn 1) and once
|
|
with ``location='London'`` (turn 2) — and that the final reply mentions
|
|
the London forecast, not "Edge of Tomorrow".
|
|
|
|
Run: EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh followup_supplies_missing_tool_arg
|
|
"""
|
|
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from conftest import requires_judge_llm
|
|
from helpers import (
|
|
ToolCallCapture,
|
|
assert_not_fallback_reply,
|
|
JUDGE_MODEL,
|
|
)
|
|
|
|
|
|
_LONDON_FORECAST = (
|
|
"Weather for London, UK:\n"
|
|
"Today: 15°C, partly cloudy. High 17°C, low 10°C.\n"
|
|
"Tomorrow: 14°C, light rain, high 16°C, low 9°C."
|
|
)
|
|
|
|
|
|
def _make_get_weather_runner(capture: ToolCallCapture):
|
|
"""Mock for ``run_tool_with_retries`` that responds to getWeather based
|
|
on the location argument.
|
|
|
|
Empty args → ``success=False`` ("could not auto-detect location") to
|
|
match the real getWeather behaviour and stamp ``tool_failed=True`` on
|
|
the recorded tool turn (turn 1 shape).
|
|
``location='London'`` (or any non-empty location) → ``success=True``
|
|
plus the canned forecast.
|
|
Everything else falls through to ``success=True`` "OK".
|
|
"""
|
|
from jarvis.tools.types import ToolExecutionResult
|
|
|
|
def _runner(db, cfg, tool_name, tool_args, **kwargs):
|
|
capture.record(tool_name, tool_args or {})
|
|
if tool_name == "getWeather":
|
|
location = ((tool_args or {}).get("location") or "").strip()
|
|
if not location:
|
|
return ToolExecutionResult(
|
|
success=False,
|
|
reply_text=(
|
|
"I couldn't auto-detect your location. Please "
|
|
"tell me which city to check the weather for."
|
|
),
|
|
)
|
|
return ToolExecutionResult(
|
|
success=True,
|
|
reply_text=_LONDON_FORECAST,
|
|
)
|
|
# If the model misroutes to webSearch we want to make damn sure we
|
|
# don't accidentally satisfy the assertion via a confabulated
|
|
# success — return something the model cannot honestly turn into
|
|
# a London forecast.
|
|
if tool_name == "webSearch":
|
|
return ToolExecutionResult(
|
|
success=True,
|
|
reply_text=(
|
|
"UNTRUSTED WEB EXTRACT:\n"
|
|
"Edge of Tomorrow is a 2014 American science fiction "
|
|
"action film directed by Doug Liman, starring Tom Cruise."
|
|
),
|
|
)
|
|
return ToolExecutionResult(success=True, reply_text="OK")
|
|
|
|
return _runner
|
|
|
|
|
|
@pytest.mark.eval
|
|
@requires_judge_llm
|
|
class TestFollowupSuppliesMissingToolArg:
|
|
"""End-to-end regression for the engine-level tool carry-over guard."""
|
|
|
|
def test_short_followup_continues_previous_tool_chain(
|
|
self, mock_config, eval_db, eval_dialogue_memory,
|
|
):
|
|
from jarvis.reply.engine import run_reply_engine
|
|
|
|
mock_config.ollama_base_url = "http://localhost:11434"
|
|
mock_config.ollama_chat_model = JUDGE_MODEL
|
|
# Geoip disabled — the only way the model gets a location is
|
|
# from the user supplying one on turn 2.
|
|
mock_config.location_enabled = False
|
|
|
|
capture = ToolCallCapture()
|
|
|
|
with patch(
|
|
"jarvis.reply.engine.run_tool_with_retries",
|
|
side_effect=_make_get_weather_runner(capture),
|
|
):
|
|
turn1 = run_reply_engine(
|
|
db=eval_db, cfg=mock_config, tts=None,
|
|
text="how's the weather tomorrow Jarvis?",
|
|
dialogue_memory=eval_dialogue_memory,
|
|
)
|
|
turn2 = run_reply_engine(
|
|
db=eval_db, cfg=mock_config, tts=None,
|
|
text="I'm in London",
|
|
dialogue_memory=eval_dialogue_memory,
|
|
)
|
|
|
|
print(f"\n Followup Carry-over ({JUDGE_MODEL}):")
|
|
print(f" Turn 1 reply: {(turn1 or '')[:200]}")
|
|
print(f" Turn 2 reply: {(turn2 or '')[:200]}")
|
|
print(f" Tools called: {capture.tool_names()}")
|
|
for c in capture.calls:
|
|
print(f" - {c['name']}({c['args']})")
|
|
|
|
assert_not_fallback_reply(turn1, context="turn-1")
|
|
assert_not_fallback_reply(turn2, context="turn-2")
|
|
|
|
weather_calls = [c for c in capture.calls if c["name"] == "getWeather"]
|
|
assert len(weather_calls) >= 2, (
|
|
"Expected getWeather to be invoked at least twice (once with "
|
|
"empty args on turn 1, once with location='London' on turn 2). "
|
|
f"Tools observed: {capture.tool_names()}. Calls: {capture.calls}"
|
|
)
|
|
|
|
# Turn-2 call must carry the location the user supplied.
|
|
london_calls = [
|
|
c for c in weather_calls
|
|
if "london" in (c["args"].get("location") or "").lower()
|
|
]
|
|
assert london_calls, (
|
|
"getWeather was never re-invoked with location='London' on "
|
|
"turn 2 — the carry-over guard did not preserve the previous "
|
|
f"tool's place in the allow-list. All getWeather calls: "
|
|
f"{[c['args'] for c in weather_calls]}"
|
|
)
|
|
|
|
# webSearch must NOT have been the path — that's the field-trace
|
|
# failure mode (Edge of Tomorrow). If it fired anyway, the user
|
|
# answer must still be about London weather, not the film.
|
|
turn2_lower = (turn2 or "").lower()
|
|
assert "edge of tomorrow" not in turn2_lower, (
|
|
"Reply parroted the Wikipedia fallback for 'Edge of Tomorrow'. "
|
|
f"Reply: {(turn2 or '')[:400]}"
|
|
)
|
|
assert "london" in turn2_lower, (
|
|
"Turn-2 reply does not mention London weather. "
|
|
f"Reply: {(turn2 or '')[:400]}"
|
|
)
|