Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
137
evals/test_graph_supplies_missing_tool_arg.py
Normal file
137
evals/test_graph_supplies_missing_tool_arg.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
End-to-end eval — single-turn flow where the user's location lives in the
|
||||
User branch of the knowledge graph (warm profile). The warm profile is
|
||||
always-loaded into the system prompt, so the chat model and planner can
|
||||
ground ``getWeather`` on it without a ``searchMemory`` step.
|
||||
|
||||
This stresses the warm-profile-injection path. It complements:
|
||||
- ``evals/test_followup_supplies_missing_tool_arg.py`` (hot-window
|
||||
carry-over, two-turn).
|
||||
- ``evals/test_diary_supplies_missing_tool_arg.py`` (diary recall via
|
||||
planner-emitted ``searchMemory``).
|
||||
|
||||
Run: EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh graph_supplies_missing_tool_arg
|
||||
"""
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import requires_judge_llm
|
||||
from helpers import (
|
||||
ToolCallCapture,
|
||||
assert_not_fallback_reply,
|
||||
JUDGE_MODEL,
|
||||
)
|
||||
|
||||
|
||||
_EDINBURGH_FORECAST = (
|
||||
"Weather for Edinburgh, UK:\n"
|
||||
"Today: 11°C, partly cloudy. High 13°C, low 7°C.\n"
|
||||
"Tomorrow: 12°C, light rain, high 14°C, low 8°C."
|
||||
)
|
||||
|
||||
|
||||
def _make_runner(capture: ToolCallCapture):
|
||||
from jarvis.tools.types import ToolExecutionResult
|
||||
|
||||
def _runner(db, cfg, tool_name, tool_args, **kwargs):
|
||||
capture.record(tool_name, tool_args or {})
|
||||
if tool_name == "getWeather":
|
||||
location = ((tool_args or {}).get("location") or "").strip()
|
||||
if not location:
|
||||
return ToolExecutionResult(
|
||||
success=False,
|
||||
reply_text=(
|
||||
"I couldn't auto-detect your location. Please "
|
||||
"tell me which city to check the weather for."
|
||||
),
|
||||
)
|
||||
return ToolExecutionResult(
|
||||
success=True,
|
||||
reply_text=_EDINBURGH_FORECAST,
|
||||
)
|
||||
return ToolExecutionResult(success=True, reply_text="OK")
|
||||
|
||||
return _runner
|
||||
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
class TestGraphSuppliesMissingToolArg:
|
||||
"""Warm-profile injection path: a User-branch fact ("lives in
|
||||
Edinburgh") is always loaded into the system prompt, so the chat
|
||||
model can supply it as the location argument without an extra
|
||||
memory search."""
|
||||
|
||||
def test_warm_profile_user_fact_grounds_get_weather_call(
|
||||
self, mock_config, eval_db, eval_dialogue_memory,
|
||||
):
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
mock_config.ollama_base_url = "http://localhost:11434"
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
# Geoip disabled — the only way the model gets a location is from
|
||||
# the warm profile loaded out of the graph.
|
||||
mock_config.location_enabled = False
|
||||
|
||||
capture = ToolCallCapture()
|
||||
|
||||
# Inject a User-branch fact directly into the warm-profile builder
|
||||
# rather than seeding the SQLite-backed graph store. The warm-
|
||||
# profile path the engine relies on is `build_warm_profile` →
|
||||
# `format_warm_profile_block`; seeding via the public API replays
|
||||
# the production shape without depending on graph-mutation
|
||||
# listeners or branch-root bootstrapping in the test DB.
|
||||
warm_profile = {
|
||||
"user": "The user lives in Edinburgh.",
|
||||
"directives": "",
|
||||
}
|
||||
|
||||
with patch(
|
||||
"jarvis.memory.graph_ops.build_warm_profile",
|
||||
return_value=warm_profile,
|
||||
), patch(
|
||||
"jarvis.reply.engine.run_tool_with_retries",
|
||||
side_effect=_make_runner(capture),
|
||||
):
|
||||
response = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text="how's the weather, Jarvis?",
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
print(f"\n Graph Supplies Missing Tool Arg ({JUDGE_MODEL}):")
|
||||
print(f" Tools called: {capture.tool_names()}")
|
||||
for c in capture.calls:
|
||||
print(f" - {c['name']}({c['args']})")
|
||||
print(f" Response: {(response or '')[:300]}")
|
||||
|
||||
assert_not_fallback_reply(response, context="warm-profile")
|
||||
|
||||
weather_calls = [c for c in capture.calls if c["name"] == "getWeather"]
|
||||
edinburgh_calls = [
|
||||
c for c in weather_calls
|
||||
if "edinburgh" in (c["args"].get("location") or "").lower()
|
||||
]
|
||||
assert edinburgh_calls, (
|
||||
"getWeather was not invoked with location='Edinburgh' even "
|
||||
"though the warm profile names Edinburgh as the user's home. "
|
||||
"The chat model must use always-loaded user facts as tool "
|
||||
"arguments without an explicit prompt to do so. "
|
||||
f"All getWeather calls: {[c['args'] for c in weather_calls]}. "
|
||||
f"Tools observed: {capture.tool_names()}. "
|
||||
f"Response: {(response or '')[:400]}"
|
||||
)
|
||||
|
||||
response_lower = (response or "").lower()
|
||||
assert "edinburgh" in response_lower, (
|
||||
"Reply does not mention Edinburgh despite the warm profile "
|
||||
f"naming it as the user's location. Response: {(response or '')[:400]}"
|
||||
)
|
||||
|
||||
assert "hackney" not in response_lower, (
|
||||
"Reply mentions Hackney — the warm profile clearly states "
|
||||
"Edinburgh, and geoip is disabled in this test. The model "
|
||||
f"leaked a hardcoded default. Response: {(response or '')[:400]}"
|
||||
)
|
||||
Reference in New Issue
Block a user