Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
179 lines
7.6 KiB
Python
179 lines
7.6 KiB
Python
"""
|
|
Tool Router — Context-Aware Selection (Live)
|
|
|
|
Guards that the LLM tool router, when handed a compact summary of what the
|
|
main assistant can already see at reply time (current local time, resolved
|
|
location, recent dialogue), correctly returns 'none' for queries fully
|
|
answerable from that context — instead of embed-matching an adjacent tool.
|
|
|
|
Motivating field incident (2026-04-20):
|
|
User asked "what time is it, Jarvis?". The router, having no view of the
|
|
assistant's live context, picked `getWeather` as the closest temporal tool
|
|
on the catalogue. With only `getWeather, stop` in the allowed list, the
|
|
main model dutifully called getWeather and the reply parroted the weather
|
|
back as if it had answered the time question.
|
|
|
|
The fix is upstream: pass the router the same compact context hint the
|
|
memory extractor already uses, and let it judge for itself whether the
|
|
query is answerable from context. Location may not always resolve, so the
|
|
hint degrades gracefully — the router falls back to content-based selection
|
|
when context is missing or partial, and should not over-commit to 'none'
|
|
for queries whose answer was NOT visible in the hint.
|
|
|
|
Run:
|
|
EVAL_JUDGE_MODEL=gemma4:e2b pytest evals/test_tool_router_context_aware.py -v
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from conftest import requires_judge_llm
|
|
from helpers import JUDGE_BASE_URL, JUDGE_MODEL
|
|
|
|
|
|
_TIME_LOCATION_HINT = (
|
|
"Current local time: Sunday, 2026-04-20 17:42 (Europe/London). "
|
|
"Location: Hackney, Hackney, United Kingdom."
|
|
)
|
|
|
|
# Deliberately omits location — exercises the graceful-degradation path.
|
|
_TIME_ONLY_HINT = "Current local time: Sunday, 2026-04-20 17:42 UTC."
|
|
|
|
|
|
def _route(query: str, context_hint):
|
|
"""Invoke the real LLM router with the builtin tool catalogue."""
|
|
from jarvis.tools.registry import BUILTIN_TOOLS
|
|
from jarvis.tools.selection import select_tools, ToolSelectionStrategy
|
|
|
|
return select_tools(
|
|
query=query,
|
|
builtin_tools=BUILTIN_TOOLS,
|
|
mcp_tools={},
|
|
strategy=ToolSelectionStrategy.LLM,
|
|
llm_base_url=JUDGE_BASE_URL,
|
|
llm_model=JUDGE_MODEL,
|
|
llm_timeout_sec=30.0,
|
|
context_hint=context_hint,
|
|
)
|
|
|
|
|
|
@pytest.mark.eval
|
|
@requires_judge_llm
|
|
class TestRouterReturnsNoneWhenContextAnswers:
|
|
"""Router must opt out when the answer is already visible in context."""
|
|
|
|
def test_time_query_with_time_in_context_returns_none(self):
|
|
selected = _route("what time is it, Jarvis?", _TIME_LOCATION_HINT)
|
|
real = [t for t in selected if t != "stop"]
|
|
print(f"\n Selected: {selected}")
|
|
if real:
|
|
pytest.xfail(
|
|
f"Small router model {JUDGE_MODEL} still picked real tools "
|
|
f"({real}) for a query fully answerable from context."
|
|
)
|
|
assert not real, f"Router should opt out, got: {selected}"
|
|
|
|
def test_date_query_with_date_in_context_returns_none(self):
|
|
selected = _route("what's today's date?", _TIME_LOCATION_HINT)
|
|
real = [t for t in selected if t != "stop"]
|
|
print(f"\n Selected: {selected}")
|
|
if real:
|
|
pytest.xfail(
|
|
f"Router picked real tools ({real}) for a date query "
|
|
f"answerable from context."
|
|
)
|
|
assert not real
|
|
|
|
def test_location_query_with_location_in_context_returns_none(self):
|
|
selected = _route("where am I right now?", _TIME_LOCATION_HINT)
|
|
real = [t for t in selected if t != "stop"]
|
|
print(f"\n Selected: {selected}")
|
|
if real:
|
|
pytest.xfail(
|
|
f"Router picked real tools ({real}) for a location query "
|
|
f"answerable from context."
|
|
)
|
|
assert not real
|
|
|
|
|
|
@pytest.mark.eval
|
|
@requires_judge_llm
|
|
class TestRouterPicksToolsWhenContextDoesNotAnswer:
|
|
"""Regression guard: router must not over-commit to 'none'."""
|
|
|
|
def test_weather_query_still_picks_getWeather(self):
|
|
"""Context has time+location, but weather itself is not in context —
|
|
the router must still pick getWeather."""
|
|
selected = _route("what's the weather like?", _TIME_LOCATION_HINT)
|
|
print(f"\n Selected: {selected}")
|
|
assert "getWeather" in selected, (
|
|
f"Router dropped getWeather for an explicit weather query. "
|
|
f"Got: {selected}"
|
|
)
|
|
|
|
def test_location_query_with_partial_hint_still_routes_sensibly(self):
|
|
"""KNOWN LIMITATION on small router models (gemma4:e2b).
|
|
|
|
When location failed to resolve (hint lacks it), a location query
|
|
should not be silenced as 'none' — it must either route to a tool
|
|
that can surface location or accept the fallback, but must not
|
|
confidently claim the answer is in context when it isn't.
|
|
|
|
Observed behaviour on gemma4:e2b: the mere presence of an
|
|
ALREADY IN CONTEXT block primes the router to return 'none' for
|
|
context-shaped queries even when the specific fact is absent
|
|
from the block. Attempts to fix this purely at prompt level
|
|
(adding "the block is NOT exhaustive" wording) regress the
|
|
positive cases (time/date queries stop routing to 'none').
|
|
The practical impact is bounded: when location genuinely fails
|
|
to resolve, the follow-up layers (main model + memory recall)
|
|
still have a chance to produce a sensible answer, and this only
|
|
fires on the narrow path where the hint is partial.
|
|
|
|
Parked as xfail rather than deleted so that a future router
|
|
model (or prompt iteration) will surface the improvement as an
|
|
unexpected pass. If fixed, delete the xfail branch and assert
|
|
`selected != ["stop"]` unconditionally.
|
|
"""
|
|
selected = _route("where am I right now?", _TIME_ONLY_HINT)
|
|
print(f"\n Selected: {selected}")
|
|
if selected == ["stop"]:
|
|
pytest.xfail(
|
|
f"Router returned 'none' for a location query whose answer "
|
|
f"was NOT in the partial hint. Known small-model limit — "
|
|
f"see test docstring."
|
|
)
|
|
|
|
def test_followup_naming_place_routes_to_getWeather(self):
|
|
"""Field capture 2026-04-20: assistant asked "Which city should I
|
|
check the weather for?" and the user replied "I'm in London". The
|
|
router saw only "I'm in London" as the query and returned 'none' —
|
|
reading it as idle chatter instead of a continuation.
|
|
|
|
With the split-hint prompt (KNOWN FACTS + RECENT DIALOGUE), the
|
|
router must merge intent across turns and route to getWeather."""
|
|
hint = (
|
|
"Current local time: Sunday, 2026-04-20 17:42 UTC.\n\n"
|
|
"Recent dialogue (short-term memory):\n"
|
|
"- user: what's the weather like?\n"
|
|
"- assistant: Which city should I check the weather for?"
|
|
)
|
|
selected = _route("I'm in London", hint)
|
|
print(f"\n Selected: {selected}")
|
|
if "getWeather" not in selected:
|
|
pytest.xfail(
|
|
f"Router did not resolve follow-up 'I'm in London' after the "
|
|
f"assistant asked for a city. Got: {selected}. Known small-"
|
|
f"model limit — the prompt change lands first, the eval "
|
|
f"tracks the improvement."
|
|
)
|
|
|
|
def test_no_hint_at_all_still_routes_sensibly(self):
|
|
"""With context_hint=None (e.g. first turn, location lookup failed
|
|
entirely), the router must still work — selecting content-relevant
|
|
tools. This guards the graceful-degradation path."""
|
|
selected = _route("what's the weather like?", None)
|
|
print(f"\n Selected: {selected}")
|
|
assert "getWeather" in selected, (
|
|
f"Router broke when context_hint was None. Got: {selected}"
|
|
)
|