Files
javis_bot/evals/test_tool_router_implicit.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

228 lines
8.1 KiB
Python

"""
Tool Router — Implicit Intent & Multi-Tool Coverage (Live)
The existing router evals (test_tool_selection.py, test_tool_router_context_aware.py)
lean on queries whose keywords almost name the tool ("search the web for X",
"log that I had Y"). In production the router fails on a different shape of
query: the words don't correspond to tool names, or the query needs more than
one tool to be answered usefully.
This file captures those shapes so regressions where the router over-prunes
are caught before they land. Known motivating failures:
- "how's the weather this week?" → router picked [getWeather, stop] only,
blocking the webSearch → fetchWebPage chain the mocked agent tests expect.
- "should I order pizza tonight?" → router picked [stop] only. fetchMeals
never reached the LLM, so the agent could not ground its advice in
today's intake.
Principles locked in here:
1. Implicit-intent queries (no tool-name keywords) must still route to the
correct tool.
2. The router must NEVER collapse to only `stop` when the query has a clear
actionable intent — that is a "silently useless" failure mode.
3. Multi-intent queries must surface each relevant tool (or a superset).
Run:
EVAL_JUDGE_MODEL=gemma4:e2b pytest evals/test_tool_router_implicit.py -v
"""
import pytest
from conftest import requires_judge_llm
from helpers import JUDGE_BASE_URL, JUDGE_MODEL
def _route(query: str, context_hint=None):
"""Invoke the real LLM router with the full builtin tool catalogue."""
from jarvis.tools.registry import BUILTIN_TOOLS
from jarvis.tools.selection import select_tools, ToolSelectionStrategy
return select_tools(
query=query,
builtin_tools=BUILTIN_TOOLS,
mcp_tools={},
strategy=ToolSelectionStrategy.LLM,
llm_base_url=JUDGE_BASE_URL,
llm_model=JUDGE_MODEL,
llm_timeout_sec=30.0,
context_hint=context_hint,
)
def _real_tools(selected):
"""Filter out the always-present `stop` sentinel."""
return [t for t in selected if t != "stop"]
# =============================================================================
# Implicit Intent — words do not correspond to tool names
# =============================================================================
# (query, must_include_any_of, rationale)
IMPLICIT_INTENT_CASES = [
pytest.param(
"should I order pizza tonight?",
["fetchMeals"],
"Advisory food decision needs today's intake to answer usefully.",
id="food decision → fetchMeals",
),
pytest.param(
"am I under my calorie budget today?",
["fetchMeals"],
"Budget question with no 'meal' keyword still needs the log.",
id="calorie budget → fetchMeals",
),
pytest.param(
"do I need a jacket today?",
["getWeather"],
"Clothing question is a weather question in disguise.",
id="jacket → getWeather",
),
pytest.param(
"will the run be miserable this afternoon?",
["getWeather"],
"Activity planning with weather subtext, no 'weather' keyword.",
id="run forecast → getWeather",
),
pytest.param(
"what did I put in my body today?",
["fetchMeals"],
"Colloquial meal recall, no tool-name keywords.",
id="meal recall (colloquial) → fetchMeals",
),
pytest.param(
"did I have anything with gluten earlier?",
["fetchMeals"],
"Dietary check against logged meals.",
id="dietary check → fetchMeals",
),
]
@pytest.mark.eval
@requires_judge_llm
class TestImplicitIntent:
"""Router must route on intent, not on surface keywords."""
@pytest.mark.parametrize("query, must_include_any, rationale", IMPLICIT_INTENT_CASES)
def test_implicit_intent_routes_to_correct_tool(
self, query, must_include_any, rationale
):
selected = _route(query)
real = _real_tools(selected)
print(f"\n Query: {query}")
print(f" Rationale: {rationale}")
print(f" Selected: {selected}")
# Floor invariant (soft — small router models sometimes collapse to
# only 'stop' on dietary/advisory queries). Tracked as xfail so a
# future router improvement flips this to an unexpected pass.
if not real:
pytest.xfail(
f"Router collapsed to only 'stop' for an actionable query on "
f"{JUDGE_MODEL}. Query: {query!r}. Rationale: {rationale}"
)
matched = [t for t in must_include_any if t in selected]
if not matched:
pytest.xfail(
f"Router missed implicit intent on {JUDGE_MODEL}. "
f"Expected any of {must_include_any}, got {selected}. "
f"Rationale: {rationale}"
)
# =============================================================================
# Multi-Tool Intent — one question needs several tools
# =============================================================================
# (query, must_include_all, rationale)
MULTI_TOOL_CASES = [
pytest.param(
"plan my day around the weather and what I've eaten",
["getWeather", "fetchMeals"],
"Two explicit subjects, two tools.",
id="weather + meals",
),
pytest.param(
"find me a detailed article about the Apollo program",
["webSearch", "fetchWebPage"],
"Research queries need search then fetch to read the actual page.",
id="research → webSearch + fetchWebPage",
),
pytest.param(
"how's the weather this week?",
["getWeather"],
"Must include getWeather; webSearch/fetchWebPage acceptable as backup "
"for multi-day forecasts the API may not cover.",
id="weekly weather keeps getWeather",
),
]
@pytest.mark.eval
@requires_judge_llm
class TestMultiToolIntent:
"""Router must surface every tool a multi-part query needs."""
@pytest.mark.parametrize("query, must_include_all, rationale", MULTI_TOOL_CASES)
def test_multi_tool_intent_surfaces_all_needed(
self, query, must_include_all, rationale
):
selected = _route(query)
real = _real_tools(selected)
print(f"\n Query: {query}")
print(f" Rationale: {rationale}")
print(f" Selected: {selected}")
if not real:
pytest.xfail(
f"Router collapsed to only 'stop' for a multi-intent query on "
f"{JUDGE_MODEL}. Query: {query!r}."
)
missing = [t for t in must_include_all if t not in selected]
if missing:
pytest.xfail(
f"Router dropped needed tools on {JUDGE_MODEL}. "
f"Missing: {missing}. Got: {selected}. Rationale: {rationale}"
)
# =============================================================================
# Floor Invariant — router must never silently collapse to only `stop`
# =============================================================================
# Queries that have an unambiguous tool-shaped answer. The router may legitimately
# narrow the catalogue, but returning only [stop] for any of these is a bug: it
# means the main model will have no way to act on the user's clear request.
NEVER_EMPTY_CASES = [
"take a screenshot",
"what's on my screen right now?",
"search the web for flight deals",
"log that I just ate a banana",
"what's the weather like?",
"find the invoice PDF on my computer",
]
@pytest.mark.eval
@requires_judge_llm
class TestRouterNeverCollapses:
"""Regression guard for the 'selected only stop' failure mode."""
@pytest.mark.parametrize("query", NEVER_EMPTY_CASES)
def test_clear_intent_keeps_at_least_one_real_tool(self, query):
selected = _route(query)
real = _real_tools(selected)
print(f"\n Query: {query}")
print(f" Selected: {selected}")
assert real, (
f"Router collapsed to only 'stop' for a clearly actionable query. "
f"Query: {query!r}. This silently disables the agent — every main-"
f"model tool_call would be dropped as out-of-catalogue."
)