Files
javis_bot/evals/test_tool_selection.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

155 lines
5.0 KiB
Python

"""
Tool Selection Evaluations
Tests that the embedding-based tool selection strategy actually filters tools
meaningfully — a weather query should select weather-related tools, not all tools.
Run: .venv/bin/python -m pytest evals/test_tool_selection.py -v
"""
import pytest
from conftest import requires_judge_llm
from helpers import JUDGE_MODEL
# =============================================================================
# Test Data
# =============================================================================
# Queries paired with the tools they MUST include and a maximum tool count.
# The max count ensures the strategy actually filters rather than passing everything.
TOOL_SELECTION_CASES = [
pytest.param(
"what's the weather like tomorrow",
["getWeather"],
5,
id="weather query selects getWeather and few others",
),
pytest.param(
"what's the weather in London this weekend",
["getWeather"],
5,
id="location weather query selects getWeather and few others",
),
pytest.param(
"log that I had a chicken salad for lunch",
["logMeal"],
5,
id="meal logging selects logMeal and few others",
),
pytest.param(
"what did I eat yesterday",
["fetchMeals"],
5,
id="meal recall selects fetchMeals and few others",
),
pytest.param(
"search the web for Python tutorials",
["webSearch"],
5,
id="web search query selects webSearch and few others",
),
]
@pytest.mark.eval
class TestToolSelectionFiltering:
"""Validates that embedding tool selection meaningfully filters tools."""
@requires_judge_llm
@pytest.mark.parametrize("query, must_include, max_tools", TOOL_SELECTION_CASES)
def test_embedding_selects_relevant_tools(
self,
mock_config,
query,
must_include,
max_tools,
):
"""Embedding strategy should select relevant tools, not all of them.
Tool selection uses a fixed embed model (nomic-embed-text) regardless of
the judge model, so we only run this once per eval run (during the
gemma4 phase) to save time.
"""
if "gemma4" not in JUDGE_MODEL:
pytest.skip(f"Tool selection uses fixed embed model; only runs in gemma4 phase (current: {JUDGE_MODEL})")
from jarvis.tools.selection import select_tools, ToolSelectionStrategy
from jarvis.tools.registry import BUILTIN_TOOLS
selected = select_tools(
query=query,
builtin_tools=BUILTIN_TOOLS,
mcp_tools={},
strategy=ToolSelectionStrategy.EMBEDDING,
llm_base_url=mock_config.ollama_base_url,
embed_model=mock_config.ollama_embed_model,
embed_timeout_sec=10.0,
)
total_builtin = len(BUILTIN_TOOLS)
# Must include the expected tools
for tool in must_include:
assert tool in selected, (
f"Expected '{tool}' in selected tools but got: {selected}"
)
# Must include 'stop' (always included)
assert "stop" in selected, f"'stop' should always be included, got: {selected}"
# Must NOT include everything — that means filtering isn't working
assert len(selected) <= max_tools, (
f"Expected at most {max_tools} tools but got {len(selected)}/{total_builtin}: {selected}"
)
print(f" ✅ Selected {len(selected)}/{total_builtin} tools: {selected}")
@pytest.mark.eval
class TestToolSelectionFilteringLLM:
"""Validates that LLM-router tool selection meaningfully filters tools.
Unlike the embedding strategy (pinned to nomic-embed-text), this exercises
the default `llm` strategy against whichever judge model is active, so the
same cases run once per supported chat model.
"""
@requires_judge_llm
@pytest.mark.parametrize("query, must_include, max_tools", TOOL_SELECTION_CASES)
def test_llm_selects_relevant_tools(
self,
mock_config,
query,
must_include,
max_tools,
):
from jarvis.tools.selection import select_tools, ToolSelectionStrategy
from jarvis.tools.registry import BUILTIN_TOOLS
selected = select_tools(
query=query,
builtin_tools=BUILTIN_TOOLS,
mcp_tools={},
strategy=ToolSelectionStrategy.LLM,
llm_base_url=mock_config.ollama_base_url,
llm_model=JUDGE_MODEL,
llm_timeout_sec=15.0,
)
total_builtin = len(BUILTIN_TOOLS)
for tool in must_include:
assert tool in selected, (
f"Expected '{tool}' in selected tools but got: {selected}"
)
assert "stop" in selected, f"'stop' should always be included, got: {selected}"
assert len(selected) <= max_tools, (
f"Expected at most {max_tools} tools but got {len(selected)}/{total_builtin}: {selected}"
)
print(f" ✅ [{JUDGE_MODEL}] Selected {len(selected)}/{total_builtin} tools: {selected}")