Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
154
evals/test_tool_selection.py
Normal file
154
evals/test_tool_selection.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""
|
||||
Tool Selection Evaluations
|
||||
|
||||
Tests that the embedding-based tool selection strategy actually filters tools
|
||||
meaningfully — a weather query should select weather-related tools, not all tools.
|
||||
|
||||
Run: .venv/bin/python -m pytest evals/test_tool_selection.py -v
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import requires_judge_llm
|
||||
from helpers import JUDGE_MODEL
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test Data
|
||||
# =============================================================================
|
||||
|
||||
# Queries paired with the tools they MUST include and a maximum tool count.
|
||||
# The max count ensures the strategy actually filters rather than passing everything.
|
||||
TOOL_SELECTION_CASES = [
|
||||
pytest.param(
|
||||
"what's the weather like tomorrow",
|
||||
["getWeather"],
|
||||
5,
|
||||
id="weather query selects getWeather and few others",
|
||||
),
|
||||
pytest.param(
|
||||
"what's the weather in London this weekend",
|
||||
["getWeather"],
|
||||
5,
|
||||
id="location weather query selects getWeather and few others",
|
||||
),
|
||||
pytest.param(
|
||||
"log that I had a chicken salad for lunch",
|
||||
["logMeal"],
|
||||
5,
|
||||
id="meal logging selects logMeal and few others",
|
||||
),
|
||||
pytest.param(
|
||||
"what did I eat yesterday",
|
||||
["fetchMeals"],
|
||||
5,
|
||||
id="meal recall selects fetchMeals and few others",
|
||||
),
|
||||
pytest.param(
|
||||
"search the web for Python tutorials",
|
||||
["webSearch"],
|
||||
5,
|
||||
id="web search query selects webSearch and few others",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.eval
|
||||
class TestToolSelectionFiltering:
|
||||
"""Validates that embedding tool selection meaningfully filters tools."""
|
||||
|
||||
@requires_judge_llm
|
||||
@pytest.mark.parametrize("query, must_include, max_tools", TOOL_SELECTION_CASES)
|
||||
def test_embedding_selects_relevant_tools(
|
||||
self,
|
||||
mock_config,
|
||||
query,
|
||||
must_include,
|
||||
max_tools,
|
||||
):
|
||||
"""Embedding strategy should select relevant tools, not all of them.
|
||||
|
||||
Tool selection uses a fixed embed model (nomic-embed-text) regardless of
|
||||
the judge model, so we only run this once per eval run (during the
|
||||
gemma4 phase) to save time.
|
||||
"""
|
||||
if "gemma4" not in JUDGE_MODEL:
|
||||
pytest.skip(f"Tool selection uses fixed embed model; only runs in gemma4 phase (current: {JUDGE_MODEL})")
|
||||
|
||||
from jarvis.tools.selection import select_tools, ToolSelectionStrategy
|
||||
from jarvis.tools.registry import BUILTIN_TOOLS
|
||||
|
||||
selected = select_tools(
|
||||
query=query,
|
||||
builtin_tools=BUILTIN_TOOLS,
|
||||
mcp_tools={},
|
||||
strategy=ToolSelectionStrategy.EMBEDDING,
|
||||
llm_base_url=mock_config.ollama_base_url,
|
||||
embed_model=mock_config.ollama_embed_model,
|
||||
embed_timeout_sec=10.0,
|
||||
)
|
||||
|
||||
total_builtin = len(BUILTIN_TOOLS)
|
||||
|
||||
# Must include the expected tools
|
||||
for tool in must_include:
|
||||
assert tool in selected, (
|
||||
f"Expected '{tool}' in selected tools but got: {selected}"
|
||||
)
|
||||
|
||||
# Must include 'stop' (always included)
|
||||
assert "stop" in selected, f"'stop' should always be included, got: {selected}"
|
||||
|
||||
# Must NOT include everything — that means filtering isn't working
|
||||
assert len(selected) <= max_tools, (
|
||||
f"Expected at most {max_tools} tools but got {len(selected)}/{total_builtin}: {selected}"
|
||||
)
|
||||
|
||||
print(f" ✅ Selected {len(selected)}/{total_builtin} tools: {selected}")
|
||||
|
||||
|
||||
@pytest.mark.eval
|
||||
class TestToolSelectionFilteringLLM:
|
||||
"""Validates that LLM-router tool selection meaningfully filters tools.
|
||||
|
||||
Unlike the embedding strategy (pinned to nomic-embed-text), this exercises
|
||||
the default `llm` strategy against whichever judge model is active, so the
|
||||
same cases run once per supported chat model.
|
||||
"""
|
||||
|
||||
@requires_judge_llm
|
||||
@pytest.mark.parametrize("query, must_include, max_tools", TOOL_SELECTION_CASES)
|
||||
def test_llm_selects_relevant_tools(
|
||||
self,
|
||||
mock_config,
|
||||
query,
|
||||
must_include,
|
||||
max_tools,
|
||||
):
|
||||
from jarvis.tools.selection import select_tools, ToolSelectionStrategy
|
||||
from jarvis.tools.registry import BUILTIN_TOOLS
|
||||
|
||||
selected = select_tools(
|
||||
query=query,
|
||||
builtin_tools=BUILTIN_TOOLS,
|
||||
mcp_tools={},
|
||||
strategy=ToolSelectionStrategy.LLM,
|
||||
llm_base_url=mock_config.ollama_base_url,
|
||||
llm_model=JUDGE_MODEL,
|
||||
llm_timeout_sec=15.0,
|
||||
)
|
||||
|
||||
total_builtin = len(BUILTIN_TOOLS)
|
||||
|
||||
for tool in must_include:
|
||||
assert tool in selected, (
|
||||
f"Expected '{tool}' in selected tools but got: {selected}"
|
||||
)
|
||||
|
||||
assert "stop" in selected, f"'stop' should always be included, got: {selected}"
|
||||
|
||||
assert len(selected) <= max_tools, (
|
||||
f"Expected at most {max_tools} tools but got {len(selected)}/{total_builtin}: {selected}"
|
||||
)
|
||||
|
||||
print(f" ✅ [{JUDGE_MODEL}] Selected {len(selected)}/{total_builtin} tools: {selected}")
|
||||
Reference in New Issue
Block a user