Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
583
tests/test_tool_selection.py
Normal file
583
tests/test_tool_selection.py
Normal file
@@ -0,0 +1,583 @@
|
||||
"""Tests for tool selection strategies."""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
|
||||
from jarvis.tools.selection import (
|
||||
select_tools,
|
||||
ToolSelectionStrategy,
|
||||
_tokenise,
|
||||
_build_tool_keywords,
|
||||
_ALWAYS_INCLUDED,
|
||||
_RELATIVE_THRESHOLD,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class FakeTool:
|
||||
"""Minimal tool stand-in for testing."""
|
||||
def __init__(self, name: str, description: str):
|
||||
self._name = name
|
||||
self._description = description
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return self._name
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return self._description
|
||||
|
||||
|
||||
class FakeToolSpec:
|
||||
"""Minimal ToolSpec stand-in for testing."""
|
||||
def __init__(self, name: str, description: str):
|
||||
self.name = name
|
||||
self.description = description
|
||||
|
||||
|
||||
def _builtin():
|
||||
"""Return a small set of fake builtin tools."""
|
||||
return {
|
||||
"webSearch": FakeTool("webSearch", "Search the web using DuckDuckGo for current information, news, or general queries."),
|
||||
"getWeather": FakeTool("getWeather", "Get current weather conditions."),
|
||||
"logMeal": FakeTool("logMeal", "Log a single meal when the user mentions eating or drinking something."),
|
||||
"fetchMeals": FakeTool("fetchMeals", "Retrieve meals from the database for a given time range."),
|
||||
"screenshot": FakeTool("screenshot", "Capture a selected screen region and OCR the text."),
|
||||
"localFiles": FakeTool("localFiles", "Safely read, write, list, append, or delete files within your home directory."),
|
||||
"stop": FakeTool("stop", "End the current conversation."),
|
||||
}
|
||||
|
||||
|
||||
def _mcp():
|
||||
"""Return a small set of fake MCP tools."""
|
||||
return {
|
||||
"homeassistant__turn_on": FakeToolSpec("homeassistant__turn_on", "Turn on a smart home device."),
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Enum
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestToolSelectionStrategy:
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_enum_values(self):
|
||||
assert ToolSelectionStrategy.ALL.value == "all"
|
||||
assert ToolSelectionStrategy.KEYWORD.value == "keyword"
|
||||
assert ToolSelectionStrategy.EMBEDDING.value == "embedding"
|
||||
assert ToolSelectionStrategy.LLM.value == "llm"
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_enum_from_string(self):
|
||||
assert ToolSelectionStrategy("all") == ToolSelectionStrategy.ALL
|
||||
assert ToolSelectionStrategy("keyword") == ToolSelectionStrategy.KEYWORD
|
||||
assert ToolSelectionStrategy("embedding") == ToolSelectionStrategy.EMBEDDING
|
||||
assert ToolSelectionStrategy("llm") == ToolSelectionStrategy.LLM
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_invalid_value_raises(self):
|
||||
with pytest.raises(ValueError):
|
||||
ToolSelectionStrategy("banana")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tokenisation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTokenise:
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_basic_tokenise(self):
|
||||
tokens = _tokenise("What's the weather in London?")
|
||||
assert "weather" in tokens
|
||||
assert "london" in tokens
|
||||
assert "the" not in tokens
|
||||
assert "in" not in tokens
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_empty_string(self):
|
||||
assert _tokenise("") == []
|
||||
|
||||
|
||||
class TestBuildToolKeywords:
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_camel_case_split(self):
|
||||
kw = _build_tool_keywords("fetchWebPage", "Fetch content from a URL.")
|
||||
assert "fetch" in kw
|
||||
assert "web" in kw
|
||||
assert "page" in kw
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_description_tokens(self):
|
||||
kw = _build_tool_keywords("getWeather", "Get current weather conditions.")
|
||||
assert "weather" in kw
|
||||
assert "conditions" in kw
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Strategy: all
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestAllStrategy:
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_returns_everything(self):
|
||||
result = select_tools("hello", _builtin(), _mcp(), strategy=ToolSelectionStrategy.ALL)
|
||||
assert len(result) == len(_builtin()) + len(_mcp())
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_default_strategy_is_all(self):
|
||||
result = select_tools("hello", _builtin(), _mcp())
|
||||
assert len(result) == len(_builtin()) + len(_mcp())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Strategy: keyword
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestKeywordStrategy:
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_weather_query_selects_weather_tool(self):
|
||||
result = select_tools("what's the weather in London", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
|
||||
assert "getWeather" in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_weather_query_excludes_irrelevant(self):
|
||||
result = select_tools("what's the weather in London", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
|
||||
assert "logMeal" not in result
|
||||
assert "screenshot" not in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_meal_query_selects_meal_tools(self):
|
||||
result = select_tools("what did I eat yesterday", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
|
||||
assert "fetchMeals" in result or "logMeal" in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_search_query_selects_web_search(self):
|
||||
result = select_tools("search for python tutorials", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
|
||||
assert "webSearch" in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_stop_always_included(self):
|
||||
result = select_tools("what's the weather", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
|
||||
assert "stop" in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_vague_query_falls_back_to_all(self):
|
||||
result = select_tools("hmm", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
|
||||
assert len(result) == len(_builtin())
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_mcp_tools_included(self):
|
||||
result = select_tools("turn on the lights", _builtin(), _mcp(), strategy=ToolSelectionStrategy.KEYWORD)
|
||||
assert "homeassistant__turn_on" in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_file_query_selects_local_files(self):
|
||||
result = select_tools("read the config file", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
|
||||
assert "localFiles" in result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Strategy: embedding
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEmbeddingStrategy:
|
||||
|
||||
def _mock_embedding(self, text_to_vec):
|
||||
"""Return a mock get_embedding that maps text substrings to vectors."""
|
||||
def mock_get_embedding(text, base_url, model, timeout_sec=10.0):
|
||||
for key, vec in text_to_vec.items():
|
||||
if key in text.lower():
|
||||
return vec
|
||||
# Default: zero vector
|
||||
return [0.0] * 4
|
||||
return mock_get_embedding
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_selects_similar_tools(self):
|
||||
"""Weather query should rank getWeather highest."""
|
||||
mock_embed = self._mock_embedding({
|
||||
"weather": [1.0, 0.0, 0.0, 0.0], # query + weather tool
|
||||
"search": [0.0, 1.0, 0.0, 0.0],
|
||||
"meal": [0.0, 0.0, 1.0, 0.0],
|
||||
"screen": [0.0, 0.0, 0.0, 1.0],
|
||||
"file": [0.1, 0.1, 0.1, 0.1],
|
||||
"conversation": [0.1, 0.1, 0.1, 0.1],
|
||||
})
|
||||
with patch("jarvis.memory.embeddings.get_embedding", side_effect=mock_embed):
|
||||
result = select_tools(
|
||||
"what's the weather",
|
||||
_builtin(), {},
|
||||
strategy=ToolSelectionStrategy.EMBEDDING,
|
||||
llm_base_url="http://localhost",
|
||||
embed_model="nomic-embed-text",
|
||||
)
|
||||
assert "getWeather" in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_stop_always_included(self):
|
||||
"""Stop tool must be present even if not semantically matched."""
|
||||
mock_embed = self._mock_embedding({
|
||||
"weather": [1.0, 0.0, 0.0, 0.0],
|
||||
})
|
||||
with patch("jarvis.memory.embeddings.get_embedding", side_effect=mock_embed):
|
||||
result = select_tools(
|
||||
"what's the weather",
|
||||
_builtin(), {},
|
||||
strategy=ToolSelectionStrategy.EMBEDDING,
|
||||
llm_base_url="http://localhost",
|
||||
embed_model="nomic-embed-text",
|
||||
)
|
||||
assert "stop" in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_failed_query_embedding_falls_back(self):
|
||||
"""If query embedding fails, fall back to all tools."""
|
||||
def mock_fail(text, base_url, model, timeout_sec=10.0):
|
||||
return None
|
||||
|
||||
with patch("jarvis.memory.embeddings.get_embedding", side_effect=mock_fail):
|
||||
result = select_tools(
|
||||
"anything",
|
||||
_builtin(), _mcp(),
|
||||
strategy=ToolSelectionStrategy.EMBEDDING,
|
||||
llm_base_url="http://localhost",
|
||||
embed_model="nomic-embed-text",
|
||||
)
|
||||
assert len(result) == len(_builtin()) + len(_mcp())
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_returns_minimum_tools(self):
|
||||
"""Should return at least _MIN_SELECTED tools even if similarity is low."""
|
||||
# All tools get zero similarity (orthogonal to query)
|
||||
call_count = [0]
|
||||
def mock_embed(text, base_url, model, timeout_sec=10.0):
|
||||
call_count[0] += 1
|
||||
if call_count[0] == 1: # query
|
||||
return [1.0, 0.0, 0.0, 0.0]
|
||||
return [0.0, 0.0, 0.0, 1.0] # all tools orthogonal
|
||||
|
||||
with patch("jarvis.memory.embeddings.get_embedding", side_effect=mock_embed):
|
||||
result = select_tools(
|
||||
"something obscure",
|
||||
_builtin(), {},
|
||||
strategy=ToolSelectionStrategy.EMBEDDING,
|
||||
llm_base_url="http://localhost",
|
||||
embed_model="nomic-embed-text",
|
||||
)
|
||||
# Should still have at least _MIN_SELECTED + stop
|
||||
assert len(result) >= 3
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_relative_threshold_filters_low_similarity(self):
|
||||
"""Relative threshold keeps only tools near the top score, not everything."""
|
||||
import math
|
||||
|
||||
# Simulate realistic scores with a clear top cluster and a weak tail.
|
||||
# query = [1, 0, 0, 0]
|
||||
# strong → cos_sim ≈ 0.90 (getWeather)
|
||||
# good → cos_sim ≈ 0.88 (webSearch — within 85% of top)
|
||||
# weak → cos_sim ≈ 0.40 (everything else — well below cutoff)
|
||||
#
|
||||
# cutoff = 0.90 * 0.85 = 0.765
|
||||
# strong (0.90) and good (0.88) pass; weak (0.40) do not.
|
||||
# With _MIN_SELECTED=3, top-3 would apply if <3 passed, but 2 pass + stop = 3 total.
|
||||
|
||||
strong = [0.9, 0.436, 0, 0]
|
||||
s_norm = math.sqrt(sum(x*x for x in strong))
|
||||
strong = [x / s_norm for x in strong]
|
||||
|
||||
good = [0.88, 0.475, 0, 0]
|
||||
g_norm = math.sqrt(sum(x*x for x in good))
|
||||
good = [x / g_norm for x in good]
|
||||
|
||||
weak = [0.4, 0.917, 0, 0]
|
||||
w_norm = math.sqrt(sum(x*x for x in weak))
|
||||
weak = [x / w_norm for x in weak]
|
||||
|
||||
mock_map = {
|
||||
"weather": [1.0, 0.0, 0.0, 0.0], # query
|
||||
"get weather": strong, # getWeather → high sim
|
||||
"web search": good, # webSearch → just above cutoff
|
||||
"log meal": weak, # logMeal → low sim
|
||||
"fetch meals": weak, # fetchMeals → low sim
|
||||
"screen": weak, # screenshot → low sim
|
||||
"file": weak, # localFiles → low sim
|
||||
}
|
||||
|
||||
def mock_embed(text, base_url, model, timeout_sec=10.0):
|
||||
text_lower = text.lower()
|
||||
for key, vec in mock_map.items():
|
||||
if key in text_lower:
|
||||
return vec
|
||||
return [0.0] * 4
|
||||
|
||||
with patch("jarvis.memory.embeddings.get_embedding", side_effect=mock_embed):
|
||||
result = select_tools(
|
||||
"what's the weather",
|
||||
_builtin(), {},
|
||||
strategy=ToolSelectionStrategy.EMBEDDING,
|
||||
llm_base_url="http://localhost",
|
||||
embed_model="nomic-embed-text",
|
||||
)
|
||||
|
||||
# Strong and good matches must be included
|
||||
assert "getWeather" in result
|
||||
assert "webSearch" in result
|
||||
|
||||
# stop is always included
|
||||
assert "stop" in result
|
||||
|
||||
# Fewer tools than total — the relative threshold actually filtered
|
||||
total_non_stop = len([t for t in _builtin() if t != "stop"])
|
||||
selected_non_stop = len([t for t in result if t != "stop"])
|
||||
assert selected_non_stop < total_non_stop, (
|
||||
f"Expected fewer than {total_non_stop} tools but got {selected_non_stop}: {result}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Strategy: llm
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLLMStrategy:
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_parses_comma_separated_response(self):
|
||||
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
|
||||
return "webSearch, getWeather"
|
||||
|
||||
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
|
||||
result = select_tools(
|
||||
"what's the weather",
|
||||
_builtin(), {},
|
||||
strategy=ToolSelectionStrategy.LLM,
|
||||
llm_base_url="http://localhost",
|
||||
llm_model="test",
|
||||
)
|
||||
assert "webSearch" in result
|
||||
assert "getWeather" in result
|
||||
assert "stop" in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_none_response_returns_only_mandatory(self):
|
||||
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
|
||||
return "none"
|
||||
|
||||
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
|
||||
result = select_tools(
|
||||
"hello",
|
||||
_builtin(), {},
|
||||
strategy=ToolSelectionStrategy.LLM,
|
||||
llm_base_url="http://localhost",
|
||||
llm_model="test",
|
||||
)
|
||||
assert result == ["stop"]
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_llm_failure_falls_back_to_keyword(self):
|
||||
"""When the router LLM raises (timeout, network, etc.) the fallback is
|
||||
keyword scoring — not the full catalogue. A 30+-tool fall-open kills
|
||||
small chat models (they choke on 41-tool prompts) and pins the
|
||||
conversation cache to "everything"; keyword narrowing preserves at
|
||||
least some routing on tool-name overlap with the query."""
|
||||
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
|
||||
raise TimeoutError("LLM timed out")
|
||||
|
||||
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
|
||||
result = select_tools(
|
||||
"weather in London",
|
||||
_builtin(), _mcp(),
|
||||
strategy=ToolSelectionStrategy.LLM,
|
||||
llm_base_url="http://localhost",
|
||||
llm_model="test",
|
||||
)
|
||||
# Keyword strategy on "weather" picks getWeather (its name + desc both
|
||||
# contain "weather"); irrelevant tools like fetchMeals must NOT appear.
|
||||
assert "getWeather" in result
|
||||
assert "fetchMeals" not in result
|
||||
assert "homeassistant__turn_on" not in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_empty_response_falls_back_to_keyword(self):
|
||||
"""Empty router response is treated identically to a hard failure:
|
||||
fall back to keyword scoring rather than to the full catalogue."""
|
||||
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
|
||||
return ""
|
||||
|
||||
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
|
||||
result = select_tools(
|
||||
"weather report",
|
||||
_builtin(), {},
|
||||
strategy=ToolSelectionStrategy.LLM,
|
||||
llm_base_url="http://localhost",
|
||||
llm_model="test",
|
||||
)
|
||||
assert "getWeather" in result
|
||||
assert "fetchMeals" not in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_unparseable_response_falls_back_to_keyword(self):
|
||||
"""When the router response is non-empty but no token matches a known
|
||||
tool name (small-model garbage), the fallback is keyword scoring.
|
||||
Field trace: a small router occasionally produces text like "I think
|
||||
we should..." that the parser strips to nothing — pre-fix this fell
|
||||
open to all 41 tools; post-fix it narrows on query keywords."""
|
||||
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
|
||||
return "I think we should pick one" # no known tool name
|
||||
|
||||
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
|
||||
result = select_tools(
|
||||
"navigate to youtube.com",
|
||||
_builtin(),
|
||||
{"chrome-devtools__navigate_page": FakeToolSpec(
|
||||
"chrome-devtools__navigate_page",
|
||||
"Navigate the browser to a given URL.",
|
||||
)},
|
||||
strategy=ToolSelectionStrategy.LLM,
|
||||
llm_base_url="http://localhost",
|
||||
llm_model="test",
|
||||
)
|
||||
# Keyword scoring matches "navigate" → chrome-devtools__navigate_page.
|
||||
assert "chrome-devtools__navigate_page" in result
|
||||
# The full catalogue must NOT be returned — that's the regression we're
|
||||
# fixing (small-model 41-tool overload).
|
||||
assert len(result) < len(_builtin()) + 1
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_ignores_hallucinated_tool_names(self):
|
||||
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
|
||||
return "webSearch, nonExistentTool, getWeather"
|
||||
|
||||
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
|
||||
result = select_tools(
|
||||
"search and weather",
|
||||
_builtin(), {},
|
||||
strategy=ToolSelectionStrategy.LLM,
|
||||
llm_base_url="http://localhost",
|
||||
llm_model="test",
|
||||
)
|
||||
assert "webSearch" in result
|
||||
assert "getWeather" in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_parses_markdown_and_backtick_wrapped_names(self):
|
||||
"""Chatty routers wrap names in backticks, bullets, or JSON brackets.
|
||||
The parser must strip that formatting before matching — a literal
|
||||
`webSearch` should resolve to the tool called webSearch, not be
|
||||
silently dropped as an unknown token."""
|
||||
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
|
||||
# A realistic worst case combining bullets, backticks, and a
|
||||
# bracketed list tail — all of which have appeared from gemma-class
|
||||
# routers in practice.
|
||||
return "- `webSearch`, * `getWeather`, [logMeal]"
|
||||
|
||||
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
|
||||
result = select_tools(
|
||||
"chatty router",
|
||||
_builtin(), {},
|
||||
strategy=ToolSelectionStrategy.LLM,
|
||||
llm_base_url="http://localhost",
|
||||
llm_model="test",
|
||||
)
|
||||
assert "webSearch" in result
|
||||
assert "getWeather" in result
|
||||
assert "logMeal" in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_caps_chatty_router_output_at_max(self):
|
||||
"""A router that echoes the whole catalogue must still produce a
|
||||
compact selection — the hard cap guarantees downstream prompt size."""
|
||||
from jarvis.tools.selection import _LLM_MAX_SELECTED
|
||||
|
||||
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
|
||||
return "webSearch, getWeather, logMeal, fetchMeals, screenshot, localFiles, homeassistant__turn_on"
|
||||
|
||||
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
|
||||
result = select_tools(
|
||||
"arbitrary query",
|
||||
_builtin(), _mcp(),
|
||||
strategy=ToolSelectionStrategy.LLM,
|
||||
llm_base_url="http://localhost",
|
||||
llm_model="test",
|
||||
)
|
||||
# Non-mandatory selections are capped; always-included tools are
|
||||
# appended on top of that cap.
|
||||
non_mandatory = [t for t in result if t not in _ALWAYS_INCLUDED]
|
||||
assert len(non_mandatory) <= _LLM_MAX_SELECTED, (
|
||||
f"Expected at most {_LLM_MAX_SELECTED} non-mandatory tools, got "
|
||||
f"{len(non_mandatory)}: {non_mandatory}"
|
||||
)
|
||||
# Ranking is preserved — first N from the router's list survive.
|
||||
assert non_mandatory[0] == "webSearch"
|
||||
assert "nonExistentTool" not in result
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_context_hint_splits_into_known_facts_and_recent_dialogue(self):
|
||||
"""When the hint carries a 'Recent dialogue' subsection, the router
|
||||
prompt must surface facts and dialogue under separate labels so the
|
||||
router can read a short follow-up ("I'm in London") as a continuation
|
||||
of the prior turn rather than as standalone idle chatter."""
|
||||
captured = {}
|
||||
|
||||
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
|
||||
captured["sys"] = sys
|
||||
captured["user"] = user
|
||||
return "getWeather"
|
||||
|
||||
hint = (
|
||||
"Current local time: Sunday, 2026-04-20 17:42 (Europe/London).\n\n"
|
||||
"Recent dialogue (short-term memory):\n"
|
||||
"- user: what's the weather like?\n"
|
||||
"- assistant: Sure — where should I check?"
|
||||
)
|
||||
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
|
||||
select_tools(
|
||||
"I'm in London",
|
||||
_builtin(), {},
|
||||
strategy=ToolSelectionStrategy.LLM,
|
||||
llm_base_url="http://localhost",
|
||||
llm_model="test",
|
||||
context_hint=hint,
|
||||
)
|
||||
|
||||
assert "KNOWN FACTS" in captured["user"]
|
||||
assert "RECENT DIALOGUE" in captured["user"]
|
||||
# Dialogue lines must actually reach the prompt under the dialogue label.
|
||||
dialogue_idx = captured["user"].index("RECENT DIALOGUE")
|
||||
assert "where should I check" in captured["user"][dialogue_idx:]
|
||||
# System prompt must tell the router to treat follow-ups as continuations.
|
||||
assert "continuation" in captured["sys"].lower()
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_context_hint_without_dialogue_uses_known_facts_only(self):
|
||||
"""When the hint carries no dialogue subsection (first turn, no
|
||||
recent messages), the router must still work — the facts flow
|
||||
through under the KNOWN FACTS label with no dialogue block."""
|
||||
captured = {}
|
||||
|
||||
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
|
||||
captured["user"] = user
|
||||
return "getWeather"
|
||||
|
||||
hint = "Current local time: Sunday, 2026-04-20 17:42 (Europe/London)."
|
||||
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
|
||||
select_tools(
|
||||
"what's the weather?",
|
||||
_builtin(), {},
|
||||
strategy=ToolSelectionStrategy.LLM,
|
||||
llm_base_url="http://localhost",
|
||||
llm_model="test",
|
||||
context_hint=hint,
|
||||
)
|
||||
|
||||
assert "KNOWN FACTS" in captured["user"]
|
||||
assert "RECENT DIALOGUE" not in captured["user"]
|
||||
Reference in New Issue
Block a user