Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled

Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
This commit is contained in:
javis-bot
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions

View File

@@ -0,0 +1,583 @@
"""Tests for tool selection strategies."""
import pytest
from unittest.mock import patch
from jarvis.tools.selection import (
select_tools,
ToolSelectionStrategy,
_tokenise,
_build_tool_keywords,
_ALWAYS_INCLUDED,
_RELATIVE_THRESHOLD,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
class FakeTool:
"""Minimal tool stand-in for testing."""
def __init__(self, name: str, description: str):
self._name = name
self._description = description
@property
def name(self):
return self._name
@property
def description(self):
return self._description
class FakeToolSpec:
"""Minimal ToolSpec stand-in for testing."""
def __init__(self, name: str, description: str):
self.name = name
self.description = description
def _builtin():
"""Return a small set of fake builtin tools."""
return {
"webSearch": FakeTool("webSearch", "Search the web using DuckDuckGo for current information, news, or general queries."),
"getWeather": FakeTool("getWeather", "Get current weather conditions."),
"logMeal": FakeTool("logMeal", "Log a single meal when the user mentions eating or drinking something."),
"fetchMeals": FakeTool("fetchMeals", "Retrieve meals from the database for a given time range."),
"screenshot": FakeTool("screenshot", "Capture a selected screen region and OCR the text."),
"localFiles": FakeTool("localFiles", "Safely read, write, list, append, or delete files within your home directory."),
"stop": FakeTool("stop", "End the current conversation."),
}
def _mcp():
"""Return a small set of fake MCP tools."""
return {
"homeassistant__turn_on": FakeToolSpec("homeassistant__turn_on", "Turn on a smart home device."),
}
# ---------------------------------------------------------------------------
# Enum
# ---------------------------------------------------------------------------
class TestToolSelectionStrategy:
@pytest.mark.unit
def test_enum_values(self):
assert ToolSelectionStrategy.ALL.value == "all"
assert ToolSelectionStrategy.KEYWORD.value == "keyword"
assert ToolSelectionStrategy.EMBEDDING.value == "embedding"
assert ToolSelectionStrategy.LLM.value == "llm"
@pytest.mark.unit
def test_enum_from_string(self):
assert ToolSelectionStrategy("all") == ToolSelectionStrategy.ALL
assert ToolSelectionStrategy("keyword") == ToolSelectionStrategy.KEYWORD
assert ToolSelectionStrategy("embedding") == ToolSelectionStrategy.EMBEDDING
assert ToolSelectionStrategy("llm") == ToolSelectionStrategy.LLM
@pytest.mark.unit
def test_invalid_value_raises(self):
with pytest.raises(ValueError):
ToolSelectionStrategy("banana")
# ---------------------------------------------------------------------------
# Tokenisation
# ---------------------------------------------------------------------------
class TestTokenise:
@pytest.mark.unit
def test_basic_tokenise(self):
tokens = _tokenise("What's the weather in London?")
assert "weather" in tokens
assert "london" in tokens
assert "the" not in tokens
assert "in" not in tokens
@pytest.mark.unit
def test_empty_string(self):
assert _tokenise("") == []
class TestBuildToolKeywords:
@pytest.mark.unit
def test_camel_case_split(self):
kw = _build_tool_keywords("fetchWebPage", "Fetch content from a URL.")
assert "fetch" in kw
assert "web" in kw
assert "page" in kw
@pytest.mark.unit
def test_description_tokens(self):
kw = _build_tool_keywords("getWeather", "Get current weather conditions.")
assert "weather" in kw
assert "conditions" in kw
# ---------------------------------------------------------------------------
# Strategy: all
# ---------------------------------------------------------------------------
class TestAllStrategy:
@pytest.mark.unit
def test_returns_everything(self):
result = select_tools("hello", _builtin(), _mcp(), strategy=ToolSelectionStrategy.ALL)
assert len(result) == len(_builtin()) + len(_mcp())
@pytest.mark.unit
def test_default_strategy_is_all(self):
result = select_tools("hello", _builtin(), _mcp())
assert len(result) == len(_builtin()) + len(_mcp())
# ---------------------------------------------------------------------------
# Strategy: keyword
# ---------------------------------------------------------------------------
class TestKeywordStrategy:
@pytest.mark.unit
def test_weather_query_selects_weather_tool(self):
result = select_tools("what's the weather in London", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
assert "getWeather" in result
@pytest.mark.unit
def test_weather_query_excludes_irrelevant(self):
result = select_tools("what's the weather in London", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
assert "logMeal" not in result
assert "screenshot" not in result
@pytest.mark.unit
def test_meal_query_selects_meal_tools(self):
result = select_tools("what did I eat yesterday", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
assert "fetchMeals" in result or "logMeal" in result
@pytest.mark.unit
def test_search_query_selects_web_search(self):
result = select_tools("search for python tutorials", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
assert "webSearch" in result
@pytest.mark.unit
def test_stop_always_included(self):
result = select_tools("what's the weather", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
assert "stop" in result
@pytest.mark.unit
def test_vague_query_falls_back_to_all(self):
result = select_tools("hmm", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
assert len(result) == len(_builtin())
@pytest.mark.unit
def test_mcp_tools_included(self):
result = select_tools("turn on the lights", _builtin(), _mcp(), strategy=ToolSelectionStrategy.KEYWORD)
assert "homeassistant__turn_on" in result
@pytest.mark.unit
def test_file_query_selects_local_files(self):
result = select_tools("read the config file", _builtin(), {}, strategy=ToolSelectionStrategy.KEYWORD)
assert "localFiles" in result
# ---------------------------------------------------------------------------
# Strategy: embedding
# ---------------------------------------------------------------------------
class TestEmbeddingStrategy:
def _mock_embedding(self, text_to_vec):
"""Return a mock get_embedding that maps text substrings to vectors."""
def mock_get_embedding(text, base_url, model, timeout_sec=10.0):
for key, vec in text_to_vec.items():
if key in text.lower():
return vec
# Default: zero vector
return [0.0] * 4
return mock_get_embedding
@pytest.mark.unit
def test_selects_similar_tools(self):
"""Weather query should rank getWeather highest."""
mock_embed = self._mock_embedding({
"weather": [1.0, 0.0, 0.0, 0.0], # query + weather tool
"search": [0.0, 1.0, 0.0, 0.0],
"meal": [0.0, 0.0, 1.0, 0.0],
"screen": [0.0, 0.0, 0.0, 1.0],
"file": [0.1, 0.1, 0.1, 0.1],
"conversation": [0.1, 0.1, 0.1, 0.1],
})
with patch("jarvis.memory.embeddings.get_embedding", side_effect=mock_embed):
result = select_tools(
"what's the weather",
_builtin(), {},
strategy=ToolSelectionStrategy.EMBEDDING,
llm_base_url="http://localhost",
embed_model="nomic-embed-text",
)
assert "getWeather" in result
@pytest.mark.unit
def test_stop_always_included(self):
"""Stop tool must be present even if not semantically matched."""
mock_embed = self._mock_embedding({
"weather": [1.0, 0.0, 0.0, 0.0],
})
with patch("jarvis.memory.embeddings.get_embedding", side_effect=mock_embed):
result = select_tools(
"what's the weather",
_builtin(), {},
strategy=ToolSelectionStrategy.EMBEDDING,
llm_base_url="http://localhost",
embed_model="nomic-embed-text",
)
assert "stop" in result
@pytest.mark.unit
def test_failed_query_embedding_falls_back(self):
"""If query embedding fails, fall back to all tools."""
def mock_fail(text, base_url, model, timeout_sec=10.0):
return None
with patch("jarvis.memory.embeddings.get_embedding", side_effect=mock_fail):
result = select_tools(
"anything",
_builtin(), _mcp(),
strategy=ToolSelectionStrategy.EMBEDDING,
llm_base_url="http://localhost",
embed_model="nomic-embed-text",
)
assert len(result) == len(_builtin()) + len(_mcp())
@pytest.mark.unit
def test_returns_minimum_tools(self):
"""Should return at least _MIN_SELECTED tools even if similarity is low."""
# All tools get zero similarity (orthogonal to query)
call_count = [0]
def mock_embed(text, base_url, model, timeout_sec=10.0):
call_count[0] += 1
if call_count[0] == 1: # query
return [1.0, 0.0, 0.0, 0.0]
return [0.0, 0.0, 0.0, 1.0] # all tools orthogonal
with patch("jarvis.memory.embeddings.get_embedding", side_effect=mock_embed):
result = select_tools(
"something obscure",
_builtin(), {},
strategy=ToolSelectionStrategy.EMBEDDING,
llm_base_url="http://localhost",
embed_model="nomic-embed-text",
)
# Should still have at least _MIN_SELECTED + stop
assert len(result) >= 3
@pytest.mark.unit
def test_relative_threshold_filters_low_similarity(self):
"""Relative threshold keeps only tools near the top score, not everything."""
import math
# Simulate realistic scores with a clear top cluster and a weak tail.
# query = [1, 0, 0, 0]
# strong → cos_sim ≈ 0.90 (getWeather)
# good → cos_sim ≈ 0.88 (webSearch — within 85% of top)
# weak → cos_sim ≈ 0.40 (everything else — well below cutoff)
#
# cutoff = 0.90 * 0.85 = 0.765
# strong (0.90) and good (0.88) pass; weak (0.40) do not.
# With _MIN_SELECTED=3, top-3 would apply if <3 passed, but 2 pass + stop = 3 total.
strong = [0.9, 0.436, 0, 0]
s_norm = math.sqrt(sum(x*x for x in strong))
strong = [x / s_norm for x in strong]
good = [0.88, 0.475, 0, 0]
g_norm = math.sqrt(sum(x*x for x in good))
good = [x / g_norm for x in good]
weak = [0.4, 0.917, 0, 0]
w_norm = math.sqrt(sum(x*x for x in weak))
weak = [x / w_norm for x in weak]
mock_map = {
"weather": [1.0, 0.0, 0.0, 0.0], # query
"get weather": strong, # getWeather → high sim
"web search": good, # webSearch → just above cutoff
"log meal": weak, # logMeal → low sim
"fetch meals": weak, # fetchMeals → low sim
"screen": weak, # screenshot → low sim
"file": weak, # localFiles → low sim
}
def mock_embed(text, base_url, model, timeout_sec=10.0):
text_lower = text.lower()
for key, vec in mock_map.items():
if key in text_lower:
return vec
return [0.0] * 4
with patch("jarvis.memory.embeddings.get_embedding", side_effect=mock_embed):
result = select_tools(
"what's the weather",
_builtin(), {},
strategy=ToolSelectionStrategy.EMBEDDING,
llm_base_url="http://localhost",
embed_model="nomic-embed-text",
)
# Strong and good matches must be included
assert "getWeather" in result
assert "webSearch" in result
# stop is always included
assert "stop" in result
# Fewer tools than total — the relative threshold actually filtered
total_non_stop = len([t for t in _builtin() if t != "stop"])
selected_non_stop = len([t for t in result if t != "stop"])
assert selected_non_stop < total_non_stop, (
f"Expected fewer than {total_non_stop} tools but got {selected_non_stop}: {result}"
)
# ---------------------------------------------------------------------------
# Strategy: llm
# ---------------------------------------------------------------------------
class TestLLMStrategy:
@pytest.mark.unit
def test_parses_comma_separated_response(self):
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
return "webSearch, getWeather"
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
result = select_tools(
"what's the weather",
_builtin(), {},
strategy=ToolSelectionStrategy.LLM,
llm_base_url="http://localhost",
llm_model="test",
)
assert "webSearch" in result
assert "getWeather" in result
assert "stop" in result
@pytest.mark.unit
def test_none_response_returns_only_mandatory(self):
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
return "none"
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
result = select_tools(
"hello",
_builtin(), {},
strategy=ToolSelectionStrategy.LLM,
llm_base_url="http://localhost",
llm_model="test",
)
assert result == ["stop"]
@pytest.mark.unit
def test_llm_failure_falls_back_to_keyword(self):
"""When the router LLM raises (timeout, network, etc.) the fallback is
keyword scoring — not the full catalogue. A 30+-tool fall-open kills
small chat models (they choke on 41-tool prompts) and pins the
conversation cache to "everything"; keyword narrowing preserves at
least some routing on tool-name overlap with the query."""
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
raise TimeoutError("LLM timed out")
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
result = select_tools(
"weather in London",
_builtin(), _mcp(),
strategy=ToolSelectionStrategy.LLM,
llm_base_url="http://localhost",
llm_model="test",
)
# Keyword strategy on "weather" picks getWeather (its name + desc both
# contain "weather"); irrelevant tools like fetchMeals must NOT appear.
assert "getWeather" in result
assert "fetchMeals" not in result
assert "homeassistant__turn_on" not in result
@pytest.mark.unit
def test_empty_response_falls_back_to_keyword(self):
"""Empty router response is treated identically to a hard failure:
fall back to keyword scoring rather than to the full catalogue."""
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
return ""
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
result = select_tools(
"weather report",
_builtin(), {},
strategy=ToolSelectionStrategy.LLM,
llm_base_url="http://localhost",
llm_model="test",
)
assert "getWeather" in result
assert "fetchMeals" not in result
@pytest.mark.unit
def test_unparseable_response_falls_back_to_keyword(self):
"""When the router response is non-empty but no token matches a known
tool name (small-model garbage), the fallback is keyword scoring.
Field trace: a small router occasionally produces text like "I think
we should..." that the parser strips to nothing — pre-fix this fell
open to all 41 tools; post-fix it narrows on query keywords."""
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
return "I think we should pick one" # no known tool name
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
result = select_tools(
"navigate to youtube.com",
_builtin(),
{"chrome-devtools__navigate_page": FakeToolSpec(
"chrome-devtools__navigate_page",
"Navigate the browser to a given URL.",
)},
strategy=ToolSelectionStrategy.LLM,
llm_base_url="http://localhost",
llm_model="test",
)
# Keyword scoring matches "navigate" → chrome-devtools__navigate_page.
assert "chrome-devtools__navigate_page" in result
# The full catalogue must NOT be returned — that's the regression we're
# fixing (small-model 41-tool overload).
assert len(result) < len(_builtin()) + 1
@pytest.mark.unit
def test_ignores_hallucinated_tool_names(self):
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
return "webSearch, nonExistentTool, getWeather"
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
result = select_tools(
"search and weather",
_builtin(), {},
strategy=ToolSelectionStrategy.LLM,
llm_base_url="http://localhost",
llm_model="test",
)
assert "webSearch" in result
assert "getWeather" in result
@pytest.mark.unit
def test_parses_markdown_and_backtick_wrapped_names(self):
"""Chatty routers wrap names in backticks, bullets, or JSON brackets.
The parser must strip that formatting before matching — a literal
`webSearch` should resolve to the tool called webSearch, not be
silently dropped as an unknown token."""
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
# A realistic worst case combining bullets, backticks, and a
# bracketed list tail — all of which have appeared from gemma-class
# routers in practice.
return "- `webSearch`, * `getWeather`, [logMeal]"
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
result = select_tools(
"chatty router",
_builtin(), {},
strategy=ToolSelectionStrategy.LLM,
llm_base_url="http://localhost",
llm_model="test",
)
assert "webSearch" in result
assert "getWeather" in result
assert "logMeal" in result
@pytest.mark.unit
def test_caps_chatty_router_output_at_max(self):
"""A router that echoes the whole catalogue must still produce a
compact selection — the hard cap guarantees downstream prompt size."""
from jarvis.tools.selection import _LLM_MAX_SELECTED
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
return "webSearch, getWeather, logMeal, fetchMeals, screenshot, localFiles, homeassistant__turn_on"
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
result = select_tools(
"arbitrary query",
_builtin(), _mcp(),
strategy=ToolSelectionStrategy.LLM,
llm_base_url="http://localhost",
llm_model="test",
)
# Non-mandatory selections are capped; always-included tools are
# appended on top of that cap.
non_mandatory = [t for t in result if t not in _ALWAYS_INCLUDED]
assert len(non_mandatory) <= _LLM_MAX_SELECTED, (
f"Expected at most {_LLM_MAX_SELECTED} non-mandatory tools, got "
f"{len(non_mandatory)}: {non_mandatory}"
)
# Ranking is preserved — first N from the router's list survive.
assert non_mandatory[0] == "webSearch"
assert "nonExistentTool" not in result
@pytest.mark.unit
def test_context_hint_splits_into_known_facts_and_recent_dialogue(self):
"""When the hint carries a 'Recent dialogue' subsection, the router
prompt must surface facts and dialogue under separate labels so the
router can read a short follow-up ("I'm in London") as a continuation
of the prior turn rather than as standalone idle chatter."""
captured = {}
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
captured["sys"] = sys
captured["user"] = user
return "getWeather"
hint = (
"Current local time: Sunday, 2026-04-20 17:42 (Europe/London).\n\n"
"Recent dialogue (short-term memory):\n"
"- user: what's the weather like?\n"
"- assistant: Sure — where should I check?"
)
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
select_tools(
"I'm in London",
_builtin(), {},
strategy=ToolSelectionStrategy.LLM,
llm_base_url="http://localhost",
llm_model="test",
context_hint=hint,
)
assert "KNOWN FACTS" in captured["user"]
assert "RECENT DIALOGUE" in captured["user"]
# Dialogue lines must actually reach the prompt under the dialogue label.
dialogue_idx = captured["user"].index("RECENT DIALOGUE")
assert "where should I check" in captured["user"][dialogue_idx:]
# System prompt must tell the router to treat follow-ups as continuations.
assert "continuation" in captured["sys"].lower()
@pytest.mark.unit
def test_context_hint_without_dialogue_uses_known_facts_only(self):
"""When the hint carries no dialogue subsection (first turn, no
recent messages), the router must still work — the facts flow
through under the KNOWN FACTS label with no dialogue block."""
captured = {}
def mock_llm(base_url, model, sys, user, timeout_sec=8.0):
captured["user"] = user
return "getWeather"
hint = "Current local time: Sunday, 2026-04-20 17:42 (Europe/London)."
with patch("jarvis.llm.call_llm_direct", side_effect=mock_llm):
select_tools(
"what's the weather?",
_builtin(), {},
strategy=ToolSelectionStrategy.LLM,
llm_base_url="http://localhost",
llm_model="test",
context_hint=hint,
)
assert "KNOWN FACTS" in captured["user"]
assert "RECENT DIALOGUE" not in captured["user"]