Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/evals/test_tool_selection.py
+++ b/evals/test_tool_selection.py
@@ -0,0 +1,154 @@
+"""
+Tool Selection Evaluations
+
+Tests that the embedding-based tool selection strategy actually filters tools
+meaningfully — a weather query should select weather-related tools, not all tools.
+
+Run: .venv/bin/python -m pytest evals/test_tool_selection.py -v
+"""
+
+import pytest
+
+from conftest import requires_judge_llm
+from helpers import JUDGE_MODEL
+
+
+# =============================================================================
+# Test Data
+# =============================================================================
+
+# Queries paired with the tools they MUST include and a maximum tool count.
+# The max count ensures the strategy actually filters rather than passing everything.
+TOOL_SELECTION_CASES = [
+    pytest.param(
+        "what's the weather like tomorrow",
+        ["getWeather"],
+        5,
+        id="weather query selects getWeather and few others",
+    ),
+    pytest.param(
+        "what's the weather in London this weekend",
+        ["getWeather"],
+        5,
+        id="location weather query selects getWeather and few others",
+    ),
+    pytest.param(
+        "log that I had a chicken salad for lunch",
+        ["logMeal"],
+        5,
+        id="meal logging selects logMeal and few others",
+    ),
+    pytest.param(
+        "what did I eat yesterday",
+        ["fetchMeals"],
+        5,
+        id="meal recall selects fetchMeals and few others",
+    ),
+    pytest.param(
+        "search the web for Python tutorials",
+        ["webSearch"],
+        5,
+        id="web search query selects webSearch and few others",
+    ),
+]
+
+
+@pytest.mark.eval
+class TestToolSelectionFiltering:
+    """Validates that embedding tool selection meaningfully filters tools."""
+
+    @requires_judge_llm
+    @pytest.mark.parametrize("query, must_include, max_tools", TOOL_SELECTION_CASES)
+    def test_embedding_selects_relevant_tools(
+        self,
+        mock_config,
+        query,
+        must_include,
+        max_tools,
+    ):
+        """Embedding strategy should select relevant tools, not all of them.
+
+        Tool selection uses a fixed embed model (nomic-embed-text) regardless of
+        the judge model, so we only run this once per eval run (during the
+        gemma4 phase) to save time.
+        """
+        if "gemma4" not in JUDGE_MODEL:
+            pytest.skip(f"Tool selection uses fixed embed model; only runs in gemma4 phase (current: {JUDGE_MODEL})")
+
+        from jarvis.tools.selection import select_tools, ToolSelectionStrategy
+        from jarvis.tools.registry import BUILTIN_TOOLS
+
+        selected = select_tools(
+            query=query,
+            builtin_tools=BUILTIN_TOOLS,
+            mcp_tools={},
+            strategy=ToolSelectionStrategy.EMBEDDING,
+            llm_base_url=mock_config.ollama_base_url,
+            embed_model=mock_config.ollama_embed_model,
+            embed_timeout_sec=10.0,
+        )
+
+        total_builtin = len(BUILTIN_TOOLS)
+
+        # Must include the expected tools
+        for tool in must_include:
+            assert tool in selected, (
+                f"Expected '{tool}' in selected tools but got: {selected}"
+            )
+
+        # Must include 'stop' (always included)
+        assert "stop" in selected, f"'stop' should always be included, got: {selected}"
+
+        # Must NOT include everything — that means filtering isn't working
+        assert len(selected) <= max_tools, (
+            f"Expected at most {max_tools} tools but got {len(selected)}/{total_builtin}: {selected}"
+        )
+
+        print(f"  ✅ Selected {len(selected)}/{total_builtin} tools: {selected}")
+
+
+@pytest.mark.eval
+class TestToolSelectionFilteringLLM:
+    """Validates that LLM-router tool selection meaningfully filters tools.
+
+    Unlike the embedding strategy (pinned to nomic-embed-text), this exercises
+    the default `llm` strategy against whichever judge model is active, so the
+    same cases run once per supported chat model.
+    """
+
+    @requires_judge_llm
+    @pytest.mark.parametrize("query, must_include, max_tools", TOOL_SELECTION_CASES)
+    def test_llm_selects_relevant_tools(
+        self,
+        mock_config,
+        query,
+        must_include,
+        max_tools,
+    ):
+        from jarvis.tools.selection import select_tools, ToolSelectionStrategy
+        from jarvis.tools.registry import BUILTIN_TOOLS
+
+        selected = select_tools(
+            query=query,
+            builtin_tools=BUILTIN_TOOLS,
+            mcp_tools={},
+            strategy=ToolSelectionStrategy.LLM,
+            llm_base_url=mock_config.ollama_base_url,
+            llm_model=JUDGE_MODEL,
+            llm_timeout_sec=15.0,
+        )
+
+        total_builtin = len(BUILTIN_TOOLS)
+
+        for tool in must_include:
+            assert tool in selected, (
+                f"Expected '{tool}' in selected tools but got: {selected}"
+            )
+
+        assert "stop" in selected, f"'stop' should always be included, got: {selected}"
+
+        assert len(selected) <= max_tools, (
+            f"Expected at most {max_tools} tools but got {len(selected)}/{total_builtin}: {selected}"
+        )
+
+        print(f"  ✅ [{JUDGE_MODEL}] Selected {len(selected)}/{total_builtin} tools: {selected}")