Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/evals/test_tool_router_context_aware.py
+++ b/evals/test_tool_router_context_aware.py
@@ -0,0 +1,178 @@
+"""
+Tool Router — Context-Aware Selection (Live)
+
+Guards that the LLM tool router, when handed a compact summary of what the
+main assistant can already see at reply time (current local time, resolved
+location, recent dialogue), correctly returns 'none' for queries fully
+answerable from that context — instead of embed-matching an adjacent tool.
+
+Motivating field incident (2026-04-20):
+  User asked "what time is it, Jarvis?". The router, having no view of the
+  assistant's live context, picked `getWeather` as the closest temporal tool
+  on the catalogue. With only `getWeather, stop` in the allowed list, the
+  main model dutifully called getWeather and the reply parroted the weather
+  back as if it had answered the time question.
+
+The fix is upstream: pass the router the same compact context hint the
+memory extractor already uses, and let it judge for itself whether the
+query is answerable from context. Location may not always resolve, so the
+hint degrades gracefully — the router falls back to content-based selection
+when context is missing or partial, and should not over-commit to 'none'
+for queries whose answer was NOT visible in the hint.
+
+Run:
+    EVAL_JUDGE_MODEL=gemma4:e2b pytest evals/test_tool_router_context_aware.py -v
+"""
+
+import pytest
+
+from conftest import requires_judge_llm
+from helpers import JUDGE_BASE_URL, JUDGE_MODEL
+
+
+_TIME_LOCATION_HINT = (
+    "Current local time: Sunday, 2026-04-20 17:42 (Europe/London). "
+    "Location: Hackney, Hackney, United Kingdom."
+)
+
+# Deliberately omits location — exercises the graceful-degradation path.
+_TIME_ONLY_HINT = "Current local time: Sunday, 2026-04-20 17:42 UTC."
+
+
+def _route(query: str, context_hint):
+    """Invoke the real LLM router with the builtin tool catalogue."""
+    from jarvis.tools.registry import BUILTIN_TOOLS
+    from jarvis.tools.selection import select_tools, ToolSelectionStrategy
+
+    return select_tools(
+        query=query,
+        builtin_tools=BUILTIN_TOOLS,
+        mcp_tools={},
+        strategy=ToolSelectionStrategy.LLM,
+        llm_base_url=JUDGE_BASE_URL,
+        llm_model=JUDGE_MODEL,
+        llm_timeout_sec=30.0,
+        context_hint=context_hint,
+    )
+
+
+@pytest.mark.eval
+@requires_judge_llm
+class TestRouterReturnsNoneWhenContextAnswers:
+    """Router must opt out when the answer is already visible in context."""
+
+    def test_time_query_with_time_in_context_returns_none(self):
+        selected = _route("what time is it, Jarvis?", _TIME_LOCATION_HINT)
+        real = [t for t in selected if t != "stop"]
+        print(f"\n  Selected: {selected}")
+        if real:
+            pytest.xfail(
+                f"Small router model {JUDGE_MODEL} still picked real tools "
+                f"({real}) for a query fully answerable from context."
+            )
+        assert not real, f"Router should opt out, got: {selected}"
+
+    def test_date_query_with_date_in_context_returns_none(self):
+        selected = _route("what's today's date?", _TIME_LOCATION_HINT)
+        real = [t for t in selected if t != "stop"]
+        print(f"\n  Selected: {selected}")
+        if real:
+            pytest.xfail(
+                f"Router picked real tools ({real}) for a date query "
+                f"answerable from context."
+            )
+        assert not real
+
+    def test_location_query_with_location_in_context_returns_none(self):
+        selected = _route("where am I right now?", _TIME_LOCATION_HINT)
+        real = [t for t in selected if t != "stop"]
+        print(f"\n  Selected: {selected}")
+        if real:
+            pytest.xfail(
+                f"Router picked real tools ({real}) for a location query "
+                f"answerable from context."
+            )
+        assert not real
+
+
+@pytest.mark.eval
+@requires_judge_llm
+class TestRouterPicksToolsWhenContextDoesNotAnswer:
+    """Regression guard: router must not over-commit to 'none'."""
+
+    def test_weather_query_still_picks_getWeather(self):
+        """Context has time+location, but weather itself is not in context —
+        the router must still pick getWeather."""
+        selected = _route("what's the weather like?", _TIME_LOCATION_HINT)
+        print(f"\n  Selected: {selected}")
+        assert "getWeather" in selected, (
+            f"Router dropped getWeather for an explicit weather query. "
+            f"Got: {selected}"
+        )
+
+    def test_location_query_with_partial_hint_still_routes_sensibly(self):
+        """KNOWN LIMITATION on small router models (gemma4:e2b).
+
+        When location failed to resolve (hint lacks it), a location query
+        should not be silenced as 'none' — it must either route to a tool
+        that can surface location or accept the fallback, but must not
+        confidently claim the answer is in context when it isn't.
+
+        Observed behaviour on gemma4:e2b: the mere presence of an
+        ALREADY IN CONTEXT block primes the router to return 'none' for
+        context-shaped queries even when the specific fact is absent
+        from the block. Attempts to fix this purely at prompt level
+        (adding "the block is NOT exhaustive" wording) regress the
+        positive cases (time/date queries stop routing to 'none').
+        The practical impact is bounded: when location genuinely fails
+        to resolve, the follow-up layers (main model + memory recall)
+        still have a chance to produce a sensible answer, and this only
+        fires on the narrow path where the hint is partial.
+
+        Parked as xfail rather than deleted so that a future router
+        model (or prompt iteration) will surface the improvement as an
+        unexpected pass. If fixed, delete the xfail branch and assert
+        `selected != ["stop"]` unconditionally.
+        """
+        selected = _route("where am I right now?", _TIME_ONLY_HINT)
+        print(f"\n  Selected: {selected}")
+        if selected == ["stop"]:
+            pytest.xfail(
+                f"Router returned 'none' for a location query whose answer "
+                f"was NOT in the partial hint. Known small-model limit — "
+                f"see test docstring."
+            )
+
+    def test_followup_naming_place_routes_to_getWeather(self):
+        """Field capture 2026-04-20: assistant asked "Which city should I
+        check the weather for?" and the user replied "I'm in London". The
+        router saw only "I'm in London" as the query and returned 'none' —
+        reading it as idle chatter instead of a continuation.
+
+        With the split-hint prompt (KNOWN FACTS + RECENT DIALOGUE), the
+        router must merge intent across turns and route to getWeather."""
+        hint = (
+            "Current local time: Sunday, 2026-04-20 17:42 UTC.\n\n"
+            "Recent dialogue (short-term memory):\n"
+            "- user: what's the weather like?\n"
+            "- assistant: Which city should I check the weather for?"
+        )
+        selected = _route("I'm in London", hint)
+        print(f"\n  Selected: {selected}")
+        if "getWeather" not in selected:
+            pytest.xfail(
+                f"Router did not resolve follow-up 'I'm in London' after the "
+                f"assistant asked for a city. Got: {selected}. Known small-"
+                f"model limit — the prompt change lands first, the eval "
+                f"tracks the improvement."
+            )
+
+    def test_no_hint_at_all_still_routes_sensibly(self):
+        """With context_hint=None (e.g. first turn, location lookup failed
+        entirely), the router must still work — selecting content-relevant
+        tools. This guards the graceful-degradation path."""
+        selected = _route("what's the weather like?", None)
+        print(f"\n  Selected: {selected}")
+        assert "getWeather" in selected, (
+            f"Router broke when context_hint was None. Got: {selected}"
+        )