Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/evals/test_weather_autoderive_location.py
+++ b/evals/test_weather_autoderive_location.py
@@ -0,0 +1,194 @@
+"""
+Regression eval: getWeather must be called without asking for location.
+
+Field failures captured 2026-04-20 and 2026-04-21:
+
+  - 2026-04-20 "what's the weather this week": the LLM replied "What location
+    are you asking about?" without calling the tool.
+  - 2026-04-21 "How's the weather, Jarvis?": with ten prior diary entries
+    about weather loaded (~890 char digest), gemma produced malformed
+    output and the engine shipped the canned fallback "I had trouble
+    understanding that request." The tool was never invoked.
+
+The tool's description explicitly states it uses the user's current location
+when none is given. This eval asserts the model respects that contract
+instead of asking for an argument the tool already handles — AND that a
+warm memory state (the normal production condition) doesn't tip gemma into
+scaffolding mode where the malformed guard silently eats the turn.
+
+Two parametrised variants cover:
+  - ``cold-memory``: fresh dialogue memory + empty diary (old behaviour).
+  - ``warm-memory``: ten prior weather-related diary summaries, matching
+    the field log at 2026-04-21. This is the state that actually ships
+    to users and was previously never exercised in evals.
+
+Historical note: this eval used to ``pytest.xfail`` every gemma failure
+as "flakiness", which meant the exact field regressions above were
+recorded as expected-failures rather than real failures. The xfail
+escape hatches have been removed — if gemma breaks here, we want CI
+to shout.
+
+Run: EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh weather_autoderive
+"""
+
+from unittest.mock import patch
+
+import pytest
+
+from conftest import requires_judge_llm
+from helpers import (
+    ToolCallCapture,
+    assert_not_fallback_reply,
+    create_mock_tool_run,
+    seed_diary_summaries,
+)
+
+
+# Phrases that indicate the model deflected to asking for location instead of
+# calling the tool. These are English-language signals for the gpt-oss/gemma
+# judge models we evaluate against. CLAUDE.md forbids hardcoded language
+# patterns in production code paths (the assistant supports arbitrary
+# languages), but eval assertions against a specific English-speaking judge
+# model are scoped to that judge and don't leak into the product.
+_LOCATION_CLARIFICATION_PHRASES = (
+    "what location",
+    "which location",
+    "where are you",
+    "your location",
+    "specify a location",
+    "specify the location",
+    "tell me your location",
+    "tell me the location",
+    "what city",
+    "which city",
+    "where do you want",
+)
+
+
+# Ten dated summaries approximating the field-log state where the user has
+# asked about weather repeatedly over a fortnight. The digest built from
+# these is ~800-900 chars, matching the production shape that tipped
+# gemma into malformed output.
+_WARM_WEATHER_DIARY = [
+    ("2026-04-07", "The user asked whether it would rain in Hackney in the evening; the assistant provided the forecast showing light rain after 18:00."),
+    ("2026-04-08", "The user inquired about the weekend weather; the assistant reported dry conditions with highs of 15°C."),
+    ("2026-04-10", "The user requested a weather check for Tuesday; the assistant replied with partly cloudy 13°C."),
+    ("2026-04-11", "The user asked about the weather for tomorrow; the assistant returned cool and overcast conditions."),
+    ("2026-04-13", "The user asked about this afternoon's weather; the assistant reported bright sun and mild temperatures."),
+    ("2026-04-15", "The user inquired about the weather for tomorrow; since no location was supplied, the assistant used Hackney and returned the forecast."),
+    ("2026-04-16", "The user asked what the weather was doing; the assistant reported intermittent rain and temperatures around 11°C."),
+    ("2026-04-17", "The user inquired about the current weather; the assistant provided a snapshot showing overcast and mild."),
+    ("2026-04-18", "The user asked about the weekend outlook; the assistant reported mixed conditions with rain Sunday afternoon."),
+    ("2026-04-20", "The user asked about the weather this week; the assistant delivered a multi-day forecast for Hackney."),
+]
+
+
+def _run_weather_query(mock_config, eval_db, eval_dialogue_memory, query: str):
+    from helpers import JUDGE_MODEL
+    from jarvis.reply.engine import run_reply_engine
+
+    mock_config.ollama_base_url = "http://localhost:11434"
+    mock_config.ollama_chat_model = JUDGE_MODEL
+    mock_config.location_enabled = True
+
+    capture = ToolCallCapture()
+
+    weather_payload = (
+        "Weather for Hackney, London, UK:\n"
+        "Today: 14°C, partly cloudy. High 16°C, low 9°C.\n"
+        "This week: mixed cloud, some rain Thursday, sunny Saturday."
+    )
+
+    with patch(
+        'jarvis.utils.location.get_location_info',
+        return_value={"city": "Hackney", "region": "England", "country": "UK"},
+    ), patch(
+        'jarvis.reply.engine.run_tool_with_retries',
+        side_effect=create_mock_tool_run(capture, {
+            "getWeather": weather_payload,
+        }),
+    ):
+        response = run_reply_engine(
+            db=eval_db, cfg=mock_config, tts=None,
+            text=query, dialogue_memory=eval_dialogue_memory,
+        )
+    return capture, response
+
+
+@pytest.mark.eval
+@requires_judge_llm
+class TestWeatherAutoDerivesLocation:
+    """Regression guard: getWeather must be called without nagging for location,
+    even under warm memory state."""
+
+    @pytest.mark.parametrize(
+        "variant,query",
+        [
+            ("cold-memory-week-forecast", "what's the weather this week"),
+            ("cold-memory-short-query", "how's the weather"),
+            ("warm-memory-short-query", "how's the weather"),
+        ],
+        ids=lambda v: v if isinstance(v, str) else "",
+    )
+    def test_weather_query_calls_tool_and_grounds_reply(
+        self, mock_config, eval_db, eval_dialogue_memory, variant, query,
+    ):
+        from helpers import JUDGE_MODEL
+
+        if variant.startswith("warm-memory"):
+            seed_diary_summaries(eval_db, _WARM_WEATHER_DIARY)
+
+        capture, response = _run_weather_query(
+            mock_config, eval_db, eval_dialogue_memory, query,
+        )
+
+        print(f"\n  Weather Auto-Derive [{variant}] ({JUDGE_MODEL}):")
+        print(f"  Query: '{query}'")
+        print(f"  Tools called: {capture.tool_names() or 'none'}")
+        print(f"  Response: {(response or '')[:300]}")
+
+        # Shield against the engine silently shipping the "I had trouble
+        # understanding that request" canned fallback — that's the malformed
+        # guard firing, which masks the real model failure from eval
+        # assertions that only check tool calls.
+        assert_not_fallback_reply(response, context=variant)
+
+        lowered = (response or "").lower()
+        asked_for_location = next(
+            (p for p in _LOCATION_CLARIFICATION_PHRASES if p in lowered), None,
+        )
+
+        assert capture.has_tool("getWeather"), (
+            f"[{variant}] Model failed to call getWeather despite the "
+            f"tool's description stating it uses the user's current "
+            f"location when none is given, and the user's location being "
+            f"injected into the system prompt. "
+            f"Tools called: {capture.tool_names() or 'none'}. "
+            f"Location-clarification phrase hit: {asked_for_location!r}. "
+            f"Response: {(response or '')[:400]}"
+        )
+
+        assert asked_for_location is None, (
+            f"[{variant}] Model called getWeather but also asked the user "
+            f"for a location — that's the deflection pattern the prompt "
+            f"clause is meant to prevent. "
+            f"Phrase hit: {asked_for_location!r}. "
+            f"Response: {(response or '')[:400]}"
+        )
+
+        # Args guard: the queries here never name a place, so getWeather
+        # must be called with no `location` arg (or empty string). The
+        # 2026-04-24 field regression had the planner stuffing a temporal
+        # qualifier into `location=` (e.g. `location='today'`, which
+        # geocoded to "Todaya" in the Philippines); the mock happily
+        # returned the canned payload regardless, so an args-blind eval
+        # would pass over this silently.
+        weather_args = capture.get_args("getWeather") or {}
+        location_arg = (weather_args.get("location") or "").strip()
+        assert location_arg == "", (
+            f"[{variant}] getWeather was called with a fabricated location "
+            f"argument: location={location_arg!r}. The user named no place, "
+            f"so the tool must be called with empty args so it auto-uses "
+            f"the user's detected location. Full args: {weather_args!r}. "
+            f"Response: {(response or '')[:400]}"
+        )