Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/evals/test_multi_turn_context.py
+++ b/evals/test_multi_turn_context.py
@@ -0,0 +1,506 @@
+"""
+Multi-Turn Context Evaluations
+
+Tests the agent's ability to handle multi-turn conversations correctly:
+1. Topic Switching - Selecting correct tool when conversation topic changes
+2. Context Anchoring - Not getting "stuck" on previous turn's tool
+3. Follow-up Handling - Using context from previous turns when relevant
+
+These evals are critical for catching regressions where the model might:
+- Call the wrong tool after a topic change (e.g., getWeather for store hours)
+- Ignore context from previous turns
+- Fail to follow up on established conversation context
+
+Run: ./scripts/run_evals.sh
+"""
+
+import pytest
+from unittest.mock import patch
+
+from conftest import requires_judge_llm
+from helpers import (
+    MockConfig, ToolCallCapture,
+    create_mock_tool_run,
+    JUDGE_MODEL,
+)
+
+
+# =============================================================================
+# Test Data - Consistent tool responses for reproducibility
+# =============================================================================
+
+MOCK_WEATHER_RESPONSE = """Current weather in Kensington, Royal Kensington and Chelsea, United Kingdom:
+Conditions: Overcast
+Temperature: 7.8°C
+Feels like: 5°C
+Humidity: 75%
+Wind: 12 km/h from the west
+"""
+
+MOCK_STORE_HOURS_SEARCH = """Web search results for 'CEX store hours Kensington':
+
+**Content from top result:**
+CEX Kensington High Street
+Opening Hours:
+Monday - Saturday: 10:00 AM - 6:00 PM
+Sunday: 11:00 AM - 5:00 PM
+
+**Other search results:**
+1. **CEX Kensington - Store Info** - https://uk.webuy.com/store/kensington
+2. **CEX Store Locator** - https://uk.webuy.com/stores
+"""
+
+MOCK_NEWS_SEARCH = """Web search results for 'tech news today':
+
+**Content from top result:**
+Today's Tech Headlines:
+- Apple announces new M4 chip
+- OpenAI releases GPT-5
+- SpaceX Starship completes orbital test
+
+**Other search results:**
+1. **TechCrunch** - https://techcrunch.com
+2. **The Verge** - https://theverge.com
+"""
+
+
+# =============================================================================
+# Topic Switching Evaluations (Live LLM)
+# =============================================================================
+
+class TestTopicSwitching:
+    """
+    Tests that the agent selects the correct tool when the conversation
+    topic changes between turns.
+
+    Uses real LLM inference to test actual model behavior.
+    Tool execution is mocked for consistent responses.
+    """
+
+    @pytest.mark.eval
+    @requires_judge_llm
+    def test_weather_then_store_hours(self, mock_config, eval_db, eval_dialogue_memory):
+        """
+        After weather query, asking about store hours should use webSearch.
+
+        Scenario:
+        - Turn 1: "How's the weather?" -> getWeather (correct)
+        - Turn 2: "Can you check when CEX closes?" -> webSearch (NOT getWeather!)
+
+        This tests the exact bug scenario where llama3.2:3b called getWeather
+        for a store hours query because it got anchored on the previous tool.
+        """
+        from jarvis.reply.engine import run_reply_engine
+
+        mock_config.ollama_base_url = "http://localhost:11434"
+        mock_config.ollama_chat_model = JUDGE_MODEL
+
+        capture = ToolCallCapture()
+        mock_tool_run = create_mock_tool_run(capture, {
+            "getWeather": MOCK_WEATHER_RESPONSE,
+            "webSearch": MOCK_STORE_HOURS_SEARCH,
+        })
+
+        with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \
+             patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, Royal Kensington and Chelsea, United Kingdom", None)):
+
+            # Turn 1: Weather query
+            capture.clear()
+            response1 = run_reply_engine(
+                db=eval_db, cfg=mock_config, tts=None,
+                text="How's the weather today?",
+                dialogue_memory=eval_dialogue_memory
+            )
+            turn1_tools = capture.tool_sequence()
+
+            # Turn 2: Store hours query (topic change)
+            capture.clear()
+            response2 = run_reply_engine(
+                db=eval_db, cfg=mock_config, tts=None,
+                text="Yeah, I could do but can you check how long CEX is open for?",
+                dialogue_memory=eval_dialogue_memory
+            )
+            turn2_tools = capture.tool_sequence()
+
+        print(f"\n📊 Topic Switching - Weather → Store Hours:")
+        print(f"   Turn 1 query: 'How's the weather today?'")
+        print(f"   Turn 1 tools: {turn1_tools}")
+        print(f"   Turn 1 response: {response1[:100] if response1 else 'None'}...")
+        print(f"   Turn 2 query: 'can you check how long CEX is open for?'")
+        print(f"   Turn 2 tools: {turn2_tools}")
+        print(f"   Turn 2 response: {response2[:100] if response2 else 'None'}...")
+
+        # Turn 1 should use getWeather
+        assert "getWeather" in turn1_tools, \
+            f"Turn 1 should use getWeather for weather query. Used: {turn1_tools}"
+
+        # Turn 2 MUST use webSearch, NOT getWeather
+        # This is the critical assertion - the model should recognize topic change
+        used_wrong_tool = "getWeather" in turn2_tools and "webSearch" not in turn2_tools
+
+        if used_wrong_tool:
+            pytest.fail(
+                f"❌ CONTEXT ANCHORING BUG: Model used getWeather for store hours!\n"
+                f"   Turn 2 tools: {turn2_tools}\n"
+                f"   Expected: webSearch\n"
+                f"   The model got 'stuck' on the previous turn's tool.\n"
+                f"   Response: {response2[:200] if response2 else 'None'}"
+            )
+
+        assert "webSearch" in turn2_tools, \
+            f"Turn 2 should use webSearch for store hours. Used: {turn2_tools}"
+
+        print(f"   ✅ Correctly switched from getWeather to webSearch")
+
+    @pytest.mark.eval
+    @requires_judge_llm
+    def test_search_then_weather(self, mock_config, eval_db, eval_dialogue_memory):
+        """
+        After a web search, asking about weather should use getWeather.
+
+        Tests the reverse direction - ensuring the model doesn't stay stuck
+        on webSearch when weather is asked.
+        """
+        from jarvis.reply.engine import run_reply_engine
+
+        mock_config.ollama_base_url = "http://localhost:11434"
+        mock_config.ollama_chat_model = JUDGE_MODEL
+
+        capture = ToolCallCapture()
+        mock_tool_run = create_mock_tool_run(capture, {
+            "getWeather": MOCK_WEATHER_RESPONSE,
+            "webSearch": MOCK_NEWS_SEARCH,
+        })
+
+        with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \
+             patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)):
+
+            # Turn 1: News search
+            capture.clear()
+            run_reply_engine(
+                db=eval_db, cfg=mock_config, tts=None,
+                text="What's the latest tech news?",
+                dialogue_memory=eval_dialogue_memory
+            )
+            turn1_tools = capture.tool_sequence()
+
+            # Turn 2: Weather
+            capture.clear()
+            response2 = run_reply_engine(
+                db=eval_db, cfg=mock_config, tts=None,
+                text="How's the weather outside?",
+                dialogue_memory=eval_dialogue_memory
+            )
+            turn2_tools = capture.tool_sequence()
+
+        print(f"\n📊 Topic Switching - News → Weather:")
+        print(f"   Turn 1 tools: {turn1_tools}")
+        print(f"   Turn 2 tools: {turn2_tools}")
+
+        assert "webSearch" in turn1_tools, \
+            f"Turn 1 should use webSearch for news. Used: {turn1_tools}"
+
+        # Check for reverse anchoring
+        if "webSearch" in turn2_tools and "getWeather" not in turn2_tools:
+            pytest.fail(
+                f"❌ CONTEXT ANCHORING BUG: Model used webSearch for weather query!\n"
+                f"   Turn 2 tools: {turn2_tools}\n"
+                f"   Response: {response2[:200] if response2 else 'None'}"
+            )
+
+        assert "getWeather" in turn2_tools, \
+            f"Turn 2 should use getWeather for weather query. Used: {turn2_tools}"
+
+        print(f"   ✅ Correctly switched from webSearch to getWeather")
+
+
+# =============================================================================
+# Follow-Up Context Evaluations (Live LLM)
+# =============================================================================
+
+class TestFollowUpContext:
+    """
+    Tests that the agent maintains context from previous turns
+    when handling follow-up questions.
+    """
+
+    @pytest.mark.eval
+    @requires_judge_llm
+    def test_follow_up_references_previous_context(self, mock_config, eval_db, eval_dialogue_memory):
+        """
+        Follow-up questions should reference information from previous turns.
+
+        Scenario:
+        - Turn 1: "How's the weather?" -> (gets weather data showing overcast, 7.8°C)
+        - Turn 2: "Should I bring an umbrella?" -> Response should reference weather
+
+        The model should use the weather context to inform the umbrella advice.
+        """
+        from jarvis.reply.engine import run_reply_engine
+
+        mock_config.ollama_base_url = "http://localhost:11434"
+        mock_config.ollama_chat_model = JUDGE_MODEL
+
+        capture = ToolCallCapture()
+        mock_tool_run = create_mock_tool_run(capture, {"getWeather": MOCK_WEATHER_RESPONSE})
+
+        with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \
+             patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)):
+
+            # Turn 1: Weather query
+            capture.clear()
+            response1 = run_reply_engine(
+                db=eval_db, cfg=mock_config, tts=None,
+                text="How's the weather today?",
+                dialogue_memory=eval_dialogue_memory
+            )
+            turn1_tools = capture.tool_sequence()
+
+            # Turn 2: Follow-up about umbrella
+            capture.clear()
+            response2 = run_reply_engine(
+                db=eval_db, cfg=mock_config, tts=None,
+                text="Should I bring an umbrella?",
+                dialogue_memory=eval_dialogue_memory
+            )
+            turn2_tools = capture.tool_sequence()
+
+        print(f"\n📊 Follow-Up Context - Weather → Umbrella:")
+        print(f"   Turn 1 tools: {turn1_tools}")
+        print(f"   Turn 1 response: {response1[:80] if response1 else 'None'}...")
+        print(f"   Turn 2 tools: {turn2_tools}")
+        print(f"   Turn 2 response: {response2[:120] if response2 else 'None'}...")
+
+        # Turn 1 should fetch weather
+        assert "getWeather" in turn1_tools, "Turn 1 should fetch weather"
+
+        # Turn 2: Check if response references weather context
+        # (It may or may not call getWeather again - both are acceptable)
+        if response2:
+            weather_terms = ["overcast", "cloud", "rain", "weather", "chilly", "cold", "7", "8"]
+            references_weather = any(term in response2.lower() for term in weather_terms)
+            print(f"   References weather context: {references_weather}")
+
+            # The response should acknowledge or use the weather context
+            # Not a hard fail if it doesn't, but we log it
+            if not references_weather:
+                print(f"   ⚠️ Response doesn't seem to reference weather context")
+
+
+# =============================================================================
+# Self-Contained Tool Argument Evaluations (Live LLM)
+# =============================================================================
+
+
+MOCK_HARRY_STYLES_SEARCH = """Web search results for 'Harry Styles':
+
+**Content from top result:**
+Harry Styles is an English singer and songwriter, born 1 February 1994.
+He rose to fame as a member of the boy band One Direction and has since
+released several solo albums including Fine Line (2019) and Harry's House (2022).
+
+**Other search results:**
+1. **Harry Styles - Wikipedia** - https://en.wikipedia.org/wiki/Harry_Styles
+"""
+
+MOCK_HARRY_STYLES_SONGS_SEARCH = """Web search results for 'Harry Styles most famous songs':
+
+**Content from top result:**
+Harry Styles' most famous songs include:
+- "Watermelon Sugar" (2019)
+- "As It Was" (2022)
+- "Sign of the Times" (2017)
+- "Adore You" (2019)
+
+**Other search results:**
+1. **Harry Styles Discography** - https://en.wikipedia.org/wiki/Harry_Styles_discography
+"""
+
+
+class TestSelfContainedToolArguments:
+    """
+    Tests that follow-up queries with unresolved pronouns produce tool calls
+    whose arguments resolve the referent from conversation history.
+
+    A tool does not see prior turns — if the model passes "what are his most
+    famous songs?" to webSearch, the search will miss the entity and return
+    irrelevant results. The model must rewrite the argument to something like
+    "Harry Styles most famous songs".
+    """
+
+    @pytest.mark.eval
+    @requires_judge_llm
+    def test_follow_up_resolves_pronoun_in_search_query(
+        self, mock_config, eval_db, eval_dialogue_memory
+    ):
+        """
+        Scenario:
+        - Turn 1: "Who is Harry Styles?" -> webSearch("Harry Styles ...")
+        - Turn 2: "What are his most famous songs?" -> webSearch argument
+                  MUST contain "Harry Styles" (pronoun resolved from context).
+        """
+        from jarvis.reply.engine import run_reply_engine
+
+        mock_config.ollama_base_url = "http://localhost:11434"
+        mock_config.ollama_chat_model = JUDGE_MODEL
+
+        capture = ToolCallCapture()
+
+        def mock_tool_run(db, cfg, tool_name, tool_args, **kwargs):
+            from jarvis.tools.types import ToolExecutionResult
+            capture.record(tool_name, tool_args or {})
+            if tool_name == "webSearch":
+                args_str = str(tool_args).lower() if tool_args else ""
+                if "song" in args_str or "music" in args_str or "album" in args_str:
+                    return ToolExecutionResult(success=True, reply_text=MOCK_HARRY_STYLES_SONGS_SEARCH)
+                return ToolExecutionResult(success=True, reply_text=MOCK_HARRY_STYLES_SEARCH)
+            return ToolExecutionResult(success=True, reply_text="OK")
+
+        with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \
+             patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)):
+
+            # Turn 1: establish entity
+            capture.clear()
+            run_reply_engine(
+                db=eval_db, cfg=mock_config, tts=None,
+                text="Who is Harry Styles?",
+                dialogue_memory=eval_dialogue_memory
+            )
+            turn1_calls = list(capture.calls)
+
+            # Turn 2: follow-up with pronoun
+            capture.clear()
+            response2 = run_reply_engine(
+                db=eval_db, cfg=mock_config, tts=None,
+                text="What are his most famous songs?",
+                dialogue_memory=eval_dialogue_memory
+            )
+            turn2_calls = list(capture.calls)
+
+        print(f"\n📊 Self-contained tool arguments — Harry Styles follow-up:")
+        print(f"   Turn 1 calls: {turn1_calls}")
+        print(f"   Turn 2 calls: {turn2_calls}")
+        print(f"   Turn 2 response: {(response2 or '')[:120]}...")
+
+        # Turn 2 must call a search-capable tool
+        search_calls = [c for c in turn2_calls if c["name"] == "webSearch"]
+        assert search_calls, (
+            f"Turn 2 should call webSearch to answer the follow-up. "
+            f"Got: {[c['name'] for c in turn2_calls]}"
+        )
+
+        # Every search call's string argument must name the entity
+        for call in search_calls:
+            args = call["args"] or {}
+            arg_values = " ".join(
+                str(v) for v in args.values() if isinstance(v, str)
+            ).lower()
+            assert "harry" in arg_values or "styles" in arg_values, (
+                f"❌ PRONOUN-RESOLUTION BUG: webSearch argument did not include "
+                f"the entity from the previous turn.\n"
+                f"   Args: {args}\n"
+                f"   Expected the string to contain 'Harry' or 'Styles' — the "
+                f"tool has no access to conversation history, so 'his' must be "
+                f"resolved by the model before the tool call."
+            )
+
+        print(f"   ✅ webSearch argument resolved the pronoun correctly")
+
+
+# =============================================================================
+# Extended Multi-Turn Evaluations (Live LLM)
+# =============================================================================
+
+class TestMultiTurnExtended:
+    """
+    Extended multi-turn scenarios testing longer conversations
+    and more complex topic changes.
+    """
+
+    @pytest.mark.eval
+    @requires_judge_llm
+    def test_three_turn_topic_changes(self, mock_config, eval_db, eval_dialogue_memory):
+        """
+        Three-turn conversation with multiple topic changes.
+
+        Turn 1: Weather query
+        Turn 2: Store hours query (topic change from weather)
+        Turn 3: News query (topic change from store)
+
+        Each turn should select the appropriate tool.
+        """
+        from jarvis.reply.engine import run_reply_engine
+
+        mock_config.ollama_base_url = "http://localhost:11434"
+        mock_config.ollama_chat_model = JUDGE_MODEL
+
+        capture = ToolCallCapture()
+        all_turns = []
+
+        def mock_tool_run(db, cfg, tool_name, tool_args, **kwargs):
+            from jarvis.tools.types import ToolExecutionResult
+            capture.record(tool_name, tool_args or {})
+
+            if tool_name == "getWeather":
+                return ToolExecutionResult(success=True, reply_text=MOCK_WEATHER_RESPONSE)
+            elif tool_name == "webSearch":
+                # Return appropriate content based on query
+                args_str = str(tool_args).lower() if tool_args else ""
+                if "cex" in args_str or "store" in args_str or "hour" in args_str:
+                    return ToolExecutionResult(success=True, reply_text=MOCK_STORE_HOURS_SEARCH)
+                else:
+                    return ToolExecutionResult(success=True, reply_text=MOCK_NEWS_SEARCH)
+            return ToolExecutionResult(success=True, reply_text="OK")
+
+        with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \
+             patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)):
+
+            queries = [
+                ("How's the weather today?", "getWeather"),
+                ("What time does CEX close?", "webSearch"),
+                ("What's happening in tech news?", "webSearch"),
+            ]
+
+            for query, expected_tool in queries:
+                capture.clear()
+                response = run_reply_engine(
+                    db=eval_db, cfg=mock_config, tts=None,
+                    text=query,
+                    dialogue_memory=eval_dialogue_memory
+                )
+                all_turns.append({
+                    "query": query,
+                    "expected": expected_tool,
+                    "tools": capture.tool_sequence().copy(),
+                    "response": response
+                })
+
+        print(f"\n📊 Three-Turn Topic Changes:")
+        failures = []
+        for i, turn in enumerate(all_turns, 1):
+            tools = turn["tools"]
+            expected = turn["expected"]
+            has_expected = expected in tools
+
+            status = "✅" if has_expected else "❌"
+            print(f"   Turn {i}: '{turn['query'][:35]}...'")
+            print(f"      Expected: {expected}, Got: {tools} {status}")
+
+            if not has_expected:
+                # Check for context anchoring specifically
+                if i > 1 and all_turns[i-2]["expected"] in tools:
+                    failures.append(
+                        f"Turn {i}: Context anchoring bug - used {tools} (previous turn's tool) "
+                        f"instead of {expected}"
+                    )
+                else:
+                    failures.append(f"Turn {i}: Expected {expected}, got {tools}")
+
+        if failures:
+            pytest.fail(
+                f"❌ Multi-turn tool selection failures:\n" +
+                "\n".join(f"   - {f}" for f in failures)
+            )
+
+        print(f"   ✅ All turns selected correct tools")
+