Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled

Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
This commit is contained in:
javis-bot
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions

View File

@@ -0,0 +1,506 @@
"""
Multi-Turn Context Evaluations
Tests the agent's ability to handle multi-turn conversations correctly:
1. Topic Switching - Selecting correct tool when conversation topic changes
2. Context Anchoring - Not getting "stuck" on previous turn's tool
3. Follow-up Handling - Using context from previous turns when relevant
These evals are critical for catching regressions where the model might:
- Call the wrong tool after a topic change (e.g., getWeather for store hours)
- Ignore context from previous turns
- Fail to follow up on established conversation context
Run: ./scripts/run_evals.sh
"""
import pytest
from unittest.mock import patch
from conftest import requires_judge_llm
from helpers import (
MockConfig, ToolCallCapture,
create_mock_tool_run,
JUDGE_MODEL,
)
# =============================================================================
# Test Data - Consistent tool responses for reproducibility
# =============================================================================
MOCK_WEATHER_RESPONSE = """Current weather in Kensington, Royal Kensington and Chelsea, United Kingdom:
Conditions: Overcast
Temperature: 7.8°C
Feels like: 5°C
Humidity: 75%
Wind: 12 km/h from the west
"""
MOCK_STORE_HOURS_SEARCH = """Web search results for 'CEX store hours Kensington':
**Content from top result:**
CEX Kensington High Street
Opening Hours:
Monday - Saturday: 10:00 AM - 6:00 PM
Sunday: 11:00 AM - 5:00 PM
**Other search results:**
1. **CEX Kensington - Store Info** - https://uk.webuy.com/store/kensington
2. **CEX Store Locator** - https://uk.webuy.com/stores
"""
MOCK_NEWS_SEARCH = """Web search results for 'tech news today':
**Content from top result:**
Today's Tech Headlines:
- Apple announces new M4 chip
- OpenAI releases GPT-5
- SpaceX Starship completes orbital test
**Other search results:**
1. **TechCrunch** - https://techcrunch.com
2. **The Verge** - https://theverge.com
"""
# =============================================================================
# Topic Switching Evaluations (Live LLM)
# =============================================================================
class TestTopicSwitching:
"""
Tests that the agent selects the correct tool when the conversation
topic changes between turns.
Uses real LLM inference to test actual model behavior.
Tool execution is mocked for consistent responses.
"""
@pytest.mark.eval
@requires_judge_llm
def test_weather_then_store_hours(self, mock_config, eval_db, eval_dialogue_memory):
"""
After weather query, asking about store hours should use webSearch.
Scenario:
- Turn 1: "How's the weather?" -> getWeather (correct)
- Turn 2: "Can you check when CEX closes?" -> webSearch (NOT getWeather!)
This tests the exact bug scenario where llama3.2:3b called getWeather
for a store hours query because it got anchored on the previous tool.
"""
from jarvis.reply.engine import run_reply_engine
mock_config.ollama_base_url = "http://localhost:11434"
mock_config.ollama_chat_model = JUDGE_MODEL
capture = ToolCallCapture()
mock_tool_run = create_mock_tool_run(capture, {
"getWeather": MOCK_WEATHER_RESPONSE,
"webSearch": MOCK_STORE_HOURS_SEARCH,
})
with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \
patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, Royal Kensington and Chelsea, United Kingdom", None)):
# Turn 1: Weather query
capture.clear()
response1 = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text="How's the weather today?",
dialogue_memory=eval_dialogue_memory
)
turn1_tools = capture.tool_sequence()
# Turn 2: Store hours query (topic change)
capture.clear()
response2 = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text="Yeah, I could do but can you check how long CEX is open for?",
dialogue_memory=eval_dialogue_memory
)
turn2_tools = capture.tool_sequence()
print(f"\n📊 Topic Switching - Weather → Store Hours:")
print(f" Turn 1 query: 'How's the weather today?'")
print(f" Turn 1 tools: {turn1_tools}")
print(f" Turn 1 response: {response1[:100] if response1 else 'None'}...")
print(f" Turn 2 query: 'can you check how long CEX is open for?'")
print(f" Turn 2 tools: {turn2_tools}")
print(f" Turn 2 response: {response2[:100] if response2 else 'None'}...")
# Turn 1 should use getWeather
assert "getWeather" in turn1_tools, \
f"Turn 1 should use getWeather for weather query. Used: {turn1_tools}"
# Turn 2 MUST use webSearch, NOT getWeather
# This is the critical assertion - the model should recognize topic change
used_wrong_tool = "getWeather" in turn2_tools and "webSearch" not in turn2_tools
if used_wrong_tool:
pytest.fail(
f"❌ CONTEXT ANCHORING BUG: Model used getWeather for store hours!\n"
f" Turn 2 tools: {turn2_tools}\n"
f" Expected: webSearch\n"
f" The model got 'stuck' on the previous turn's tool.\n"
f" Response: {response2[:200] if response2 else 'None'}"
)
assert "webSearch" in turn2_tools, \
f"Turn 2 should use webSearch for store hours. Used: {turn2_tools}"
print(f" ✅ Correctly switched from getWeather to webSearch")
@pytest.mark.eval
@requires_judge_llm
def test_search_then_weather(self, mock_config, eval_db, eval_dialogue_memory):
"""
After a web search, asking about weather should use getWeather.
Tests the reverse direction - ensuring the model doesn't stay stuck
on webSearch when weather is asked.
"""
from jarvis.reply.engine import run_reply_engine
mock_config.ollama_base_url = "http://localhost:11434"
mock_config.ollama_chat_model = JUDGE_MODEL
capture = ToolCallCapture()
mock_tool_run = create_mock_tool_run(capture, {
"getWeather": MOCK_WEATHER_RESPONSE,
"webSearch": MOCK_NEWS_SEARCH,
})
with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \
patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)):
# Turn 1: News search
capture.clear()
run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text="What's the latest tech news?",
dialogue_memory=eval_dialogue_memory
)
turn1_tools = capture.tool_sequence()
# Turn 2: Weather
capture.clear()
response2 = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text="How's the weather outside?",
dialogue_memory=eval_dialogue_memory
)
turn2_tools = capture.tool_sequence()
print(f"\n📊 Topic Switching - News → Weather:")
print(f" Turn 1 tools: {turn1_tools}")
print(f" Turn 2 tools: {turn2_tools}")
assert "webSearch" in turn1_tools, \
f"Turn 1 should use webSearch for news. Used: {turn1_tools}"
# Check for reverse anchoring
if "webSearch" in turn2_tools and "getWeather" not in turn2_tools:
pytest.fail(
f"❌ CONTEXT ANCHORING BUG: Model used webSearch for weather query!\n"
f" Turn 2 tools: {turn2_tools}\n"
f" Response: {response2[:200] if response2 else 'None'}"
)
assert "getWeather" in turn2_tools, \
f"Turn 2 should use getWeather for weather query. Used: {turn2_tools}"
print(f" ✅ Correctly switched from webSearch to getWeather")
# =============================================================================
# Follow-Up Context Evaluations (Live LLM)
# =============================================================================
class TestFollowUpContext:
"""
Tests that the agent maintains context from previous turns
when handling follow-up questions.
"""
@pytest.mark.eval
@requires_judge_llm
def test_follow_up_references_previous_context(self, mock_config, eval_db, eval_dialogue_memory):
"""
Follow-up questions should reference information from previous turns.
Scenario:
- Turn 1: "How's the weather?" -> (gets weather data showing overcast, 7.8°C)
- Turn 2: "Should I bring an umbrella?" -> Response should reference weather
The model should use the weather context to inform the umbrella advice.
"""
from jarvis.reply.engine import run_reply_engine
mock_config.ollama_base_url = "http://localhost:11434"
mock_config.ollama_chat_model = JUDGE_MODEL
capture = ToolCallCapture()
mock_tool_run = create_mock_tool_run(capture, {"getWeather": MOCK_WEATHER_RESPONSE})
with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \
patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)):
# Turn 1: Weather query
capture.clear()
response1 = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text="How's the weather today?",
dialogue_memory=eval_dialogue_memory
)
turn1_tools = capture.tool_sequence()
# Turn 2: Follow-up about umbrella
capture.clear()
response2 = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text="Should I bring an umbrella?",
dialogue_memory=eval_dialogue_memory
)
turn2_tools = capture.tool_sequence()
print(f"\n📊 Follow-Up Context - Weather → Umbrella:")
print(f" Turn 1 tools: {turn1_tools}")
print(f" Turn 1 response: {response1[:80] if response1 else 'None'}...")
print(f" Turn 2 tools: {turn2_tools}")
print(f" Turn 2 response: {response2[:120] if response2 else 'None'}...")
# Turn 1 should fetch weather
assert "getWeather" in turn1_tools, "Turn 1 should fetch weather"
# Turn 2: Check if response references weather context
# (It may or may not call getWeather again - both are acceptable)
if response2:
weather_terms = ["overcast", "cloud", "rain", "weather", "chilly", "cold", "7", "8"]
references_weather = any(term in response2.lower() for term in weather_terms)
print(f" References weather context: {references_weather}")
# The response should acknowledge or use the weather context
# Not a hard fail if it doesn't, but we log it
if not references_weather:
print(f" ⚠️ Response doesn't seem to reference weather context")
# =============================================================================
# Self-Contained Tool Argument Evaluations (Live LLM)
# =============================================================================
MOCK_HARRY_STYLES_SEARCH = """Web search results for 'Harry Styles':
**Content from top result:**
Harry Styles is an English singer and songwriter, born 1 February 1994.
He rose to fame as a member of the boy band One Direction and has since
released several solo albums including Fine Line (2019) and Harry's House (2022).
**Other search results:**
1. **Harry Styles - Wikipedia** - https://en.wikipedia.org/wiki/Harry_Styles
"""
MOCK_HARRY_STYLES_SONGS_SEARCH = """Web search results for 'Harry Styles most famous songs':
**Content from top result:**
Harry Styles' most famous songs include:
- "Watermelon Sugar" (2019)
- "As It Was" (2022)
- "Sign of the Times" (2017)
- "Adore You" (2019)
**Other search results:**
1. **Harry Styles Discography** - https://en.wikipedia.org/wiki/Harry_Styles_discography
"""
class TestSelfContainedToolArguments:
"""
Tests that follow-up queries with unresolved pronouns produce tool calls
whose arguments resolve the referent from conversation history.
A tool does not see prior turns — if the model passes "what are his most
famous songs?" to webSearch, the search will miss the entity and return
irrelevant results. The model must rewrite the argument to something like
"Harry Styles most famous songs".
"""
@pytest.mark.eval
@requires_judge_llm
def test_follow_up_resolves_pronoun_in_search_query(
self, mock_config, eval_db, eval_dialogue_memory
):
"""
Scenario:
- Turn 1: "Who is Harry Styles?" -> webSearch("Harry Styles ...")
- Turn 2: "What are his most famous songs?" -> webSearch argument
MUST contain "Harry Styles" (pronoun resolved from context).
"""
from jarvis.reply.engine import run_reply_engine
mock_config.ollama_base_url = "http://localhost:11434"
mock_config.ollama_chat_model = JUDGE_MODEL
capture = ToolCallCapture()
def mock_tool_run(db, cfg, tool_name, tool_args, **kwargs):
from jarvis.tools.types import ToolExecutionResult
capture.record(tool_name, tool_args or {})
if tool_name == "webSearch":
args_str = str(tool_args).lower() if tool_args else ""
if "song" in args_str or "music" in args_str or "album" in args_str:
return ToolExecutionResult(success=True, reply_text=MOCK_HARRY_STYLES_SONGS_SEARCH)
return ToolExecutionResult(success=True, reply_text=MOCK_HARRY_STYLES_SEARCH)
return ToolExecutionResult(success=True, reply_text="OK")
with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \
patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)):
# Turn 1: establish entity
capture.clear()
run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text="Who is Harry Styles?",
dialogue_memory=eval_dialogue_memory
)
turn1_calls = list(capture.calls)
# Turn 2: follow-up with pronoun
capture.clear()
response2 = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text="What are his most famous songs?",
dialogue_memory=eval_dialogue_memory
)
turn2_calls = list(capture.calls)
print(f"\n📊 Self-contained tool arguments — Harry Styles follow-up:")
print(f" Turn 1 calls: {turn1_calls}")
print(f" Turn 2 calls: {turn2_calls}")
print(f" Turn 2 response: {(response2 or '')[:120]}...")
# Turn 2 must call a search-capable tool
search_calls = [c for c in turn2_calls if c["name"] == "webSearch"]
assert search_calls, (
f"Turn 2 should call webSearch to answer the follow-up. "
f"Got: {[c['name'] for c in turn2_calls]}"
)
# Every search call's string argument must name the entity
for call in search_calls:
args = call["args"] or {}
arg_values = " ".join(
str(v) for v in args.values() if isinstance(v, str)
).lower()
assert "harry" in arg_values or "styles" in arg_values, (
f"❌ PRONOUN-RESOLUTION BUG: webSearch argument did not include "
f"the entity from the previous turn.\n"
f" Args: {args}\n"
f" Expected the string to contain 'Harry' or 'Styles' — the "
f"tool has no access to conversation history, so 'his' must be "
f"resolved by the model before the tool call."
)
print(f" ✅ webSearch argument resolved the pronoun correctly")
# =============================================================================
# Extended Multi-Turn Evaluations (Live LLM)
# =============================================================================
class TestMultiTurnExtended:
"""
Extended multi-turn scenarios testing longer conversations
and more complex topic changes.
"""
@pytest.mark.eval
@requires_judge_llm
def test_three_turn_topic_changes(self, mock_config, eval_db, eval_dialogue_memory):
"""
Three-turn conversation with multiple topic changes.
Turn 1: Weather query
Turn 2: Store hours query (topic change from weather)
Turn 3: News query (topic change from store)
Each turn should select the appropriate tool.
"""
from jarvis.reply.engine import run_reply_engine
mock_config.ollama_base_url = "http://localhost:11434"
mock_config.ollama_chat_model = JUDGE_MODEL
capture = ToolCallCapture()
all_turns = []
def mock_tool_run(db, cfg, tool_name, tool_args, **kwargs):
from jarvis.tools.types import ToolExecutionResult
capture.record(tool_name, tool_args or {})
if tool_name == "getWeather":
return ToolExecutionResult(success=True, reply_text=MOCK_WEATHER_RESPONSE)
elif tool_name == "webSearch":
# Return appropriate content based on query
args_str = str(tool_args).lower() if tool_args else ""
if "cex" in args_str or "store" in args_str or "hour" in args_str:
return ToolExecutionResult(success=True, reply_text=MOCK_STORE_HOURS_SEARCH)
else:
return ToolExecutionResult(success=True, reply_text=MOCK_NEWS_SEARCH)
return ToolExecutionResult(success=True, reply_text="OK")
with patch('jarvis.reply.engine.run_tool_with_retries', side_effect=mock_tool_run), \
patch('jarvis.reply.engine.get_location_context_with_timezone', return_value=("Location: Kensington, UK", None)):
queries = [
("How's the weather today?", "getWeather"),
("What time does CEX close?", "webSearch"),
("What's happening in tech news?", "webSearch"),
]
for query, expected_tool in queries:
capture.clear()
response = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text=query,
dialogue_memory=eval_dialogue_memory
)
all_turns.append({
"query": query,
"expected": expected_tool,
"tools": capture.tool_sequence().copy(),
"response": response
})
print(f"\n📊 Three-Turn Topic Changes:")
failures = []
for i, turn in enumerate(all_turns, 1):
tools = turn["tools"]
expected = turn["expected"]
has_expected = expected in tools
status = "" if has_expected else ""
print(f" Turn {i}: '{turn['query'][:35]}...'")
print(f" Expected: {expected}, Got: {tools} {status}")
if not has_expected:
# Check for context anchoring specifically
if i > 1 and all_turns[i-2]["expected"] in tools:
failures.append(
f"Turn {i}: Context anchoring bug - used {tools} (previous turn's tool) "
f"instead of {expected}"
)
else:
failures.append(f"Turn {i}: Expected {expected}, got {tools}")
if failures:
pytest.fail(
f"❌ Multi-turn tool selection failures:\n" +
"\n".join(f" - {f}" for f in failures)
)
print(f" ✅ All turns selected correct tools")