Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
319
evals/test_greeting_no_tools.py
Normal file
319
evals/test_greeting_no_tools.py
Normal file
@@ -0,0 +1,319 @@
|
||||
"""
|
||||
Greeting No-Tools Evaluations (Live)
|
||||
|
||||
Live tests that verify greetings don't trigger tool calls with real LLM inference.
|
||||
Mocked equivalents live in tests/test_greeting_no_tools.py as unit tests.
|
||||
|
||||
Run: ./scripts/run_evals.sh test_greeting
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
|
||||
from conftest import requires_judge_llm
|
||||
from helpers import MockConfig, ToolCallCapture, create_mock_tool_run
|
||||
|
||||
|
||||
def _assert_no_tools(capture, query, is_small, model_name):
|
||||
"""Assert no tools were called; xfail for small models."""
|
||||
if capture.has_any_tool():
|
||||
if is_small:
|
||||
pytest.xfail(
|
||||
f"Small model {model_name} called tools for '{query}'. "
|
||||
f"Known limitation. Called: {capture.tool_names()}"
|
||||
)
|
||||
else:
|
||||
pytest.fail(
|
||||
f"Large model '{query}' should NOT trigger tools. "
|
||||
f"Called: {capture.tool_names()}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Live Tests with Real LLM
|
||||
# =============================================================================
|
||||
|
||||
def _is_small_model(model_name: str) -> bool:
|
||||
"""Check if model is classified as small by the model size detector."""
|
||||
from jarvis.reply.prompts import detect_model_size, ModelSize
|
||||
return detect_model_size(model_name) == ModelSize.SMALL
|
||||
|
||||
|
||||
class TestGreetingNoToolsLive:
|
||||
"""
|
||||
Live tests with real LLM inference.
|
||||
|
||||
These verify that the prompt changes actually work with real models.
|
||||
|
||||
NOTE: Small models (1b-7b) may still incorrectly call tools for greetings
|
||||
despite explicit prompt constraints. This is a fundamental limitation of
|
||||
small model reasoning capacity. These tests document this behaviour.
|
||||
"""
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
@pytest.mark.parametrize("query,should_use_tools", [
|
||||
pytest.param("hello", False, id="Greeting: hello"),
|
||||
pytest.param("ni hao", False, id="Greeting: ni hao (Chinese)"),
|
||||
])
|
||||
def test_greeting_no_tools_live(
|
||||
self,
|
||||
query: str,
|
||||
should_use_tools: bool,
|
||||
mock_config,
|
||||
eval_db,
|
||||
eval_dialogue_memory
|
||||
):
|
||||
"""Live test: greetings should not trigger tool calls."""
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
from helpers import JUDGE_MODEL
|
||||
|
||||
# Use the judge model (which may be small or large)
|
||||
mock_config.ollama_base_url = "http://localhost:11434"
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
|
||||
# Small models may fail this test due to limited reasoning capacity
|
||||
# This documents the limitation rather than masking it
|
||||
is_small = _is_small_model(JUDGE_MODEL)
|
||||
|
||||
capture = ToolCallCapture()
|
||||
|
||||
with patch('jarvis.reply.engine.run_tool_with_retries',
|
||||
side_effect=create_mock_tool_run(capture)):
|
||||
response = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text=query, dialogue_memory=eval_dialogue_memory
|
||||
)
|
||||
|
||||
print(f"\n Live Greeting Test ({JUDGE_MODEL}):")
|
||||
print(f" Query: '{query}'")
|
||||
print(f" Tools called: {capture.tool_names() or 'none'}")
|
||||
print(f" Response: {(response or '')[:100]}...")
|
||||
print(f" Model size: {'small' if is_small else 'large'}")
|
||||
|
||||
# For greetings, we expect NO tool calls
|
||||
if not should_use_tools:
|
||||
_assert_no_tools(capture, query, is_small, JUDGE_MODEL)
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
@pytest.mark.parametrize("query,should_use_tools", [
|
||||
pytest.param("always use Celsius when telling me temperatures", False, id="Instruction: use Celsius"),
|
||||
pytest.param("be more brief in your responses", False, id="Instruction: be more brief"),
|
||||
])
|
||||
def test_user_instructions_no_tools_live(
|
||||
self,
|
||||
query: str,
|
||||
should_use_tools: bool,
|
||||
mock_config,
|
||||
eval_db,
|
||||
eval_dialogue_memory
|
||||
):
|
||||
"""Live test: user instructions about behaviour should not trigger tool calls."""
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
from helpers import JUDGE_MODEL
|
||||
|
||||
mock_config.ollama_base_url = "http://localhost:11434"
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
|
||||
is_small = _is_small_model(JUDGE_MODEL)
|
||||
|
||||
capture = ToolCallCapture()
|
||||
|
||||
with patch('jarvis.reply.engine.run_tool_with_retries',
|
||||
side_effect=create_mock_tool_run(capture)):
|
||||
response = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text=query, dialogue_memory=eval_dialogue_memory
|
||||
)
|
||||
|
||||
print(f"\n Live User Instruction Test ({JUDGE_MODEL}):")
|
||||
print(f" Query: '{query}'")
|
||||
print(f" Tools called: {capture.tool_names() or 'none'}")
|
||||
print(f" Response: {(response or '')[:100]}...")
|
||||
print(f" Model size: {'small' if is_small else 'large'}")
|
||||
|
||||
_assert_no_tools(capture, query, is_small, JUDGE_MODEL)
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
@pytest.mark.parametrize("query", [
|
||||
pytest.param("what do you know about the Possessor movie", id="Unknown entity: Possessor (film)"),
|
||||
pytest.param("tell me about the book Piranesi", id="Unknown entity: Piranesi (book)"),
|
||||
# Permission-framed phrasing. Regression: the small model previously
|
||||
# read "what can you tell me" as "tell me what you can do" and deflected
|
||||
# with "I can search the web if you'd like" instead of calling webSearch.
|
||||
pytest.param("what can you tell me about the movie Possessor", id="Unknown entity: permission-framed (Possessor)"),
|
||||
# "Have you heard of" is another common permission-framed variant.
|
||||
pytest.param("have you heard of the film Piranesi", id="Unknown entity: have-you-heard-of (Piranesi)"),
|
||||
])
|
||||
def test_unknown_named_entity_triggers_web_search_live(
|
||||
self,
|
||||
query: str,
|
||||
mock_config,
|
||||
eval_db,
|
||||
eval_dialogue_memory,
|
||||
):
|
||||
"""Live test: questions about specific named entities should trigger a web lookup.
|
||||
|
||||
The model should recognise it has no concrete facts about the entity and call
|
||||
webSearch rather than denying knowledge or asking for a link.
|
||||
"""
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
from helpers import JUDGE_MODEL
|
||||
|
||||
mock_config.ollama_base_url = "http://localhost:11434"
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
is_small = _is_small_model(JUDGE_MODEL)
|
||||
|
||||
capture = ToolCallCapture()
|
||||
|
||||
with patch('jarvis.reply.engine.run_tool_with_retries',
|
||||
side_effect=create_mock_tool_run(capture, {
|
||||
"webSearch": "Search result: relevant details about the requested entity.",
|
||||
"fetchWebPage": "Page content: relevant details about the requested entity.",
|
||||
})):
|
||||
response = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text=query, dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
print(f"\n Live Unknown-Entity Test ({JUDGE_MODEL}):")
|
||||
print(f" Query: '{query}'")
|
||||
print(f" Tools called: {capture.tool_names() or 'none'}")
|
||||
print(f" Response: {(response or '')[:120]}...")
|
||||
print(f" Model size: {'small' if is_small else 'large'}")
|
||||
|
||||
if not capture.has_tool("webSearch"):
|
||||
msg = (
|
||||
f"Query about unknown named entity should trigger webSearch. "
|
||||
f"Called: {capture.tool_names() or 'none'}. Response: {(response or '')[:200]}"
|
||||
)
|
||||
if is_small:
|
||||
pytest.xfail(f"Small model {JUDGE_MODEL} did not call webSearch. {msg}")
|
||||
else:
|
||||
pytest.fail(msg)
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_unknown_entity_with_poisoned_diary_still_triggers_web_search_live(
|
||||
self,
|
||||
mock_config,
|
||||
eval_db,
|
||||
eval_dialogue_memory,
|
||||
):
|
||||
"""Reproduces the Possessor field regression.
|
||||
|
||||
A prior diary entry narrates the assistant's past deflection ("the assistant
|
||||
offered to search the web"). When the same entity is asked about again, the
|
||||
diary entry is retrieved as enrichment and — without the reference-only
|
||||
framing — the small model imitates the narrated deflection instead of
|
||||
calling webSearch.
|
||||
|
||||
The defences this test guards:
|
||||
1. Summariser should not produce such entries in the first place (the
|
||||
seeded entry simulates a legacy poisoned summary from before the fix).
|
||||
2. The reply engine must frame the enrichment as reference-only so the
|
||||
model doesn't treat "the assistant offered to search" as a template.
|
||||
"""
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
from helpers import JUDGE_MODEL
|
||||
|
||||
mock_config.ollama_base_url = "http://localhost:11434"
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
is_small = _is_small_model(JUDGE_MODEL)
|
||||
|
||||
# Seed a poisoned diary entry — matches the shape of the real 2026-04-19
|
||||
# entry from the field failure. Uses the exact deflection phrasing we're
|
||||
# trying to stop the model from imitating.
|
||||
poisoned_summary = (
|
||||
'[2026-04-19] The conversation began with the user asking for information about '
|
||||
'the movie "Possessor." The assistant initially could not provide details. '
|
||||
'Subsequently, the user asked for details about "Possessor," prompting the '
|
||||
'assistant to state it lacked specific context and offer to search the web.'
|
||||
)
|
||||
|
||||
# Also seed short-term dialogue memory with a prior deflection turn —
|
||||
# mirrors the real field session where the model had already said it
|
||||
# lacked info earlier in the same conversation, which then primes it
|
||||
# to repeat the same pattern on the follow-up.
|
||||
eval_dialogue_memory.add_message("user", "what do you know about the Possessor movie")
|
||||
eval_dialogue_memory.add_message(
|
||||
"assistant",
|
||||
"I don't have specific information about the film Possessor. "
|
||||
"I could search the web for it if you'd like.",
|
||||
)
|
||||
|
||||
query = "tell me more about Possessor"
|
||||
capture = ToolCallCapture()
|
||||
|
||||
# Patch the keyword search to guarantee the poisoned entry reaches the
|
||||
# system prompt. Going through the FTS/vector hybrid would make the test
|
||||
# flaky on seeded data that lacks vector embeddings.
|
||||
with patch(
|
||||
'jarvis.memory.conversation.search_conversation_memory_by_keywords',
|
||||
return_value=[poisoned_summary],
|
||||
), patch(
|
||||
'jarvis.reply.engine.run_tool_with_retries',
|
||||
side_effect=create_mock_tool_run(capture, {
|
||||
"webSearch": "Search result: Possessor is a 2020 film directed by Brandon Cronenberg.",
|
||||
"fetchWebPage": "Page content: relevant details about the requested entity.",
|
||||
}),
|
||||
):
|
||||
response = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text=query, dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
print(f"\n Live Poisoned-Diary Test ({JUDGE_MODEL}):")
|
||||
print(f" Query: '{query}'")
|
||||
print(f" Tools called: {capture.tool_names() or 'none'}")
|
||||
print(f" Response: {(response or '')[:200]}...")
|
||||
print(f" Model size: {'small' if is_small else 'large'}")
|
||||
|
||||
if not capture.has_tool("webSearch"):
|
||||
msg = (
|
||||
f"With a poisoned diary entry narrating past deflection, the model still "
|
||||
f"must call webSearch. Called: {capture.tool_names() or 'none'}. "
|
||||
f"Response: {(response or '')[:300]}"
|
||||
)
|
||||
if is_small:
|
||||
pytest.xfail(f"Small model {JUDGE_MODEL} regressed under poisoned diary. {msg}")
|
||||
else:
|
||||
pytest.fail(msg)
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_weather_still_triggers_tools_live(
|
||||
self,
|
||||
mock_config,
|
||||
eval_db,
|
||||
eval_dialogue_memory
|
||||
):
|
||||
"""Live test: weather query should still trigger tools."""
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
from helpers import JUDGE_MODEL
|
||||
|
||||
query = "what's the weather today"
|
||||
mock_config.ollama_base_url = "http://localhost:11434"
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
|
||||
capture = ToolCallCapture()
|
||||
|
||||
with patch('jarvis.reply.engine.run_tool_with_retries',
|
||||
side_effect=create_mock_tool_run(capture, {
|
||||
"getWeather": "Weather: 22C, partly cloudy",
|
||||
})):
|
||||
response = run_reply_engine(
|
||||
db=eval_db, cfg=mock_config, tts=None,
|
||||
text=query, dialogue_memory=eval_dialogue_memory
|
||||
)
|
||||
|
||||
print(f"\n Live Weather Test ({JUDGE_MODEL}):")
|
||||
print(f" Query: '{query}'")
|
||||
print(f" Tools called: {capture.tool_names() or 'none'}")
|
||||
print(f" Response: {(response or '')[:100]}...")
|
||||
|
||||
# Weather should trigger tools (getWeather or webSearch)
|
||||
assert capture.has_any_tool(), \
|
||||
f"Weather query should trigger tools. Response: {response}"
|
||||
Reference in New Issue
Block a user