Files
javis_bot/evals/test_greeting_no_tools.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

320 lines
13 KiB
Python

"""
Greeting No-Tools Evaluations (Live)
Live tests that verify greetings don't trigger tool calls with real LLM inference.
Mocked equivalents live in tests/test_greeting_no_tools.py as unit tests.
Run: ./scripts/run_evals.sh test_greeting
"""
import pytest
from unittest.mock import patch
from conftest import requires_judge_llm
from helpers import MockConfig, ToolCallCapture, create_mock_tool_run
def _assert_no_tools(capture, query, is_small, model_name):
"""Assert no tools were called; xfail for small models."""
if capture.has_any_tool():
if is_small:
pytest.xfail(
f"Small model {model_name} called tools for '{query}'. "
f"Known limitation. Called: {capture.tool_names()}"
)
else:
pytest.fail(
f"Large model '{query}' should NOT trigger tools. "
f"Called: {capture.tool_names()}"
)
# =============================================================================
# Live Tests with Real LLM
# =============================================================================
def _is_small_model(model_name: str) -> bool:
"""Check if model is classified as small by the model size detector."""
from jarvis.reply.prompts import detect_model_size, ModelSize
return detect_model_size(model_name) == ModelSize.SMALL
class TestGreetingNoToolsLive:
"""
Live tests with real LLM inference.
These verify that the prompt changes actually work with real models.
NOTE: Small models (1b-7b) may still incorrectly call tools for greetings
despite explicit prompt constraints. This is a fundamental limitation of
small model reasoning capacity. These tests document this behaviour.
"""
@pytest.mark.eval
@requires_judge_llm
@pytest.mark.parametrize("query,should_use_tools", [
pytest.param("hello", False, id="Greeting: hello"),
pytest.param("ni hao", False, id="Greeting: ni hao (Chinese)"),
])
def test_greeting_no_tools_live(
self,
query: str,
should_use_tools: bool,
mock_config,
eval_db,
eval_dialogue_memory
):
"""Live test: greetings should not trigger tool calls."""
from jarvis.reply.engine import run_reply_engine
from helpers import JUDGE_MODEL
# Use the judge model (which may be small or large)
mock_config.ollama_base_url = "http://localhost:11434"
mock_config.ollama_chat_model = JUDGE_MODEL
# Small models may fail this test due to limited reasoning capacity
# This documents the limitation rather than masking it
is_small = _is_small_model(JUDGE_MODEL)
capture = ToolCallCapture()
with patch('jarvis.reply.engine.run_tool_with_retries',
side_effect=create_mock_tool_run(capture)):
response = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text=query, dialogue_memory=eval_dialogue_memory
)
print(f"\n Live Greeting Test ({JUDGE_MODEL}):")
print(f" Query: '{query}'")
print(f" Tools called: {capture.tool_names() or 'none'}")
print(f" Response: {(response or '')[:100]}...")
print(f" Model size: {'small' if is_small else 'large'}")
# For greetings, we expect NO tool calls
if not should_use_tools:
_assert_no_tools(capture, query, is_small, JUDGE_MODEL)
@pytest.mark.eval
@requires_judge_llm
@pytest.mark.parametrize("query,should_use_tools", [
pytest.param("always use Celsius when telling me temperatures", False, id="Instruction: use Celsius"),
pytest.param("be more brief in your responses", False, id="Instruction: be more brief"),
])
def test_user_instructions_no_tools_live(
self,
query: str,
should_use_tools: bool,
mock_config,
eval_db,
eval_dialogue_memory
):
"""Live test: user instructions about behaviour should not trigger tool calls."""
from jarvis.reply.engine import run_reply_engine
from helpers import JUDGE_MODEL
mock_config.ollama_base_url = "http://localhost:11434"
mock_config.ollama_chat_model = JUDGE_MODEL
is_small = _is_small_model(JUDGE_MODEL)
capture = ToolCallCapture()
with patch('jarvis.reply.engine.run_tool_with_retries',
side_effect=create_mock_tool_run(capture)):
response = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text=query, dialogue_memory=eval_dialogue_memory
)
print(f"\n Live User Instruction Test ({JUDGE_MODEL}):")
print(f" Query: '{query}'")
print(f" Tools called: {capture.tool_names() or 'none'}")
print(f" Response: {(response or '')[:100]}...")
print(f" Model size: {'small' if is_small else 'large'}")
_assert_no_tools(capture, query, is_small, JUDGE_MODEL)
@pytest.mark.eval
@requires_judge_llm
@pytest.mark.parametrize("query", [
pytest.param("what do you know about the Possessor movie", id="Unknown entity: Possessor (film)"),
pytest.param("tell me about the book Piranesi", id="Unknown entity: Piranesi (book)"),
# Permission-framed phrasing. Regression: the small model previously
# read "what can you tell me" as "tell me what you can do" and deflected
# with "I can search the web if you'd like" instead of calling webSearch.
pytest.param("what can you tell me about the movie Possessor", id="Unknown entity: permission-framed (Possessor)"),
# "Have you heard of" is another common permission-framed variant.
pytest.param("have you heard of the film Piranesi", id="Unknown entity: have-you-heard-of (Piranesi)"),
])
def test_unknown_named_entity_triggers_web_search_live(
self,
query: str,
mock_config,
eval_db,
eval_dialogue_memory,
):
"""Live test: questions about specific named entities should trigger a web lookup.
The model should recognise it has no concrete facts about the entity and call
webSearch rather than denying knowledge or asking for a link.
"""
from jarvis.reply.engine import run_reply_engine
from helpers import JUDGE_MODEL
mock_config.ollama_base_url = "http://localhost:11434"
mock_config.ollama_chat_model = JUDGE_MODEL
is_small = _is_small_model(JUDGE_MODEL)
capture = ToolCallCapture()
with patch('jarvis.reply.engine.run_tool_with_retries',
side_effect=create_mock_tool_run(capture, {
"webSearch": "Search result: relevant details about the requested entity.",
"fetchWebPage": "Page content: relevant details about the requested entity.",
})):
response = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text=query, dialogue_memory=eval_dialogue_memory,
)
print(f"\n Live Unknown-Entity Test ({JUDGE_MODEL}):")
print(f" Query: '{query}'")
print(f" Tools called: {capture.tool_names() or 'none'}")
print(f" Response: {(response or '')[:120]}...")
print(f" Model size: {'small' if is_small else 'large'}")
if not capture.has_tool("webSearch"):
msg = (
f"Query about unknown named entity should trigger webSearch. "
f"Called: {capture.tool_names() or 'none'}. Response: {(response or '')[:200]}"
)
if is_small:
pytest.xfail(f"Small model {JUDGE_MODEL} did not call webSearch. {msg}")
else:
pytest.fail(msg)
@pytest.mark.eval
@requires_judge_llm
def test_unknown_entity_with_poisoned_diary_still_triggers_web_search_live(
self,
mock_config,
eval_db,
eval_dialogue_memory,
):
"""Reproduces the Possessor field regression.
A prior diary entry narrates the assistant's past deflection ("the assistant
offered to search the web"). When the same entity is asked about again, the
diary entry is retrieved as enrichment and — without the reference-only
framing — the small model imitates the narrated deflection instead of
calling webSearch.
The defences this test guards:
1. Summariser should not produce such entries in the first place (the
seeded entry simulates a legacy poisoned summary from before the fix).
2. The reply engine must frame the enrichment as reference-only so the
model doesn't treat "the assistant offered to search" as a template.
"""
from jarvis.reply.engine import run_reply_engine
from helpers import JUDGE_MODEL
mock_config.ollama_base_url = "http://localhost:11434"
mock_config.ollama_chat_model = JUDGE_MODEL
is_small = _is_small_model(JUDGE_MODEL)
# Seed a poisoned diary entry — matches the shape of the real 2026-04-19
# entry from the field failure. Uses the exact deflection phrasing we're
# trying to stop the model from imitating.
poisoned_summary = (
'[2026-04-19] The conversation began with the user asking for information about '
'the movie "Possessor." The assistant initially could not provide details. '
'Subsequently, the user asked for details about "Possessor," prompting the '
'assistant to state it lacked specific context and offer to search the web.'
)
# Also seed short-term dialogue memory with a prior deflection turn —
# mirrors the real field session where the model had already said it
# lacked info earlier in the same conversation, which then primes it
# to repeat the same pattern on the follow-up.
eval_dialogue_memory.add_message("user", "what do you know about the Possessor movie")
eval_dialogue_memory.add_message(
"assistant",
"I don't have specific information about the film Possessor. "
"I could search the web for it if you'd like.",
)
query = "tell me more about Possessor"
capture = ToolCallCapture()
# Patch the keyword search to guarantee the poisoned entry reaches the
# system prompt. Going through the FTS/vector hybrid would make the test
# flaky on seeded data that lacks vector embeddings.
with patch(
'jarvis.memory.conversation.search_conversation_memory_by_keywords',
return_value=[poisoned_summary],
), patch(
'jarvis.reply.engine.run_tool_with_retries',
side_effect=create_mock_tool_run(capture, {
"webSearch": "Search result: Possessor is a 2020 film directed by Brandon Cronenberg.",
"fetchWebPage": "Page content: relevant details about the requested entity.",
}),
):
response = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text=query, dialogue_memory=eval_dialogue_memory,
)
print(f"\n Live Poisoned-Diary Test ({JUDGE_MODEL}):")
print(f" Query: '{query}'")
print(f" Tools called: {capture.tool_names() or 'none'}")
print(f" Response: {(response or '')[:200]}...")
print(f" Model size: {'small' if is_small else 'large'}")
if not capture.has_tool("webSearch"):
msg = (
f"With a poisoned diary entry narrating past deflection, the model still "
f"must call webSearch. Called: {capture.tool_names() or 'none'}. "
f"Response: {(response or '')[:300]}"
)
if is_small:
pytest.xfail(f"Small model {JUDGE_MODEL} regressed under poisoned diary. {msg}")
else:
pytest.fail(msg)
@pytest.mark.eval
@requires_judge_llm
def test_weather_still_triggers_tools_live(
self,
mock_config,
eval_db,
eval_dialogue_memory
):
"""Live test: weather query should still trigger tools."""
from jarvis.reply.engine import run_reply_engine
from helpers import JUDGE_MODEL
query = "what's the weather today"
mock_config.ollama_base_url = "http://localhost:11434"
mock_config.ollama_chat_model = JUDGE_MODEL
capture = ToolCallCapture()
with patch('jarvis.reply.engine.run_tool_with_retries',
side_effect=create_mock_tool_run(capture, {
"getWeather": "Weather: 22C, partly cloudy",
})):
response = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text=query, dialogue_memory=eval_dialogue_memory
)
print(f"\n Live Weather Test ({JUDGE_MODEL}):")
print(f" Query: '{query}'")
print(f" Tools called: {capture.tool_names() or 'none'}")
print(f" Response: {(response or '')[:100]}...")
# Weather should trigger tools (getWeather or webSearch)
assert capture.has_any_tool(), \
f"Weather query should trigger tools. Response: {response}"