Files
javis_bot/evals/helpers.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

653 lines
24 KiB
Python

"""
Helper functions and data classes for eval tests.
"""
from dataclasses import dataclass, field
from typing import Optional, Dict, Any, List, Callable, Tuple
import os
# LLM-as-judge / model-under-test configuration.
#
# This single knob does double duty: it's both the model the eval uses as
# the chat LLM being tested AND the judge used to assess open-ended
# responses. Field failures on the production default surface here first,
# so the default MUST match what users actually run — which is the smallest
# supported model in the README ("gemma4:e2b"), not the largest we
# internally test against. Opt into larger models with EVAL_JUDGE_MODEL=…
# when you want a sanity check of the upper tier.
#
# Historical note: the default was gpt-oss:20b until 2026-04-20, at which
# point two field regressions on gemma4:e2b (tool selected but not invoked;
# native "tool_code" fallback syntax) slipped past CI because the evals
# were only testing the 20B tier. Defaulting to the small tier is the
# cheapest way to stop that happening again.
JUDGE_MODEL = os.environ.get("EVAL_JUDGE_MODEL", "gemma4:e2b")
JUDGE_BASE_URL = os.environ.get("EVAL_JUDGE_BASE_URL", "http://localhost:11434")
# =============================================================================
# Tool Call Capture
# =============================================================================
# =============================================================================
# Fallback-reply detection
# =============================================================================
#
# When the malformed-output guard fires in the reply engine (engine.py), the
# user gets one of these canned strings. From the user's perspective that is
# a FAILURE — they asked a question and got a shrug — but historically several
# evals treated it as neutral because "no malformed text reached the user" is
# technically true. Treating these strings as test failures turns a silent
# shield into a loud alarm: if gemma keeps tripping the guard under a given
# context shape (warm memory, large digest, odd phrasing), the evals will
# finally flag it.
#
# The helper asserts at the call site of an eval rather than globally,
# because a handful of evals (e.g. `TestMalformedResponseAfterTools` itself)
# are specifically asserting the fallback fires and must NOT use this helper.
FALLBACK_REPLY_PHRASES = (
"i had trouble understanding that request",
"i had trouble processing that",
"sorry, i had trouble",
)
def is_fallback_reply(response: Optional[str]) -> bool:
"""Return True when ``response`` is the engine's canned malformed-guard
fallback reply — i.e. the user got a shrug instead of an answer."""
if not response:
return False
lowered = response.lower()
return any(phrase in lowered for phrase in FALLBACK_REPLY_PHRASES)
def assert_not_fallback_reply(response: Optional[str], context: str = "") -> None:
"""Fail the test when the response is the engine's canned fallback.
A fallback reply means the malformed-output guard fired — which is a
safety net masking an underlying model failure. In most evals, seeing
this string means the test SHOULD fail even if the rest of the
assertions happen to pass, because the user experience is "the
assistant gave up".
"""
import pytest
if is_fallback_reply(response):
prefix = f"[{context}] " if context else ""
pytest.fail(
f"{prefix}Response is the engine's canned malformed-guard "
f"fallback reply — the model produced garbled output and the "
f"guard shielded the user. From the user's perspective the "
f"assistant gave up. Treat this as a real failure. "
f"Response: {(response or '')[:400]}"
)
# =============================================================================
# Max-turns digest caveat detection
# =============================================================================
#
# When the agentic loop exhausts ``agentic_max_turns`` without the evaluator
# ever firing terminal, ``digest_loop_for_max_turns`` in ``enrichment.py``
# produces a reply whose first sentence is a caveat noting the request was
# not fully finished (e.g. "I could not fully finish your request…").
#
# From the user's perspective that caveat is a FAILURE for simple,
# single-tool queries — the tool ran, the answer was in hand, and yet the
# evaluator kept saying "continue" until the turn cap fired the digest
# summariser. The answer that follows the caveat is typically correct, so
# naive grounding assertions pass and the regression hides. Treating the
# caveat as a failure turns that silent shield into a loud alarm for the
# evaluator's terminal-detection quality.
#
# The digest prompt (``_LOOP_DIGEST_SYSTEM_PROMPT`` in
# ``src/jarvis/reply/enrichment.py``) instructs the LLM to open with a
# caveat about not finishing. The phrases below are the canonical English
# shapes that prompt produces; a drift pin test keeps them aligned with
# the source prompt.
MAX_TURNS_DIGEST_PHRASES = (
"could not fully finish",
"couldn't fully finish",
"was unable to fully finish",
"wasn't able to fully finish",
)
def is_max_turns_digest(response: Optional[str]) -> bool:
"""Return True when ``response`` looks like the max-turns digest
caveat — i.e. the agentic loop ran out of turns without the evaluator
ever firing terminal."""
if not response:
return False
lowered = response.lower()
return any(phrase in lowered for phrase in MAX_TURNS_DIGEST_PHRASES)
def assert_not_max_turns_digest(response: Optional[str], context: str = "") -> None:
"""Fail the test when the response opens with the max-turns digest
caveat. For simple single-tool queries, hitting the digest path means
the evaluator failed to recognise a grounded, terminal reply — even if
the content that follows the caveat happens to be correct."""
import pytest
if is_max_turns_digest(response):
prefix = f"[{context}] " if context else ""
pytest.fail(
f"{prefix}Response begins with the max-turns digest caveat — "
f"the agentic loop exhausted ``agentic_max_turns`` without the "
f"evaluator returning terminal on a grounded reply. For simple "
f"queries this is an evaluator quality failure, not a success. "
f"Response: {(response or '')[:400]}"
)
# =============================================================================
# Warm-memory seeding
# =============================================================================
#
# The default eval fixtures (`eval_db`, `eval_dialogue_memory`) start empty,
# which does NOT reproduce the real-world state where the user's memory
# already carries weeks of diary summaries. Field failures consistently
# correlate with loaded context: gemma produces clean tool calls on empty
# memory and slides into scaffolding leaks when a multi-hundred-char memory
# digest is prepended to the system message.
#
# This helper seeds the diary table with dated summaries on a given topic
# so the memory-search path hits real entries and produces a digest that
# matches the production shape.
def seed_diary_summaries(
db,
topic_summaries: List[Tuple[str, str]],
) -> None:
"""Seed ``conversation_summaries`` with the given (date_utc, summary) pairs.
``date_utc`` must be ``YYYY-MM-DD``. The helper is a thin wrapper around
``db.upsert_conversation_summary`` intended for evals that need a warm
memory state — e.g. "user has asked about the weather ten times in the
last fortnight" — to reproduce the loaded-context failure mode that the
reply engine hits in production.
"""
for date_utc, summary in topic_summaries:
db.upsert_conversation_summary(
date_utc=date_utc,
summary=summary,
topics=None,
source_app="jarvis",
)
@dataclass
class ToolCallCapture:
"""Captures tool calls during evaluation."""
calls: List[Dict[str, Any]] = field(default_factory=list)
def record(self, name: str, args: Dict[str, Any]):
self.calls.append({"name": name, "args": args})
def has_tool(self, name: str) -> bool:
return any(c["name"] == name for c in self.calls)
def has_any_tool(self) -> bool:
return len(self.calls) > 0
def get_args(self, name: str) -> Optional[Dict[str, Any]]:
for c in self.calls:
if c["name"] == name:
return c["args"]
return None
def tool_names(self) -> List[str]:
return [c["name"] for c in self.calls]
# Alias for backward compatibility
tool_sequence = tool_names
def clear(self):
self.calls = []
# =============================================================================
# Mock Tool Run Factory
# =============================================================================
def create_mock_tool_run(
capture: ToolCallCapture,
responses: Optional[Dict[str, str]] = None,
):
"""Create a mock tool runner that captures calls and returns canned responses.
Args:
capture: ToolCallCapture instance to record calls
responses: Dict mapping tool name → response text. Unmatched tools return "OK".
Returns:
A function suitable for patching ``run_tool_with_retries``.
"""
responses = responses or {}
def mock_tool_run(db, cfg, tool_name, tool_args, **kwargs):
from jarvis.tools.types import ToolExecutionResult
capture.record(tool_name, tool_args or {})
reply = responses.get(tool_name, "OK")
return ToolExecutionResult(success=True, reply_text=reply)
return mock_tool_run
@dataclass
class MockConfig:
"""Minimal config object for eval tests."""
ollama_base_url: str = "http://localhost:11434"
ollama_chat_model: str = "gemma4:e2b"
ollama_embed_model: str = "nomic-embed-text"
db_path: str = ":memory:"
sqlite_vss_path: Optional[str] = None
voice_debug: bool = True
tts_enabled: bool = False
tts_engine: str = "piper" # "piper" (default) or "chatterbox"
tts_voice: Optional[str] = None
tts_rate: int = 200
# Piper TTS settings
tts_piper_model_path: Optional[str] = None
tts_piper_speaker: Optional[int] = None
tts_piper_length_scale: float = 1.0
tts_piper_noise_scale: float = 0.667
tts_piper_noise_w: float = 0.8
tts_piper_sentence_silence: float = 0.2
# Chatterbox TTS settings
tts_chatterbox_device: str = "cpu"
tts_chatterbox_audio_prompt: Optional[str] = None
tts_chatterbox_exaggeration: float = 0.5
tts_chatterbox_cfg_weight: float = 0.5
web_search_enabled: bool = True
brave_search_api_key: str = ""
wikipedia_fallback_enabled: bool = True
llm_profile_select_timeout_sec: float = 10.0
llm_tools_timeout_sec: float = 8.0
llm_embed_timeout_sec: float = 10.0
llm_chat_timeout_sec: float = 120.0
agentic_max_turns: int = 8
memory_enrichment_max_results: int = 5
active_profiles: List[str] = field(default_factory=lambda: ["developer", "business", "life"])
location_enabled: bool = True
location_ip_address: Optional[str] = None
location_auto_detect: bool = False
location_cgnat_resolve_public_ip: bool = False
dialogue_memory_timeout: int = 300
mcps: Dict[str, Any] = field(default_factory=dict)
use_stdin: bool = True
@dataclass
class EvalResult:
"""Result of a single eval test case."""
query: str
response: Optional[str]
is_passed: bool
failure_reason: Optional[str] = None
tool_calls_made: List[str] = field(default_factory=list)
turn_count: int = 0
def __str__(self) -> str:
status = "✅ PASS" if self.is_passed else "❌ FAIL"
lines = [
f"{status}: {self.query[:50]}...",
f" Response: {(self.response or '')[:100]}...",
f" Tools used: {', '.join(self.tool_calls_made) or 'none'}",
f" Turns: {self.turn_count}",
]
if self.failure_reason:
lines.append(f" Reason: {self.failure_reason}")
return "\n".join(lines)
@dataclass
class EvalCase:
"""A single eval test case definition."""
name: str
query: str
expected_tool_calls: List[str] = field(default_factory=list)
response_should_contain: List[str] = field(default_factory=list)
response_should_not_contain: List[str] = field(default_factory=list)
custom_validator: Optional[Callable[[str], bool]] = None
profile_hint: Optional[str] = None
def assert_response_quality(result: EvalResult, case: EvalCase) -> None:
"""Assert that the response meets quality criteria."""
response = result.response or ""
response_lower = response.lower()
# Check expected content
for expected in case.response_should_contain:
assert expected.lower() in response_lower, (
f"Response should contain '{expected}' but got: {response[:200]}..."
)
# Check excluded content
for excluded in case.response_should_not_contain:
assert excluded.lower() not in response_lower, (
f"Response should NOT contain '{excluded}' but got: {response[:200]}..."
)
# Check custom validator
if case.custom_validator:
assert case.custom_validator(response), (
f"Custom validation failed for response: {response[:200]}..."
)
def is_generic_greeting(response: str) -> bool:
"""Check if response is a generic greeting that ignores the query."""
generic_patterns = [
"how can i help you",
"what can i do for you",
"what would you like",
"how may i assist",
"is there something",
"let me know what",
"feel free to ask",
]
response_lower = response.lower()
return any(pattern in response_lower for pattern in generic_patterns)
def response_addresses_topic(response: str, topic_keywords: List[str]) -> bool:
"""Check if response addresses the topic by mentioning relevant keywords."""
response_lower = response.lower()
return any(kw.lower() in response_lower for kw in topic_keywords)
def create_mock_llm_response(content: str, tool_calls: Optional[List[Dict]] = None) -> Dict[str, Any]:
"""Create a mock LLM response in Ollama format."""
message = {"content": content, "role": "assistant"}
if tool_calls:
message["tool_calls"] = tool_calls
return {"message": message}
def create_tool_call(name: str, args: Dict[str, Any]) -> Dict[str, Any]:
"""Create a tool call in OpenAI format."""
return {
"id": f"call_{name}_001",
"function": {
"name": name,
"arguments": args
}
}
# =============================================================================
# LLM-as-Judge Evaluation
# =============================================================================
@dataclass
class JudgeVerdict:
"""Result from LLM judge evaluation."""
is_passed: bool
score: float # 0.0 to 1.0
reasoning: str
criteria_scores: Dict[str, float] = field(default_factory=dict)
def is_judge_llm_available() -> bool:
"""Check if the judge LLM is available and the model exists."""
import requests
try:
# First check if Ollama is running
resp = requests.get(f"{JUDGE_BASE_URL.rstrip('/')}/api/tags", timeout=2)
if resp.status_code != 200:
return False
# Check if the judge model is available
data = resp.json()
models = data.get("models", [])
model_names = [m.get("name", "").split(":")[0] for m in models]
# Check if our judge model (or a variant) is available
judge_base = JUDGE_MODEL.split(":")[0]
return any(judge_base in name for name in model_names)
except Exception:
return False
def call_judge_llm(system_prompt: str, user_prompt: str, timeout_sec: float = 120.0) -> Optional[str]:
"""Call the judge LLM with a prompt."""
import requests
payload = {
"model": JUDGE_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
"stream": False,
"options": {"num_ctx": 4096},
}
try:
resp = requests.post(
f"{JUDGE_BASE_URL.rstrip('/')}/api/chat",
json=payload,
timeout=timeout_sec
)
resp.raise_for_status()
data = resp.json()
if isinstance(data, dict) and "message" in data:
return data["message"].get("content", "")
except Exception as e:
print(f"⚠️ Judge LLM call failed: {e}")
return None
return None
def judge_response_answers_query(query: str, response: str, context: Optional[str] = None) -> JudgeVerdict:
"""
Use LLM to judge if the response actually answers the user's query.
Args:
query: The user's original question
response: The assistant's response
context: Optional context about what data was available (e.g., tool results)
Returns:
JudgeVerdict with pass/fail, score, and reasoning
"""
system_prompt = """You are an evaluation judge for a voice assistant. Your job is to determine if the assistant's response actually answers the user's question with real information.
Score the response on these criteria (0-10 each):
1. RELEVANCE: Does the response address the specific question asked? Score 0 if it doesn't mention the topic at all.
2. COMPLETENESS: Does it provide the information the user was seeking? Score 0 for empty acknowledgments like "Sure!", "OK!", "Got it!" that provide no actual information.
3. ACCURACY: Is the information factually plausible (based on any context provided)? Score 0 if no factual information is provided.
4. NO_DEFLECTION: Does it avoid generic greetings, deflections like "How can I help you?", or empty acknowledgments? Score 0 for responses under 20 characters that don't answer the question.
IMPORTANT: A response that just acknowledges without providing any actual information (e.g., "Sure thing!", "OK!", "Got it!") should score 0 on COMPLETENESS and fail overall.
Output your evaluation in this EXACT format:
RELEVANCE: [0-10]
COMPLETENESS: [0-10]
ACCURACY: [0-10]
NO_DEFLECTION: [0-10]
OVERALL: [PASS/FAIL]
REASONING: [One paragraph explaining your verdict]"""
user_prompt = f"""User Query: {query}
Assistant Response: {response}"""
if context:
user_prompt += f"\n\nContext (data available to assistant):\n{context[:2000]}"
judge_response = call_judge_llm(system_prompt, user_prompt)
if not judge_response:
# Fallback to heuristic evaluation if judge fails
return JudgeVerdict(
is_passed=not is_generic_greeting(response) and len(response) > 50,
score=0.5,
reasoning="Judge LLM unavailable, using heuristic fallback"
)
# Parse the judge response
return _parse_judge_response(judge_response)
def judge_search_query_quality(
user_query: str,
search_query: str,
location: Optional[str] = None,
time_context: Optional[str] = None
) -> JudgeVerdict:
"""
Use LLM to judge if the search query is well-formed for the user's intent.
Args:
user_query: What the user asked
search_query: The search query the assistant generated
location: User's known location (should be included if relevant)
time_context: Time-related context (e.g., "this week", "tomorrow")
Returns:
JudgeVerdict evaluating search query quality
"""
system_prompt = """You are evaluating search queries generated by a voice assistant.
Score the search query on these criteria (0-10 each):
1. INTENT_MATCH: Does the search query capture the user's actual intent?
2. LOCATION_AWARENESS: If location is known and relevant, is it included appropriately?
3. TIME_AWARENESS: If the query has time context, is it reflected in the search?
4. SPECIFICITY: Is the query specific enough to get useful results?
Output your evaluation in this EXACT format:
INTENT_MATCH: [0-10]
LOCATION_AWARENESS: [0-10]
TIME_AWARENESS: [0-10]
SPECIFICITY: [0-10]
OVERALL: [PASS/FAIL]
REASONING: [One paragraph explaining your verdict]"""
user_prompt = f"""User Query: "{user_query}"
Generated Search Query: "{search_query}"
"""
if location:
user_prompt += f"User's Known Location: {location}\n"
if time_context:
user_prompt += f"Time Context: {time_context}\n"
judge_response = call_judge_llm(system_prompt, user_prompt)
if not judge_response:
# Heuristic fallback
has_location = location and any(
loc_part.lower() in search_query.lower()
for loc_part in location.split(",")[0].split()
)
return JudgeVerdict(
is_passed=has_location if location else True,
score=0.5,
reasoning="Judge LLM unavailable, using heuristic fallback"
)
return _parse_judge_response(judge_response)
def _parse_judge_response(response: str) -> JudgeVerdict:
"""Parse the structured judge response into a JudgeVerdict."""
lines = response.strip().split("\n")
criteria_scores = {}
is_passed = False
reasoning = ""
for line in lines:
line = line.strip()
if ":" in line:
key, value = line.split(":", 1)
key = key.strip().upper()
value = value.strip()
if key == "OVERALL":
is_passed = "PASS" in value.upper()
elif key == "REASONING":
reasoning = value
else:
# Try to parse as score
try:
score = float(value.split()[0])
criteria_scores[key.lower()] = score / 10.0 # Normalize to 0-1
except (ValueError, IndexError):
pass
# Calculate average score
avg_score = sum(criteria_scores.values()) / len(criteria_scores) if criteria_scores else 0.5
return JudgeVerdict(
is_passed=is_passed,
score=avg_score,
reasoning=reasoning,
criteria_scores=criteria_scores
)
def judge_tool_usage_appropriateness(
query: str,
tools_called: List[str],
tool_args: List[Dict[str, Any]],
expected_tools: Optional[List[str]] = None
) -> JudgeVerdict:
"""
Judge whether the tools used were appropriate for the query.
Args:
query: User's question
tools_called: List of tool names that were called
tool_args: List of arguments passed to each tool
expected_tools: Optional list of tools that should have been called
Returns:
JudgeVerdict on tool usage
"""
system_prompt = """You are evaluating tool usage by a voice assistant.
Score on these criteria (0-10 each):
1. TOOL_SELECTION: Were the right tools chosen for the task?
2. ARG_QUALITY: Were the tool arguments well-formed and appropriate?
3. EFFICIENCY: Was there unnecessary tool calling or missing necessary calls?
Output your evaluation in this EXACT format:
TOOL_SELECTION: [0-10]
ARG_QUALITY: [0-10]
EFFICIENCY: [0-10]
OVERALL: [PASS/FAIL]
REASONING: [One paragraph explaining your verdict]"""
tool_info = "\n".join([
f"- {name}: {args}" for name, args in zip(tools_called, tool_args)
]) if tools_called else "No tools called"
user_prompt = f"""User Query: "{query}"
Tools Called:
{tool_info}
"""
if expected_tools:
user_prompt += f"\nExpected Tools: {', '.join(expected_tools)}"
judge_response = call_judge_llm(system_prompt, user_prompt)
if not judge_response:
# Heuristic fallback
has_expected = not expected_tools or all(t in tools_called for t in expected_tools)
return JudgeVerdict(
is_passed=has_expected,
score=0.5,
reasoning="Judge LLM unavailable, using heuristic fallback"
)
return _parse_judge_response(judge_response)