Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
717 lines
29 KiB
Python
717 lines
29 KiB
Python
"""
|
|
Shared fixtures and configuration for evals.
|
|
|
|
Evals test end-to-end quality of the reply engine with real or mock LLM responses.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, List, Optional
|
|
import pytest
|
|
|
|
# Robustly locate repository root
|
|
_this_file = Path(__file__).resolve()
|
|
ROOT = None
|
|
for parent in _this_file.parents:
|
|
if (parent / "src" / "jarvis").exists():
|
|
ROOT = parent
|
|
break
|
|
if ROOT is None:
|
|
ROOT = _this_file.parent.parent
|
|
|
|
SRC = ROOT / "src"
|
|
EVALS = ROOT / "evals"
|
|
if str(ROOT) not in sys.path:
|
|
sys.path.insert(0, str(ROOT))
|
|
if str(SRC) not in sys.path:
|
|
sys.path.insert(0, str(SRC))
|
|
if str(EVALS) not in sys.path:
|
|
sys.path.insert(0, str(EVALS))
|
|
|
|
from helpers import MockConfig, JUDGE_MODEL, is_judge_llm_available
|
|
|
|
|
|
# =============================================================================
|
|
# Shared Markers
|
|
# =============================================================================
|
|
|
|
_JUDGE_LLM_AVAILABLE = is_judge_llm_available()
|
|
requires_judge_llm = pytest.mark.skipif(
|
|
not _JUDGE_LLM_AVAILABLE,
|
|
reason="Judge LLM not available"
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# Test Case Descriptions
|
|
# =============================================================================
|
|
|
|
# Human-readable descriptions for test classes
|
|
CLASS_DESCRIPTIONS = {
|
|
"TestResponseQuality": "LLM-as-judge evaluations for response quality",
|
|
"TestContextUtilization": "Tests that agent uses location/time/memory context",
|
|
"TestToolUsage": "Validates tool selection and argument quality",
|
|
"TestMultiStepReasoning": "Complex scenarios requiring tool chaining and synthesis",
|
|
"TestMemoryEnrichment": "Tests automatic memory enrichment keyword extraction",
|
|
"TestLiveEndToEnd": "End-to-end tests against real LLM inference",
|
|
"TestNutritionExtraction": "Tests LLM nutrition extraction accuracy for meal logging",
|
|
"TestNutritionToolIntegration": "Tests full meal logging tool with macro extraction",
|
|
"TestNutritionModelComparison": "Baseline tests for comparing nutrition extraction across models",
|
|
"TestIntentJudgeAccuracy": "Intent judge accuracy for voice command classification",
|
|
"TestIntentJudgePromptQuality": "Intent judge prompt construction quality",
|
|
"TestIntentJudgeFallback": "Intent judge fallback behaviour when unavailable",
|
|
"TestIntentJudgeMultiSegment": "Intent judge with multi-segment buffers and multi-person conversations",
|
|
"TestWakeWordValidationSafetyNet": "Integration: listener rejects judge hallucinations when no wake word present",
|
|
"TestEchoReasoningDistrust": "Integration: listener overrides judge echo claims when EchoDetector cleared",
|
|
"TestHotWindowHeuristicAccuracy": "Integration: could_be_hot_window heuristic passes correct mode to judge",
|
|
"TestProcessedSegmentFilteringIntegration": "Integration: processed segments excluded from judge prompt",
|
|
"TestHotWindowUsesRawText": "Integration: hot window preserves full user text, wake word uses judge extraction",
|
|
"TestMultiSegmentBufferIntegration": "Integration: multi-segment buffer with TTS echoes handled correctly",
|
|
"TestStopCommandBypassesJudge": "Integration: stop commands during TTS bypass judge entirely",
|
|
"TestKnowledgeExtractionQuality": "Tests that novel knowledge is correctly extracted from summaries",
|
|
"TestKnowledgeExtractionRejection": "Tests that noise, stale data, and common knowledge are rejected",
|
|
"TestKnowledgeExtractionReframing": "Tests that interaction descriptions are reframed as knowledge",
|
|
"TestKnowledgeExtractionJudge": "LLM-as-judge evaluations of extraction quality",
|
|
"TestTopicSwitching": "Tests correct tool selection when conversation topic changes",
|
|
"TestFollowUpContext": "Tests context retention for follow-up questions",
|
|
"TestMultiTurnExtended": "Extended multi-turn scenarios with longer conversations",
|
|
"TestGreetingNoToolsLive": "Tests that greetings don't trigger tool calls",
|
|
"TestHelpfulness": "Tests that agent uses tools proactively instead of deflecting",
|
|
"TestDiaryRecencyOrder": "Tests that diary search returns newer entries before older ones",
|
|
"TestGraphRecencySuperseding": "Tests that graph handles contradicting facts with date context",
|
|
"TestRecencyJudge": "LLM judge evaluates whether newer information is preferred over older",
|
|
"TestMalformedResponseAfterTools": "Tests that malformed LLM output after tool results is not surfaced",
|
|
"TestCelebrityIdentityThenFollowUp": "Two-turn celebrity flow: identity query then pronoun follow-up",
|
|
"TestSearchFailureWikipediaRescue": "Wikipedia-rescue payload is consumed correctly, not confabulated over",
|
|
"TestMultiStepEntityQuery": "Single query requiring two sequential webSearch calls (director + filmography)",
|
|
}
|
|
|
|
# Descriptions for non-parametrized tests
|
|
TEST_DESCRIPTIONS = {
|
|
"test_weather_response_quality": "Judge evaluates weather response quality",
|
|
"test_location_context_in_search": "Location context flows to search queries",
|
|
"test_simple_search_flow": "Agent calls webSearch for info queries",
|
|
"test_tool_chaining_search_then_fetch": "Agent chains search → fetch for details",
|
|
"test_nutrition_advice_uses_memory_and_data": "Agent uses memory + nutrition data",
|
|
"test_enrichment_extracts_correct_keywords": "Enrichment extracts personalization keywords",
|
|
"test_enrichment_provides_context_to_llm": "Enrichment results appear in system message",
|
|
"test_llm_uses_enrichment_for_personalised_queries": "LLM uses enrichment-surfaced interests for personalised search",
|
|
"test_weather_query_live": "Weather query is answered with current conditions",
|
|
"test_personalized_query_recalls_memory_live": "Assistant checks memory before asking about interests",
|
|
"test_interest_flavoured_query_live": "Interest-flavoured phrasings surface seeded interests in the reply",
|
|
# Nutrition extraction tests
|
|
"test_meal_extraction_accuracy": "Extracts accurate macros for common meals",
|
|
"test_extraction_returns_valid_json_structure": "Returns valid JSON with all required fields",
|
|
"test_extraction_handles_ambiguous_portions": "Handles ambiguous portion descriptions",
|
|
"test_extraction_rejects_non_food": "Returns NONE for non-food inputs",
|
|
"test_log_meal_tool_extracts_macros": "LogMealTool stores meals with macros",
|
|
"test_simple_meal_extraction": "Simple meal baseline (2 boiled eggs)",
|
|
"test_extraction_with_quantities": "Extraction with explicit quantities",
|
|
# Multi-turn context tests
|
|
"test_weather_then_store_hours": "Topic switch: weather → store hours uses webSearch",
|
|
"test_weather_then_restaurant_search": "Topic switch: weather → restaurant uses webSearch",
|
|
"test_search_then_weather": "Topic switch: search → weather uses getWeather",
|
|
"test_follow_up_references_previous_context": "Follow-up references previous turn context",
|
|
"test_three_turn_topic_changes": "3-turn conversation with topic changes",
|
|
"test_rapid_topic_switching": "Rapid back-and-forth topic switching",
|
|
# Greeting no-tools live tests
|
|
"test_greeting_no_tools_live": "Greetings do not trigger tool calls",
|
|
"test_user_instructions_no_tools_live": "User instructions do not trigger tool calls",
|
|
"test_weather_still_triggers_tools_live": "Weather query still triggers tools after a greeting",
|
|
# Helpfulness / anti-deflection tests
|
|
"test_no_deflection_for_weather_forecast_live": "No deflection on weather forecast questions",
|
|
"test_no_deflection_for_answerable_queries_live": "No deflection on answerable questions",
|
|
"test_tool_retry_after_failure_live": "Assistant retries a tool after the first attempt fails",
|
|
"test_graph_knowledge_surfaced_in_reply_live": "Graph-enriched facts surface in the reply, no denial",
|
|
"test_does_not_deny_long_term_memory_live": "Assistant does not deny having long-term memory",
|
|
# Multi-step entity / complex flow tests
|
|
"test_chained_research_possessor_director": "Chained research: who directed Possessor and what else have they made",
|
|
"test_parallel_comparison_paris_vs_london": "Parallel weather lookup: compare Paris and London",
|
|
"test_director_then_filmography_requires_two_searches": "Director-then-filmography needs two searches",
|
|
"test_two_turn_celebrity_flow": "Two-turn celebrity flow: identity then pronoun follow-up",
|
|
"test_single_weather_call_terminates": "Single weather query ends after one tool call",
|
|
"test_max_turn_triggers_digest": "Max-turn cap delivers a digest reply, never silence",
|
|
# Knowledge extraction
|
|
"test_judge_mixed_summary_filters_noise": "Mixed summary: keep novel facts, drop stale weather/recommendations",
|
|
"test_judge_empty_conversation_returns_empty": "Trivial conversations produce no extracted facts",
|
|
"test_open_ended_prompt_grounds_in_graph_context_live": "Open-ended prompt grounds in stored knowledge",
|
|
}
|
|
|
|
|
|
def _parse_parametrize_id(node_id: str) -> Optional[str]:
|
|
"""Extract the parametrize case ID from a node_id like 'test_foo[case-name]'.
|
|
|
|
Returns None if the bracket content is just a pytest-repeat suffix like '1-3'.
|
|
"""
|
|
match = re.search(r'\[(.+)\]$', node_id)
|
|
if not match:
|
|
return None
|
|
|
|
case_id = match.group(1)
|
|
|
|
# Check if this is just a pytest-repeat suffix (e.g., "1-3", "2-3")
|
|
# These have format "N-M" where N is run number and M is total runs
|
|
if re.match(r'^\d+-\d+$', case_id):
|
|
return None
|
|
|
|
# Strip pytest-repeat suffix from the end of case IDs (e.g., "greeting-1-3" -> "greeting")
|
|
case_id = re.sub(r'-\d+-\d+$', '', case_id)
|
|
|
|
return case_id
|
|
|
|
|
|
def _extract_judge_notes(stdout: Optional[str]) -> Optional[Dict[str, str]]:
|
|
"""Parse judge evaluation output from stdout."""
|
|
if not stdout:
|
|
return None
|
|
|
|
notes = {}
|
|
|
|
# Extract score
|
|
score_match = re.search(r'Score:\s*([\d.]+)', stdout)
|
|
if score_match:
|
|
notes["score"] = score_match.group(1)
|
|
|
|
# Extract reasoning
|
|
reasoning_match = re.search(r'Reasoning:\s*(.+?)(?:\n|$)', stdout)
|
|
if reasoning_match:
|
|
notes["reasoning"] = reasoning_match.group(1).strip()
|
|
|
|
# Extract response being evaluated
|
|
response_match = re.search(r'Response:\s*(.+?)(?:\.\.\.|$)', stdout)
|
|
if response_match:
|
|
notes["response"] = response_match.group(1).strip()
|
|
|
|
return notes if notes else None
|
|
|
|
|
|
def _humanise_test_name(test_name: str) -> str:
|
|
"""Turn ``test_some_thing_does_X`` into ``Some thing does X``.
|
|
|
|
Last-resort fallback used when a test has no entry in TEST_DESCRIPTIONS
|
|
and no parametrize id. Keeps the report readable for non-technical
|
|
readers — they shouldn't have to parse Python identifiers.
|
|
"""
|
|
name = test_name
|
|
if name.startswith("test_"):
|
|
name = name[5:]
|
|
name = name.replace("_", " ").strip()
|
|
if not name:
|
|
return test_name
|
|
return name[0].upper() + name[1:]
|
|
|
|
|
|
def _strip_redundant_prefix(label: str) -> str:
|
|
"""Drop noisy prefixes from human-readable case labels.
|
|
|
|
Every eval is live by design (the suite drives a real model), so the
|
|
``Live:`` / ``Live `` prefix is uninformative. Same for trailing model
|
|
suffixes like ``-gpt-oss:20b`` that pytest cross-products into
|
|
parametrize ids — the Model column already shows that.
|
|
"""
|
|
s = label.strip()
|
|
# Trailing "-<model>" suffix injected by pytest parametrize cross-product.
|
|
for suffix in ("-gpt-oss:20b", "-gemma4:e2b", "-gemma4:e4b"):
|
|
if s.endswith(suffix):
|
|
s = s[: -len(suffix)].rstrip()
|
|
break
|
|
# Leading "Live:" / "Live " prefix is redundant — the suite is live.
|
|
lower = s.lower()
|
|
for prefix in ("live: ", "live: ", "live "):
|
|
if lower.startswith(prefix):
|
|
s = s[len(prefix):].lstrip()
|
|
if s:
|
|
s = s[0].upper() + s[1:]
|
|
break
|
|
return s
|
|
|
|
|
|
def _get_test_description(test_name: str, case_id: Optional[str]) -> str:
|
|
"""
|
|
Get the description for a test case.
|
|
|
|
For parametrized tests, the case_id IS the description (set via pytest.param id=).
|
|
For non-parametrized tests, use the TEST_DESCRIPTIONS lookup.
|
|
"""
|
|
if case_id:
|
|
return _strip_redundant_prefix(case_id)
|
|
|
|
raw = TEST_DESCRIPTIONS.get(test_name)
|
|
if raw is not None:
|
|
return _strip_redundant_prefix(raw)
|
|
# Last-resort: humanise the raw test name so the report doesn't expose
|
|
# Python identifiers to non-technical readers.
|
|
return _humanise_test_name(test_name)
|
|
|
|
|
|
# =============================================================================
|
|
# Markdown Report Generation
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class TestResult:
|
|
"""Captured result from a single test run."""
|
|
name: str
|
|
outcome: str # passed, failed, skipped, xfailed, xpassed
|
|
duration: float
|
|
class_name: str
|
|
test_name: str
|
|
case_id: Optional[str] = None
|
|
description: str = ""
|
|
reason: Optional[str] = None
|
|
stdout: Optional[str] = None
|
|
judge_notes: Optional[Dict[str, str]] = None
|
|
|
|
|
|
@dataclass
|
|
class AggregatedTestResult:
|
|
"""Aggregated results from multiple runs of the same test."""
|
|
name: str
|
|
class_name: str
|
|
test_name: str
|
|
description: str
|
|
runs: List[TestResult] = field(default_factory=list)
|
|
|
|
@property
|
|
def pass_count(self) -> int:
|
|
return sum(1 for r in self.runs if r.outcome in ("passed", "xpassed"))
|
|
|
|
@property
|
|
def fail_count(self) -> int:
|
|
return sum(1 for r in self.runs if r.outcome == "failed")
|
|
|
|
@property
|
|
def skip_count(self) -> int:
|
|
return sum(1 for r in self.runs if r.outcome == "skipped")
|
|
|
|
@property
|
|
def xfail_count(self) -> int:
|
|
return sum(1 for r in self.runs if r.outcome == "xfailed")
|
|
|
|
@property
|
|
def total_runs(self) -> int:
|
|
return len(self.runs)
|
|
|
|
@property
|
|
def pass_rate(self) -> float:
|
|
countable = self.pass_count + self.fail_count
|
|
return (self.pass_count / countable * 100) if countable > 0 else 0.0
|
|
|
|
@property
|
|
def total_duration(self) -> float:
|
|
return sum(r.duration for r in self.runs)
|
|
|
|
@property
|
|
def avg_duration(self) -> float:
|
|
return self.total_duration / len(self.runs) if self.runs else 0.0
|
|
|
|
@property
|
|
def overall_outcome(self) -> str:
|
|
"""Determine overall outcome based on pass rate."""
|
|
if self.skip_count == self.total_runs:
|
|
return "skipped"
|
|
if self.xfail_count == self.total_runs:
|
|
return "xfailed"
|
|
if self.pass_count == self.total_runs:
|
|
return "passed"
|
|
if self.fail_count == self.total_runs:
|
|
return "failed"
|
|
return "partial"
|
|
|
|
@property
|
|
def pass_rate_str(self) -> str:
|
|
"""Format pass rate as 'X/Y (Z%)'."""
|
|
countable = self.pass_count + self.fail_count
|
|
if countable == 0:
|
|
if self.skip_count > 0:
|
|
return "SKIPPED"
|
|
if self.xfail_count > 0:
|
|
return f"{self.xfail_count}/{self.total_runs} XFAIL"
|
|
return "N/A"
|
|
return f"{self.pass_count}/{countable} ({self.pass_rate:.0f}%)"
|
|
|
|
@property
|
|
def judge_notes(self) -> Optional[Dict[str, str]]:
|
|
"""Return judge notes from first run that has them."""
|
|
for run in self.runs:
|
|
if run.judge_notes:
|
|
return run.judge_notes
|
|
return None
|
|
|
|
@property
|
|
def reason(self) -> Optional[str]:
|
|
"""Return reason from first run that has it."""
|
|
for run in self.runs:
|
|
if run.reason:
|
|
return run.reason
|
|
return None
|
|
|
|
|
|
def _strip_repeat_suffix(node_id: str) -> str:
|
|
"""
|
|
Strip pytest-repeat iteration suffix from node ID.
|
|
|
|
pytest-repeat adds suffixes like [1-3], [2-3], [3-3] to repeated tests.
|
|
This strips those suffixes to get the base test identifier for aggregation.
|
|
"""
|
|
# Match patterns like [1-3], [2-3], [3-3] at the end of node ID
|
|
# But preserve parametrize IDs like [greeting-en], [weather-query], etc.
|
|
return re.sub(r'\[(\d+)-(\d+)\]$', '', node_id)
|
|
|
|
|
|
def _get_aggregation_key(result: TestResult) -> str:
|
|
"""Get a unique key for aggregating repeated test runs."""
|
|
# Use class_name + test_name + case_id (if any) as the aggregation key
|
|
key_parts = [result.class_name, result.test_name]
|
|
if result.case_id:
|
|
# case_id should already have repeat suffixes stripped by _parse_parametrize_id
|
|
key_parts.append(result.case_id)
|
|
return "::".join(key_parts)
|
|
|
|
|
|
@dataclass
|
|
class EvalReport:
|
|
"""Aggregated eval results for markdown generation."""
|
|
results: List[TestResult] = field(default_factory=list)
|
|
start_time: Optional[datetime] = None
|
|
end_time: Optional[datetime] = None
|
|
judge_model: str = ""
|
|
|
|
def add_result(self, result: TestResult):
|
|
self.results.append(result)
|
|
|
|
def get_aggregated_results(self) -> List[AggregatedTestResult]:
|
|
"""Aggregate results from multiple runs of the same test."""
|
|
aggregated: Dict[str, AggregatedTestResult] = {}
|
|
|
|
for result in self.results:
|
|
key = _get_aggregation_key(result)
|
|
if key not in aggregated:
|
|
# Description should already have repeat suffixes stripped
|
|
aggregated[key] = AggregatedTestResult(
|
|
name=_strip_repeat_suffix(result.name),
|
|
class_name=result.class_name,
|
|
test_name=result.test_name,
|
|
description=result.description,
|
|
)
|
|
aggregated[key].runs.append(result)
|
|
|
|
return list(aggregated.values())
|
|
|
|
@property
|
|
def total_unique_tests(self) -> int:
|
|
return len(self.get_aggregated_results())
|
|
|
|
@property
|
|
def total_runs(self) -> int:
|
|
return len(self.results)
|
|
|
|
@property
|
|
def passed(self) -> int:
|
|
return sum(1 for r in self.results if r.outcome == "passed")
|
|
|
|
@property
|
|
def failed(self) -> int:
|
|
return sum(1 for r in self.results if r.outcome == "failed")
|
|
|
|
@property
|
|
def skipped(self) -> int:
|
|
return sum(1 for r in self.results if r.outcome == "skipped")
|
|
|
|
@property
|
|
def xfailed(self) -> int:
|
|
return sum(1 for r in self.results if r.outcome == "xfailed")
|
|
|
|
@property
|
|
def xpassed(self) -> int:
|
|
return sum(1 for r in self.results if r.outcome == "xpassed")
|
|
|
|
@property
|
|
def pass_rate(self) -> float:
|
|
countable = self.passed + self.failed + self.xpassed
|
|
return (self.passed + self.xpassed) / countable * 100 if countable > 0 else 0.0
|
|
|
|
@property
|
|
def duration(self) -> float:
|
|
return sum(r.duration for r in self.results)
|
|
|
|
def generate_markdown(self) -> str:
|
|
"""Generate a pretty markdown report with pass rates from multiple runs."""
|
|
lines = []
|
|
aggregated_results = self.get_aggregated_results()
|
|
|
|
# Calculate overall stats from aggregated results
|
|
total_tests = len(aggregated_results)
|
|
fully_passed = sum(1 for r in aggregated_results if r.overall_outcome == "passed")
|
|
fully_failed = sum(1 for r in aggregated_results if r.overall_outcome == "failed")
|
|
partial = sum(1 for r in aggregated_results if r.overall_outcome == "partial")
|
|
skipped = sum(1 for r in aggregated_results if r.overall_outcome == "skipped")
|
|
xfailed = sum(1 for r in aggregated_results if r.overall_outcome == "xfailed")
|
|
|
|
# Header
|
|
lines.append("# 🧪 Jarvis Evaluation Report")
|
|
lines.append("")
|
|
lines.append(f"**Generated:** {self.end_time.strftime('%Y-%m-%d %H:%M:%S') if self.end_time else 'N/A'}")
|
|
lines.append(f"**Judge Model:** `{self.judge_model}`")
|
|
lines.append(f"**Duration:** {self.duration:.2f}s")
|
|
lines.append(f"**Runs per test:** {self.total_runs // total_tests if total_tests > 0 else 0}")
|
|
lines.append("")
|
|
|
|
# Summary stats
|
|
lines.append("## 📊 Summary")
|
|
lines.append("")
|
|
lines.append("| Metric | Count |")
|
|
lines.append("|--------|-------|")
|
|
lines.append(f"| ✅ Fully Passed (100%) | {fully_passed} |")
|
|
lines.append(f"| ⚠️ Partial Pass | {partial} |")
|
|
lines.append(f"| ❌ Fully Failed (0%) | {fully_failed} |")
|
|
lines.append(f"| ⏭️ Skipped | {skipped} |")
|
|
lines.append(f"| 🔸 Expected Fail | {xfailed} |")
|
|
lines.append(f"| **Unique Tests** | **{total_tests}** |")
|
|
lines.append(f"| **Total Runs** | **{self.total_runs}** |")
|
|
lines.append("")
|
|
|
|
# Pass rate bar (based on individual runs)
|
|
pass_rate = self.pass_rate
|
|
bar_filled = int(pass_rate / 5) # 20 chars max
|
|
bar_empty = 20 - bar_filled
|
|
bar = "█" * bar_filled + "░" * bar_empty
|
|
emoji = "🟢" if pass_rate >= 80 else "🟡" if pass_rate >= 50 else "🔴"
|
|
lines.append(f"**Overall Pass Rate:** {emoji} `{bar}` **{pass_rate:.1f}%** ({self.passed}/{self.passed + self.failed} runs)")
|
|
lines.append("")
|
|
|
|
# Group aggregated results by class
|
|
by_class: Dict[str, List[AggregatedTestResult]] = {}
|
|
for result in aggregated_results:
|
|
if result.class_name not in by_class:
|
|
by_class[result.class_name] = []
|
|
by_class[result.class_name].append(result)
|
|
|
|
# Detailed results
|
|
lines.append("---")
|
|
lines.append("")
|
|
lines.append("## 📋 Detailed Results")
|
|
lines.append("")
|
|
|
|
for class_name, class_results in by_class.items():
|
|
class_fully_passed = sum(1 for r in class_results if r.overall_outcome == "passed")
|
|
class_total = len([r for r in class_results if r.overall_outcome not in ("skipped",)])
|
|
class_emoji = "✅" if class_fully_passed == class_total and class_total > 0 else "⚠️" if class_fully_passed > 0 else "❌"
|
|
|
|
# Class header with description
|
|
lines.append(f"### {class_emoji} {class_name}")
|
|
if class_name in CLASS_DESCRIPTIONS:
|
|
lines.append(f"> {CLASS_DESCRIPTIONS[class_name]}")
|
|
lines.append("")
|
|
|
|
# Check if this class has judge notes (only for LLMAsJudge class)
|
|
is_judge_class = "Judge" in class_name
|
|
has_judge_notes = is_judge_class and any(r.judge_notes for r in class_results)
|
|
|
|
if has_judge_notes:
|
|
# Detailed format for judge tests
|
|
for result in class_results:
|
|
status_emoji = {
|
|
"passed": "✅",
|
|
"failed": "❌",
|
|
"skipped": "⏭️",
|
|
"xfailed": "🔸",
|
|
"partial": "⚠️",
|
|
}.get(result.overall_outcome, "❓")
|
|
|
|
lines.append(f"#### {status_emoji} {result.description}")
|
|
lines.append("")
|
|
lines.append(f"**Pass Rate:** {result.pass_rate_str}")
|
|
|
|
if result.judge_notes:
|
|
notes = result.judge_notes
|
|
if "response" in notes:
|
|
lines.append(f"**Input:** `{notes['response']}`")
|
|
if "score" in notes:
|
|
score = float(notes['score'])
|
|
score_bar = "●" * int(score * 10) + "○" * (10 - int(score * 10))
|
|
lines.append(f"**Score:** {score_bar} ({notes['score']})")
|
|
if "reasoning" in notes:
|
|
lines.append(f"**Judge notes:** {notes['reasoning']}")
|
|
lines.append("")
|
|
|
|
lines.append(f"*Avg Duration: {result.avg_duration:.2f}s*")
|
|
lines.append("")
|
|
else:
|
|
# Table format for non-judge tests with pass rates
|
|
lines.append("| Test Case | Pass Rate | Status | Avg Duration |")
|
|
lines.append("|-----------|-----------|--------|--------------|")
|
|
|
|
for result in class_results:
|
|
status_emoji = {
|
|
"passed": "✅",
|
|
"failed": "❌",
|
|
"skipped": "⏭️",
|
|
"xfailed": "🔸",
|
|
"partial": "⚠️",
|
|
}.get(result.overall_outcome, "❓")
|
|
|
|
status_text = result.overall_outcome.upper()
|
|
if result.reason:
|
|
reason_short = result.reason[:30] + "..." if len(result.reason) > 30 else result.reason
|
|
status_text += f" ({reason_short})"
|
|
|
|
lines.append(f"| {result.description} | {result.pass_rate_str} | {status_emoji} {status_text} | {result.avg_duration:.2f}s |")
|
|
|
|
lines.append("")
|
|
|
|
# Footer
|
|
lines.append("---")
|
|
lines.append("")
|
|
lines.append("*Report generated by Jarvis eval suite*")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
# Global report instance
|
|
_eval_report: Optional[EvalReport] = None
|
|
|
|
|
|
def pytest_configure(config):
|
|
"""Initialize the eval report at test session start."""
|
|
global _eval_report
|
|
if os.environ.get("EVAL_GENERATE_REPORT") == "1":
|
|
_eval_report = EvalReport(
|
|
start_time=datetime.now(),
|
|
judge_model=JUDGE_MODEL
|
|
)
|
|
|
|
|
|
def pytest_runtest_logreport(report):
|
|
"""Capture each test result."""
|
|
global _eval_report
|
|
if _eval_report is None:
|
|
return
|
|
|
|
# Only capture the final result (call phase for passed/failed, setup/teardown for errors)
|
|
if report.when != "call" and not (report.when in ("setup", "teardown") and report.outcome == "failed"):
|
|
return
|
|
|
|
# Parse the node ID to extract class and test name
|
|
node_id = report.nodeid
|
|
parts = node_id.split("::")
|
|
class_name = parts[1] if len(parts) > 1 else "Unknown"
|
|
full_test_name = parts[-1] if parts else node_id
|
|
|
|
# Extract parametrize case ID (which is the description for parametrized tests)
|
|
case_id = _parse_parametrize_id(full_test_name)
|
|
test_name = full_test_name.split("[")[0]
|
|
|
|
# Get description: for parametrized tests, it's the case_id; otherwise from lookup
|
|
description = _get_test_description(test_name, case_id)
|
|
|
|
# Determine outcome
|
|
outcome = report.outcome
|
|
if hasattr(report, "wasxfail"):
|
|
outcome = "xpassed" if report.passed else "xfailed"
|
|
|
|
# Get skip reason if applicable
|
|
reason = None
|
|
if outcome == "skipped" and hasattr(report, "longrepr"):
|
|
if isinstance(report.longrepr, tuple) and len(report.longrepr) >= 3:
|
|
reason = str(report.longrepr[2])
|
|
|
|
# Capture stdout and parse judge notes
|
|
stdout = None
|
|
judge_notes = None
|
|
if hasattr(report, "capstdout") and report.capstdout:
|
|
stdout = report.capstdout
|
|
judge_notes = _extract_judge_notes(stdout)
|
|
|
|
# Also check sections for captured stdout
|
|
if not stdout:
|
|
for section_name, section_content in report.sections:
|
|
if "stdout" in section_name.lower():
|
|
stdout = section_content
|
|
judge_notes = _extract_judge_notes(stdout)
|
|
break
|
|
|
|
_eval_report.add_result(TestResult(
|
|
name=node_id,
|
|
outcome=outcome,
|
|
duration=report.duration,
|
|
class_name=class_name,
|
|
test_name=test_name,
|
|
case_id=case_id,
|
|
description=description,
|
|
reason=reason,
|
|
stdout=stdout,
|
|
judge_notes=judge_notes,
|
|
))
|
|
|
|
|
|
def pytest_sessionfinish(session, exitstatus):
|
|
"""Generate the markdown report at session end."""
|
|
global _eval_report
|
|
if _eval_report is None:
|
|
return
|
|
|
|
_eval_report.end_time = datetime.now()
|
|
|
|
# Write the markdown report (ensure UTF-8 encoding for emojis/unicode)
|
|
# Support custom report path via environment variable
|
|
report_path_str = os.environ.get("EVAL_REPORT_PATH")
|
|
if report_path_str:
|
|
report_path = Path(report_path_str)
|
|
else:
|
|
report_path = ROOT / "EVALS.md"
|
|
|
|
markdown = _eval_report.generate_markdown()
|
|
report_path.write_text(markdown, encoding="utf-8")
|
|
try:
|
|
print(f"\n📄 Eval report saved to: {report_path}")
|
|
except UnicodeEncodeError:
|
|
print(f"\nEval report saved to: {report_path}")
|
|
|
|
|
|
# =============================================================================
|
|
# Fixtures
|
|
# =============================================================================
|
|
|
|
@pytest.fixture
|
|
def mock_config():
|
|
"""Provide a mock configuration for eval tests."""
|
|
return MockConfig()
|
|
|
|
|
|
@pytest.fixture
|
|
def eval_db():
|
|
"""Provide an in-memory database for eval tests."""
|
|
from jarvis.memory.db import Database
|
|
db = Database(":memory:", sqlite_vss_path=None)
|
|
yield db
|
|
db.close()
|
|
|
|
|
|
@pytest.fixture
|
|
def eval_dialogue_memory():
|
|
"""Provide a dialogue memory instance for eval tests."""
|
|
from jarvis.memory.conversation import DialogueMemory
|
|
return DialogueMemory(inactivity_timeout=300, max_interactions=20)
|
|
|
|
|
|
@pytest.fixture
|
|
def graph_store(tmp_path):
|
|
"""Graph store backed by a temp SQLite DB, closed on teardown.
|
|
|
|
Closes the SQLite connection so `tmp_path`'s cleanup can unlink
|
|
the file on Windows. POSIX would tolerate a still-open handle,
|
|
Windows would not.
|
|
"""
|
|
from jarvis.memory.graph import GraphMemoryStore
|
|
store = GraphMemoryStore(str(tmp_path / "test.db"))
|
|
try:
|
|
yield store
|
|
finally:
|
|
store.close()
|
|
|