#!/usr/bin/env python3 """ Merge multiple eval reports into a single combined EVALS.md. This script takes pairs of (report_path, model_name) arguments and generates a combined report showing results from all models side by side. Usage: python merge_eval_reports.py report1.md model1 report2.md model2 > EVALS.md """ import sys import re from datetime import datetime from pathlib import Path from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple @dataclass class TestResult: """Result for a single test case (aggregated across multiple runs).""" name: str outcome: str # passed, failed, skipped, xfailed, xpassed, partial duration: float pass_rate: str = "" # e.g., "3/3 (100%)" or "2/3 (67%)" class_name: str = "" # The test class this result belongs to @dataclass class ModelReport: """Parsed report for a single model.""" model_name: str results: Dict[str, TestResult] = field(default_factory=dict) total: int = 0 passed: int = 0 failed: int = 0 skipped: int = 0 duration: float = 0.0 def parse_report(report_path: str, model_name: str) -> Optional[ModelReport]: """Parse a markdown eval report into a ModelReport.""" path = Path(report_path) if not path.exists(): print(f"Warning: Report not found: {report_path}", file=sys.stderr) return None content = path.read_text(encoding="utf-8") report = ModelReport(model_name=model_name) # Parse summary stats for line in content.split("\n"): if "| ✅ Passed |" in line: match = re.search(r"\|\s*(\d+)\s*\|", line.split("Passed")[1]) if match: report.passed = int(match.group(1)) elif "| ❌ Failed |" in line: match = re.search(r"\|\s*(\d+)\s*\|", line.split("Failed")[1]) if match: report.failed = int(match.group(1)) elif "| ⏭️ Skipped |" in line: match = re.search(r"\|\s*(\d+)\s*\|", line.split("Skipped")[1]) if match: report.skipped = int(match.group(1)) elif "| **Total** |" in line: match = re.search(r"\|\s*\*\*(\d+)\*\*\s*\|", line) if match: report.total = int(match.group(1)) elif "**Duration:**" in line: match = re.search(r"([\d.]+)s", line) if match: report.duration = float(match.group(1)) # Parse individual test results from: # 1. Table format: | Test Case | Pass Rate | Status | Avg Duration | # 2. Detailed format: #### ✅ test_name (used for judge tests with notes) # Track current class name from section headers like "### ✅ TestClassName" in_table = False table_format = "old" # "old" or "new" current_class = "" current_detailed_test = None # Track test name for detailed format parsing lines = content.split("\n") for i, line in enumerate(lines): # Detect class section headers (e.g., "### ✅ TestIntentJudgeAccuracy") # Use a more lenient pattern that handles multi-byte emoji characters class_header_match = re.match(r'^###\s+\S+\s+(Test\w+)', line) if class_header_match: current_class = class_header_match.group(1) in_table = False # Reset table state for new section current_detailed_test = None continue # Detect detailed test headers (e.g., "#### ✅ wake_word_simple_question") # Use a more lenient pattern that handles multi-byte emoji characters detailed_test_match = re.match(r'^####\s+(\S+)\s+(.+)$', line) if detailed_test_match: in_table = False emoji_str = detailed_test_match.group(1) test_name = detailed_test_match.group(2).strip() # Determine outcome from emoji (check for emoji presence) outcome = "unknown" if "✅" in emoji_str: outcome = "passed" elif "❌" in emoji_str: outcome = "failed" elif "⏭" in emoji_str: # May be ⏭️ or just ⏭ outcome = "skipped" elif "🔸" in emoji_str: outcome = "xfailed" elif "🎉" in emoji_str: outcome = "xpassed" elif "⚠" in emoji_str: # May be ⚠️ or just ⚠ outcome = "partial" current_detailed_test = test_name # Initialize with placeholder values, will be updated below report.results[test_name] = TestResult( name=test_name, outcome=outcome, duration=0.0, pass_rate="", class_name=current_class ) continue # Parse pass rate and duration for detailed format if current_detailed_test and current_detailed_test in report.results: # Parse pass rate line: "**Pass Rate:** 1/1 (100%)" or "**Pass Rate:** 1/1 XFAIL" if line.startswith("**Pass Rate:**"): pass_rate_match = re.search(r'\*\*Pass Rate:\*\*\s*(.+)', line) if pass_rate_match: report.results[current_detailed_test].pass_rate = pass_rate_match.group(1).strip() # Parse duration line: "*Avg Duration: 1.23s*" elif line.startswith("*Avg Duration:"): duration_match = re.search(r'([\d.]+)s', line) if duration_match: report.results[current_detailed_test].duration = float(duration_match.group(1)) current_detailed_test = None # Done parsing this test # Table format parsing if "| Test Case | Pass Rate | Status | Avg Duration |" in line: in_table = True table_format = "new" current_detailed_test = None continue if "| Test Case | Status | Duration |" in line: in_table = True table_format = "old" current_detailed_test = None continue if in_table and line.startswith("|") and "---" not in line: parts = [p.strip() for p in line.split("|")[1:-1]] if table_format == "new" and len(parts) >= 4: # Parse new format: | Test Case | Pass Rate | Status | Avg Duration | test_name = parts[0] pass_rate = parts[1] status_cell = parts[2] duration_cell = parts[3] elif len(parts) >= 3: # Parse old format: | Test Case | Status | Duration | test_name = parts[0] pass_rate = "" status_cell = parts[1] duration_cell = parts[2] else: continue # Extract outcome from status cell outcome = "unknown" if "✅" in status_cell: outcome = "passed" elif "❌" in status_cell: outcome = "failed" elif "⏭️" in status_cell: outcome = "skipped" elif "🔸" in status_cell: outcome = "xfailed" elif "🎉" in status_cell: outcome = "xpassed" elif "⚠️" in status_cell: outcome = "partial" # Extract duration duration_match = re.search(r"([\d.]+)s", duration_cell) duration = float(duration_match.group(1)) if duration_match else 0.0 report.results[test_name] = TestResult( name=test_name, outcome=outcome, duration=duration, pass_rate=pass_rate, class_name=current_class ) elif in_table and not line.startswith("|"): in_table = False return report def is_fixed_model_test(result: TestResult) -> bool: """Check if a test uses a fixed model, independent of the judge model. Some tests are pinned to specific models regardless of EVAL_JUDGE_MODEL: - Intent judge tests use gemma4 (the intent classification model) - Tool selection tests use nomic-embed-text (the embedding model) These shouldn't be compared across judge models since they always use the same model — they belong in their own section. NOTE: This list is kept in sync manually. When you add a new test class or file whose model is pinned (not controlled by EVAL_JUDGE_MODEL), add its class-name substring below or its test-name pattern to the fallback list. """ fixed_model_classes = [ "IntentJudge", # TestIntentJudgeAccuracy, TestIntentJudgeMultiSegment, etc. "ProcessedSegmentFiltering", # Intent judge processed segment filtering ] fixed_model_exact_classes = { "TestToolSelectionFiltering", # Embedding strategy, pinned to nomic-embed-text (exact match so TestToolSelectionFilteringLLM isn't bucketed here) } if result.class_name: if result.class_name in fixed_model_exact_classes: return True for class_pattern in fixed_model_classes: if class_pattern in result.class_name: return True fixed_model_name_patterns = [ "test_hot_window_mode_indicated_in_prompt", "test_tts_text_included_for_echo_detection", "test_system_prompt_has_echo_guidance", "test_returns_none_when_ollama_unavailable", ] return any(pattern in result.name for pattern in fixed_model_name_patterns) # Backwards-compatible alias is_intent_judge_test = is_fixed_model_test def _parse_pass_rate_fraction(pass_rate: str) -> Optional[Tuple[int, int]]: """Parse a pass rate string like '2/3 (67%)' into (passes, total). Returns None for non-standard formats (SKIPPED, XFAIL, N/A, etc.). """ match = re.match(r'(\d+)/(\d+)', pass_rate) if match: return int(match.group(1)), int(match.group(2)) return None def _calc_run_level_pass_rate( report: ModelReport, main_llm_tests: set ) -> Tuple[int, int]: """Calculate pass rate from individual run results across all main LLM tests. Returns (total_passes, total_runs) by parsing each test's pass_rate string. Falls back to counting fully-passed/failed tests when pass_rate data is missing. """ total_passes = 0 total_runs = 0 for test_name in main_llm_tests: result = report.results.get(test_name) if not result: continue # Skip xfailed/skipped — not countable if result.outcome in ("xfailed", "skipped"): continue fraction = _parse_pass_rate_fraction(result.pass_rate) if result.pass_rate else None if fraction: total_passes += fraction[0] total_runs += fraction[1] else: # Fallback: treat passed as 1/1, failed as 0/1 if result.outcome == "passed": total_passes += 1 total_runs += 1 elif result.outcome == "failed": total_runs += 1 return total_passes, total_runs STATUS_EMOJI = { "passed": "✅", "failed": "❌", "skipped": "⏭️", "xfailed": "🔸", "xpassed": "🎉", "partial": "⚠️", "unknown": "❓", } def _classify_fixed_model(result: TestResult) -> Optional[Tuple[str, str]]: """Return (category_key, pinned_model) for fixed-model tests, else None.""" cls = result.class_name or "" name = result.name or "" if "IntentJudge" in cls or "ProcessedSegmentFiltering" in cls or any( p in name for p in ( "test_hot_window_mode_indicated_in_prompt", "test_tts_text_included_for_echo_detection", "test_system_prompt_has_echo_guidance", "test_returns_none_when_ollama_unavailable", ) ): return ("intent_judge", "gemma4:e2b") if cls == "TestToolSelectionFiltering": return ("tool_selection", "nomic-embed-text") return None def _rate_emoji(rate: float) -> str: return "🟢" if rate >= 80 else "🟡" if rate >= 50 else "🔴" def _count_outcomes(results) -> Dict[str, int]: """Count outcome buckets (run-level: uses pass_rate fractions where available).""" passed = failed = skipped = xfailed = partial = 0 total_passes = total_runs = 0 for r in results: if r.outcome == "passed": passed += 1 elif r.outcome == "failed": failed += 1 elif r.outcome == "skipped": skipped += 1 elif r.outcome == "xfailed": xfailed += 1 elif r.outcome == "partial": partial += 1 if r.outcome in ("xfailed", "skipped"): continue fraction = _parse_pass_rate_fraction(r.pass_rate) if r.pass_rate else None if fraction: total_passes += fraction[0] total_runs += fraction[1] elif r.outcome == "passed": total_passes += 1 total_runs += 1 elif r.outcome == "failed": total_runs += 1 rate = (total_passes / total_runs * 100) if total_runs > 0 else 0.0 return { "passed": passed, "failed": failed, "skipped": skipped, "xfailed": xfailed, "partial": partial, "total": passed + failed + skipped + xfailed + partial, "run_passes": total_passes, "run_total": total_runs, "rate": rate, } def generate_combined_report(reports: List[ModelReport]) -> str: """Generate a combined markdown report grouped by test category.""" lines: List[str] = [] now = datetime.now() # Bucket results into three categories: # judge_compared: run once per judge model, compared side-by-side # intent_judge: pinned to gemma4:e2b, shown once # tool_selection: pinned to nomic-embed-text, shown once judge_compared: set[str] = set() intent_judge_results: Dict[str, TestResult] = {} tool_selection_results: Dict[str, TestResult] = {} for report in reports: for test_name, result in report.results.items(): fm = _classify_fixed_model(result) if fm is None: judge_compared.add(test_name) continue bucket = intent_judge_results if fm[0] == "intent_judge" else tool_selection_results existing = bucket.get(test_name) if existing is None or (existing.outcome == "skipped" and result.outcome != "skipped"): bucket[test_name] = result # Per-model stats for the judge-compared bucket per_model_stats: Dict[str, Dict[str, int]] = {} for report in reports: results = [r for n, r in report.results.items() if n in judge_compared] per_model_stats[report.model_name] = _count_outcomes(results) intent_stats = _count_outcomes(list(intent_judge_results.values())) tool_stats = _count_outcomes(list(tool_selection_results.values())) # Overall aggregate (sum of runs across all categories) overall_passes = sum(s["run_passes"] for s in per_model_stats.values()) + intent_stats["run_passes"] + tool_stats["run_passes"] overall_runs = sum(s["run_total"] for s in per_model_stats.values()) + intent_stats["run_total"] + tool_stats["run_total"] overall_rate = (overall_passes / overall_runs * 100) if overall_runs > 0 else 0.0 # Header lines.append("# 🧪 Jarvis Evaluation Report") lines.append("") lines.append(f"**Generated:** {now.strftime('%Y-%m-%d %H:%M:%S')}") lines.append("") # TL;DR lines.append("## 📊 TL;DR") lines.append("") lines.append(f"**Overall:** {_rate_emoji(overall_rate)} **{overall_passes}/{overall_runs} passed ({overall_rate:.1f}%)** across all categories") lines.append("") lines.append("| Category | Model | Passed | Failed | Skipped | Pass Rate |") lines.append("|----------|-------|-------:|-------:|--------:|----------:|") def _fmt_row(label: str, model_note: str, stats: Dict[str, int]) -> str: emoji = _rate_emoji(stats["rate"]) if stats["run_total"] else "➖" rate_str = f"{emoji} {stats['rate']:.1f}%" if stats["run_total"] else "➖" return ( f"| {label} | {model_note} | {stats['passed']} | {stats['failed']} | " f"{stats['skipped']} | {rate_str} |" ) for report in reports: lines.append(_fmt_row("🤖 Agent behaviour", f"`{report.model_name}`", per_model_stats[report.model_name])) if intent_judge_results: lines.append(_fmt_row("🎤 Intent judge", "`gemma4:e2b` (fixed)", intent_stats)) if tool_selection_results: lines.append(_fmt_row("🔍 Tool selection", "`nomic-embed-text` (fixed)", tool_stats)) lines.append("") # Model selection guide (only when comparing judges) if len(reports) > 1: lines.append("### 💡 Model Selection Guide") lines.append("") lines.append("| Model | Best For | Trade-offs |") lines.append("|-------|----------|------------|") lines.append("| `gemma4:e2b` | Quick responses, lower RAM usage | May struggle with complex reasoning |") lines.append("| `gpt-oss:20b` | Best accuracy, complex tasks | Slower, requires more RAM |") lines.append("") # Agent behaviour: per-test comparison across judge models lines.append("---") lines.append("") lines.append("## 🤖 Agent behaviour") lines.append("") lines.append("> Runs the full agent pipeline against each judge model. Tests are compared side-by-side.") lines.append("") header = "| Test Case |" separator = "|-----------|" for report in reports: header += f" {report.model_name} |" separator += "----------:|" lines.append(header) lines.append(separator) for test_name in sorted(judge_compared): row = f"| {test_name} |" for report in reports: result = report.results.get(test_name) if result: emoji = STATUS_EMOJI.get(result.outcome, "❓") row += f" {emoji} {result.pass_rate} |" if result.pass_rate else f" {emoji} |" else: row += " ➖ |" lines.append(row) lines.append("") def _render_fixed_section(title: str, blurb: str, results: Dict[str, TestResult]) -> None: if not results: return lines.append("---") lines.append("") lines.append(f"## {title}") lines.append("") lines.append(f"> {blurb}") lines.append("") lines.append("| Test Case | Pass Rate | Status |") lines.append("|-----------|-----------|:------:|") for test_name in sorted(results.keys()): result = results[test_name] emoji = STATUS_EMOJI.get(result.outcome, "❓") pass_rate_str = result.pass_rate if result.pass_rate else "N/A" lines.append(f"| {test_name} | {pass_rate_str} | {emoji} |") lines.append("") _render_fixed_section( "🎤 Intent judge", "Pinned to `gemma4:e2b` (the voice intent classifier). Not affected by the judge model.", intent_judge_results, ) _render_fixed_section( "🔍 Tool selection", "Pinned to `nomic-embed-text` (embedding-based filter). Not affected by the judge model.", tool_selection_results, ) # Legend lines.append("---") lines.append("") lines.append("### 📖 Legend") lines.append("") lines.append("| Symbol | Meaning |") lines.append("|--------|---------|") lines.append("| ✅ | Fully passed (100% pass rate) |") lines.append("| ⚠️ | Partial pass (some runs failed) |") lines.append("| ❌ | Fully failed (0% pass rate) |") lines.append("| ⏭️ | Skipped (missing dependencies) |") lines.append("| 🔸 | Expected failure (known limitation) |") lines.append("| 🎉 | Unexpectedly passed (bug fixed!) |") lines.append("| ➖ | Not run for this model |") lines.append("") lines.append("*Report generated by Jarvis eval suite*") return "\n".join(lines) def main(): if len(sys.argv) < 5 or len(sys.argv) % 2 != 1: print("Usage: merge_eval_reports.py report1.md model1 report2.md model2 ...", file=sys.stderr) sys.exit(1) # Parse arguments into pairs reports = [] args = sys.argv[1:] for i in range(0, len(args), 2): report_path = args[i] model_name = args[i + 1] report = parse_report(report_path, model_name) if report: reports.append(report) if not reports: print("Error: No valid reports found", file=sys.stderr) sys.exit(1) # Generate combined report combined = generate_combined_report(reports) sys.stdout.buffer.write(combined.encode("utf-8")) if __name__ == "__main__": main()