Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
540 lines
20 KiB
Python
Executable File
540 lines
20 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Merge multiple eval reports into a single combined EVALS.md.
|
||
|
||
This script takes pairs of (report_path, model_name) arguments and generates
|
||
a combined report showing results from all models side by side.
|
||
|
||
Usage:
|
||
python merge_eval_reports.py report1.md model1 report2.md model2 > EVALS.md
|
||
"""
|
||
|
||
import sys
|
||
import re
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from dataclasses import dataclass, field
|
||
from typing import Dict, List, Optional, Tuple
|
||
|
||
|
||
@dataclass
|
||
class TestResult:
|
||
"""Result for a single test case (aggregated across multiple runs)."""
|
||
name: str
|
||
outcome: str # passed, failed, skipped, xfailed, xpassed, partial
|
||
duration: float
|
||
pass_rate: str = "" # e.g., "3/3 (100%)" or "2/3 (67%)"
|
||
class_name: str = "" # The test class this result belongs to
|
||
|
||
|
||
@dataclass
|
||
class ModelReport:
|
||
"""Parsed report for a single model."""
|
||
model_name: str
|
||
results: Dict[str, TestResult] = field(default_factory=dict)
|
||
total: int = 0
|
||
passed: int = 0
|
||
failed: int = 0
|
||
skipped: int = 0
|
||
duration: float = 0.0
|
||
|
||
|
||
def parse_report(report_path: str, model_name: str) -> Optional[ModelReport]:
|
||
"""Parse a markdown eval report into a ModelReport."""
|
||
path = Path(report_path)
|
||
if not path.exists():
|
||
print(f"Warning: Report not found: {report_path}", file=sys.stderr)
|
||
return None
|
||
|
||
content = path.read_text(encoding="utf-8")
|
||
report = ModelReport(model_name=model_name)
|
||
|
||
# Parse summary stats
|
||
for line in content.split("\n"):
|
||
if "| ✅ Passed |" in line:
|
||
match = re.search(r"\|\s*(\d+)\s*\|", line.split("Passed")[1])
|
||
if match:
|
||
report.passed = int(match.group(1))
|
||
elif "| ❌ Failed |" in line:
|
||
match = re.search(r"\|\s*(\d+)\s*\|", line.split("Failed")[1])
|
||
if match:
|
||
report.failed = int(match.group(1))
|
||
elif "| ⏭️ Skipped |" in line:
|
||
match = re.search(r"\|\s*(\d+)\s*\|", line.split("Skipped")[1])
|
||
if match:
|
||
report.skipped = int(match.group(1))
|
||
elif "| **Total** |" in line:
|
||
match = re.search(r"\|\s*\*\*(\d+)\*\*\s*\|", line)
|
||
if match:
|
||
report.total = int(match.group(1))
|
||
elif "**Duration:**" in line:
|
||
match = re.search(r"([\d.]+)s", line)
|
||
if match:
|
||
report.duration = float(match.group(1))
|
||
|
||
# Parse individual test results from:
|
||
# 1. Table format: | Test Case | Pass Rate | Status | Avg Duration |
|
||
# 2. Detailed format: #### ✅ test_name (used for judge tests with notes)
|
||
# Track current class name from section headers like "### ✅ TestClassName"
|
||
in_table = False
|
||
table_format = "old" # "old" or "new"
|
||
current_class = ""
|
||
current_detailed_test = None # Track test name for detailed format parsing
|
||
lines = content.split("\n")
|
||
|
||
for i, line in enumerate(lines):
|
||
# Detect class section headers (e.g., "### ✅ TestIntentJudgeAccuracy")
|
||
# Use a more lenient pattern that handles multi-byte emoji characters
|
||
class_header_match = re.match(r'^###\s+\S+\s+(Test\w+)', line)
|
||
if class_header_match:
|
||
current_class = class_header_match.group(1)
|
||
in_table = False # Reset table state for new section
|
||
current_detailed_test = None
|
||
continue
|
||
|
||
# Detect detailed test headers (e.g., "#### ✅ wake_word_simple_question")
|
||
# Use a more lenient pattern that handles multi-byte emoji characters
|
||
detailed_test_match = re.match(r'^####\s+(\S+)\s+(.+)$', line)
|
||
if detailed_test_match:
|
||
in_table = False
|
||
emoji_str = detailed_test_match.group(1)
|
||
test_name = detailed_test_match.group(2).strip()
|
||
|
||
# Determine outcome from emoji (check for emoji presence)
|
||
outcome = "unknown"
|
||
if "✅" in emoji_str:
|
||
outcome = "passed"
|
||
elif "❌" in emoji_str:
|
||
outcome = "failed"
|
||
elif "⏭" in emoji_str: # May be ⏭️ or just ⏭
|
||
outcome = "skipped"
|
||
elif "🔸" in emoji_str:
|
||
outcome = "xfailed"
|
||
elif "🎉" in emoji_str:
|
||
outcome = "xpassed"
|
||
elif "⚠" in emoji_str: # May be ⚠️ or just ⚠
|
||
outcome = "partial"
|
||
|
||
current_detailed_test = test_name
|
||
# Initialize with placeholder values, will be updated below
|
||
report.results[test_name] = TestResult(
|
||
name=test_name,
|
||
outcome=outcome,
|
||
duration=0.0,
|
||
pass_rate="",
|
||
class_name=current_class
|
||
)
|
||
continue
|
||
|
||
# Parse pass rate and duration for detailed format
|
||
if current_detailed_test and current_detailed_test in report.results:
|
||
# Parse pass rate line: "**Pass Rate:** 1/1 (100%)" or "**Pass Rate:** 1/1 XFAIL"
|
||
if line.startswith("**Pass Rate:**"):
|
||
pass_rate_match = re.search(r'\*\*Pass Rate:\*\*\s*(.+)', line)
|
||
if pass_rate_match:
|
||
report.results[current_detailed_test].pass_rate = pass_rate_match.group(1).strip()
|
||
# Parse duration line: "*Avg Duration: 1.23s*"
|
||
elif line.startswith("*Avg Duration:"):
|
||
duration_match = re.search(r'([\d.]+)s', line)
|
||
if duration_match:
|
||
report.results[current_detailed_test].duration = float(duration_match.group(1))
|
||
current_detailed_test = None # Done parsing this test
|
||
|
||
# Table format parsing
|
||
if "| Test Case | Pass Rate | Status | Avg Duration |" in line:
|
||
in_table = True
|
||
table_format = "new"
|
||
current_detailed_test = None
|
||
continue
|
||
if "| Test Case | Status | Duration |" in line:
|
||
in_table = True
|
||
table_format = "old"
|
||
current_detailed_test = None
|
||
continue
|
||
if in_table and line.startswith("|") and "---" not in line:
|
||
parts = [p.strip() for p in line.split("|")[1:-1]]
|
||
|
||
if table_format == "new" and len(parts) >= 4:
|
||
# Parse new format: | Test Case | Pass Rate | Status | Avg Duration |
|
||
test_name = parts[0]
|
||
pass_rate = parts[1]
|
||
status_cell = parts[2]
|
||
duration_cell = parts[3]
|
||
elif len(parts) >= 3:
|
||
# Parse old format: | Test Case | Status | Duration |
|
||
test_name = parts[0]
|
||
pass_rate = ""
|
||
status_cell = parts[1]
|
||
duration_cell = parts[2]
|
||
else:
|
||
continue
|
||
|
||
# Extract outcome from status cell
|
||
outcome = "unknown"
|
||
if "✅" in status_cell:
|
||
outcome = "passed"
|
||
elif "❌" in status_cell:
|
||
outcome = "failed"
|
||
elif "⏭️" in status_cell:
|
||
outcome = "skipped"
|
||
elif "🔸" in status_cell:
|
||
outcome = "xfailed"
|
||
elif "🎉" in status_cell:
|
||
outcome = "xpassed"
|
||
elif "⚠️" in status_cell:
|
||
outcome = "partial"
|
||
|
||
# Extract duration
|
||
duration_match = re.search(r"([\d.]+)s", duration_cell)
|
||
duration = float(duration_match.group(1)) if duration_match else 0.0
|
||
|
||
report.results[test_name] = TestResult(
|
||
name=test_name,
|
||
outcome=outcome,
|
||
duration=duration,
|
||
pass_rate=pass_rate,
|
||
class_name=current_class
|
||
)
|
||
elif in_table and not line.startswith("|"):
|
||
in_table = False
|
||
|
||
return report
|
||
|
||
|
||
def is_fixed_model_test(result: TestResult) -> bool:
|
||
"""Check if a test uses a fixed model, independent of the judge model.
|
||
|
||
Some tests are pinned to specific models regardless of EVAL_JUDGE_MODEL:
|
||
- Intent judge tests use gemma4 (the intent classification model)
|
||
- Tool selection tests use nomic-embed-text (the embedding model)
|
||
|
||
These shouldn't be compared across judge models since they always use the
|
||
same model — they belong in their own section.
|
||
|
||
NOTE: This list is kept in sync manually. When you add a new test class or
|
||
file whose model is pinned (not controlled by EVAL_JUDGE_MODEL), add its
|
||
class-name substring below or its test-name pattern to the fallback list.
|
||
"""
|
||
fixed_model_classes = [
|
||
"IntentJudge", # TestIntentJudgeAccuracy, TestIntentJudgeMultiSegment, etc.
|
||
"ProcessedSegmentFiltering", # Intent judge processed segment filtering
|
||
]
|
||
fixed_model_exact_classes = {
|
||
"TestToolSelectionFiltering", # Embedding strategy, pinned to nomic-embed-text (exact match so TestToolSelectionFilteringLLM isn't bucketed here)
|
||
}
|
||
|
||
if result.class_name:
|
||
if result.class_name in fixed_model_exact_classes:
|
||
return True
|
||
for class_pattern in fixed_model_classes:
|
||
if class_pattern in result.class_name:
|
||
return True
|
||
|
||
fixed_model_name_patterns = [
|
||
"test_hot_window_mode_indicated_in_prompt",
|
||
"test_tts_text_included_for_echo_detection",
|
||
"test_system_prompt_has_echo_guidance",
|
||
"test_returns_none_when_ollama_unavailable",
|
||
]
|
||
return any(pattern in result.name for pattern in fixed_model_name_patterns)
|
||
|
||
|
||
# Backwards-compatible alias
|
||
is_intent_judge_test = is_fixed_model_test
|
||
|
||
|
||
def _parse_pass_rate_fraction(pass_rate: str) -> Optional[Tuple[int, int]]:
|
||
"""Parse a pass rate string like '2/3 (67%)' into (passes, total).
|
||
|
||
Returns None for non-standard formats (SKIPPED, XFAIL, N/A, etc.).
|
||
"""
|
||
match = re.match(r'(\d+)/(\d+)', pass_rate)
|
||
if match:
|
||
return int(match.group(1)), int(match.group(2))
|
||
return None
|
||
|
||
|
||
def _calc_run_level_pass_rate(
|
||
report: ModelReport, main_llm_tests: set
|
||
) -> Tuple[int, int]:
|
||
"""Calculate pass rate from individual run results across all main LLM tests.
|
||
|
||
Returns (total_passes, total_runs) by parsing each test's pass_rate string.
|
||
Falls back to counting fully-passed/failed tests when pass_rate data is missing.
|
||
"""
|
||
total_passes = 0
|
||
total_runs = 0
|
||
|
||
for test_name in main_llm_tests:
|
||
result = report.results.get(test_name)
|
||
if not result:
|
||
continue
|
||
|
||
# Skip xfailed/skipped — not countable
|
||
if result.outcome in ("xfailed", "skipped"):
|
||
continue
|
||
|
||
fraction = _parse_pass_rate_fraction(result.pass_rate) if result.pass_rate else None
|
||
if fraction:
|
||
total_passes += fraction[0]
|
||
total_runs += fraction[1]
|
||
else:
|
||
# Fallback: treat passed as 1/1, failed as 0/1
|
||
if result.outcome == "passed":
|
||
total_passes += 1
|
||
total_runs += 1
|
||
elif result.outcome == "failed":
|
||
total_runs += 1
|
||
|
||
return total_passes, total_runs
|
||
|
||
|
||
STATUS_EMOJI = {
|
||
"passed": "✅",
|
||
"failed": "❌",
|
||
"skipped": "⏭️",
|
||
"xfailed": "🔸",
|
||
"xpassed": "🎉",
|
||
"partial": "⚠️",
|
||
"unknown": "❓",
|
||
}
|
||
|
||
|
||
def _classify_fixed_model(result: TestResult) -> Optional[Tuple[str, str]]:
|
||
"""Return (category_key, pinned_model) for fixed-model tests, else None."""
|
||
cls = result.class_name or ""
|
||
name = result.name or ""
|
||
if "IntentJudge" in cls or "ProcessedSegmentFiltering" in cls or any(
|
||
p in name
|
||
for p in (
|
||
"test_hot_window_mode_indicated_in_prompt",
|
||
"test_tts_text_included_for_echo_detection",
|
||
"test_system_prompt_has_echo_guidance",
|
||
"test_returns_none_when_ollama_unavailable",
|
||
)
|
||
):
|
||
return ("intent_judge", "gemma4:e2b")
|
||
if cls == "TestToolSelectionFiltering":
|
||
return ("tool_selection", "nomic-embed-text")
|
||
return None
|
||
|
||
|
||
def _rate_emoji(rate: float) -> str:
|
||
return "🟢" if rate >= 80 else "🟡" if rate >= 50 else "🔴"
|
||
|
||
|
||
def _count_outcomes(results) -> Dict[str, int]:
|
||
"""Count outcome buckets (run-level: uses pass_rate fractions where available)."""
|
||
passed = failed = skipped = xfailed = partial = 0
|
||
total_passes = total_runs = 0
|
||
for r in results:
|
||
if r.outcome == "passed":
|
||
passed += 1
|
||
elif r.outcome == "failed":
|
||
failed += 1
|
||
elif r.outcome == "skipped":
|
||
skipped += 1
|
||
elif r.outcome == "xfailed":
|
||
xfailed += 1
|
||
elif r.outcome == "partial":
|
||
partial += 1
|
||
if r.outcome in ("xfailed", "skipped"):
|
||
continue
|
||
fraction = _parse_pass_rate_fraction(r.pass_rate) if r.pass_rate else None
|
||
if fraction:
|
||
total_passes += fraction[0]
|
||
total_runs += fraction[1]
|
||
elif r.outcome == "passed":
|
||
total_passes += 1
|
||
total_runs += 1
|
||
elif r.outcome == "failed":
|
||
total_runs += 1
|
||
rate = (total_passes / total_runs * 100) if total_runs > 0 else 0.0
|
||
return {
|
||
"passed": passed, "failed": failed, "skipped": skipped,
|
||
"xfailed": xfailed, "partial": partial,
|
||
"total": passed + failed + skipped + xfailed + partial,
|
||
"run_passes": total_passes, "run_total": total_runs, "rate": rate,
|
||
}
|
||
|
||
|
||
def generate_combined_report(reports: List[ModelReport]) -> str:
|
||
"""Generate a combined markdown report grouped by test category."""
|
||
lines: List[str] = []
|
||
now = datetime.now()
|
||
|
||
# Bucket results into three categories:
|
||
# judge_compared: run once per judge model, compared side-by-side
|
||
# intent_judge: pinned to gemma4:e2b, shown once
|
||
# tool_selection: pinned to nomic-embed-text, shown once
|
||
judge_compared: set[str] = set()
|
||
intent_judge_results: Dict[str, TestResult] = {}
|
||
tool_selection_results: Dict[str, TestResult] = {}
|
||
|
||
for report in reports:
|
||
for test_name, result in report.results.items():
|
||
fm = _classify_fixed_model(result)
|
||
if fm is None:
|
||
judge_compared.add(test_name)
|
||
continue
|
||
bucket = intent_judge_results if fm[0] == "intent_judge" else tool_selection_results
|
||
existing = bucket.get(test_name)
|
||
if existing is None or (existing.outcome == "skipped" and result.outcome != "skipped"):
|
||
bucket[test_name] = result
|
||
|
||
# Per-model stats for the judge-compared bucket
|
||
per_model_stats: Dict[str, Dict[str, int]] = {}
|
||
for report in reports:
|
||
results = [r for n, r in report.results.items() if n in judge_compared]
|
||
per_model_stats[report.model_name] = _count_outcomes(results)
|
||
|
||
intent_stats = _count_outcomes(list(intent_judge_results.values()))
|
||
tool_stats = _count_outcomes(list(tool_selection_results.values()))
|
||
|
||
# Overall aggregate (sum of runs across all categories)
|
||
overall_passes = sum(s["run_passes"] for s in per_model_stats.values()) + intent_stats["run_passes"] + tool_stats["run_passes"]
|
||
overall_runs = sum(s["run_total"] for s in per_model_stats.values()) + intent_stats["run_total"] + tool_stats["run_total"]
|
||
overall_rate = (overall_passes / overall_runs * 100) if overall_runs > 0 else 0.0
|
||
|
||
# Header
|
||
lines.append("# 🧪 Jarvis Evaluation Report")
|
||
lines.append("")
|
||
lines.append(f"**Generated:** {now.strftime('%Y-%m-%d %H:%M:%S')}")
|
||
lines.append("")
|
||
|
||
# TL;DR
|
||
lines.append("## 📊 TL;DR")
|
||
lines.append("")
|
||
lines.append(f"**Overall:** {_rate_emoji(overall_rate)} **{overall_passes}/{overall_runs} passed ({overall_rate:.1f}%)** across all categories")
|
||
lines.append("")
|
||
lines.append("| Category | Model | Passed | Failed | Skipped | Pass Rate |")
|
||
lines.append("|----------|-------|-------:|-------:|--------:|----------:|")
|
||
|
||
def _fmt_row(label: str, model_note: str, stats: Dict[str, int]) -> str:
|
||
emoji = _rate_emoji(stats["rate"]) if stats["run_total"] else "➖"
|
||
rate_str = f"{emoji} {stats['rate']:.1f}%" if stats["run_total"] else "➖"
|
||
return (
|
||
f"| {label} | {model_note} | {stats['passed']} | {stats['failed']} | "
|
||
f"{stats['skipped']} | {rate_str} |"
|
||
)
|
||
|
||
for report in reports:
|
||
lines.append(_fmt_row("🤖 Agent behaviour", f"`{report.model_name}`", per_model_stats[report.model_name]))
|
||
if intent_judge_results:
|
||
lines.append(_fmt_row("🎤 Intent judge", "`gemma4:e2b` (fixed)", intent_stats))
|
||
if tool_selection_results:
|
||
lines.append(_fmt_row("🔍 Tool selection", "`nomic-embed-text` (fixed)", tool_stats))
|
||
lines.append("")
|
||
|
||
# Model selection guide (only when comparing judges)
|
||
if len(reports) > 1:
|
||
lines.append("### 💡 Model Selection Guide")
|
||
lines.append("")
|
||
lines.append("| Model | Best For | Trade-offs |")
|
||
lines.append("|-------|----------|------------|")
|
||
lines.append("| `gemma4:e2b` | Quick responses, lower RAM usage | May struggle with complex reasoning |")
|
||
lines.append("| `gpt-oss:20b` | Best accuracy, complex tasks | Slower, requires more RAM |")
|
||
lines.append("")
|
||
|
||
# Agent behaviour: per-test comparison across judge models
|
||
lines.append("---")
|
||
lines.append("")
|
||
lines.append("## 🤖 Agent behaviour")
|
||
lines.append("")
|
||
lines.append("> Runs the full agent pipeline against each judge model. Tests are compared side-by-side.")
|
||
lines.append("")
|
||
header = "| Test Case |"
|
||
separator = "|-----------|"
|
||
for report in reports:
|
||
header += f" {report.model_name} |"
|
||
separator += "----------:|"
|
||
lines.append(header)
|
||
lines.append(separator)
|
||
for test_name in sorted(judge_compared):
|
||
row = f"| {test_name} |"
|
||
for report in reports:
|
||
result = report.results.get(test_name)
|
||
if result:
|
||
emoji = STATUS_EMOJI.get(result.outcome, "❓")
|
||
row += f" {emoji} {result.pass_rate} |" if result.pass_rate else f" {emoji} |"
|
||
else:
|
||
row += " ➖ |"
|
||
lines.append(row)
|
||
lines.append("")
|
||
|
||
def _render_fixed_section(title: str, blurb: str, results: Dict[str, TestResult]) -> None:
|
||
if not results:
|
||
return
|
||
lines.append("---")
|
||
lines.append("")
|
||
lines.append(f"## {title}")
|
||
lines.append("")
|
||
lines.append(f"> {blurb}")
|
||
lines.append("")
|
||
lines.append("| Test Case | Pass Rate | Status |")
|
||
lines.append("|-----------|-----------|:------:|")
|
||
for test_name in sorted(results.keys()):
|
||
result = results[test_name]
|
||
emoji = STATUS_EMOJI.get(result.outcome, "❓")
|
||
pass_rate_str = result.pass_rate if result.pass_rate else "N/A"
|
||
lines.append(f"| {test_name} | {pass_rate_str} | {emoji} |")
|
||
lines.append("")
|
||
|
||
_render_fixed_section(
|
||
"🎤 Intent judge",
|
||
"Pinned to `gemma4:e2b` (the voice intent classifier). Not affected by the judge model.",
|
||
intent_judge_results,
|
||
)
|
||
_render_fixed_section(
|
||
"🔍 Tool selection",
|
||
"Pinned to `nomic-embed-text` (embedding-based filter). Not affected by the judge model.",
|
||
tool_selection_results,
|
||
)
|
||
|
||
# Legend
|
||
lines.append("---")
|
||
lines.append("")
|
||
lines.append("### 📖 Legend")
|
||
lines.append("")
|
||
lines.append("| Symbol | Meaning |")
|
||
lines.append("|--------|---------|")
|
||
lines.append("| ✅ | Fully passed (100% pass rate) |")
|
||
lines.append("| ⚠️ | Partial pass (some runs failed) |")
|
||
lines.append("| ❌ | Fully failed (0% pass rate) |")
|
||
lines.append("| ⏭️ | Skipped (missing dependencies) |")
|
||
lines.append("| 🔸 | Expected failure (known limitation) |")
|
||
lines.append("| 🎉 | Unexpectedly passed (bug fixed!) |")
|
||
lines.append("| ➖ | Not run for this model |")
|
||
lines.append("")
|
||
lines.append("*Report generated by Jarvis eval suite*")
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def main():
|
||
if len(sys.argv) < 5 or len(sys.argv) % 2 != 1:
|
||
print("Usage: merge_eval_reports.py report1.md model1 report2.md model2 ...", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
# Parse arguments into pairs
|
||
reports = []
|
||
args = sys.argv[1:]
|
||
for i in range(0, len(args), 2):
|
||
report_path = args[i]
|
||
model_name = args[i + 1]
|
||
report = parse_report(report_path, model_name)
|
||
if report:
|
||
reports.append(report)
|
||
|
||
if not reports:
|
||
print("Error: No valid reports found", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
# Generate combined report
|
||
combined = generate_combined_report(reports)
|
||
sys.stdout.buffer.write(combined.encode("utf-8"))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|