Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
539
scripts/merge_eval_reports.py
Executable file
539
scripts/merge_eval_reports.py
Executable file
@@ -0,0 +1,539 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Merge multiple eval reports into a single combined EVALS.md.
|
||||
|
||||
This script takes pairs of (report_path, model_name) arguments and generates
|
||||
a combined report showing results from all models side by side.
|
||||
|
||||
Usage:
|
||||
python merge_eval_reports.py report1.md model1 report2.md model2 > EVALS.md
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestResult:
|
||||
"""Result for a single test case (aggregated across multiple runs)."""
|
||||
name: str
|
||||
outcome: str # passed, failed, skipped, xfailed, xpassed, partial
|
||||
duration: float
|
||||
pass_rate: str = "" # e.g., "3/3 (100%)" or "2/3 (67%)"
|
||||
class_name: str = "" # The test class this result belongs to
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelReport:
|
||||
"""Parsed report for a single model."""
|
||||
model_name: str
|
||||
results: Dict[str, TestResult] = field(default_factory=dict)
|
||||
total: int = 0
|
||||
passed: int = 0
|
||||
failed: int = 0
|
||||
skipped: int = 0
|
||||
duration: float = 0.0
|
||||
|
||||
|
||||
def parse_report(report_path: str, model_name: str) -> Optional[ModelReport]:
|
||||
"""Parse a markdown eval report into a ModelReport."""
|
||||
path = Path(report_path)
|
||||
if not path.exists():
|
||||
print(f"Warning: Report not found: {report_path}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
content = path.read_text(encoding="utf-8")
|
||||
report = ModelReport(model_name=model_name)
|
||||
|
||||
# Parse summary stats
|
||||
for line in content.split("\n"):
|
||||
if "| ✅ Passed |" in line:
|
||||
match = re.search(r"\|\s*(\d+)\s*\|", line.split("Passed")[1])
|
||||
if match:
|
||||
report.passed = int(match.group(1))
|
||||
elif "| ❌ Failed |" in line:
|
||||
match = re.search(r"\|\s*(\d+)\s*\|", line.split("Failed")[1])
|
||||
if match:
|
||||
report.failed = int(match.group(1))
|
||||
elif "| ⏭️ Skipped |" in line:
|
||||
match = re.search(r"\|\s*(\d+)\s*\|", line.split("Skipped")[1])
|
||||
if match:
|
||||
report.skipped = int(match.group(1))
|
||||
elif "| **Total** |" in line:
|
||||
match = re.search(r"\|\s*\*\*(\d+)\*\*\s*\|", line)
|
||||
if match:
|
||||
report.total = int(match.group(1))
|
||||
elif "**Duration:**" in line:
|
||||
match = re.search(r"([\d.]+)s", line)
|
||||
if match:
|
||||
report.duration = float(match.group(1))
|
||||
|
||||
# Parse individual test results from:
|
||||
# 1. Table format: | Test Case | Pass Rate | Status | Avg Duration |
|
||||
# 2. Detailed format: #### ✅ test_name (used for judge tests with notes)
|
||||
# Track current class name from section headers like "### ✅ TestClassName"
|
||||
in_table = False
|
||||
table_format = "old" # "old" or "new"
|
||||
current_class = ""
|
||||
current_detailed_test = None # Track test name for detailed format parsing
|
||||
lines = content.split("\n")
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
# Detect class section headers (e.g., "### ✅ TestIntentJudgeAccuracy")
|
||||
# Use a more lenient pattern that handles multi-byte emoji characters
|
||||
class_header_match = re.match(r'^###\s+\S+\s+(Test\w+)', line)
|
||||
if class_header_match:
|
||||
current_class = class_header_match.group(1)
|
||||
in_table = False # Reset table state for new section
|
||||
current_detailed_test = None
|
||||
continue
|
||||
|
||||
# Detect detailed test headers (e.g., "#### ✅ wake_word_simple_question")
|
||||
# Use a more lenient pattern that handles multi-byte emoji characters
|
||||
detailed_test_match = re.match(r'^####\s+(\S+)\s+(.+)$', line)
|
||||
if detailed_test_match:
|
||||
in_table = False
|
||||
emoji_str = detailed_test_match.group(1)
|
||||
test_name = detailed_test_match.group(2).strip()
|
||||
|
||||
# Determine outcome from emoji (check for emoji presence)
|
||||
outcome = "unknown"
|
||||
if "✅" in emoji_str:
|
||||
outcome = "passed"
|
||||
elif "❌" in emoji_str:
|
||||
outcome = "failed"
|
||||
elif "⏭" in emoji_str: # May be ⏭️ or just ⏭
|
||||
outcome = "skipped"
|
||||
elif "🔸" in emoji_str:
|
||||
outcome = "xfailed"
|
||||
elif "🎉" in emoji_str:
|
||||
outcome = "xpassed"
|
||||
elif "⚠" in emoji_str: # May be ⚠️ or just ⚠
|
||||
outcome = "partial"
|
||||
|
||||
current_detailed_test = test_name
|
||||
# Initialize with placeholder values, will be updated below
|
||||
report.results[test_name] = TestResult(
|
||||
name=test_name,
|
||||
outcome=outcome,
|
||||
duration=0.0,
|
||||
pass_rate="",
|
||||
class_name=current_class
|
||||
)
|
||||
continue
|
||||
|
||||
# Parse pass rate and duration for detailed format
|
||||
if current_detailed_test and current_detailed_test in report.results:
|
||||
# Parse pass rate line: "**Pass Rate:** 1/1 (100%)" or "**Pass Rate:** 1/1 XFAIL"
|
||||
if line.startswith("**Pass Rate:**"):
|
||||
pass_rate_match = re.search(r'\*\*Pass Rate:\*\*\s*(.+)', line)
|
||||
if pass_rate_match:
|
||||
report.results[current_detailed_test].pass_rate = pass_rate_match.group(1).strip()
|
||||
# Parse duration line: "*Avg Duration: 1.23s*"
|
||||
elif line.startswith("*Avg Duration:"):
|
||||
duration_match = re.search(r'([\d.]+)s', line)
|
||||
if duration_match:
|
||||
report.results[current_detailed_test].duration = float(duration_match.group(1))
|
||||
current_detailed_test = None # Done parsing this test
|
||||
|
||||
# Table format parsing
|
||||
if "| Test Case | Pass Rate | Status | Avg Duration |" in line:
|
||||
in_table = True
|
||||
table_format = "new"
|
||||
current_detailed_test = None
|
||||
continue
|
||||
if "| Test Case | Status | Duration |" in line:
|
||||
in_table = True
|
||||
table_format = "old"
|
||||
current_detailed_test = None
|
||||
continue
|
||||
if in_table and line.startswith("|") and "---" not in line:
|
||||
parts = [p.strip() for p in line.split("|")[1:-1]]
|
||||
|
||||
if table_format == "new" and len(parts) >= 4:
|
||||
# Parse new format: | Test Case | Pass Rate | Status | Avg Duration |
|
||||
test_name = parts[0]
|
||||
pass_rate = parts[1]
|
||||
status_cell = parts[2]
|
||||
duration_cell = parts[3]
|
||||
elif len(parts) >= 3:
|
||||
# Parse old format: | Test Case | Status | Duration |
|
||||
test_name = parts[0]
|
||||
pass_rate = ""
|
||||
status_cell = parts[1]
|
||||
duration_cell = parts[2]
|
||||
else:
|
||||
continue
|
||||
|
||||
# Extract outcome from status cell
|
||||
outcome = "unknown"
|
||||
if "✅" in status_cell:
|
||||
outcome = "passed"
|
||||
elif "❌" in status_cell:
|
||||
outcome = "failed"
|
||||
elif "⏭️" in status_cell:
|
||||
outcome = "skipped"
|
||||
elif "🔸" in status_cell:
|
||||
outcome = "xfailed"
|
||||
elif "🎉" in status_cell:
|
||||
outcome = "xpassed"
|
||||
elif "⚠️" in status_cell:
|
||||
outcome = "partial"
|
||||
|
||||
# Extract duration
|
||||
duration_match = re.search(r"([\d.]+)s", duration_cell)
|
||||
duration = float(duration_match.group(1)) if duration_match else 0.0
|
||||
|
||||
report.results[test_name] = TestResult(
|
||||
name=test_name,
|
||||
outcome=outcome,
|
||||
duration=duration,
|
||||
pass_rate=pass_rate,
|
||||
class_name=current_class
|
||||
)
|
||||
elif in_table and not line.startswith("|"):
|
||||
in_table = False
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def is_fixed_model_test(result: TestResult) -> bool:
|
||||
"""Check if a test uses a fixed model, independent of the judge model.
|
||||
|
||||
Some tests are pinned to specific models regardless of EVAL_JUDGE_MODEL:
|
||||
- Intent judge tests use gemma4 (the intent classification model)
|
||||
- Tool selection tests use nomic-embed-text (the embedding model)
|
||||
|
||||
These shouldn't be compared across judge models since they always use the
|
||||
same model — they belong in their own section.
|
||||
|
||||
NOTE: This list is kept in sync manually. When you add a new test class or
|
||||
file whose model is pinned (not controlled by EVAL_JUDGE_MODEL), add its
|
||||
class-name substring below or its test-name pattern to the fallback list.
|
||||
"""
|
||||
fixed_model_classes = [
|
||||
"IntentJudge", # TestIntentJudgeAccuracy, TestIntentJudgeMultiSegment, etc.
|
||||
"ProcessedSegmentFiltering", # Intent judge processed segment filtering
|
||||
]
|
||||
fixed_model_exact_classes = {
|
||||
"TestToolSelectionFiltering", # Embedding strategy, pinned to nomic-embed-text (exact match so TestToolSelectionFilteringLLM isn't bucketed here)
|
||||
}
|
||||
|
||||
if result.class_name:
|
||||
if result.class_name in fixed_model_exact_classes:
|
||||
return True
|
||||
for class_pattern in fixed_model_classes:
|
||||
if class_pattern in result.class_name:
|
||||
return True
|
||||
|
||||
fixed_model_name_patterns = [
|
||||
"test_hot_window_mode_indicated_in_prompt",
|
||||
"test_tts_text_included_for_echo_detection",
|
||||
"test_system_prompt_has_echo_guidance",
|
||||
"test_returns_none_when_ollama_unavailable",
|
||||
]
|
||||
return any(pattern in result.name for pattern in fixed_model_name_patterns)
|
||||
|
||||
|
||||
# Backwards-compatible alias
|
||||
is_intent_judge_test = is_fixed_model_test
|
||||
|
||||
|
||||
def _parse_pass_rate_fraction(pass_rate: str) -> Optional[Tuple[int, int]]:
|
||||
"""Parse a pass rate string like '2/3 (67%)' into (passes, total).
|
||||
|
||||
Returns None for non-standard formats (SKIPPED, XFAIL, N/A, etc.).
|
||||
"""
|
||||
match = re.match(r'(\d+)/(\d+)', pass_rate)
|
||||
if match:
|
||||
return int(match.group(1)), int(match.group(2))
|
||||
return None
|
||||
|
||||
|
||||
def _calc_run_level_pass_rate(
|
||||
report: ModelReport, main_llm_tests: set
|
||||
) -> Tuple[int, int]:
|
||||
"""Calculate pass rate from individual run results across all main LLM tests.
|
||||
|
||||
Returns (total_passes, total_runs) by parsing each test's pass_rate string.
|
||||
Falls back to counting fully-passed/failed tests when pass_rate data is missing.
|
||||
"""
|
||||
total_passes = 0
|
||||
total_runs = 0
|
||||
|
||||
for test_name in main_llm_tests:
|
||||
result = report.results.get(test_name)
|
||||
if not result:
|
||||
continue
|
||||
|
||||
# Skip xfailed/skipped — not countable
|
||||
if result.outcome in ("xfailed", "skipped"):
|
||||
continue
|
||||
|
||||
fraction = _parse_pass_rate_fraction(result.pass_rate) if result.pass_rate else None
|
||||
if fraction:
|
||||
total_passes += fraction[0]
|
||||
total_runs += fraction[1]
|
||||
else:
|
||||
# Fallback: treat passed as 1/1, failed as 0/1
|
||||
if result.outcome == "passed":
|
||||
total_passes += 1
|
||||
total_runs += 1
|
||||
elif result.outcome == "failed":
|
||||
total_runs += 1
|
||||
|
||||
return total_passes, total_runs
|
||||
|
||||
|
||||
STATUS_EMOJI = {
|
||||
"passed": "✅",
|
||||
"failed": "❌",
|
||||
"skipped": "⏭️",
|
||||
"xfailed": "🔸",
|
||||
"xpassed": "🎉",
|
||||
"partial": "⚠️",
|
||||
"unknown": "❓",
|
||||
}
|
||||
|
||||
|
||||
def _classify_fixed_model(result: TestResult) -> Optional[Tuple[str, str]]:
|
||||
"""Return (category_key, pinned_model) for fixed-model tests, else None."""
|
||||
cls = result.class_name or ""
|
||||
name = result.name or ""
|
||||
if "IntentJudge" in cls or "ProcessedSegmentFiltering" in cls or any(
|
||||
p in name
|
||||
for p in (
|
||||
"test_hot_window_mode_indicated_in_prompt",
|
||||
"test_tts_text_included_for_echo_detection",
|
||||
"test_system_prompt_has_echo_guidance",
|
||||
"test_returns_none_when_ollama_unavailable",
|
||||
)
|
||||
):
|
||||
return ("intent_judge", "gemma4:e2b")
|
||||
if cls == "TestToolSelectionFiltering":
|
||||
return ("tool_selection", "nomic-embed-text")
|
||||
return None
|
||||
|
||||
|
||||
def _rate_emoji(rate: float) -> str:
|
||||
return "🟢" if rate >= 80 else "🟡" if rate >= 50 else "🔴"
|
||||
|
||||
|
||||
def _count_outcomes(results) -> Dict[str, int]:
|
||||
"""Count outcome buckets (run-level: uses pass_rate fractions where available)."""
|
||||
passed = failed = skipped = xfailed = partial = 0
|
||||
total_passes = total_runs = 0
|
||||
for r in results:
|
||||
if r.outcome == "passed":
|
||||
passed += 1
|
||||
elif r.outcome == "failed":
|
||||
failed += 1
|
||||
elif r.outcome == "skipped":
|
||||
skipped += 1
|
||||
elif r.outcome == "xfailed":
|
||||
xfailed += 1
|
||||
elif r.outcome == "partial":
|
||||
partial += 1
|
||||
if r.outcome in ("xfailed", "skipped"):
|
||||
continue
|
||||
fraction = _parse_pass_rate_fraction(r.pass_rate) if r.pass_rate else None
|
||||
if fraction:
|
||||
total_passes += fraction[0]
|
||||
total_runs += fraction[1]
|
||||
elif r.outcome == "passed":
|
||||
total_passes += 1
|
||||
total_runs += 1
|
||||
elif r.outcome == "failed":
|
||||
total_runs += 1
|
||||
rate = (total_passes / total_runs * 100) if total_runs > 0 else 0.0
|
||||
return {
|
||||
"passed": passed, "failed": failed, "skipped": skipped,
|
||||
"xfailed": xfailed, "partial": partial,
|
||||
"total": passed + failed + skipped + xfailed + partial,
|
||||
"run_passes": total_passes, "run_total": total_runs, "rate": rate,
|
||||
}
|
||||
|
||||
|
||||
def generate_combined_report(reports: List[ModelReport]) -> str:
|
||||
"""Generate a combined markdown report grouped by test category."""
|
||||
lines: List[str] = []
|
||||
now = datetime.now()
|
||||
|
||||
# Bucket results into three categories:
|
||||
# judge_compared: run once per judge model, compared side-by-side
|
||||
# intent_judge: pinned to gemma4:e2b, shown once
|
||||
# tool_selection: pinned to nomic-embed-text, shown once
|
||||
judge_compared: set[str] = set()
|
||||
intent_judge_results: Dict[str, TestResult] = {}
|
||||
tool_selection_results: Dict[str, TestResult] = {}
|
||||
|
||||
for report in reports:
|
||||
for test_name, result in report.results.items():
|
||||
fm = _classify_fixed_model(result)
|
||||
if fm is None:
|
||||
judge_compared.add(test_name)
|
||||
continue
|
||||
bucket = intent_judge_results if fm[0] == "intent_judge" else tool_selection_results
|
||||
existing = bucket.get(test_name)
|
||||
if existing is None or (existing.outcome == "skipped" and result.outcome != "skipped"):
|
||||
bucket[test_name] = result
|
||||
|
||||
# Per-model stats for the judge-compared bucket
|
||||
per_model_stats: Dict[str, Dict[str, int]] = {}
|
||||
for report in reports:
|
||||
results = [r for n, r in report.results.items() if n in judge_compared]
|
||||
per_model_stats[report.model_name] = _count_outcomes(results)
|
||||
|
||||
intent_stats = _count_outcomes(list(intent_judge_results.values()))
|
||||
tool_stats = _count_outcomes(list(tool_selection_results.values()))
|
||||
|
||||
# Overall aggregate (sum of runs across all categories)
|
||||
overall_passes = sum(s["run_passes"] for s in per_model_stats.values()) + intent_stats["run_passes"] + tool_stats["run_passes"]
|
||||
overall_runs = sum(s["run_total"] for s in per_model_stats.values()) + intent_stats["run_total"] + tool_stats["run_total"]
|
||||
overall_rate = (overall_passes / overall_runs * 100) if overall_runs > 0 else 0.0
|
||||
|
||||
# Header
|
||||
lines.append("# 🧪 Jarvis Evaluation Report")
|
||||
lines.append("")
|
||||
lines.append(f"**Generated:** {now.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
lines.append("")
|
||||
|
||||
# TL;DR
|
||||
lines.append("## 📊 TL;DR")
|
||||
lines.append("")
|
||||
lines.append(f"**Overall:** {_rate_emoji(overall_rate)} **{overall_passes}/{overall_runs} passed ({overall_rate:.1f}%)** across all categories")
|
||||
lines.append("")
|
||||
lines.append("| Category | Model | Passed | Failed | Skipped | Pass Rate |")
|
||||
lines.append("|----------|-------|-------:|-------:|--------:|----------:|")
|
||||
|
||||
def _fmt_row(label: str, model_note: str, stats: Dict[str, int]) -> str:
|
||||
emoji = _rate_emoji(stats["rate"]) if stats["run_total"] else "➖"
|
||||
rate_str = f"{emoji} {stats['rate']:.1f}%" if stats["run_total"] else "➖"
|
||||
return (
|
||||
f"| {label} | {model_note} | {stats['passed']} | {stats['failed']} | "
|
||||
f"{stats['skipped']} | {rate_str} |"
|
||||
)
|
||||
|
||||
for report in reports:
|
||||
lines.append(_fmt_row("🤖 Agent behaviour", f"`{report.model_name}`", per_model_stats[report.model_name]))
|
||||
if intent_judge_results:
|
||||
lines.append(_fmt_row("🎤 Intent judge", "`gemma4:e2b` (fixed)", intent_stats))
|
||||
if tool_selection_results:
|
||||
lines.append(_fmt_row("🔍 Tool selection", "`nomic-embed-text` (fixed)", tool_stats))
|
||||
lines.append("")
|
||||
|
||||
# Model selection guide (only when comparing judges)
|
||||
if len(reports) > 1:
|
||||
lines.append("### 💡 Model Selection Guide")
|
||||
lines.append("")
|
||||
lines.append("| Model | Best For | Trade-offs |")
|
||||
lines.append("|-------|----------|------------|")
|
||||
lines.append("| `gemma4:e2b` | Quick responses, lower RAM usage | May struggle with complex reasoning |")
|
||||
lines.append("| `gpt-oss:20b` | Best accuracy, complex tasks | Slower, requires more RAM |")
|
||||
lines.append("")
|
||||
|
||||
# Agent behaviour: per-test comparison across judge models
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
lines.append("## 🤖 Agent behaviour")
|
||||
lines.append("")
|
||||
lines.append("> Runs the full agent pipeline against each judge model. Tests are compared side-by-side.")
|
||||
lines.append("")
|
||||
header = "| Test Case |"
|
||||
separator = "|-----------|"
|
||||
for report in reports:
|
||||
header += f" {report.model_name} |"
|
||||
separator += "----------:|"
|
||||
lines.append(header)
|
||||
lines.append(separator)
|
||||
for test_name in sorted(judge_compared):
|
||||
row = f"| {test_name} |"
|
||||
for report in reports:
|
||||
result = report.results.get(test_name)
|
||||
if result:
|
||||
emoji = STATUS_EMOJI.get(result.outcome, "❓")
|
||||
row += f" {emoji} {result.pass_rate} |" if result.pass_rate else f" {emoji} |"
|
||||
else:
|
||||
row += " ➖ |"
|
||||
lines.append(row)
|
||||
lines.append("")
|
||||
|
||||
def _render_fixed_section(title: str, blurb: str, results: Dict[str, TestResult]) -> None:
|
||||
if not results:
|
||||
return
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
lines.append(f"## {title}")
|
||||
lines.append("")
|
||||
lines.append(f"> {blurb}")
|
||||
lines.append("")
|
||||
lines.append("| Test Case | Pass Rate | Status |")
|
||||
lines.append("|-----------|-----------|:------:|")
|
||||
for test_name in sorted(results.keys()):
|
||||
result = results[test_name]
|
||||
emoji = STATUS_EMOJI.get(result.outcome, "❓")
|
||||
pass_rate_str = result.pass_rate if result.pass_rate else "N/A"
|
||||
lines.append(f"| {test_name} | {pass_rate_str} | {emoji} |")
|
||||
lines.append("")
|
||||
|
||||
_render_fixed_section(
|
||||
"🎤 Intent judge",
|
||||
"Pinned to `gemma4:e2b` (the voice intent classifier). Not affected by the judge model.",
|
||||
intent_judge_results,
|
||||
)
|
||||
_render_fixed_section(
|
||||
"🔍 Tool selection",
|
||||
"Pinned to `nomic-embed-text` (embedding-based filter). Not affected by the judge model.",
|
||||
tool_selection_results,
|
||||
)
|
||||
|
||||
# Legend
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
lines.append("### 📖 Legend")
|
||||
lines.append("")
|
||||
lines.append("| Symbol | Meaning |")
|
||||
lines.append("|--------|---------|")
|
||||
lines.append("| ✅ | Fully passed (100% pass rate) |")
|
||||
lines.append("| ⚠️ | Partial pass (some runs failed) |")
|
||||
lines.append("| ❌ | Fully failed (0% pass rate) |")
|
||||
lines.append("| ⏭️ | Skipped (missing dependencies) |")
|
||||
lines.append("| 🔸 | Expected failure (known limitation) |")
|
||||
lines.append("| 🎉 | Unexpectedly passed (bug fixed!) |")
|
||||
lines.append("| ➖ | Not run for this model |")
|
||||
lines.append("")
|
||||
lines.append("*Report generated by Jarvis eval suite*")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 5 or len(sys.argv) % 2 != 1:
|
||||
print("Usage: merge_eval_reports.py report1.md model1 report2.md model2 ...", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Parse arguments into pairs
|
||||
reports = []
|
||||
args = sys.argv[1:]
|
||||
for i in range(0, len(args), 2):
|
||||
report_path = args[i]
|
||||
model_name = args[i + 1]
|
||||
report = parse_report(report_path, model_name)
|
||||
if report:
|
||||
reports.append(report)
|
||||
|
||||
if not reports:
|
||||
print("Error: No valid reports found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Generate combined report
|
||||
combined = generate_combined_report(reports)
|
||||
sys.stdout.buffer.write(combined.encode("utf-8"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user