Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/scripts/merge_eval_reports.py
+++ b/scripts/merge_eval_reports.py
@@ -0,0 +1,539 @@
+#!/usr/bin/env python3
+"""
+Merge multiple eval reports into a single combined EVALS.md.
+
+This script takes pairs of (report_path, model_name) arguments and generates
+a combined report showing results from all models side by side.
+
+Usage:
+    python merge_eval_reports.py report1.md model1 report2.md model2 > EVALS.md
+"""
+
+import sys
+import re
+from datetime import datetime
+from pathlib import Path
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+
+
+@dataclass
+class TestResult:
+    """Result for a single test case (aggregated across multiple runs)."""
+    name: str
+    outcome: str  # passed, failed, skipped, xfailed, xpassed, partial
+    duration: float
+    pass_rate: str = ""  # e.g., "3/3 (100%)" or "2/3 (67%)"
+    class_name: str = ""  # The test class this result belongs to
+
+
+@dataclass
+class ModelReport:
+    """Parsed report for a single model."""
+    model_name: str
+    results: Dict[str, TestResult] = field(default_factory=dict)
+    total: int = 0
+    passed: int = 0
+    failed: int = 0
+    skipped: int = 0
+    duration: float = 0.0
+
+
+def parse_report(report_path: str, model_name: str) -> Optional[ModelReport]:
+    """Parse a markdown eval report into a ModelReport."""
+    path = Path(report_path)
+    if not path.exists():
+        print(f"Warning: Report not found: {report_path}", file=sys.stderr)
+        return None
+
+    content = path.read_text(encoding="utf-8")
+    report = ModelReport(model_name=model_name)
+
+    # Parse summary stats
+    for line in content.split("\n"):
+        if "| ✅ Passed |" in line:
+            match = re.search(r"\|\s*(\d+)\s*\|", line.split("Passed")[1])
+            if match:
+                report.passed = int(match.group(1))
+        elif "| ❌ Failed |" in line:
+            match = re.search(r"\|\s*(\d+)\s*\|", line.split("Failed")[1])
+            if match:
+                report.failed = int(match.group(1))
+        elif "| ⏭️ Skipped |" in line:
+            match = re.search(r"\|\s*(\d+)\s*\|", line.split("Skipped")[1])
+            if match:
+                report.skipped = int(match.group(1))
+        elif "| **Total** |" in line:
+            match = re.search(r"\|\s*\*\*(\d+)\*\*\s*\|", line)
+            if match:
+                report.total = int(match.group(1))
+        elif "**Duration:**" in line:
+            match = re.search(r"([\d.]+)s", line)
+            if match:
+                report.duration = float(match.group(1))
+
+    # Parse individual test results from:
+    # 1. Table format: | Test Case | Pass Rate | Status | Avg Duration |
+    # 2. Detailed format: #### ✅ test_name (used for judge tests with notes)
+    # Track current class name from section headers like "### ✅ TestClassName"
+    in_table = False
+    table_format = "old"  # "old" or "new"
+    current_class = ""
+    current_detailed_test = None  # Track test name for detailed format parsing
+    lines = content.split("\n")
+
+    for i, line in enumerate(lines):
+        # Detect class section headers (e.g., "### ✅ TestIntentJudgeAccuracy")
+        # Use a more lenient pattern that handles multi-byte emoji characters
+        class_header_match = re.match(r'^###\s+\S+\s+(Test\w+)', line)
+        if class_header_match:
+            current_class = class_header_match.group(1)
+            in_table = False  # Reset table state for new section
+            current_detailed_test = None
+            continue
+
+        # Detect detailed test headers (e.g., "#### ✅ wake_word_simple_question")
+        # Use a more lenient pattern that handles multi-byte emoji characters
+        detailed_test_match = re.match(r'^####\s+(\S+)\s+(.+)$', line)
+        if detailed_test_match:
+            in_table = False
+            emoji_str = detailed_test_match.group(1)
+            test_name = detailed_test_match.group(2).strip()
+
+            # Determine outcome from emoji (check for emoji presence)
+            outcome = "unknown"
+            if "✅" in emoji_str:
+                outcome = "passed"
+            elif "❌" in emoji_str:
+                outcome = "failed"
+            elif "⏭" in emoji_str:  # May be ⏭️ or just ⏭
+                outcome = "skipped"
+            elif "🔸" in emoji_str:
+                outcome = "xfailed"
+            elif "🎉" in emoji_str:
+                outcome = "xpassed"
+            elif "⚠" in emoji_str:  # May be ⚠️ or just ⚠
+                outcome = "partial"
+
+            current_detailed_test = test_name
+            # Initialize with placeholder values, will be updated below
+            report.results[test_name] = TestResult(
+                name=test_name,
+                outcome=outcome,
+                duration=0.0,
+                pass_rate="",
+                class_name=current_class
+            )
+            continue
+
+        # Parse pass rate and duration for detailed format
+        if current_detailed_test and current_detailed_test in report.results:
+            # Parse pass rate line: "**Pass Rate:** 1/1 (100%)" or "**Pass Rate:** 1/1 XFAIL"
+            if line.startswith("**Pass Rate:**"):
+                pass_rate_match = re.search(r'\*\*Pass Rate:\*\*\s*(.+)', line)
+                if pass_rate_match:
+                    report.results[current_detailed_test].pass_rate = pass_rate_match.group(1).strip()
+            # Parse duration line: "*Avg Duration: 1.23s*"
+            elif line.startswith("*Avg Duration:"):
+                duration_match = re.search(r'([\d.]+)s', line)
+                if duration_match:
+                    report.results[current_detailed_test].duration = float(duration_match.group(1))
+                current_detailed_test = None  # Done parsing this test
+
+        # Table format parsing
+        if "| Test Case | Pass Rate | Status | Avg Duration |" in line:
+            in_table = True
+            table_format = "new"
+            current_detailed_test = None
+            continue
+        if "| Test Case | Status | Duration |" in line:
+            in_table = True
+            table_format = "old"
+            current_detailed_test = None
+            continue
+        if in_table and line.startswith("|") and "---" not in line:
+            parts = [p.strip() for p in line.split("|")[1:-1]]
+
+            if table_format == "new" and len(parts) >= 4:
+                # Parse new format: | Test Case | Pass Rate | Status | Avg Duration |
+                test_name = parts[0]
+                pass_rate = parts[1]
+                status_cell = parts[2]
+                duration_cell = parts[3]
+            elif len(parts) >= 3:
+                # Parse old format: | Test Case | Status | Duration |
+                test_name = parts[0]
+                pass_rate = ""
+                status_cell = parts[1]
+                duration_cell = parts[2]
+            else:
+                continue
+
+            # Extract outcome from status cell
+            outcome = "unknown"
+            if "✅" in status_cell:
+                outcome = "passed"
+            elif "❌" in status_cell:
+                outcome = "failed"
+            elif "⏭️" in status_cell:
+                outcome = "skipped"
+            elif "🔸" in status_cell:
+                outcome = "xfailed"
+            elif "🎉" in status_cell:
+                outcome = "xpassed"
+            elif "⚠️" in status_cell:
+                outcome = "partial"
+
+            # Extract duration
+            duration_match = re.search(r"([\d.]+)s", duration_cell)
+            duration = float(duration_match.group(1)) if duration_match else 0.0
+
+            report.results[test_name] = TestResult(
+                name=test_name,
+                outcome=outcome,
+                duration=duration,
+                pass_rate=pass_rate,
+                class_name=current_class
+            )
+        elif in_table and not line.startswith("|"):
+            in_table = False
+
+    return report
+
+
+def is_fixed_model_test(result: TestResult) -> bool:
+    """Check if a test uses a fixed model, independent of the judge model.
+
+    Some tests are pinned to specific models regardless of EVAL_JUDGE_MODEL:
+    - Intent judge tests use gemma4 (the intent classification model)
+    - Tool selection tests use nomic-embed-text (the embedding model)
+
+    These shouldn't be compared across judge models since they always use the
+    same model — they belong in their own section.
+
+    NOTE: This list is kept in sync manually. When you add a new test class or
+    file whose model is pinned (not controlled by EVAL_JUDGE_MODEL), add its
+    class-name substring below or its test-name pattern to the fallback list.
+    """
+    fixed_model_classes = [
+        "IntentJudge",  # TestIntentJudgeAccuracy, TestIntentJudgeMultiSegment, etc.
+        "ProcessedSegmentFiltering",  # Intent judge processed segment filtering
+    ]
+    fixed_model_exact_classes = {
+        "TestToolSelectionFiltering",  # Embedding strategy, pinned to nomic-embed-text (exact match so TestToolSelectionFilteringLLM isn't bucketed here)
+    }
+
+    if result.class_name:
+        if result.class_name in fixed_model_exact_classes:
+            return True
+        for class_pattern in fixed_model_classes:
+            if class_pattern in result.class_name:
+                return True
+
+    fixed_model_name_patterns = [
+        "test_hot_window_mode_indicated_in_prompt",
+        "test_tts_text_included_for_echo_detection",
+        "test_system_prompt_has_echo_guidance",
+        "test_returns_none_when_ollama_unavailable",
+    ]
+    return any(pattern in result.name for pattern in fixed_model_name_patterns)
+
+
+# Backwards-compatible alias
+is_intent_judge_test = is_fixed_model_test
+
+
+def _parse_pass_rate_fraction(pass_rate: str) -> Optional[Tuple[int, int]]:
+    """Parse a pass rate string like '2/3 (67%)' into (passes, total).
+
+    Returns None for non-standard formats (SKIPPED, XFAIL, N/A, etc.).
+    """
+    match = re.match(r'(\d+)/(\d+)', pass_rate)
+    if match:
+        return int(match.group(1)), int(match.group(2))
+    return None
+
+
+def _calc_run_level_pass_rate(
+    report: ModelReport, main_llm_tests: set
+) -> Tuple[int, int]:
+    """Calculate pass rate from individual run results across all main LLM tests.
+
+    Returns (total_passes, total_runs) by parsing each test's pass_rate string.
+    Falls back to counting fully-passed/failed tests when pass_rate data is missing.
+    """
+    total_passes = 0
+    total_runs = 0
+
+    for test_name in main_llm_tests:
+        result = report.results.get(test_name)
+        if not result:
+            continue
+
+        # Skip xfailed/skipped — not countable
+        if result.outcome in ("xfailed", "skipped"):
+            continue
+
+        fraction = _parse_pass_rate_fraction(result.pass_rate) if result.pass_rate else None
+        if fraction:
+            total_passes += fraction[0]
+            total_runs += fraction[1]
+        else:
+            # Fallback: treat passed as 1/1, failed as 0/1
+            if result.outcome == "passed":
+                total_passes += 1
+                total_runs += 1
+            elif result.outcome == "failed":
+                total_runs += 1
+
+    return total_passes, total_runs
+
+
+STATUS_EMOJI = {
+    "passed": "✅",
+    "failed": "❌",
+    "skipped": "⏭️",
+    "xfailed": "🔸",
+    "xpassed": "🎉",
+    "partial": "⚠️",
+    "unknown": "❓",
+}
+
+
+def _classify_fixed_model(result: TestResult) -> Optional[Tuple[str, str]]:
+    """Return (category_key, pinned_model) for fixed-model tests, else None."""
+    cls = result.class_name or ""
+    name = result.name or ""
+    if "IntentJudge" in cls or "ProcessedSegmentFiltering" in cls or any(
+        p in name
+        for p in (
+            "test_hot_window_mode_indicated_in_prompt",
+            "test_tts_text_included_for_echo_detection",
+            "test_system_prompt_has_echo_guidance",
+            "test_returns_none_when_ollama_unavailable",
+        )
+    ):
+        return ("intent_judge", "gemma4:e2b")
+    if cls == "TestToolSelectionFiltering":
+        return ("tool_selection", "nomic-embed-text")
+    return None
+
+
+def _rate_emoji(rate: float) -> str:
+    return "🟢" if rate >= 80 else "🟡" if rate >= 50 else "🔴"
+
+
+def _count_outcomes(results) -> Dict[str, int]:
+    """Count outcome buckets (run-level: uses pass_rate fractions where available)."""
+    passed = failed = skipped = xfailed = partial = 0
+    total_passes = total_runs = 0
+    for r in results:
+        if r.outcome == "passed":
+            passed += 1
+        elif r.outcome == "failed":
+            failed += 1
+        elif r.outcome == "skipped":
+            skipped += 1
+        elif r.outcome == "xfailed":
+            xfailed += 1
+        elif r.outcome == "partial":
+            partial += 1
+        if r.outcome in ("xfailed", "skipped"):
+            continue
+        fraction = _parse_pass_rate_fraction(r.pass_rate) if r.pass_rate else None
+        if fraction:
+            total_passes += fraction[0]
+            total_runs += fraction[1]
+        elif r.outcome == "passed":
+            total_passes += 1
+            total_runs += 1
+        elif r.outcome == "failed":
+            total_runs += 1
+    rate = (total_passes / total_runs * 100) if total_runs > 0 else 0.0
+    return {
+        "passed": passed, "failed": failed, "skipped": skipped,
+        "xfailed": xfailed, "partial": partial,
+        "total": passed + failed + skipped + xfailed + partial,
+        "run_passes": total_passes, "run_total": total_runs, "rate": rate,
+    }
+
+
+def generate_combined_report(reports: List[ModelReport]) -> str:
+    """Generate a combined markdown report grouped by test category."""
+    lines: List[str] = []
+    now = datetime.now()
+
+    # Bucket results into three categories:
+    #   judge_compared: run once per judge model, compared side-by-side
+    #   intent_judge:   pinned to gemma4:e2b, shown once
+    #   tool_selection: pinned to nomic-embed-text, shown once
+    judge_compared: set[str] = set()
+    intent_judge_results: Dict[str, TestResult] = {}
+    tool_selection_results: Dict[str, TestResult] = {}
+
+    for report in reports:
+        for test_name, result in report.results.items():
+            fm = _classify_fixed_model(result)
+            if fm is None:
+                judge_compared.add(test_name)
+                continue
+            bucket = intent_judge_results if fm[0] == "intent_judge" else tool_selection_results
+            existing = bucket.get(test_name)
+            if existing is None or (existing.outcome == "skipped" and result.outcome != "skipped"):
+                bucket[test_name] = result
+
+    # Per-model stats for the judge-compared bucket
+    per_model_stats: Dict[str, Dict[str, int]] = {}
+    for report in reports:
+        results = [r for n, r in report.results.items() if n in judge_compared]
+        per_model_stats[report.model_name] = _count_outcomes(results)
+
+    intent_stats = _count_outcomes(list(intent_judge_results.values()))
+    tool_stats = _count_outcomes(list(tool_selection_results.values()))
+
+    # Overall aggregate (sum of runs across all categories)
+    overall_passes = sum(s["run_passes"] for s in per_model_stats.values()) + intent_stats["run_passes"] + tool_stats["run_passes"]
+    overall_runs = sum(s["run_total"] for s in per_model_stats.values()) + intent_stats["run_total"] + tool_stats["run_total"]
+    overall_rate = (overall_passes / overall_runs * 100) if overall_runs > 0 else 0.0
+
+    # Header
+    lines.append("# 🧪 Jarvis Evaluation Report")
+    lines.append("")
+    lines.append(f"**Generated:** {now.strftime('%Y-%m-%d %H:%M:%S')}")
+    lines.append("")
+
+    # TL;DR
+    lines.append("## 📊 TL;DR")
+    lines.append("")
+    lines.append(f"**Overall:** {_rate_emoji(overall_rate)} **{overall_passes}/{overall_runs} passed ({overall_rate:.1f}%)** across all categories")
+    lines.append("")
+    lines.append("| Category | Model | Passed | Failed | Skipped | Pass Rate |")
+    lines.append("|----------|-------|-------:|-------:|--------:|----------:|")
+
+    def _fmt_row(label: str, model_note: str, stats: Dict[str, int]) -> str:
+        emoji = _rate_emoji(stats["rate"]) if stats["run_total"] else "➖"
+        rate_str = f"{emoji} {stats['rate']:.1f}%" if stats["run_total"] else "➖"
+        return (
+            f"| {label} | {model_note} | {stats['passed']} | {stats['failed']} | "
+            f"{stats['skipped']} | {rate_str} |"
+        )
+
+    for report in reports:
+        lines.append(_fmt_row("🤖 Agent behaviour", f"`{report.model_name}`", per_model_stats[report.model_name]))
+    if intent_judge_results:
+        lines.append(_fmt_row("🎤 Intent judge", "`gemma4:e2b` (fixed)", intent_stats))
+    if tool_selection_results:
+        lines.append(_fmt_row("🔍 Tool selection", "`nomic-embed-text` (fixed)", tool_stats))
+    lines.append("")
+
+    # Model selection guide (only when comparing judges)
+    if len(reports) > 1:
+        lines.append("### 💡 Model Selection Guide")
+        lines.append("")
+        lines.append("| Model | Best For | Trade-offs |")
+        lines.append("|-------|----------|------------|")
+        lines.append("| `gemma4:e2b` | Quick responses, lower RAM usage | May struggle with complex reasoning |")
+        lines.append("| `gpt-oss:20b` | Best accuracy, complex tasks | Slower, requires more RAM |")
+        lines.append("")
+
+    # Agent behaviour: per-test comparison across judge models
+    lines.append("---")
+    lines.append("")
+    lines.append("## 🤖 Agent behaviour")
+    lines.append("")
+    lines.append("> Runs the full agent pipeline against each judge model. Tests are compared side-by-side.")
+    lines.append("")
+    header = "| Test Case |"
+    separator = "|-----------|"
+    for report in reports:
+        header += f" {report.model_name} |"
+        separator += "----------:|"
+    lines.append(header)
+    lines.append(separator)
+    for test_name in sorted(judge_compared):
+        row = f"| {test_name} |"
+        for report in reports:
+            result = report.results.get(test_name)
+            if result:
+                emoji = STATUS_EMOJI.get(result.outcome, "❓")
+                row += f" {emoji} {result.pass_rate} |" if result.pass_rate else f" {emoji} |"
+            else:
+                row += " ➖ |"
+        lines.append(row)
+    lines.append("")
+
+    def _render_fixed_section(title: str, blurb: str, results: Dict[str, TestResult]) -> None:
+        if not results:
+            return
+        lines.append("---")
+        lines.append("")
+        lines.append(f"## {title}")
+        lines.append("")
+        lines.append(f"> {blurb}")
+        lines.append("")
+        lines.append("| Test Case | Pass Rate | Status |")
+        lines.append("|-----------|-----------|:------:|")
+        for test_name in sorted(results.keys()):
+            result = results[test_name]
+            emoji = STATUS_EMOJI.get(result.outcome, "❓")
+            pass_rate_str = result.pass_rate if result.pass_rate else "N/A"
+            lines.append(f"| {test_name} | {pass_rate_str} | {emoji} |")
+        lines.append("")
+
+    _render_fixed_section(
+        "🎤 Intent judge",
+        "Pinned to `gemma4:e2b` (the voice intent classifier). Not affected by the judge model.",
+        intent_judge_results,
+    )
+    _render_fixed_section(
+        "🔍 Tool selection",
+        "Pinned to `nomic-embed-text` (embedding-based filter). Not affected by the judge model.",
+        tool_selection_results,
+    )
+
+    # Legend
+    lines.append("---")
+    lines.append("")
+    lines.append("### 📖 Legend")
+    lines.append("")
+    lines.append("| Symbol | Meaning |")
+    lines.append("|--------|---------|")
+    lines.append("| ✅ | Fully passed (100% pass rate) |")
+    lines.append("| ⚠️ | Partial pass (some runs failed) |")
+    lines.append("| ❌ | Fully failed (0% pass rate) |")
+    lines.append("| ⏭️ | Skipped (missing dependencies) |")
+    lines.append("| 🔸 | Expected failure (known limitation) |")
+    lines.append("| 🎉 | Unexpectedly passed (bug fixed!) |")
+    lines.append("| ➖ | Not run for this model |")
+    lines.append("")
+    lines.append("*Report generated by Jarvis eval suite*")
+
+    return "\n".join(lines)
+
+
+def main():
+    if len(sys.argv) < 5 or len(sys.argv) % 2 != 1:
+        print("Usage: merge_eval_reports.py report1.md model1 report2.md model2 ...", file=sys.stderr)
+        sys.exit(1)
+
+    # Parse arguments into pairs
+    reports = []
+    args = sys.argv[1:]
+    for i in range(0, len(args), 2):
+        report_path = args[i]
+        model_name = args[i + 1]
+        report = parse_report(report_path, model_name)
+        if report:
+            reports.append(report)
+
+    if not reports:
+        print("Error: No valid reports found", file=sys.stderr)
+        sys.exit(1)
+
+    # Generate combined report
+    combined = generate_combined_report(reports)
+    sys.stdout.buffer.write(combined.encode("utf-8"))
+
+
+if __name__ == "__main__":
+    main()