Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled

Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
This commit is contained in:
javis-bot
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions

539
scripts/merge_eval_reports.py Executable file
View File

@@ -0,0 +1,539 @@
#!/usr/bin/env python3
"""
Merge multiple eval reports into a single combined EVALS.md.
This script takes pairs of (report_path, model_name) arguments and generates
a combined report showing results from all models side by side.
Usage:
python merge_eval_reports.py report1.md model1 report2.md model2 > EVALS.md
"""
import sys
import re
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
@dataclass
class TestResult:
"""Result for a single test case (aggregated across multiple runs)."""
name: str
outcome: str # passed, failed, skipped, xfailed, xpassed, partial
duration: float
pass_rate: str = "" # e.g., "3/3 (100%)" or "2/3 (67%)"
class_name: str = "" # The test class this result belongs to
@dataclass
class ModelReport:
"""Parsed report for a single model."""
model_name: str
results: Dict[str, TestResult] = field(default_factory=dict)
total: int = 0
passed: int = 0
failed: int = 0
skipped: int = 0
duration: float = 0.0
def parse_report(report_path: str, model_name: str) -> Optional[ModelReport]:
"""Parse a markdown eval report into a ModelReport."""
path = Path(report_path)
if not path.exists():
print(f"Warning: Report not found: {report_path}", file=sys.stderr)
return None
content = path.read_text(encoding="utf-8")
report = ModelReport(model_name=model_name)
# Parse summary stats
for line in content.split("\n"):
if "| ✅ Passed |" in line:
match = re.search(r"\|\s*(\d+)\s*\|", line.split("Passed")[1])
if match:
report.passed = int(match.group(1))
elif "| ❌ Failed |" in line:
match = re.search(r"\|\s*(\d+)\s*\|", line.split("Failed")[1])
if match:
report.failed = int(match.group(1))
elif "| ⏭️ Skipped |" in line:
match = re.search(r"\|\s*(\d+)\s*\|", line.split("Skipped")[1])
if match:
report.skipped = int(match.group(1))
elif "| **Total** |" in line:
match = re.search(r"\|\s*\*\*(\d+)\*\*\s*\|", line)
if match:
report.total = int(match.group(1))
elif "**Duration:**" in line:
match = re.search(r"([\d.]+)s", line)
if match:
report.duration = float(match.group(1))
# Parse individual test results from:
# 1. Table format: | Test Case | Pass Rate | Status | Avg Duration |
# 2. Detailed format: #### ✅ test_name (used for judge tests with notes)
# Track current class name from section headers like "### ✅ TestClassName"
in_table = False
table_format = "old" # "old" or "new"
current_class = ""
current_detailed_test = None # Track test name for detailed format parsing
lines = content.split("\n")
for i, line in enumerate(lines):
# Detect class section headers (e.g., "### ✅ TestIntentJudgeAccuracy")
# Use a more lenient pattern that handles multi-byte emoji characters
class_header_match = re.match(r'^###\s+\S+\s+(Test\w+)', line)
if class_header_match:
current_class = class_header_match.group(1)
in_table = False # Reset table state for new section
current_detailed_test = None
continue
# Detect detailed test headers (e.g., "#### ✅ wake_word_simple_question")
# Use a more lenient pattern that handles multi-byte emoji characters
detailed_test_match = re.match(r'^####\s+(\S+)\s+(.+)$', line)
if detailed_test_match:
in_table = False
emoji_str = detailed_test_match.group(1)
test_name = detailed_test_match.group(2).strip()
# Determine outcome from emoji (check for emoji presence)
outcome = "unknown"
if "" in emoji_str:
outcome = "passed"
elif "" in emoji_str:
outcome = "failed"
elif "" in emoji_str: # May be ⏭️ or just ⏭
outcome = "skipped"
elif "🔸" in emoji_str:
outcome = "xfailed"
elif "🎉" in emoji_str:
outcome = "xpassed"
elif "" in emoji_str: # May be ⚠️ or just ⚠
outcome = "partial"
current_detailed_test = test_name
# Initialize with placeholder values, will be updated below
report.results[test_name] = TestResult(
name=test_name,
outcome=outcome,
duration=0.0,
pass_rate="",
class_name=current_class
)
continue
# Parse pass rate and duration for detailed format
if current_detailed_test and current_detailed_test in report.results:
# Parse pass rate line: "**Pass Rate:** 1/1 (100%)" or "**Pass Rate:** 1/1 XFAIL"
if line.startswith("**Pass Rate:**"):
pass_rate_match = re.search(r'\*\*Pass Rate:\*\*\s*(.+)', line)
if pass_rate_match:
report.results[current_detailed_test].pass_rate = pass_rate_match.group(1).strip()
# Parse duration line: "*Avg Duration: 1.23s*"
elif line.startswith("*Avg Duration:"):
duration_match = re.search(r'([\d.]+)s', line)
if duration_match:
report.results[current_detailed_test].duration = float(duration_match.group(1))
current_detailed_test = None # Done parsing this test
# Table format parsing
if "| Test Case | Pass Rate | Status | Avg Duration |" in line:
in_table = True
table_format = "new"
current_detailed_test = None
continue
if "| Test Case | Status | Duration |" in line:
in_table = True
table_format = "old"
current_detailed_test = None
continue
if in_table and line.startswith("|") and "---" not in line:
parts = [p.strip() for p in line.split("|")[1:-1]]
if table_format == "new" and len(parts) >= 4:
# Parse new format: | Test Case | Pass Rate | Status | Avg Duration |
test_name = parts[0]
pass_rate = parts[1]
status_cell = parts[2]
duration_cell = parts[3]
elif len(parts) >= 3:
# Parse old format: | Test Case | Status | Duration |
test_name = parts[0]
pass_rate = ""
status_cell = parts[1]
duration_cell = parts[2]
else:
continue
# Extract outcome from status cell
outcome = "unknown"
if "" in status_cell:
outcome = "passed"
elif "" in status_cell:
outcome = "failed"
elif "⏭️" in status_cell:
outcome = "skipped"
elif "🔸" in status_cell:
outcome = "xfailed"
elif "🎉" in status_cell:
outcome = "xpassed"
elif "⚠️" in status_cell:
outcome = "partial"
# Extract duration
duration_match = re.search(r"([\d.]+)s", duration_cell)
duration = float(duration_match.group(1)) if duration_match else 0.0
report.results[test_name] = TestResult(
name=test_name,
outcome=outcome,
duration=duration,
pass_rate=pass_rate,
class_name=current_class
)
elif in_table and not line.startswith("|"):
in_table = False
return report
def is_fixed_model_test(result: TestResult) -> bool:
"""Check if a test uses a fixed model, independent of the judge model.
Some tests are pinned to specific models regardless of EVAL_JUDGE_MODEL:
- Intent judge tests use gemma4 (the intent classification model)
- Tool selection tests use nomic-embed-text (the embedding model)
These shouldn't be compared across judge models since they always use the
same model — they belong in their own section.
NOTE: This list is kept in sync manually. When you add a new test class or
file whose model is pinned (not controlled by EVAL_JUDGE_MODEL), add its
class-name substring below or its test-name pattern to the fallback list.
"""
fixed_model_classes = [
"IntentJudge", # TestIntentJudgeAccuracy, TestIntentJudgeMultiSegment, etc.
"ProcessedSegmentFiltering", # Intent judge processed segment filtering
]
fixed_model_exact_classes = {
"TestToolSelectionFiltering", # Embedding strategy, pinned to nomic-embed-text (exact match so TestToolSelectionFilteringLLM isn't bucketed here)
}
if result.class_name:
if result.class_name in fixed_model_exact_classes:
return True
for class_pattern in fixed_model_classes:
if class_pattern in result.class_name:
return True
fixed_model_name_patterns = [
"test_hot_window_mode_indicated_in_prompt",
"test_tts_text_included_for_echo_detection",
"test_system_prompt_has_echo_guidance",
"test_returns_none_when_ollama_unavailable",
]
return any(pattern in result.name for pattern in fixed_model_name_patterns)
# Backwards-compatible alias
is_intent_judge_test = is_fixed_model_test
def _parse_pass_rate_fraction(pass_rate: str) -> Optional[Tuple[int, int]]:
"""Parse a pass rate string like '2/3 (67%)' into (passes, total).
Returns None for non-standard formats (SKIPPED, XFAIL, N/A, etc.).
"""
match = re.match(r'(\d+)/(\d+)', pass_rate)
if match:
return int(match.group(1)), int(match.group(2))
return None
def _calc_run_level_pass_rate(
report: ModelReport, main_llm_tests: set
) -> Tuple[int, int]:
"""Calculate pass rate from individual run results across all main LLM tests.
Returns (total_passes, total_runs) by parsing each test's pass_rate string.
Falls back to counting fully-passed/failed tests when pass_rate data is missing.
"""
total_passes = 0
total_runs = 0
for test_name in main_llm_tests:
result = report.results.get(test_name)
if not result:
continue
# Skip xfailed/skipped — not countable
if result.outcome in ("xfailed", "skipped"):
continue
fraction = _parse_pass_rate_fraction(result.pass_rate) if result.pass_rate else None
if fraction:
total_passes += fraction[0]
total_runs += fraction[1]
else:
# Fallback: treat passed as 1/1, failed as 0/1
if result.outcome == "passed":
total_passes += 1
total_runs += 1
elif result.outcome == "failed":
total_runs += 1
return total_passes, total_runs
STATUS_EMOJI = {
"passed": "",
"failed": "",
"skipped": "⏭️",
"xfailed": "🔸",
"xpassed": "🎉",
"partial": "⚠️",
"unknown": "",
}
def _classify_fixed_model(result: TestResult) -> Optional[Tuple[str, str]]:
"""Return (category_key, pinned_model) for fixed-model tests, else None."""
cls = result.class_name or ""
name = result.name or ""
if "IntentJudge" in cls or "ProcessedSegmentFiltering" in cls or any(
p in name
for p in (
"test_hot_window_mode_indicated_in_prompt",
"test_tts_text_included_for_echo_detection",
"test_system_prompt_has_echo_guidance",
"test_returns_none_when_ollama_unavailable",
)
):
return ("intent_judge", "gemma4:e2b")
if cls == "TestToolSelectionFiltering":
return ("tool_selection", "nomic-embed-text")
return None
def _rate_emoji(rate: float) -> str:
return "🟢" if rate >= 80 else "🟡" if rate >= 50 else "🔴"
def _count_outcomes(results) -> Dict[str, int]:
"""Count outcome buckets (run-level: uses pass_rate fractions where available)."""
passed = failed = skipped = xfailed = partial = 0
total_passes = total_runs = 0
for r in results:
if r.outcome == "passed":
passed += 1
elif r.outcome == "failed":
failed += 1
elif r.outcome == "skipped":
skipped += 1
elif r.outcome == "xfailed":
xfailed += 1
elif r.outcome == "partial":
partial += 1
if r.outcome in ("xfailed", "skipped"):
continue
fraction = _parse_pass_rate_fraction(r.pass_rate) if r.pass_rate else None
if fraction:
total_passes += fraction[0]
total_runs += fraction[1]
elif r.outcome == "passed":
total_passes += 1
total_runs += 1
elif r.outcome == "failed":
total_runs += 1
rate = (total_passes / total_runs * 100) if total_runs > 0 else 0.0
return {
"passed": passed, "failed": failed, "skipped": skipped,
"xfailed": xfailed, "partial": partial,
"total": passed + failed + skipped + xfailed + partial,
"run_passes": total_passes, "run_total": total_runs, "rate": rate,
}
def generate_combined_report(reports: List[ModelReport]) -> str:
"""Generate a combined markdown report grouped by test category."""
lines: List[str] = []
now = datetime.now()
# Bucket results into three categories:
# judge_compared: run once per judge model, compared side-by-side
# intent_judge: pinned to gemma4:e2b, shown once
# tool_selection: pinned to nomic-embed-text, shown once
judge_compared: set[str] = set()
intent_judge_results: Dict[str, TestResult] = {}
tool_selection_results: Dict[str, TestResult] = {}
for report in reports:
for test_name, result in report.results.items():
fm = _classify_fixed_model(result)
if fm is None:
judge_compared.add(test_name)
continue
bucket = intent_judge_results if fm[0] == "intent_judge" else tool_selection_results
existing = bucket.get(test_name)
if existing is None or (existing.outcome == "skipped" and result.outcome != "skipped"):
bucket[test_name] = result
# Per-model stats for the judge-compared bucket
per_model_stats: Dict[str, Dict[str, int]] = {}
for report in reports:
results = [r for n, r in report.results.items() if n in judge_compared]
per_model_stats[report.model_name] = _count_outcomes(results)
intent_stats = _count_outcomes(list(intent_judge_results.values()))
tool_stats = _count_outcomes(list(tool_selection_results.values()))
# Overall aggregate (sum of runs across all categories)
overall_passes = sum(s["run_passes"] for s in per_model_stats.values()) + intent_stats["run_passes"] + tool_stats["run_passes"]
overall_runs = sum(s["run_total"] for s in per_model_stats.values()) + intent_stats["run_total"] + tool_stats["run_total"]
overall_rate = (overall_passes / overall_runs * 100) if overall_runs > 0 else 0.0
# Header
lines.append("# 🧪 Jarvis Evaluation Report")
lines.append("")
lines.append(f"**Generated:** {now.strftime('%Y-%m-%d %H:%M:%S')}")
lines.append("")
# TL;DR
lines.append("## 📊 TL;DR")
lines.append("")
lines.append(f"**Overall:** {_rate_emoji(overall_rate)} **{overall_passes}/{overall_runs} passed ({overall_rate:.1f}%)** across all categories")
lines.append("")
lines.append("| Category | Model | Passed | Failed | Skipped | Pass Rate |")
lines.append("|----------|-------|-------:|-------:|--------:|----------:|")
def _fmt_row(label: str, model_note: str, stats: Dict[str, int]) -> str:
emoji = _rate_emoji(stats["rate"]) if stats["run_total"] else ""
rate_str = f"{emoji} {stats['rate']:.1f}%" if stats["run_total"] else ""
return (
f"| {label} | {model_note} | {stats['passed']} | {stats['failed']} | "
f"{stats['skipped']} | {rate_str} |"
)
for report in reports:
lines.append(_fmt_row("🤖 Agent behaviour", f"`{report.model_name}`", per_model_stats[report.model_name]))
if intent_judge_results:
lines.append(_fmt_row("🎤 Intent judge", "`gemma4:e2b` (fixed)", intent_stats))
if tool_selection_results:
lines.append(_fmt_row("🔍 Tool selection", "`nomic-embed-text` (fixed)", tool_stats))
lines.append("")
# Model selection guide (only when comparing judges)
if len(reports) > 1:
lines.append("### 💡 Model Selection Guide")
lines.append("")
lines.append("| Model | Best For | Trade-offs |")
lines.append("|-------|----------|------------|")
lines.append("| `gemma4:e2b` | Quick responses, lower RAM usage | May struggle with complex reasoning |")
lines.append("| `gpt-oss:20b` | Best accuracy, complex tasks | Slower, requires more RAM |")
lines.append("")
# Agent behaviour: per-test comparison across judge models
lines.append("---")
lines.append("")
lines.append("## 🤖 Agent behaviour")
lines.append("")
lines.append("> Runs the full agent pipeline against each judge model. Tests are compared side-by-side.")
lines.append("")
header = "| Test Case |"
separator = "|-----------|"
for report in reports:
header += f" {report.model_name} |"
separator += "----------:|"
lines.append(header)
lines.append(separator)
for test_name in sorted(judge_compared):
row = f"| {test_name} |"
for report in reports:
result = report.results.get(test_name)
if result:
emoji = STATUS_EMOJI.get(result.outcome, "")
row += f" {emoji} {result.pass_rate} |" if result.pass_rate else f" {emoji} |"
else:
row += " |"
lines.append(row)
lines.append("")
def _render_fixed_section(title: str, blurb: str, results: Dict[str, TestResult]) -> None:
if not results:
return
lines.append("---")
lines.append("")
lines.append(f"## {title}")
lines.append("")
lines.append(f"> {blurb}")
lines.append("")
lines.append("| Test Case | Pass Rate | Status |")
lines.append("|-----------|-----------|:------:|")
for test_name in sorted(results.keys()):
result = results[test_name]
emoji = STATUS_EMOJI.get(result.outcome, "")
pass_rate_str = result.pass_rate if result.pass_rate else "N/A"
lines.append(f"| {test_name} | {pass_rate_str} | {emoji} |")
lines.append("")
_render_fixed_section(
"🎤 Intent judge",
"Pinned to `gemma4:e2b` (the voice intent classifier). Not affected by the judge model.",
intent_judge_results,
)
_render_fixed_section(
"🔍 Tool selection",
"Pinned to `nomic-embed-text` (embedding-based filter). Not affected by the judge model.",
tool_selection_results,
)
# Legend
lines.append("---")
lines.append("")
lines.append("### 📖 Legend")
lines.append("")
lines.append("| Symbol | Meaning |")
lines.append("|--------|---------|")
lines.append("| ✅ | Fully passed (100% pass rate) |")
lines.append("| ⚠️ | Partial pass (some runs failed) |")
lines.append("| ❌ | Fully failed (0% pass rate) |")
lines.append("| ⏭️ | Skipped (missing dependencies) |")
lines.append("| 🔸 | Expected failure (known limitation) |")
lines.append("| 🎉 | Unexpectedly passed (bug fixed!) |")
lines.append("| | Not run for this model |")
lines.append("")
lines.append("*Report generated by Jarvis eval suite*")
return "\n".join(lines)
def main():
if len(sys.argv) < 5 or len(sys.argv) % 2 != 1:
print("Usage: merge_eval_reports.py report1.md model1 report2.md model2 ...", file=sys.stderr)
sys.exit(1)
# Parse arguments into pairs
reports = []
args = sys.argv[1:]
for i in range(0, len(args), 2):
report_path = args[i]
model_name = args[i + 1]
report = parse_report(report_path, model_name)
if report:
reports.append(report)
if not reports:
print("Error: No valid reports found", file=sys.stderr)
sys.exit(1)
# Generate combined report
combined = generate_combined_report(reports)
sys.stdout.buffer.write(combined.encode("utf-8"))
if __name__ == "__main__":
main()