Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/scripts/build_installer.bat
+++ b/scripts/build_installer.bat
@@ -0,0 +1,99 @@
+@echo off
+REM Build the Windows installer (Jarvis-Setup-x64.exe) for manual testing.
+REM PyInstaller produces dist\Jarvis\, then Inno Setup wraps that into the
+REM installer at dist\Jarvis-Setup-x64.exe. The resulting installer is the
+REM artefact CI ships, so manual runs of it exercise the same code paths
+REM as a real release including install_cuda.ps1 and the VerifyCudaInstall hook.
+
+REM Navigate to project root (use for-loop to resolve .. reliably across shells)
+for %%I in ("%~dp0..") do set "PROJECT_ROOT=%%~fI"
+cd /d "%PROJECT_ROOT%"
+
+REM Resolve mamba env: prefer this checkout's own, fall back to the main
+REM repo's when running from a git worktree (worktrees share one env).
+set "MAMBA_ENV=%PROJECT_ROOT%\.mamba_env"
+if not exist "%MAMBA_ENV%\python.exe" call :resolve_mamba_from_worktree
+
+if not exist "%MAMBA_ENV%\python.exe" (
+    echo [build_installer] ERROR: Mamba environment not found.
+    echo                   Looked in: %PROJECT_ROOT%\.mamba_env
+    echo                   And the main repo's .mamba_env ^(if this is a git worktree^).
+    echo                   Run the setup script first.
+    exit /b 1
+)
+
+REM ---- Stamp a dev version file so jarvis.get_version() works in the bundle.
+echo [build_installer] Stamping dev _version.py...
+for /f "delims=" %%i in ('git rev-parse --short=7 HEAD 2^>nul') do set "GIT_SHA=%%i"
+if "%GIT_SHA%"=="" set "GIT_SHA=local"
+set "DEV_VERSION=dev-%GIT_SHA%"
+> "%PROJECT_ROOT%\src\jarvis\_version.py" (
+    echo # Auto-generated by scripts/build_installer.bat
+    echo VERSION = "%DEV_VERSION%"
+    echo RELEASE_CHANNEL = "develop"
+)
+
+REM ---- Generate icons (idempotent; cheap to re-run).
+echo [build_installer] Generating icons...
+"%MAMBA_ENV%\python.exe" src\desktop_app\desktop_assets\generate_icons.py
+if errorlevel 1 (
+    echo [build_installer] ERROR: icon generation failed
+    exit /b 1
+)
+
+REM ---- Clean previous build outputs.
+echo [build_installer] Cleaning previous builds...
+if exist "build" rmdir /s /q build
+if exist "dist"  rmdir /s /q dist
+
+REM ---- PyInstaller produces dist\Jarvis\.
+echo [build_installer] Running PyInstaller...
+"%MAMBA_ENV%\python.exe" -m PyInstaller jarvis_desktop.spec
+if not exist "dist\Jarvis\Jarvis.exe" (
+    echo [build_installer] ERROR: PyInstaller did not produce dist\Jarvis\Jarvis.exe
+    exit /b 1
+)
+
+REM ---- Locate ISCC.exe. Try common install paths first, then PATH.
+set "ISCC="
+if exist "C:\Program Files (x86)\Inno Setup 6\ISCC.exe" set "ISCC=C:\Program Files (x86)\Inno Setup 6\ISCC.exe"
+if not defined ISCC if exist "C:\Program Files\Inno Setup 6\ISCC.exe" set "ISCC=C:\Program Files\Inno Setup 6\ISCC.exe"
+if not defined ISCC for /f "delims=" %%i in ('where iscc 2^>nul') do set "ISCC=%%i"
+
+if not defined ISCC (
+    echo [build_installer] ERROR: ISCC.exe not found.
+    echo                   Install Inno Setup 6 from https://jrsoftware.org/isdl.php
+    echo                   or run: choco install innosetup -y
+    exit /b 1
+)
+
+REM ---- Build the installer. /DMyAppVersion is what the .iss file expects.
+echo [build_installer] Running Inno Setup with version %DEV_VERSION%...
+"%ISCC%" /DMyAppVersion="%DEV_VERSION%" installer\windows\jarvis_setup.iss
+if errorlevel 1 (
+    echo [build_installer] ERROR: Inno Setup failed
+    exit /b 1
+)
+
+if not exist "dist\Jarvis-Setup-x64.exe" (
+    echo [build_installer] ERROR: Installer was not produced at dist\Jarvis-Setup-x64.exe
+    exit /b 1
+)
+
+echo.
+echo [build_installer] SUCCESS
+echo                   Installer:  %PROJECT_ROOT%\dist\Jarvis-Setup-x64.exe
+echo                   Frozen app: %PROJECT_ROOT%\dist\Jarvis\Jarvis.exe
+echo.
+echo [build_installer] To test the CUDA install flow, run the installer with the
+echo                   "Download NVIDIA CUDA libraries" task ticked, then check
+echo                   "%%LOCALAPPDATA%%\Programs\Jarvis\cuda\install.log".
+
+goto :eof
+
+:resolve_mamba_from_worktree
+for /f "usebackq delims=" %%G in (`git -C "%PROJECT_ROOT%" rev-parse --git-common-dir 2^>nul`) do set "GIT_COMMON_DIR=%%G"
+if not defined GIT_COMMON_DIR goto :eof
+for %%I in ("%GIT_COMMON_DIR%\..") do set "MAIN_REPO=%%~fI"
+if exist "%MAIN_REPO%\.mamba_env\python.exe" set "MAMBA_ENV=%MAIN_REPO%\.mamba_env"
+goto :eof
--- a/scripts/build_installer.sh
+++ b/scripts/build_installer.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Build the frozen app for manual testing. On macOS this produces
+# dist/Jarvis.app; on Linux dist/Jarvis/. There is no Inno-equivalent
+# installer step on these platforms, so the bundle directory itself is
+# the artefact you'd ship.
+
+set -euo pipefail
+
+cd "$(dirname "$0")/.."
+PROJECT_ROOT="$(pwd)"
+
+# Stamp a dev version file so jarvis.get_version() works in the bundle.
+GIT_SHA="$(git rev-parse --short=7 HEAD 2>/dev/null || echo local)"
+DEV_VERSION="dev-${GIT_SHA}"
+echo "[build_installer] Stamping dev _version.py (${DEV_VERSION})..."
+cat > "${PROJECT_ROOT}/src/jarvis/_version.py" <<EOF
+# Auto-generated by scripts/build_installer.sh
+VERSION = "${DEV_VERSION}"
+RELEASE_CHANNEL = "develop"
+EOF
+
+echo "[build_installer] 🎨 Generating icons..."
+python src/desktop_app/desktop_assets/generate_icons.py
+
+echo "[build_installer] 🧹 Cleaning previous builds..."
+rm -rf build dist
+
+echo "[build_installer] 📦 Running PyInstaller..."
+python -m PyInstaller jarvis_desktop.spec
+
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    if [[ -d dist/Jarvis.app ]]; then
+        echo
+        echo "[build_installer] ✅ SUCCESS"
+        echo "                  Bundle: ${PROJECT_ROOT}/dist/Jarvis.app"
+        echo "[build_installer] ℹ️  No installer is produced on macOS."
+    else
+        echo "[build_installer] ❌ Bundle missing at dist/Jarvis.app" >&2
+        exit 1
+    fi
+else
+    if [[ -d dist/Jarvis ]]; then
+        echo
+        echo "[build_installer] ✅ SUCCESS"
+        echo "                  Bundle: ${PROJECT_ROOT}/dist/Jarvis"
+        echo "[build_installer] ℹ️  No installer is produced on Linux."
+    else
+        echo "[build_installer] ❌ Bundle missing at dist/Jarvis" >&2
+        exit 1
+    fi
+fi
--- a/scripts/dev.sh
+++ b/scripts/dev.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Run brain bridge + bot together for local development.
+# The bridge expects the VNC desktop on DISPLAY :1 for screen capture.
+set -euo pipefail
+cd "$(dirname "$0")/.."
+
+./scripts/start_bridge.sh &
+BRIDGE_PID=$!
+trap 'kill $BRIDGE_PID 2>/dev/null || true' EXIT
+
+# Give the bridge a moment to bind its port before the bot queries /health.
+sleep 2
+./scripts/start_bot.sh
--- a/scripts/generate_config_examples.py
+++ b/scripts/generate_config_examples.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""
+Script to generate example configuration files from the default values in config.py.
+This ensures config examples stay in sync with the actual defaults.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+# Add src to path so we can import jarvis modules
+script_dir = Path(__file__).parent
+project_root = script_dir.parent
+src_dir = project_root / "src"
+sys.path.insert(0, str(src_dir))
+
+from jarvis.config import export_example_config
+
+
+def generate_config_example() -> None:
+    """Generate examples/config.json from defaults."""
+    config = export_example_config(include_db_path=False)
+    
+    # Generate the config file
+    config_path = project_root / "examples" / "config.json"
+    with config_path.open("w", encoding="utf-8") as f:
+        json.dump(config, f, indent=2)
+        f.write("\n")  # Add trailing newline
+    
+    print(f"Generated {config_path}")
+
+
+def main() -> None:
+    """Generate all example configuration files."""
+    print("Generating configuration examples from defaults...")
+    
+    generate_config_example()
+    
+    print("\nDone! Example files are now in sync with config.py defaults.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/launch.py
+++ b/scripts/launch.py
@@ -0,0 +1,56 @@
+"""Cross-platform launcher for Claude Code preview_start.
+
+Detects the OS and delegates to the appropriate platform-specific script
+(bat on Windows, sh on macOS/Linux). Can be invoked with any Python 3.x.
+
+Usage:
+    python scripts/launch.py <script_name> [args...]
+
+Examples:
+    python scripts/launch.py run_desktop_app
+    python scripts/launch.py run_desktop_app --voice-debug
+    python scripts/launch.py run_evals
+"""
+
+import os
+import platform
+import subprocess
+import sys
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python scripts/launch.py <script_name> [args...]")
+        sys.exit(1)
+
+    script_name = sys.argv[1]
+    extra_args = sys.argv[2:]
+
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    scripts_dir = os.path.join(project_root, "scripts")
+
+    if platform.system() == "Windows":
+        script_path = os.path.join(scripts_dir, f"{script_name}.bat")
+        if not os.path.isfile(script_path):
+            print(f"ERROR: {script_path} not found")
+            sys.exit(1)
+        result = subprocess.run(
+            [script_path] + extra_args,
+            cwd=project_root,
+            shell=True,
+        )
+    else:
+        script_path = os.path.join(scripts_dir, f"{script_name}.sh")
+        if not os.path.isfile(script_path):
+            print(f"ERROR: {script_path} not found")
+            sys.exit(1)
+        result = subprocess.run(
+            ["bash", script_path] + extra_args,
+            cwd=project_root,
+        )
+
+    sys.exit(result.returncode)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/merge_eval_reports.py
+++ b/scripts/merge_eval_reports.py
@@ -0,0 +1,539 @@
+#!/usr/bin/env python3
+"""
+Merge multiple eval reports into a single combined EVALS.md.
+
+This script takes pairs of (report_path, model_name) arguments and generates
+a combined report showing results from all models side by side.
+
+Usage:
+    python merge_eval_reports.py report1.md model1 report2.md model2 > EVALS.md
+"""
+
+import sys
+import re
+from datetime import datetime
+from pathlib import Path
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+
+
+@dataclass
+class TestResult:
+    """Result for a single test case (aggregated across multiple runs)."""
+    name: str
+    outcome: str  # passed, failed, skipped, xfailed, xpassed, partial
+    duration: float
+    pass_rate: str = ""  # e.g., "3/3 (100%)" or "2/3 (67%)"
+    class_name: str = ""  # The test class this result belongs to
+
+
+@dataclass
+class ModelReport:
+    """Parsed report for a single model."""
+    model_name: str
+    results: Dict[str, TestResult] = field(default_factory=dict)
+    total: int = 0
+    passed: int = 0
+    failed: int = 0
+    skipped: int = 0
+    duration: float = 0.0
+
+
+def parse_report(report_path: str, model_name: str) -> Optional[ModelReport]:
+    """Parse a markdown eval report into a ModelReport."""
+    path = Path(report_path)
+    if not path.exists():
+        print(f"Warning: Report not found: {report_path}", file=sys.stderr)
+        return None
+
+    content = path.read_text(encoding="utf-8")
+    report = ModelReport(model_name=model_name)
+
+    # Parse summary stats
+    for line in content.split("\n"):
+        if "| ✅ Passed |" in line:
+            match = re.search(r"\|\s*(\d+)\s*\|", line.split("Passed")[1])
+            if match:
+                report.passed = int(match.group(1))
+        elif "| ❌ Failed |" in line:
+            match = re.search(r"\|\s*(\d+)\s*\|", line.split("Failed")[1])
+            if match:
+                report.failed = int(match.group(1))
+        elif "| ⏭️ Skipped |" in line:
+            match = re.search(r"\|\s*(\d+)\s*\|", line.split("Skipped")[1])
+            if match:
+                report.skipped = int(match.group(1))
+        elif "| **Total** |" in line:
+            match = re.search(r"\|\s*\*\*(\d+)\*\*\s*\|", line)
+            if match:
+                report.total = int(match.group(1))
+        elif "**Duration:**" in line:
+            match = re.search(r"([\d.]+)s", line)
+            if match:
+                report.duration = float(match.group(1))
+
+    # Parse individual test results from:
+    # 1. Table format: | Test Case | Pass Rate | Status | Avg Duration |
+    # 2. Detailed format: #### ✅ test_name (used for judge tests with notes)
+    # Track current class name from section headers like "### ✅ TestClassName"
+    in_table = False
+    table_format = "old"  # "old" or "new"
+    current_class = ""
+    current_detailed_test = None  # Track test name for detailed format parsing
+    lines = content.split("\n")
+
+    for i, line in enumerate(lines):
+        # Detect class section headers (e.g., "### ✅ TestIntentJudgeAccuracy")
+        # Use a more lenient pattern that handles multi-byte emoji characters
+        class_header_match = re.match(r'^###\s+\S+\s+(Test\w+)', line)
+        if class_header_match:
+            current_class = class_header_match.group(1)
+            in_table = False  # Reset table state for new section
+            current_detailed_test = None
+            continue
+
+        # Detect detailed test headers (e.g., "#### ✅ wake_word_simple_question")
+        # Use a more lenient pattern that handles multi-byte emoji characters
+        detailed_test_match = re.match(r'^####\s+(\S+)\s+(.+)$', line)
+        if detailed_test_match:
+            in_table = False
+            emoji_str = detailed_test_match.group(1)
+            test_name = detailed_test_match.group(2).strip()
+
+            # Determine outcome from emoji (check for emoji presence)
+            outcome = "unknown"
+            if "✅" in emoji_str:
+                outcome = "passed"
+            elif "❌" in emoji_str:
+                outcome = "failed"
+            elif "⏭" in emoji_str:  # May be ⏭️ or just ⏭
+                outcome = "skipped"
+            elif "🔸" in emoji_str:
+                outcome = "xfailed"
+            elif "🎉" in emoji_str:
+                outcome = "xpassed"
+            elif "⚠" in emoji_str:  # May be ⚠️ or just ⚠
+                outcome = "partial"
+
+            current_detailed_test = test_name
+            # Initialize with placeholder values, will be updated below
+            report.results[test_name] = TestResult(
+                name=test_name,
+                outcome=outcome,
+                duration=0.0,
+                pass_rate="",
+                class_name=current_class
+            )
+            continue
+
+        # Parse pass rate and duration for detailed format
+        if current_detailed_test and current_detailed_test in report.results:
+            # Parse pass rate line: "**Pass Rate:** 1/1 (100%)" or "**Pass Rate:** 1/1 XFAIL"
+            if line.startswith("**Pass Rate:**"):
+                pass_rate_match = re.search(r'\*\*Pass Rate:\*\*\s*(.+)', line)
+                if pass_rate_match:
+                    report.results[current_detailed_test].pass_rate = pass_rate_match.group(1).strip()
+            # Parse duration line: "*Avg Duration: 1.23s*"
+            elif line.startswith("*Avg Duration:"):
+                duration_match = re.search(r'([\d.]+)s', line)
+                if duration_match:
+                    report.results[current_detailed_test].duration = float(duration_match.group(1))
+                current_detailed_test = None  # Done parsing this test
+
+        # Table format parsing
+        if "| Test Case | Pass Rate | Status | Avg Duration |" in line:
+            in_table = True
+            table_format = "new"
+            current_detailed_test = None
+            continue
+        if "| Test Case | Status | Duration |" in line:
+            in_table = True
+            table_format = "old"
+            current_detailed_test = None
+            continue
+        if in_table and line.startswith("|") and "---" not in line:
+            parts = [p.strip() for p in line.split("|")[1:-1]]
+
+            if table_format == "new" and len(parts) >= 4:
+                # Parse new format: | Test Case | Pass Rate | Status | Avg Duration |
+                test_name = parts[0]
+                pass_rate = parts[1]
+                status_cell = parts[2]
+                duration_cell = parts[3]
+            elif len(parts) >= 3:
+                # Parse old format: | Test Case | Status | Duration |
+                test_name = parts[0]
+                pass_rate = ""
+                status_cell = parts[1]
+                duration_cell = parts[2]
+            else:
+                continue
+
+            # Extract outcome from status cell
+            outcome = "unknown"
+            if "✅" in status_cell:
+                outcome = "passed"
+            elif "❌" in status_cell:
+                outcome = "failed"
+            elif "⏭️" in status_cell:
+                outcome = "skipped"
+            elif "🔸" in status_cell:
+                outcome = "xfailed"
+            elif "🎉" in status_cell:
+                outcome = "xpassed"
+            elif "⚠️" in status_cell:
+                outcome = "partial"
+
+            # Extract duration
+            duration_match = re.search(r"([\d.]+)s", duration_cell)
+            duration = float(duration_match.group(1)) if duration_match else 0.0
+
+            report.results[test_name] = TestResult(
+                name=test_name,
+                outcome=outcome,
+                duration=duration,
+                pass_rate=pass_rate,
+                class_name=current_class
+            )
+        elif in_table and not line.startswith("|"):
+            in_table = False
+
+    return report
+
+
+def is_fixed_model_test(result: TestResult) -> bool:
+    """Check if a test uses a fixed model, independent of the judge model.
+
+    Some tests are pinned to specific models regardless of EVAL_JUDGE_MODEL:
+    - Intent judge tests use gemma4 (the intent classification model)
+    - Tool selection tests use nomic-embed-text (the embedding model)
+
+    These shouldn't be compared across judge models since they always use the
+    same model — they belong in their own section.
+
+    NOTE: This list is kept in sync manually. When you add a new test class or
+    file whose model is pinned (not controlled by EVAL_JUDGE_MODEL), add its
+    class-name substring below or its test-name pattern to the fallback list.
+    """
+    fixed_model_classes = [
+        "IntentJudge",  # TestIntentJudgeAccuracy, TestIntentJudgeMultiSegment, etc.
+        "ProcessedSegmentFiltering",  # Intent judge processed segment filtering
+    ]
+    fixed_model_exact_classes = {
+        "TestToolSelectionFiltering",  # Embedding strategy, pinned to nomic-embed-text (exact match so TestToolSelectionFilteringLLM isn't bucketed here)
+    }
+
+    if result.class_name:
+        if result.class_name in fixed_model_exact_classes:
+            return True
+        for class_pattern in fixed_model_classes:
+            if class_pattern in result.class_name:
+                return True
+
+    fixed_model_name_patterns = [
+        "test_hot_window_mode_indicated_in_prompt",
+        "test_tts_text_included_for_echo_detection",
+        "test_system_prompt_has_echo_guidance",
+        "test_returns_none_when_ollama_unavailable",
+    ]
+    return any(pattern in result.name for pattern in fixed_model_name_patterns)
+
+
+# Backwards-compatible alias
+is_intent_judge_test = is_fixed_model_test
+
+
+def _parse_pass_rate_fraction(pass_rate: str) -> Optional[Tuple[int, int]]:
+    """Parse a pass rate string like '2/3 (67%)' into (passes, total).
+
+    Returns None for non-standard formats (SKIPPED, XFAIL, N/A, etc.).
+    """
+    match = re.match(r'(\d+)/(\d+)', pass_rate)
+    if match:
+        return int(match.group(1)), int(match.group(2))
+    return None
+
+
+def _calc_run_level_pass_rate(
+    report: ModelReport, main_llm_tests: set
+) -> Tuple[int, int]:
+    """Calculate pass rate from individual run results across all main LLM tests.
+
+    Returns (total_passes, total_runs) by parsing each test's pass_rate string.
+    Falls back to counting fully-passed/failed tests when pass_rate data is missing.
+    """
+    total_passes = 0
+    total_runs = 0
+
+    for test_name in main_llm_tests:
+        result = report.results.get(test_name)
+        if not result:
+            continue
+
+        # Skip xfailed/skipped — not countable
+        if result.outcome in ("xfailed", "skipped"):
+            continue
+
+        fraction = _parse_pass_rate_fraction(result.pass_rate) if result.pass_rate else None
+        if fraction:
+            total_passes += fraction[0]
+            total_runs += fraction[1]
+        else:
+            # Fallback: treat passed as 1/1, failed as 0/1
+            if result.outcome == "passed":
+                total_passes += 1
+                total_runs += 1
+            elif result.outcome == "failed":
+                total_runs += 1
+
+    return total_passes, total_runs
+
+
+STATUS_EMOJI = {
+    "passed": "✅",
+    "failed": "❌",
+    "skipped": "⏭️",
+    "xfailed": "🔸",
+    "xpassed": "🎉",
+    "partial": "⚠️",
+    "unknown": "❓",
+}
+
+
+def _classify_fixed_model(result: TestResult) -> Optional[Tuple[str, str]]:
+    """Return (category_key, pinned_model) for fixed-model tests, else None."""
+    cls = result.class_name or ""
+    name = result.name or ""
+    if "IntentJudge" in cls or "ProcessedSegmentFiltering" in cls or any(
+        p in name
+        for p in (
+            "test_hot_window_mode_indicated_in_prompt",
+            "test_tts_text_included_for_echo_detection",
+            "test_system_prompt_has_echo_guidance",
+            "test_returns_none_when_ollama_unavailable",
+        )
+    ):
+        return ("intent_judge", "gemma4:e2b")
+    if cls == "TestToolSelectionFiltering":
+        return ("tool_selection", "nomic-embed-text")
+    return None
+
+
+def _rate_emoji(rate: float) -> str:
+    return "🟢" if rate >= 80 else "🟡" if rate >= 50 else "🔴"
+
+
+def _count_outcomes(results) -> Dict[str, int]:
+    """Count outcome buckets (run-level: uses pass_rate fractions where available)."""
+    passed = failed = skipped = xfailed = partial = 0
+    total_passes = total_runs = 0
+    for r in results:
+        if r.outcome == "passed":
+            passed += 1
+        elif r.outcome == "failed":
+            failed += 1
+        elif r.outcome == "skipped":
+            skipped += 1
+        elif r.outcome == "xfailed":
+            xfailed += 1
+        elif r.outcome == "partial":
+            partial += 1
+        if r.outcome in ("xfailed", "skipped"):
+            continue
+        fraction = _parse_pass_rate_fraction(r.pass_rate) if r.pass_rate else None
+        if fraction:
+            total_passes += fraction[0]
+            total_runs += fraction[1]
+        elif r.outcome == "passed":
+            total_passes += 1
+            total_runs += 1
+        elif r.outcome == "failed":
+            total_runs += 1
+    rate = (total_passes / total_runs * 100) if total_runs > 0 else 0.0
+    return {
+        "passed": passed, "failed": failed, "skipped": skipped,
+        "xfailed": xfailed, "partial": partial,
+        "total": passed + failed + skipped + xfailed + partial,
+        "run_passes": total_passes, "run_total": total_runs, "rate": rate,
+    }
+
+
+def generate_combined_report(reports: List[ModelReport]) -> str:
+    """Generate a combined markdown report grouped by test category."""
+    lines: List[str] = []
+    now = datetime.now()
+
+    # Bucket results into three categories:
+    #   judge_compared: run once per judge model, compared side-by-side
+    #   intent_judge:   pinned to gemma4:e2b, shown once
+    #   tool_selection: pinned to nomic-embed-text, shown once
+    judge_compared: set[str] = set()
+    intent_judge_results: Dict[str, TestResult] = {}
+    tool_selection_results: Dict[str, TestResult] = {}
+
+    for report in reports:
+        for test_name, result in report.results.items():
+            fm = _classify_fixed_model(result)
+            if fm is None:
+                judge_compared.add(test_name)
+                continue
+            bucket = intent_judge_results if fm[0] == "intent_judge" else tool_selection_results
+            existing = bucket.get(test_name)
+            if existing is None or (existing.outcome == "skipped" and result.outcome != "skipped"):
+                bucket[test_name] = result
+
+    # Per-model stats for the judge-compared bucket
+    per_model_stats: Dict[str, Dict[str, int]] = {}
+    for report in reports:
+        results = [r for n, r in report.results.items() if n in judge_compared]
+        per_model_stats[report.model_name] = _count_outcomes(results)
+
+    intent_stats = _count_outcomes(list(intent_judge_results.values()))
+    tool_stats = _count_outcomes(list(tool_selection_results.values()))
+
+    # Overall aggregate (sum of runs across all categories)
+    overall_passes = sum(s["run_passes"] for s in per_model_stats.values()) + intent_stats["run_passes"] + tool_stats["run_passes"]
+    overall_runs = sum(s["run_total"] for s in per_model_stats.values()) + intent_stats["run_total"] + tool_stats["run_total"]
+    overall_rate = (overall_passes / overall_runs * 100) if overall_runs > 0 else 0.0
+
+    # Header
+    lines.append("# 🧪 Jarvis Evaluation Report")
+    lines.append("")
+    lines.append(f"**Generated:** {now.strftime('%Y-%m-%d %H:%M:%S')}")
+    lines.append("")
+
+    # TL;DR
+    lines.append("## 📊 TL;DR")
+    lines.append("")
+    lines.append(f"**Overall:** {_rate_emoji(overall_rate)} **{overall_passes}/{overall_runs} passed ({overall_rate:.1f}%)** across all categories")
+    lines.append("")
+    lines.append("| Category | Model | Passed | Failed | Skipped | Pass Rate |")
+    lines.append("|----------|-------|-------:|-------:|--------:|----------:|")
+
+    def _fmt_row(label: str, model_note: str, stats: Dict[str, int]) -> str:
+        emoji = _rate_emoji(stats["rate"]) if stats["run_total"] else "➖"
+        rate_str = f"{emoji} {stats['rate']:.1f}%" if stats["run_total"] else "➖"
+        return (
+            f"| {label} | {model_note} | {stats['passed']} | {stats['failed']} | "
+            f"{stats['skipped']} | {rate_str} |"
+        )
+
+    for report in reports:
+        lines.append(_fmt_row("🤖 Agent behaviour", f"`{report.model_name}`", per_model_stats[report.model_name]))
+    if intent_judge_results:
+        lines.append(_fmt_row("🎤 Intent judge", "`gemma4:e2b` (fixed)", intent_stats))
+    if tool_selection_results:
+        lines.append(_fmt_row("🔍 Tool selection", "`nomic-embed-text` (fixed)", tool_stats))
+    lines.append("")
+
+    # Model selection guide (only when comparing judges)
+    if len(reports) > 1:
+        lines.append("### 💡 Model Selection Guide")
+        lines.append("")
+        lines.append("| Model | Best For | Trade-offs |")
+        lines.append("|-------|----------|------------|")
+        lines.append("| `gemma4:e2b` | Quick responses, lower RAM usage | May struggle with complex reasoning |")
+        lines.append("| `gpt-oss:20b` | Best accuracy, complex tasks | Slower, requires more RAM |")
+        lines.append("")
+
+    # Agent behaviour: per-test comparison across judge models
+    lines.append("---")
+    lines.append("")
+    lines.append("## 🤖 Agent behaviour")
+    lines.append("")
+    lines.append("> Runs the full agent pipeline against each judge model. Tests are compared side-by-side.")
+    lines.append("")
+    header = "| Test Case |"
+    separator = "|-----------|"
+    for report in reports:
+        header += f" {report.model_name} |"
+        separator += "----------:|"
+    lines.append(header)
+    lines.append(separator)
+    for test_name in sorted(judge_compared):
+        row = f"| {test_name} |"
+        for report in reports:
+            result = report.results.get(test_name)
+            if result:
+                emoji = STATUS_EMOJI.get(result.outcome, "❓")
+                row += f" {emoji} {result.pass_rate} |" if result.pass_rate else f" {emoji} |"
+            else:
+                row += " ➖ |"
+        lines.append(row)
+    lines.append("")
+
+    def _render_fixed_section(title: str, blurb: str, results: Dict[str, TestResult]) -> None:
+        if not results:
+            return
+        lines.append("---")
+        lines.append("")
+        lines.append(f"## {title}")
+        lines.append("")
+        lines.append(f"> {blurb}")
+        lines.append("")
+        lines.append("| Test Case | Pass Rate | Status |")
+        lines.append("|-----------|-----------|:------:|")
+        for test_name in sorted(results.keys()):
+            result = results[test_name]
+            emoji = STATUS_EMOJI.get(result.outcome, "❓")
+            pass_rate_str = result.pass_rate if result.pass_rate else "N/A"
+            lines.append(f"| {test_name} | {pass_rate_str} | {emoji} |")
+        lines.append("")
+
+    _render_fixed_section(
+        "🎤 Intent judge",
+        "Pinned to `gemma4:e2b` (the voice intent classifier). Not affected by the judge model.",
+        intent_judge_results,
+    )
+    _render_fixed_section(
+        "🔍 Tool selection",
+        "Pinned to `nomic-embed-text` (embedding-based filter). Not affected by the judge model.",
+        tool_selection_results,
+    )
+
+    # Legend
+    lines.append("---")
+    lines.append("")
+    lines.append("### 📖 Legend")
+    lines.append("")
+    lines.append("| Symbol | Meaning |")
+    lines.append("|--------|---------|")
+    lines.append("| ✅ | Fully passed (100% pass rate) |")
+    lines.append("| ⚠️ | Partial pass (some runs failed) |")
+    lines.append("| ❌ | Fully failed (0% pass rate) |")
+    lines.append("| ⏭️ | Skipped (missing dependencies) |")
+    lines.append("| 🔸 | Expected failure (known limitation) |")
+    lines.append("| 🎉 | Unexpectedly passed (bug fixed!) |")
+    lines.append("| ➖ | Not run for this model |")
+    lines.append("")
+    lines.append("*Report generated by Jarvis eval suite*")
+
+    return "\n".join(lines)
+
+
+def main():
+    if len(sys.argv) < 5 or len(sys.argv) % 2 != 1:
+        print("Usage: merge_eval_reports.py report1.md model1 report2.md model2 ...", file=sys.stderr)
+        sys.exit(1)
+
+    # Parse arguments into pairs
+    reports = []
+    args = sys.argv[1:]
+    for i in range(0, len(args), 2):
+        report_path = args[i]
+        model_name = args[i + 1]
+        report = parse_report(report_path, model_name)
+        if report:
+            reports.append(report)
+
+    if not reports:
+        print("Error: No valid reports found", file=sys.stderr)
+        sys.exit(1)
+
+    # Generate combined report
+    combined = generate_combined_report(reports)
+    sys.stdout.buffer.write(combined.encode("utf-8"))
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/run_desktop_app.bat
+++ b/scripts/run_desktop_app.bat
@@ -0,0 +1,84 @@
+@echo off
+REM Run script for the Jarvis Desktop App on Windows
+REM Uses the project's mamba environment
+REM Usage: run_desktop_app.bat [--voice-debug]
+
+REM Parse arguments
+set "VOICE_DEBUG=0"
+:parse_args
+if "%~1"=="" goto done_args
+if "%~1"=="--voice-debug" (
+    set "VOICE_DEBUG=1"
+    shift
+    goto parse_args
+)
+shift
+goto parse_args
+:done_args
+
+echo Testing Jarvis Desktop App locally...
+if "%VOICE_DEBUG%"=="1" (
+    echo    Voice debug: ENABLED
+)
+echo.
+
+REM Navigate to project root (use for-loop to resolve .. reliably across shells)
+for %%I in ("%~dp0..") do set "PROJECT_ROOT=%%~fI"
+cd /d "%PROJECT_ROOT%"
+set "PYTHONPATH=%PROJECT_ROOT%\src;%PYTHONPATH%"
+
+REM Resolve mamba env: prefer this checkout's own, fall back to the main
+REM repo's when running from a git worktree (worktrees share one env).
+set "MAMBA_ENV=%PROJECT_ROOT%\.mamba_env"
+if not exist "%MAMBA_ENV%\python.exe" call :resolve_mamba_from_worktree
+
+REM Check if mamba environment exists
+if not exist "%MAMBA_ENV%\python.exe" (
+    echo ERROR: Mamba environment not found.
+    echo    Looked in: %PROJECT_ROOT%\.mamba_env
+    echo    And the main repo's .mamba_env ^(if this is a git worktree^).
+    echo Please run the setup script first.
+    pause
+    exit /b 1
+)
+
+REM Check Python version in mamba env
+echo Checking Python version...
+"%MAMBA_ENV%\python.exe" --version
+echo.
+
+REM Install/update dependencies from requirements.txt
+echo Installing dependencies...
+"%MAMBA_ENV%\python.exe" -m pip install -q -r requirements.txt
+if errorlevel 1 (
+    echo WARNING: Some dependencies may have failed to install
+)
+echo.
+
+REM Generate icons
+echo Generating icons...
+"%MAMBA_ENV%\python.exe" src\desktop_app\desktop_assets\generate_icons.py
+echo.
+
+REM Run the desktop app
+echo Starting desktop app...
+echo    Click the system tray icon to open menu
+echo    Select 'Start Listening' from menu to begin
+echo    Or press Ctrl+C to quit
+echo.
+
+REM Set voice debug environment variable if requested
+if "%VOICE_DEBUG%"=="1" (
+    set "JARVIS_VOICE_DEBUG=1"
+)
+
+"%MAMBA_ENV%\python.exe" -m desktop_app
+goto :eof
+
+:resolve_mamba_from_worktree
+for /f "usebackq delims=" %%G in (`git -C "%PROJECT_ROOT%" rev-parse --git-common-dir 2^>nul`) do set "GIT_COMMON_DIR=%%G"
+if not defined GIT_COMMON_DIR goto :eof
+for %%I in ("%GIT_COMMON_DIR%\..") do set "MAIN_REPO=%%~fI"
+if exist "%MAIN_REPO%\.mamba_env\python.exe" set "MAMBA_ENV=%MAIN_REPO%\.mamba_env"
+goto :eof
+
--- a/scripts/run_desktop_app.sh
+++ b/scripts/run_desktop_app.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+# Test script for the Jarvis Desktop App
+
+# Parse arguments
+VOICE_DEBUG=0
+for arg in "$@"; do
+    case $arg in
+        --voice-debug)
+            VOICE_DEBUG=1
+            shift
+            ;;
+    esac
+done
+
+# Navigate to project root first
+cd "$(dirname "$0")/.." || exit
+
+echo "🔧 Testing Jarvis Desktop App locally..."
+if [ "$VOICE_DEBUG" = "1" ]; then
+    echo "   📋 Voice debug: ENABLED"
+fi
+echo ""
+
+# Find a suitable Python (3.10+)
+# Check both PATH and common install locations (homebrew, deadsnakes, etc.)
+PYTHON=""
+SEARCH_PATHS=(
+    ""                          # PATH lookup
+    "/opt/homebrew/bin/"        # macOS Homebrew (Apple Silicon)
+    "/usr/local/bin/"           # macOS Homebrew (Intel) / Linux manual installs
+)
+for candidate in python3.12 python3.11 python3.10; do
+    for prefix in "${SEARCH_PATHS[@]}"; do
+        if [ -x "${prefix}${candidate}" ] 2>/dev/null || command -v "${prefix}${candidate}" &>/dev/null; then
+            PYTHON="${prefix}${candidate}"
+            break 2
+        fi
+    done
+done
+if [ -z "$PYTHON" ]; then
+    # Fall back to python3 and hope it's new enough
+    PYTHON="python3"
+fi
+
+# Set up / activate virtual environment
+if [ ! -d .venv ]; then
+    echo "📦 Creating virtual environment..."
+    "$PYTHON" -m venv .venv
+fi
+source .venv/bin/activate
+
+# Check Python version
+echo "📋 Checking Python version..."
+python --version
+PY_MINOR=$(python -c 'import sys; print(sys.version_info.minor)')
+if [ "$PY_MINOR" -lt 10 ]; then
+    echo "⚠️  Python 3.10+ is required. Found $(python --version)."
+    echo "   Recreating .venv with $PYTHON..."
+    deactivate 2>/dev/null
+    rm -rf .venv
+    "$PYTHON" -m venv .venv
+    source .venv/bin/activate
+    echo "   Now using: $(python --version)"
+fi
+echo ""
+
+# Install dependencies from requirements.txt
+echo "📦 Installing dependencies..."
+pip install -q -r requirements.txt
+echo ""
+
+# Generate icons
+echo "🎨 Generating icons..."
+python src/desktop_app/desktop_assets/generate_icons.py
+echo ""
+
+# Run the desktop app
+echo "🚀 Starting desktop app..."
+echo "   Click the system tray icon to open menu"
+echo "   Select 'Start Listening' from menu to begin"
+echo "   Or press Ctrl+C to quit"
+echo ""
+
+# Set PYTHONPATH to include src directory (already at project root)
+export PYTHONPATH="$(pwd)/src:$PYTHONPATH"
+
+# Set voice debug environment variable if requested
+if [ "$VOICE_DEBUG" = "1" ]; then
+    export JARVIS_VOICE_DEBUG=1
+fi
+
+python -m desktop_app
+
--- a/scripts/run_evals.bat
+++ b/scripts/run_evals.bat
@@ -0,0 +1,252 @@
+@echo off
+setlocal EnableDelayedExpansion
+REM Run Jarvis evaluation suite on Windows
+REM
+REM Usage:
+REM   run_evals.bat              Run all evals with both models (live + judge enabled)
+REM   run_evals.bat weather      Run only weather-related evals
+REM   run_evals.bat -v           Verbose output
+REM   run_evals.bat --no-live    Exclude live LLM tests
+REM   run_evals.bat --no-judge   Exclude LLM-as-judge tests
+REM   run_evals.bat --no-report  Skip EVALS.md generation
+REM   run_evals.bat --single     Run with single model only (EVAL_JUDGE_MODEL)
+REM
+REM Environment variables:
+REM   EVAL_JUDGE_MODEL    - Model to use for LLM-as-judge (default: gpt-oss:20b)
+REM   EVAL_JUDGE_BASE_URL - Ollama base URL (default: http://localhost:11434)
+REM   EVAL_REPEAT_COUNT   - Number of times to run each test (default: 3)
+
+REM Navigate to project root
+for %%I in ("%~dp0..") do set "PROJECT_ROOT=%%~fI"
+set "SCRIPT_DIR=%~dp0"
+cd /d "%PROJECT_ROOT%"
+
+REM Resolve mamba env: prefer this checkout's own, fall back to the main
+REM repo's when running from a git worktree (worktrees share one env).
+set "MAMBA_ENV=%PROJECT_ROOT%\.mamba_env"
+if not exist "!MAMBA_ENV!\python.exe" (
+    for /f "usebackq delims=" %%G in (`git -C "%PROJECT_ROOT%" rev-parse --git-common-dir 2^>nul`) do (
+        for %%I in ("%%G\..") do (
+            if exist "%%~fI\.mamba_env\python.exe" set "MAMBA_ENV=%%~fI\.mamba_env"
+        )
+    )
+)
+
+if not exist "!MAMBA_ENV!\python.exe" (
+    echo ERROR: Mamba environment not found.
+    echo    Looked in: %PROJECT_ROOT%\.mamba_env
+    echo    And the main repo's .mamba_env ^(if this is a git worktree^).
+    echo Please run the setup script first.
+    pause
+    exit /b 1
+)
+
+set "PYTHON=!MAMBA_ENV!\python.exe"
+set "PYTHONPATH=%PROJECT_ROOT%\src;%PYTHONPATH%"
+
+REM Officially supported models (from config.py)
+set "MODEL_SMALL=gemma4:e2b"
+set "MODEL_LARGE=gpt-oss:20b"
+
+echo.
+echo +------------------------------------------------------------+
+echo ^|                  Jarvis Evaluation Suite                   ^|
+echo +------------------------------------------------------------+
+echo.
+
+REM Check if Ollama is available
+set "OLLAMA_AVAILABLE=false"
+if defined EVAL_JUDGE_BASE_URL (
+    set "OLLAMA_URL=!EVAL_JUDGE_BASE_URL!"
+) else (
+    set "OLLAMA_URL=http://localhost:11434"
+)
+curl -s "!OLLAMA_URL!/api/tags" >nul 2>&1
+if not errorlevel 1 (
+    set "OLLAMA_AVAILABLE=true"
+    echo   Ollama detected at !OLLAMA_URL!
+) else (
+    echo   WARNING: Ollama not detected at !OLLAMA_URL!
+    echo      LLM-as-judge tests will be skipped
+)
+echo.
+
+REM Parse arguments
+set "PYTEST_ARGS=-v"
+set "FILTER="
+set "INCLUDE_LIVE=true"
+set "INCLUDE_JUDGE=true"
+set "GENERATE_REPORT=true"
+set "MULTI_MODEL=true"
+
+:parse_args
+if "%~1"=="" goto done_args
+if /i "%~1"=="--no-live" (
+    set "INCLUDE_LIVE=false"
+    shift
+    goto parse_args
+)
+if /i "%~1"=="--no-judge" (
+    set "INCLUDE_JUDGE=false"
+    shift
+    goto parse_args
+)
+if /i "%~1"=="--no-report" (
+    set "GENERATE_REPORT=false"
+    shift
+    goto parse_args
+)
+if /i "%~1"=="--single" (
+    set "MULTI_MODEL=false"
+    shift
+    goto parse_args
+)
+if /i "%~1"=="--live" (
+    set "INCLUDE_LIVE=true"
+    shift
+    goto parse_args
+)
+if /i "%~1"=="--judge" (
+    set "INCLUDE_JUDGE=true"
+    shift
+    goto parse_args
+)
+if /i "%~1"=="-v" (
+    set "PYTEST_ARGS=!PYTEST_ARGS! -v"
+    shift
+    goto parse_args
+)
+if /i "%~1"=="--verbose" (
+    set "PYTEST_ARGS=!PYTEST_ARGS! -v"
+    shift
+    goto parse_args
+)
+if /i "%~1"=="-vv" (
+    set "PYTEST_ARGS=!PYTEST_ARGS! -vv"
+    shift
+    goto parse_args
+)
+set "_FIRST_CHAR=%~1"
+if "!_FIRST_CHAR:~0,2!"=="--" (
+    set "PYTEST_ARGS=!PYTEST_ARGS! %~1"
+    shift
+    goto parse_args
+)
+set "FILTER=%~1"
+shift
+goto parse_args
+:done_args
+
+set "EXCLUDE_PATTERNS="
+if "!INCLUDE_LIVE!"=="false" (
+    set "EXCLUDE_PATTERNS=Live"
+    echo   Skipping live LLM tests ^(remove --no-live to include^)
+)
+
+if "!GENERATE_REPORT!"=="true" (
+    echo   Report will be saved to EVALS.md
+)
+
+set "FINAL_EXIT_CODE=0"
+set "RUN_MULTI=false"
+if "!MULTI_MODEL!"=="true" if "!OLLAMA_AVAILABLE!"=="true" set "RUN_MULTI=true"
+
+if "!RUN_MULTI!"=="true" (
+    echo   Running evals with both supported models for comparison
+
+    set "TEMP_DIR=%TEMP%\jarvis_evals_%RANDOM%_%RANDOM%"
+    mkdir "!TEMP_DIR!" >nul 2>&1
+
+    set "EVAL_REPORT_PATH=!TEMP_DIR!\evals_small.md"
+    call :run_evals_for_model "!MODEL_SMALL!" "_small"
+    if errorlevel 1 set "FINAL_EXIT_CODE=1"
+
+    echo   Unloading models before switching...
+    curl -s "!OLLAMA_URL!/api/generate" -d "{\"model\":\"!MODEL_SMALL!\",\"keep_alive\":0}" >nul 2>&1
+    timeout /t 2 /nobreak >nul
+
+    set "EVAL_REPORT_PATH=!TEMP_DIR!\evals_large.md"
+    call :run_evals_for_model "!MODEL_LARGE!" "_large"
+    if errorlevel 1 set "FINAL_EXIT_CODE=1"
+
+    if "!GENERATE_REPORT!"=="true" (
+        "!PYTHON!" "!SCRIPT_DIR!merge_eval_reports.py" ^
+            "!TEMP_DIR!\evals_small.md" "!MODEL_SMALL!" ^
+            "!TEMP_DIR!\evals_large.md" "!MODEL_LARGE!" ^
+            > "!PROJECT_ROOT!\EVALS.md"
+        echo.
+        echo   Combined report saved to EVALS.md
+    )
+
+    rmdir /s /q "!TEMP_DIR!" >nul 2>&1
+) else (
+    if not defined EVAL_JUDGE_MODEL set "EVAL_JUDGE_MODEL=!MODEL_LARGE!"
+    set "EVAL_REPORT_PATH=!PROJECT_ROOT!\EVALS.md"
+    call :run_evals_for_model "!EVAL_JUDGE_MODEL!" ""
+    if errorlevel 1 set "FINAL_EXIT_CODE=1"
+)
+
+echo.
+echo ----------------------------------------------------------------
+if "!FINAL_EXIT_CODE!"=="0" (
+    echo   All evaluations passed!
+) else (
+    echo   WARNING: Some evaluations failed ^(exit code: !FINAL_EXIT_CODE!^)
+)
+echo.
+echo   Legend:
+echo      PASSED  -^> Test passed
+echo      FAILED  -^> Test failed
+echo      SKIPPED -^> Test skipped ^(missing dependencies^)
+echo      XFAIL   -^> Expected failure ^(documents known limitation^)
+echo      XPASS   -^> Bug fixed! ^(expected failure now passes^)
+echo.
+if "!GENERATE_REPORT!"=="true" (
+    echo   Full report: EVALS.md
+    echo.
+)
+echo ----------------------------------------------------------------
+
+exit /b !FINAL_EXIT_CODE!
+
+
+:run_evals_for_model
+REM %~1 = model, %~2 = report suffix
+set "_MODEL=%~1"
+set "_REPORT_SUFFIX=%~2"
+set "EVAL_JUDGE_MODEL=!_MODEL!"
+
+echo.
+echo ================================================================
+echo   Running evals with model: !_MODEL!
+echo ================================================================
+echo.
+
+if defined EVAL_REPEAT_COUNT (
+    set "_REPEAT_COUNT=!EVAL_REPEAT_COUNT!"
+) else (
+    set "_REPEAT_COUNT=3"
+)
+
+set "_CMD="!PYTHON!" -m pytest evals/ !PYTEST_ARGS! --tb=short --count=!_REPEAT_COUNT!"
+
+if not "!FILTER!"=="" (
+    if not "!EXCLUDE_PATTERNS!"=="" (
+        set "_CMD=!_CMD! -k "!FILTER! and not !EXCLUDE_PATTERNS!""
+    ) else (
+        set "_CMD=!_CMD! -k "!FILTER!""
+    )
+) else if not "!EXCLUDE_PATTERNS!"=="" (
+    set "_CMD=!_CMD! -k "not !EXCLUDE_PATTERNS!""
+)
+
+echo   Command: !_CMD!
+echo.
+
+if "!GENERATE_REPORT!"=="true" (
+    set "EVAL_GENERATE_REPORT=1"
+    set "EVAL_REPORT_SUFFIX=!_REPORT_SUFFIX!"
+)
+
+call !_CMD!
+exit /b !errorlevel!
--- a/scripts/run_evals.sh
+++ b/scripts/run_evals.sh
@@ -0,0 +1,209 @@
+#!/bin/bash
+# Run Jarvis evaluation suite
+#
+# Usage:
+#   ./scripts/run_evals.sh              # Run all evals with both models (live + judge enabled)
+#   ./scripts/run_evals.sh weather      # Run only weather-related evals
+#   ./scripts/run_evals.sh -v           # Verbose output
+#   ./scripts/run_evals.sh --no-live    # Exclude live LLM tests
+#   ./scripts/run_evals.sh --no-judge   # Exclude LLM-as-judge tests
+#   ./scripts/run_evals.sh --no-report  # Skip EVALS.md generation
+#   ./scripts/run_evals.sh --single     # Run with single model only (EVAL_JUDGE_MODEL)
+#
+# Environment variables:
+#   EVAL_JUDGE_MODEL    - Model to use for LLM-as-judge (default: gpt-oss:20b)
+#   EVAL_JUDGE_BASE_URL - Ollama base URL (default: http://localhost:11434)
+#   EVAL_REPEAT_COUNT   - Number of times to run each test (default: 1; use 3 when tuning prompts to surface flakiness)
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+cd "$PROJECT_ROOT"
+
+# Officially supported models (from config.py)
+MODEL_SMALL="gemma4:e2b"
+MODEL_LARGE="gpt-oss:20b"
+
+echo ""
+echo "┌────────────────────────────────────────────────────────────┐"
+echo "│                  🧪 Jarvis Evaluation Suite                │"
+echo "└────────────────────────────────────────────────────────────┘"
+echo ""
+
+# Check if Ollama is available
+OLLAMA_AVAILABLE=false
+OLLAMA_URL="${EVAL_JUDGE_BASE_URL:-http://localhost:11434}"
+if curl -s "${OLLAMA_URL}/api/tags" > /dev/null 2>&1; then
+    OLLAMA_AVAILABLE=true
+    echo "  ✅ Ollama detected at ${OLLAMA_URL}"
+else
+    echo "  ⚠️  Ollama not detected at ${OLLAMA_URL}"
+    echo "     LLM-as-judge tests will be skipped"
+fi
+echo ""
+
+# Parse arguments (defaults: live=true, judge=true, report=true, multi_model=true)
+PYTEST_ARGS="-v"
+FILTER=""
+INCLUDE_LIVE=true
+INCLUDE_JUDGE=true
+GENERATE_REPORT=true
+MULTI_MODEL=true
+
+for arg in "$@"; do
+    case $arg in
+        --no-live)
+            INCLUDE_LIVE=false
+            ;;
+        --no-judge)
+            INCLUDE_JUDGE=false
+            ;;
+        --no-report)
+            GENERATE_REPORT=false
+            ;;
+        --single)
+            MULTI_MODEL=false
+            ;;
+        --live)
+            INCLUDE_LIVE=true
+            ;;
+        --judge)
+            INCLUDE_JUDGE=true
+            ;;
+        -v|--verbose)
+            PYTEST_ARGS="$PYTEST_ARGS -v"
+            ;;
+        -vv)
+            PYTEST_ARGS="$PYTEST_ARGS -vv"
+            ;;
+        --*)
+            PYTEST_ARGS="$PYTEST_ARGS $arg"
+            ;;
+        *)
+            FILTER="$arg"
+            ;;
+    esac
+done
+
+# Build exclusion filter
+EXCLUDE_PATTERNS=""
+if [ "$INCLUDE_LIVE" = false ]; then
+    EXCLUDE_PATTERNS="Live"
+    echo "  ⏭️  Skipping live LLM tests (remove --no-live to include)"
+fi
+
+# Function to run evals for a specific model
+run_evals_for_model() {
+    local model="$1"
+    local report_suffix="$2"
+
+    export EVAL_JUDGE_MODEL="$model"
+
+    echo ""
+    echo "╔════════════════════════════════════════════════════════════╗"
+    echo "  🤖 Running evals with model: $model"
+    echo "╚════════════════════════════════════════════════════════════╝"
+    echo ""
+
+    # Build the pytest command (--tb=short for cleaner tracebacks, -s to capture stdout for judge notes)
+    # Each test runs REPEAT_COUNT times for pass rate calculation
+    local REPEAT_COUNT="${EVAL_REPEAT_COUNT:-1}"
+    local CMD="python -m pytest evals/ $PYTEST_ARGS --tb=short --count=$REPEAT_COUNT"
+
+    if [ -n "$FILTER" ]; then
+        if [ -n "$EXCLUDE_PATTERNS" ]; then
+            CMD="$CMD -k '$FILTER and not $EXCLUDE_PATTERNS'"
+        else
+            CMD="$CMD -k '$FILTER'"
+        fi
+    elif [ -n "$EXCLUDE_PATTERNS" ]; then
+        CMD="$CMD -k 'not $EXCLUDE_PATTERNS'"
+    fi
+
+    echo "  🚀 Command: $CMD"
+    echo ""
+
+    # Run with report generation if enabled
+    if [ "$GENERATE_REPORT" = true ]; then
+        export EVAL_GENERATE_REPORT=1
+        export EVAL_REPORT_SUFFIX="$report_suffix"
+    fi
+
+    # Run and capture exit code (don't exit on failure)
+    set +e
+    eval $CMD
+    local exit_code=$?
+    set -e
+
+    return $exit_code
+}
+
+# Run evals
+if [ "$GENERATE_REPORT" = true ]; then
+    echo "  📄 Report will be saved to EVALS.md"
+fi
+
+FINAL_EXIT_CODE=0
+
+if [ "$MULTI_MODEL" = true ] && [ "$OLLAMA_AVAILABLE" = true ]; then
+    echo "  🔄 Running evals with both supported models for comparison"
+
+    # Create temp files for individual model reports
+    TEMP_DIR=$(mktemp -d)
+
+    # Run with small model
+    export EVAL_REPORT_PATH="${TEMP_DIR}/evals_small.md"
+    run_evals_for_model "$MODEL_SMALL" "_small" || FINAL_EXIT_CODE=$?
+
+    # Unload all models to avoid VRAM corruption when switching
+    echo "  🔄 Unloading models before switching..."
+    curl -s "${OLLAMA_URL}/api/generate" -d "{\"model\":\"$MODEL_SMALL\",\"keep_alive\":0}" > /dev/null 2>&1
+    sleep 2
+
+    # Run with large model
+    export EVAL_REPORT_PATH="${TEMP_DIR}/evals_large.md"
+    run_evals_for_model "$MODEL_LARGE" "_large" || FINAL_EXIT_CODE=$?
+
+    # Merge reports into final EVALS.md
+    if [ "$GENERATE_REPORT" = true ]; then
+        python "${SCRIPT_DIR}/merge_eval_reports.py" \
+            "${TEMP_DIR}/evals_small.md" "$MODEL_SMALL" \
+            "${TEMP_DIR}/evals_large.md" "$MODEL_LARGE" \
+            > "${PROJECT_ROOT}/EVALS.md"
+        echo ""
+        echo "  📄 Combined report saved to EVALS.md"
+    fi
+
+    # Cleanup temp directory
+    rm -rf "$TEMP_DIR"
+else
+    # Single model mode
+    export EVAL_JUDGE_MODEL="${EVAL_JUDGE_MODEL:-$MODEL_LARGE}"
+    export EVAL_REPORT_PATH="${PROJECT_ROOT}/EVALS.md"
+    run_evals_for_model "$EVAL_JUDGE_MODEL" "" || FINAL_EXIT_CODE=$?
+fi
+
+echo ""
+echo "────────────────────────────────────────────────────────────────"
+if [ $FINAL_EXIT_CODE -eq 0 ]; then
+    echo "  ✅ All evaluations passed!"
+else
+    echo "  ⚠️  Some evaluations failed (exit code: $FINAL_EXIT_CODE)"
+fi
+echo ""
+echo "  📖 Legend:"
+echo "     PASSED  → Test passed"
+echo "     FAILED  → Test failed"
+echo "     SKIPPED → Test skipped (missing dependencies)"
+echo "     XFAIL   → Expected failure (documents known limitation)"
+echo "     XPASS   → Bug fixed! (expected failure now passes)"
+echo ""
+if [ "$GENERATE_REPORT" = true ]; then
+    echo "  📄 Full report: EVALS.md"
+    echo ""
+fi
+echo "────────────────────────────────────────────────────────────────"
+
+exit $FINAL_EXIT_CODE
--- a/scripts/run_linux.sh
+++ b/scripts/run_linux.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(dirname "$SCRIPT_DIR")"
+cd "$REPO_ROOT"
+
+if [ ! -d .venv ]; then
+  python3 -m venv .venv
+fi
+source .venv/bin/activate
+pip install -r requirements.txt
+
+export PYTHONPATH="$REPO_ROOT/src"
+# Allow override via JARVIS_CONFIG_PATH; otherwise use default search path in code
+export JARVIS_VOICE_DEBUG=${JARVIS_VOICE_DEBUG:-0}
+python -m jarvis.daemon
--- a/scripts/run_macos.sh
+++ b/scripts/run_macos.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(dirname "$SCRIPT_DIR")"
+cd "$REPO_ROOT"
+
+if [ ! -d .venv ]; then
+  python3 -m venv .venv
+fi
+source .venv/bin/activate
+pip install -r requirements.txt
+
+# Build Swift capture helper (scaffold)
+if [ -d mac/CaptureCLI ]; then
+  (cd mac/CaptureCLI && swift build -c release)
+fi
+
+export PYTHONPATH="$REPO_ROOT/src"
+# Allow override via JARVIS_CONFIG_PATH; otherwise use default search path in code
+export JARVIS_VOICE_DEBUG=${JARVIS_VOICE_DEBUG:-0}
+python -m jarvis.daemon
--- a/scripts/run_windows.ps1
+++ b/scripts/run_windows.ps1
@@ -0,0 +1,63 @@
+Param()
+
+$ErrorActionPreference = 'Stop'
+
+function Write-Info($msg) { Write-Host "[jarvis] $msg" }
+
+# Repo root
+$SCRIPT_DIR = Split-Path -Parent $MyInvocation.MyCommand.Path
+$REPO_ROOT = Resolve-Path (Join-Path $SCRIPT_DIR '..')
+Set-Location $REPO_ROOT
+
+# Helper to set env vars for the current process
+$env:PYTHONPATH = Join-Path $REPO_ROOT 'src'
+if (-not $env:JARVIS_VOICE_DEBUG) { $env:JARVIS_VOICE_DEBUG = '0' }
+
+# Prefer micromamba for pre-built dependencies (webrtcvad, av, etc.)
+$micromamba = Get-Command micromamba -ErrorAction SilentlyContinue
+if ($micromamba) {
+  $envPrefix = Join-Path $REPO_ROOT '.mamba_env'
+  Write-Info "Using Micromamba environment at '$envPrefix' (avoids compilation issues)"
+
+  if (-not (Test-Path $envPrefix)) {
+    Write-Info 'Creating environment (python 3.12)...'
+    micromamba create -y -p $envPrefix python=3.12 -c conda-forge
+  }
+
+  Write-Info 'Installing PyAV (FFmpeg bindings) from conda-forge...'
+  micromamba install -y -p $envPrefix -c conda-forge av
+
+  Write-Info 'Installing Python requirements with pip...'
+  micromamba run -p $envPrefix pip install -r requirements.txt
+
+  # Prefer launching python.exe directly so Ctrl+C propagates to the child on Windows
+  $envPython = Join-Path $envPrefix 'python.exe'
+  if (Test-Path $envPython) {
+    Write-Info 'Starting daemon...'
+    & $envPython -m jarvis.daemon
+    exit $LASTEXITCODE
+  } else {
+    # Fallback to micromamba run if python.exe is not found for some reason
+    Write-Info 'Starting daemon (fallback via micromamba run)...'
+    micromamba run -p $envPrefix python -m jarvis.daemon
+    exit $LASTEXITCODE
+  }
+}
+
+# Fallback: venv + pip (may require Visual C++ Build Tools for compilation)
+$venvPath = Join-Path $REPO_ROOT '.venv'
+$venvPython = Join-Path $venvPath 'Scripts/python.exe'
+Write-Info "Micromamba not found, using regular Python (may need Visual C++ Build Tools for native deps)"
+
+if (-not (Test-Path $venvPython)) {
+  Write-Info 'Creating virtual environment (.venv)...'
+  python -m venv $venvPath
+}
+
+Write-Info 'Installing Python requirements with pip...'
+& $venvPython -m pip install -r requirements.txt
+
+Write-Info 'Starting daemon...'
+& $venvPython -m jarvis.daemon
+
+
--- a/scripts/setup_geolocation.py
+++ b/scripts/setup_geolocation.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+"""
+Setup script for GeoLite2 geolocation database.
+
+This script helps users set up the MaxMind GeoLite2 database required for
+location-based features in Jarvis.
+
+Since MaxMind requires registration for free access to GeoLite2 data (as of 2019),
+this script provides instructions and utilities to help with the setup process.
+"""
+
+import os
+import sys
+import subprocess
+from pathlib import Path
+from typing import Optional
+
+# Add the src directory to path for imports
+script_dir = Path(__file__).parent
+src_dir = script_dir.parent / "src"
+sys.path.insert(0, str(src_dir))
+
+try:
+    # Location utilities live under utils.location after refactor.
+    from jarvis.utils.location import (
+        _get_database_path,
+        is_location_available,
+        get_location_info,
+        setup_location_database,
+        _get_local_network_ip,
+        _get_external_ip_automatically,
+    )
+    from jarvis.config import load_settings
+    SETTINGS = load_settings()
+    JARVIS_AVAILABLE = True
+except ImportError as e:
+    print(
+        "Warning: Could not import Jarvis location utilities from 'jarvis.utils.location'.\n"
+        f"  Import error: {e}\n"
+        "  Make sure you're running from the repository root and that 'src' is on PYTHONPATH.\n"
+        "  Example (zsh/bash): export PYTHONPATH=\"$(pwd)/src:$PYTHONPATH\"\n"
+        "  Or install the project in editable mode once packaging is set up (pip install -e .)."
+    )
+    JARVIS_AVAILABLE = False
+
+
+def check_dependencies() -> bool:
+    """Check if required dependencies are installed."""
+    try:
+        import geoip2
+        return True
+    except ImportError:
+        return False
+
+
+def install_dependencies() -> bool:
+    """Install required dependencies."""
+    print("Installing geoip2 dependency...")
+    try:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "geoip2==4.8.0"])
+        return True
+    except subprocess.CalledProcessError:
+        return False
+
+
+def get_database_info() -> dict:
+    """Get information about the database location and status."""
+    if not JARVIS_AVAILABLE:
+        base_dir = Path.home() / ".local" / "share" / "jarvis" / "geoip"
+        db_path = base_dir / "GeoLite2-City.mmdb"
+    else:
+        db_path = _get_database_path()
+
+    return {
+        "path": db_path,
+        "directory": db_path.parent,
+        "exists": db_path.exists(),
+        "size": db_path.stat().st_size if db_path.exists() else 0,
+    }
+
+
+def print_setup_instructions():
+    """Print instructions for setting up the GeoLite2 database."""
+    db_info = get_database_info()
+
+    print("\n" + "="*60)
+    print("📍 JARVIS GEOLOCATION SETUP")
+    print("="*60)
+
+    print(f"Database location: {db_info['path']}")
+    print(f"Database exists: {'✅ Yes' if db_info['exists'] else '❌ No'}")
+
+    if db_info['exists']:
+        size_mb = db_info['size'] / (1024 * 1024)
+        print(f"Database size: {size_mb:.1f} MB")
+
+        if JARVIS_AVAILABLE:
+            print("\n🧪 Testing location detection...")
+            try:
+                location = get_location_info(settings=SETTINGS)
+                if "error" in location:
+                    print(f"❌ Location test failed: {location['error']}")
+                else:
+                    print("✅ Location detection working!")
+                    print(f"   Detected: {location.get('city', 'Unknown')}, {location.get('country', 'Unknown')}")
+            except Exception as e:
+                print(f"❌ Location test error: {e}")
+    else:
+        print("\n📋 SETUP INSTRUCTIONS:")
+        print("1. Register for a free MaxMind account:")
+        print("   https://www.maxmind.com/en/geolite2/signup")
+        print()
+        print("2. Generate a license key in your account dashboard")
+        print()
+        print("3. Download GeoLite2 City database:")
+        print("   - Go to: https://www.maxmind.com/en/accounts/current/geoip/downloads")
+        print("   - Download: GeoLite2 City (MMDB format)")
+        print("   - Extract the .tar.gz file")
+        print()
+        print("4. Copy the database file:")
+        print(f"   cp GeoLite2-City_*/GeoLite2-City.mmdb {db_info['path']}")
+        print()
+        print("5. Location detection is automatic!")
+        print("   Jarvis will attempt to detect your external IP using:")
+        print("   - UPnP (queries your local router)")
+        print("   - Socket routing (minimal external contact)")
+        print("   - Optional single DNS query (OpenDNS) if behind CGNAT (config: location_cgnat_resolve_public_ip=true)")
+        print()
+        print("   If automatic detection fails, manually configure:")
+        print("   Add to ~/.config/jarvis/config.json:")
+        print('   {')
+        print('     "location_auto_detect": false,')
+        print('     "location_ip_address": "YOUR_PUBLIC_IP_HERE"')
+        print('   }')
+        print()
+        print("   💡 To find your public IP: https://whatismyipaddress.com")
+        print()
+        print("6. Run this script again to test the setup")
+
+        # Create directory if it doesn't exist
+        db_info['directory'].mkdir(parents=True, exist_ok=True)
+        print(f"\n✅ Created directory: {db_info['directory']}")
+
+
+def test_location_features():
+    """Test the location detection features."""
+    if not JARVIS_AVAILABLE:
+        print("❌ Cannot test: Jarvis modules not available")
+        return False
+
+    print("\n🔍 Testing location features...")
+
+    # Test if location is available
+    if not is_location_available():
+        print("❌ Location database not available")
+        return False
+
+    # Test automatic external IP detection
+    print("Testing automatic external IP detection...")
+    external_ip = _get_external_ip_automatically()
+    if external_ip:
+        print(f"✅ External IP automatically detected: {external_ip}")
+    else:
+        print("⚠️  Automatic IP detection failed")
+        print("💡 You may need to manually configure 'location_ip_address'")
+
+    # Test local IP detection (fallback)
+    print("\nTesting local IP detection (fallback)...")
+    local_ip = _get_local_network_ip()
+    if local_ip:
+        print(f"✅ Local IP detected: {local_ip}")
+    else:
+        print("⚠️  Could not detect local IP")
+
+    # Test location detection
+    try:
+        location = get_location_info(settings=SETTINGS)
+        if "error" in location:
+            print(f"⚠️  Location detection result: {location['error']}")
+            reason = location.get("reason")
+            advice = location.get("advice")
+            if reason == "cgnat_not_found":
+                print("💡 Carrier-grade NAT (100.64.0.0/10) and IP not in GeoLite2. Cannot derive precise location.")
+                print("   Configure a real public IP in ~/.config/jarvis/config.json:")
+                print("   { 'location_ip_address': 'YOUR_PUBLIC_IP', 'location_auto_detect': false }")
+            elif reason == "not_found":
+                print("💡 IP not found in free GeoLite2 dataset. It may be new or CGNAT.")
+            elif "No IP address available" in location['error']:
+                print("💡 No IP available. Provide 'location_ip_address' in config.")
+            if advice:
+                print(f"   Advice: {advice}")
+            return False
+
+        print("✅ Location detection working!")
+        print(f"   IP: {location.get('ip', 'Unknown')}")
+        print(f"   Location: {location.get('city', 'Unknown')}, {location.get('region', '')}, {location.get('country', 'Unknown')}")
+
+        if location.get('latitude') and location.get('longitude'):
+            print(f"   Coordinates: {location['latitude']}, {location['longitude']}")
+
+        if location.get('timezone'):
+            print(f"   Timezone: {location['timezone']}")
+
+        return True
+
+    except Exception as e:
+        print(f"❌ Location test error: {e}")
+        return False
+
+
+def create_test_config():
+    """Create a test configuration file with location enabled."""
+    config_path = Path.home() / ".config" / "jarvis" / "config.json"
+
+    if config_path.exists():
+        print(f"✅ Config file already exists: {config_path}")
+        print("To enable location features, add to your config:")
+        print('  "location_ip_address": "YOUR_PUBLIC_IP_HERE"')
+        return
+
+    config_path.parent.mkdir(parents=True, exist_ok=True)
+
+    test_config = {
+        "location_enabled": True,
+        "location_cache_minutes": 60,
+        "location_ip_address": None,
+        "location_auto_detect": True,
+        "voice_debug": True
+    }
+
+    import json
+    with open(config_path, 'w') as f:
+        json.dump(test_config, f, indent=2)
+
+    print(f"✅ Created test config: {config_path}")
+    print("💡 Location features will auto-detect your IP address")
+    print("   If auto-detection fails, manually set 'location_ip_address'")
+
+
+def main():
+    """Main setup function."""
+    print("🌍 Jarvis Geolocation Setup")
+
+    # Check dependencies
+    if not check_dependencies():
+        print("❌ geoip2 library not found")
+        print("Installing dependencies...")
+        if not install_dependencies():
+            print("❌ Failed to install dependencies")
+            sys.exit(1)
+        print("✅ Dependencies installed")
+    else:
+        print("✅ Dependencies available")
+
+    # Print setup instructions
+    print_setup_instructions()
+
+    # Test if everything is working
+    db_info = get_database_info()
+    if db_info['exists']:
+        test_success = test_location_features()
+
+        if test_success:
+            print("\n🎉 Geolocation setup complete!")
+            print("Location metadata will now be included in agent context.")
+        else:
+            print("\n⚠️  Database exists but testing failed")
+            print("Please check the database file is valid.")
+    else:
+        print("\n⏳ Database not found - follow the instructions above")
+
+    print("\n💡 Privacy Note: Jarvis respects your privacy by:")
+    print("   - Using UPnP (local router) and socket routing instead of third-party services")
+    print("   - Working entirely with local databases")
+    print("   - Giving you full control over IP detection methods")
+    print("\n💡 Tip: Set JARVIS_VOICE_DEBUG=1 to see location info in debug output")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/start_bot.sh
+++ b/scripts/start_bot.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+# Start the Discord bot (bun). Registers slash commands first.
+set -euo pipefail
+cd "$(dirname "$0")/../bot"
+bun install
+bun run register
+exec bun run start
--- a/scripts/start_bridge.sh
+++ b/scripts/start_bridge.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+# Start the Python brain bridge (STT + reply engine + TTS).
+set -euo pipefail
+cd "$(dirname "$0")/.."
+# Load .env if present
+if [ -f .env ]; then set -a; . ./.env; set +a; fi
+exec python -m bridge.server
--- a/scripts/test_bundled_app.bat
+++ b/scripts/test_bundled_app.bat
@@ -0,0 +1,59 @@
+@echo off
+REM Test script to build and run the bundled Windows app locally
+
+echo.
+echo === Building Jarvis Desktop App with PyInstaller ===
+echo.
+
+REM Get to project root
+cd /d "%~dp0\.."
+
+REM Set up paths
+set "PROJECT_ROOT=%cd%"
+set "MAMBA_ENV=%PROJECT_ROOT%\.mamba_env"
+set "PYTHONPATH=%PROJECT_ROOT%\src;%PYTHONPATH%"
+
+REM Check if mamba environment exists
+if not exist "%MAMBA_ENV%\python.exe" (
+    echo ERROR: Mamba environment not found at %MAMBA_ENV%
+    echo    Please run the setup script first.
+    pause
+    exit /b 1
+)
+
+REM Clean previous builds
+echo Cleaning previous builds...
+if exist "build" rmdir /s /q build
+if exist "dist" rmdir /s /q dist
+echo.
+
+REM Build with PyInstaller
+echo Building app bundle...
+"%MAMBA_ENV%\python.exe" -m PyInstaller jarvis_desktop.spec
+echo.
+
+REM Check if build succeeded
+if exist "dist\Jarvis.exe" (
+    echo Build successful!
+    echo.
+    echo App location: %cd%\dist\Jarvis.exe
+    echo.
+
+    REM Show file info
+    echo File info:
+    dir dist\Jarvis.exe
+    echo.
+
+    REM Run the app
+    echo Launching app...
+    echo    Press Ctrl+C in this window to stop the app
+    echo.
+
+    dist\Jarvis.exe
+
+    echo.
+    echo App exited.
+) else (
+    echo Build failed! Check the output above for errors.
+    exit /b 1
+)
--- a/scripts/test_bundled_app.sh
+++ b/scripts/test_bundled_app.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Test script to build and run the bundled macOS app locally
+
+set -e
+
+echo "🔨 Building Jarvis Desktop App with PyInstaller..."
+echo ""
+
+# Get to project root
+cd "$(dirname "$0")/.." || exit
+
+# Clean previous builds
+echo "🧹 Cleaning previous builds..."
+rm -rf build dist
+echo ""
+
+# Build with PyInstaller
+echo "📦 Building app bundle..."
+python -m PyInstaller jarvis_desktop.spec
+echo ""
+
+# Check if build succeeded
+if [ -d "dist/Jarvis.app" ]; then
+    echo "✅ Build successful!"
+    echo ""
+    echo "📍 App location: $(pwd)/dist/Jarvis.app"
+    echo ""
+
+    # Show app contents for debugging
+    echo "📂 App structure:"
+    ls -lh dist/Jarvis.app/Contents/MacOS/
+    echo ""
+
+    # Make the app executable
+    chmod +x dist/Jarvis.app/Contents/MacOS/Jarvis
+
+    # Run the app in terminal to see output
+    echo "🚀 Launching app (console mode enabled for debugging)..."
+    echo "   This should open a Terminal window showing the app's output"
+    echo "   If successful, you'll see the Jarvis icon in the menu bar"
+    echo ""
+
+    open -a Terminal dist/Jarvis.app
+
+    echo ""
+    echo "📝 If the app crashes or fails:"
+    echo "   1. Check the Terminal window that opened for error messages"
+    echo "   2. Check ~/Library/Logs/jarvis_desktop_crash.log"
+    echo "   3. Run manually: ./dist/Jarvis.app/Contents/MacOS/Jarvis"
+    echo ""
+else
+    echo "❌ Build failed! Check the output above for errors."
+    exit 1
+fi
+