Files
javis_bot/scripts/run_evals.bat
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

253 lines
6.9 KiB
Batchfile

@echo off
setlocal EnableDelayedExpansion
REM Run Jarvis evaluation suite on Windows
REM
REM Usage:
REM run_evals.bat Run all evals with both models (live + judge enabled)
REM run_evals.bat weather Run only weather-related evals
REM run_evals.bat -v Verbose output
REM run_evals.bat --no-live Exclude live LLM tests
REM run_evals.bat --no-judge Exclude LLM-as-judge tests
REM run_evals.bat --no-report Skip EVALS.md generation
REM run_evals.bat --single Run with single model only (EVAL_JUDGE_MODEL)
REM
REM Environment variables:
REM EVAL_JUDGE_MODEL - Model to use for LLM-as-judge (default: gpt-oss:20b)
REM EVAL_JUDGE_BASE_URL - Ollama base URL (default: http://localhost:11434)
REM EVAL_REPEAT_COUNT - Number of times to run each test (default: 3)
REM Navigate to project root
for %%I in ("%~dp0..") do set "PROJECT_ROOT=%%~fI"
set "SCRIPT_DIR=%~dp0"
cd /d "%PROJECT_ROOT%"
REM Resolve mamba env: prefer this checkout's own, fall back to the main
REM repo's when running from a git worktree (worktrees share one env).
set "MAMBA_ENV=%PROJECT_ROOT%\.mamba_env"
if not exist "!MAMBA_ENV!\python.exe" (
for /f "usebackq delims=" %%G in (`git -C "%PROJECT_ROOT%" rev-parse --git-common-dir 2^>nul`) do (
for %%I in ("%%G\..") do (
if exist "%%~fI\.mamba_env\python.exe" set "MAMBA_ENV=%%~fI\.mamba_env"
)
)
)
if not exist "!MAMBA_ENV!\python.exe" (
echo ERROR: Mamba environment not found.
echo Looked in: %PROJECT_ROOT%\.mamba_env
echo And the main repo's .mamba_env ^(if this is a git worktree^).
echo Please run the setup script first.
pause
exit /b 1
)
set "PYTHON=!MAMBA_ENV!\python.exe"
set "PYTHONPATH=%PROJECT_ROOT%\src;%PYTHONPATH%"
REM Officially supported models (from config.py)
set "MODEL_SMALL=gemma4:e2b"
set "MODEL_LARGE=gpt-oss:20b"
echo.
echo +------------------------------------------------------------+
echo ^| Jarvis Evaluation Suite ^|
echo +------------------------------------------------------------+
echo.
REM Check if Ollama is available
set "OLLAMA_AVAILABLE=false"
if defined EVAL_JUDGE_BASE_URL (
set "OLLAMA_URL=!EVAL_JUDGE_BASE_URL!"
) else (
set "OLLAMA_URL=http://localhost:11434"
)
curl -s "!OLLAMA_URL!/api/tags" >nul 2>&1
if not errorlevel 1 (
set "OLLAMA_AVAILABLE=true"
echo Ollama detected at !OLLAMA_URL!
) else (
echo WARNING: Ollama not detected at !OLLAMA_URL!
echo LLM-as-judge tests will be skipped
)
echo.
REM Parse arguments
set "PYTEST_ARGS=-v"
set "FILTER="
set "INCLUDE_LIVE=true"
set "INCLUDE_JUDGE=true"
set "GENERATE_REPORT=true"
set "MULTI_MODEL=true"
:parse_args
if "%~1"=="" goto done_args
if /i "%~1"=="--no-live" (
set "INCLUDE_LIVE=false"
shift
goto parse_args
)
if /i "%~1"=="--no-judge" (
set "INCLUDE_JUDGE=false"
shift
goto parse_args
)
if /i "%~1"=="--no-report" (
set "GENERATE_REPORT=false"
shift
goto parse_args
)
if /i "%~1"=="--single" (
set "MULTI_MODEL=false"
shift
goto parse_args
)
if /i "%~1"=="--live" (
set "INCLUDE_LIVE=true"
shift
goto parse_args
)
if /i "%~1"=="--judge" (
set "INCLUDE_JUDGE=true"
shift
goto parse_args
)
if /i "%~1"=="-v" (
set "PYTEST_ARGS=!PYTEST_ARGS! -v"
shift
goto parse_args
)
if /i "%~1"=="--verbose" (
set "PYTEST_ARGS=!PYTEST_ARGS! -v"
shift
goto parse_args
)
if /i "%~1"=="-vv" (
set "PYTEST_ARGS=!PYTEST_ARGS! -vv"
shift
goto parse_args
)
set "_FIRST_CHAR=%~1"
if "!_FIRST_CHAR:~0,2!"=="--" (
set "PYTEST_ARGS=!PYTEST_ARGS! %~1"
shift
goto parse_args
)
set "FILTER=%~1"
shift
goto parse_args
:done_args
set "EXCLUDE_PATTERNS="
if "!INCLUDE_LIVE!"=="false" (
set "EXCLUDE_PATTERNS=Live"
echo Skipping live LLM tests ^(remove --no-live to include^)
)
if "!GENERATE_REPORT!"=="true" (
echo Report will be saved to EVALS.md
)
set "FINAL_EXIT_CODE=0"
set "RUN_MULTI=false"
if "!MULTI_MODEL!"=="true" if "!OLLAMA_AVAILABLE!"=="true" set "RUN_MULTI=true"
if "!RUN_MULTI!"=="true" (
echo Running evals with both supported models for comparison
set "TEMP_DIR=%TEMP%\jarvis_evals_%RANDOM%_%RANDOM%"
mkdir "!TEMP_DIR!" >nul 2>&1
set "EVAL_REPORT_PATH=!TEMP_DIR!\evals_small.md"
call :run_evals_for_model "!MODEL_SMALL!" "_small"
if errorlevel 1 set "FINAL_EXIT_CODE=1"
echo Unloading models before switching...
curl -s "!OLLAMA_URL!/api/generate" -d "{\"model\":\"!MODEL_SMALL!\",\"keep_alive\":0}" >nul 2>&1
timeout /t 2 /nobreak >nul
set "EVAL_REPORT_PATH=!TEMP_DIR!\evals_large.md"
call :run_evals_for_model "!MODEL_LARGE!" "_large"
if errorlevel 1 set "FINAL_EXIT_CODE=1"
if "!GENERATE_REPORT!"=="true" (
"!PYTHON!" "!SCRIPT_DIR!merge_eval_reports.py" ^
"!TEMP_DIR!\evals_small.md" "!MODEL_SMALL!" ^
"!TEMP_DIR!\evals_large.md" "!MODEL_LARGE!" ^
> "!PROJECT_ROOT!\EVALS.md"
echo.
echo Combined report saved to EVALS.md
)
rmdir /s /q "!TEMP_DIR!" >nul 2>&1
) else (
if not defined EVAL_JUDGE_MODEL set "EVAL_JUDGE_MODEL=!MODEL_LARGE!"
set "EVAL_REPORT_PATH=!PROJECT_ROOT!\EVALS.md"
call :run_evals_for_model "!EVAL_JUDGE_MODEL!" ""
if errorlevel 1 set "FINAL_EXIT_CODE=1"
)
echo.
echo ----------------------------------------------------------------
if "!FINAL_EXIT_CODE!"=="0" (
echo All evaluations passed!
) else (
echo WARNING: Some evaluations failed ^(exit code: !FINAL_EXIT_CODE!^)
)
echo.
echo Legend:
echo PASSED -^> Test passed
echo FAILED -^> Test failed
echo SKIPPED -^> Test skipped ^(missing dependencies^)
echo XFAIL -^> Expected failure ^(documents known limitation^)
echo XPASS -^> Bug fixed! ^(expected failure now passes^)
echo.
if "!GENERATE_REPORT!"=="true" (
echo Full report: EVALS.md
echo.
)
echo ----------------------------------------------------------------
exit /b !FINAL_EXIT_CODE!
:run_evals_for_model
REM %~1 = model, %~2 = report suffix
set "_MODEL=%~1"
set "_REPORT_SUFFIX=%~2"
set "EVAL_JUDGE_MODEL=!_MODEL!"
echo.
echo ================================================================
echo Running evals with model: !_MODEL!
echo ================================================================
echo.
if defined EVAL_REPEAT_COUNT (
set "_REPEAT_COUNT=!EVAL_REPEAT_COUNT!"
) else (
set "_REPEAT_COUNT=3"
)
set "_CMD="!PYTHON!" -m pytest evals/ !PYTEST_ARGS! --tb=short --count=!_REPEAT_COUNT!"
if not "!FILTER!"=="" (
if not "!EXCLUDE_PATTERNS!"=="" (
set "_CMD=!_CMD! -k "!FILTER! and not !EXCLUDE_PATTERNS!""
) else (
set "_CMD=!_CMD! -k "!FILTER!""
)
) else if not "!EXCLUDE_PATTERNS!"=="" (
set "_CMD=!_CMD! -k "not !EXCLUDE_PATTERNS!""
)
echo Command: !_CMD!
echo.
if "!GENERATE_REPORT!"=="true" (
set "EVAL_GENERATE_REPORT=1"
set "EVAL_REPORT_SUFFIX=!_REPORT_SUFFIX!"
)
call !_CMD!
exit /b !errorlevel!