Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
210 lines
7.0 KiB
Bash
Executable File
210 lines
7.0 KiB
Bash
Executable File
#!/bin/bash
|
|
# Run Jarvis evaluation suite
|
|
#
|
|
# Usage:
|
|
# ./scripts/run_evals.sh # Run all evals with both models (live + judge enabled)
|
|
# ./scripts/run_evals.sh weather # Run only weather-related evals
|
|
# ./scripts/run_evals.sh -v # Verbose output
|
|
# ./scripts/run_evals.sh --no-live # Exclude live LLM tests
|
|
# ./scripts/run_evals.sh --no-judge # Exclude LLM-as-judge tests
|
|
# ./scripts/run_evals.sh --no-report # Skip EVALS.md generation
|
|
# ./scripts/run_evals.sh --single # Run with single model only (EVAL_JUDGE_MODEL)
|
|
#
|
|
# Environment variables:
|
|
# EVAL_JUDGE_MODEL - Model to use for LLM-as-judge (default: gpt-oss:20b)
|
|
# EVAL_JUDGE_BASE_URL - Ollama base URL (default: http://localhost:11434)
|
|
# EVAL_REPEAT_COUNT - Number of times to run each test (default: 1; use 3 when tuning prompts to surface flakiness)
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
|
|
cd "$PROJECT_ROOT"
|
|
|
|
# Officially supported models (from config.py)
|
|
MODEL_SMALL="gemma4:e2b"
|
|
MODEL_LARGE="gpt-oss:20b"
|
|
|
|
echo ""
|
|
echo "┌────────────────────────────────────────────────────────────┐"
|
|
echo "│ 🧪 Jarvis Evaluation Suite │"
|
|
echo "└────────────────────────────────────────────────────────────┘"
|
|
echo ""
|
|
|
|
# Check if Ollama is available
|
|
OLLAMA_AVAILABLE=false
|
|
OLLAMA_URL="${EVAL_JUDGE_BASE_URL:-http://localhost:11434}"
|
|
if curl -s "${OLLAMA_URL}/api/tags" > /dev/null 2>&1; then
|
|
OLLAMA_AVAILABLE=true
|
|
echo " ✅ Ollama detected at ${OLLAMA_URL}"
|
|
else
|
|
echo " ⚠️ Ollama not detected at ${OLLAMA_URL}"
|
|
echo " LLM-as-judge tests will be skipped"
|
|
fi
|
|
echo ""
|
|
|
|
# Parse arguments (defaults: live=true, judge=true, report=true, multi_model=true)
|
|
PYTEST_ARGS="-v"
|
|
FILTER=""
|
|
INCLUDE_LIVE=true
|
|
INCLUDE_JUDGE=true
|
|
GENERATE_REPORT=true
|
|
MULTI_MODEL=true
|
|
|
|
for arg in "$@"; do
|
|
case $arg in
|
|
--no-live)
|
|
INCLUDE_LIVE=false
|
|
;;
|
|
--no-judge)
|
|
INCLUDE_JUDGE=false
|
|
;;
|
|
--no-report)
|
|
GENERATE_REPORT=false
|
|
;;
|
|
--single)
|
|
MULTI_MODEL=false
|
|
;;
|
|
--live)
|
|
INCLUDE_LIVE=true
|
|
;;
|
|
--judge)
|
|
INCLUDE_JUDGE=true
|
|
;;
|
|
-v|--verbose)
|
|
PYTEST_ARGS="$PYTEST_ARGS -v"
|
|
;;
|
|
-vv)
|
|
PYTEST_ARGS="$PYTEST_ARGS -vv"
|
|
;;
|
|
--*)
|
|
PYTEST_ARGS="$PYTEST_ARGS $arg"
|
|
;;
|
|
*)
|
|
FILTER="$arg"
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Build exclusion filter
|
|
EXCLUDE_PATTERNS=""
|
|
if [ "$INCLUDE_LIVE" = false ]; then
|
|
EXCLUDE_PATTERNS="Live"
|
|
echo " ⏭️ Skipping live LLM tests (remove --no-live to include)"
|
|
fi
|
|
|
|
# Function to run evals for a specific model
|
|
run_evals_for_model() {
|
|
local model="$1"
|
|
local report_suffix="$2"
|
|
|
|
export EVAL_JUDGE_MODEL="$model"
|
|
|
|
echo ""
|
|
echo "╔════════════════════════════════════════════════════════════╗"
|
|
echo " 🤖 Running evals with model: $model"
|
|
echo "╚════════════════════════════════════════════════════════════╝"
|
|
echo ""
|
|
|
|
# Build the pytest command (--tb=short for cleaner tracebacks, -s to capture stdout for judge notes)
|
|
# Each test runs REPEAT_COUNT times for pass rate calculation
|
|
local REPEAT_COUNT="${EVAL_REPEAT_COUNT:-1}"
|
|
local CMD="python -m pytest evals/ $PYTEST_ARGS --tb=short --count=$REPEAT_COUNT"
|
|
|
|
if [ -n "$FILTER" ]; then
|
|
if [ -n "$EXCLUDE_PATTERNS" ]; then
|
|
CMD="$CMD -k '$FILTER and not $EXCLUDE_PATTERNS'"
|
|
else
|
|
CMD="$CMD -k '$FILTER'"
|
|
fi
|
|
elif [ -n "$EXCLUDE_PATTERNS" ]; then
|
|
CMD="$CMD -k 'not $EXCLUDE_PATTERNS'"
|
|
fi
|
|
|
|
echo " 🚀 Command: $CMD"
|
|
echo ""
|
|
|
|
# Run with report generation if enabled
|
|
if [ "$GENERATE_REPORT" = true ]; then
|
|
export EVAL_GENERATE_REPORT=1
|
|
export EVAL_REPORT_SUFFIX="$report_suffix"
|
|
fi
|
|
|
|
# Run and capture exit code (don't exit on failure)
|
|
set +e
|
|
eval $CMD
|
|
local exit_code=$?
|
|
set -e
|
|
|
|
return $exit_code
|
|
}
|
|
|
|
# Run evals
|
|
if [ "$GENERATE_REPORT" = true ]; then
|
|
echo " 📄 Report will be saved to EVALS.md"
|
|
fi
|
|
|
|
FINAL_EXIT_CODE=0
|
|
|
|
if [ "$MULTI_MODEL" = true ] && [ "$OLLAMA_AVAILABLE" = true ]; then
|
|
echo " 🔄 Running evals with both supported models for comparison"
|
|
|
|
# Create temp files for individual model reports
|
|
TEMP_DIR=$(mktemp -d)
|
|
|
|
# Run with small model
|
|
export EVAL_REPORT_PATH="${TEMP_DIR}/evals_small.md"
|
|
run_evals_for_model "$MODEL_SMALL" "_small" || FINAL_EXIT_CODE=$?
|
|
|
|
# Unload all models to avoid VRAM corruption when switching
|
|
echo " 🔄 Unloading models before switching..."
|
|
curl -s "${OLLAMA_URL}/api/generate" -d "{\"model\":\"$MODEL_SMALL\",\"keep_alive\":0}" > /dev/null 2>&1
|
|
sleep 2
|
|
|
|
# Run with large model
|
|
export EVAL_REPORT_PATH="${TEMP_DIR}/evals_large.md"
|
|
run_evals_for_model "$MODEL_LARGE" "_large" || FINAL_EXIT_CODE=$?
|
|
|
|
# Merge reports into final EVALS.md
|
|
if [ "$GENERATE_REPORT" = true ]; then
|
|
python "${SCRIPT_DIR}/merge_eval_reports.py" \
|
|
"${TEMP_DIR}/evals_small.md" "$MODEL_SMALL" \
|
|
"${TEMP_DIR}/evals_large.md" "$MODEL_LARGE" \
|
|
> "${PROJECT_ROOT}/EVALS.md"
|
|
echo ""
|
|
echo " 📄 Combined report saved to EVALS.md"
|
|
fi
|
|
|
|
# Cleanup temp directory
|
|
rm -rf "$TEMP_DIR"
|
|
else
|
|
# Single model mode
|
|
export EVAL_JUDGE_MODEL="${EVAL_JUDGE_MODEL:-$MODEL_LARGE}"
|
|
export EVAL_REPORT_PATH="${PROJECT_ROOT}/EVALS.md"
|
|
run_evals_for_model "$EVAL_JUDGE_MODEL" "" || FINAL_EXIT_CODE=$?
|
|
fi
|
|
|
|
echo ""
|
|
echo "────────────────────────────────────────────────────────────────"
|
|
if [ $FINAL_EXIT_CODE -eq 0 ]; then
|
|
echo " ✅ All evaluations passed!"
|
|
else
|
|
echo " ⚠️ Some evaluations failed (exit code: $FINAL_EXIT_CODE)"
|
|
fi
|
|
echo ""
|
|
echo " 📖 Legend:"
|
|
echo " PASSED → Test passed"
|
|
echo " FAILED → Test failed"
|
|
echo " SKIPPED → Test skipped (missing dependencies)"
|
|
echo " XFAIL → Expected failure (documents known limitation)"
|
|
echo " XPASS → Bug fixed! (expected failure now passes)"
|
|
echo ""
|
|
if [ "$GENERATE_REPORT" = true ]; then
|
|
echo " 📄 Full report: EVALS.md"
|
|
echo ""
|
|
fi
|
|
echo "────────────────────────────────────────────────────────────────"
|
|
|
|
exit $FINAL_EXIT_CODE
|