#!/bin/bash # Run Jarvis evaluation suite # # Usage: # ./scripts/run_evals.sh # Run all evals with both models (live + judge enabled) # ./scripts/run_evals.sh weather # Run only weather-related evals # ./scripts/run_evals.sh -v # Verbose output # ./scripts/run_evals.sh --no-live # Exclude live LLM tests # ./scripts/run_evals.sh --no-judge # Exclude LLM-as-judge tests # ./scripts/run_evals.sh --no-report # Skip EVALS.md generation # ./scripts/run_evals.sh --single # Run with single model only (EVAL_JUDGE_MODEL) # # Environment variables: # EVAL_JUDGE_MODEL - Model to use for LLM-as-judge (default: gpt-oss:20b) # EVAL_JUDGE_BASE_URL - Ollama base URL (default: http://localhost:11434) # EVAL_REPEAT_COUNT - Number of times to run each test (default: 1; use 3 when tuning prompts to surface flakiness) set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" cd "$PROJECT_ROOT" # Officially supported models (from config.py) MODEL_SMALL="gemma4:e2b" MODEL_LARGE="gpt-oss:20b" echo "" echo "┌────────────────────────────────────────────────────────────┐" echo "│ 🧪 Jarvis Evaluation Suite │" echo "└────────────────────────────────────────────────────────────┘" echo "" # Check if Ollama is available OLLAMA_AVAILABLE=false OLLAMA_URL="${EVAL_JUDGE_BASE_URL:-http://localhost:11434}" if curl -s "${OLLAMA_URL}/api/tags" > /dev/null 2>&1; then OLLAMA_AVAILABLE=true echo " ✅ Ollama detected at ${OLLAMA_URL}" else echo " ⚠️ Ollama not detected at ${OLLAMA_URL}" echo " LLM-as-judge tests will be skipped" fi echo "" # Parse arguments (defaults: live=true, judge=true, report=true, multi_model=true) PYTEST_ARGS="-v" FILTER="" INCLUDE_LIVE=true INCLUDE_JUDGE=true GENERATE_REPORT=true MULTI_MODEL=true for arg in "$@"; do case $arg in --no-live) INCLUDE_LIVE=false ;; --no-judge) INCLUDE_JUDGE=false ;; --no-report) GENERATE_REPORT=false ;; --single) MULTI_MODEL=false ;; --live) INCLUDE_LIVE=true ;; --judge) INCLUDE_JUDGE=true ;; -v|--verbose) PYTEST_ARGS="$PYTEST_ARGS -v" ;; -vv) PYTEST_ARGS="$PYTEST_ARGS -vv" ;; --*) PYTEST_ARGS="$PYTEST_ARGS $arg" ;; *) FILTER="$arg" ;; esac done # Build exclusion filter EXCLUDE_PATTERNS="" if [ "$INCLUDE_LIVE" = false ]; then EXCLUDE_PATTERNS="Live" echo " ⏭️ Skipping live LLM tests (remove --no-live to include)" fi # Function to run evals for a specific model run_evals_for_model() { local model="$1" local report_suffix="$2" export EVAL_JUDGE_MODEL="$model" echo "" echo "╔════════════════════════════════════════════════════════════╗" echo " 🤖 Running evals with model: $model" echo "╚════════════════════════════════════════════════════════════╝" echo "" # Build the pytest command (--tb=short for cleaner tracebacks, -s to capture stdout for judge notes) # Each test runs REPEAT_COUNT times for pass rate calculation local REPEAT_COUNT="${EVAL_REPEAT_COUNT:-1}" local CMD="python -m pytest evals/ $PYTEST_ARGS --tb=short --count=$REPEAT_COUNT" if [ -n "$FILTER" ]; then if [ -n "$EXCLUDE_PATTERNS" ]; then CMD="$CMD -k '$FILTER and not $EXCLUDE_PATTERNS'" else CMD="$CMD -k '$FILTER'" fi elif [ -n "$EXCLUDE_PATTERNS" ]; then CMD="$CMD -k 'not $EXCLUDE_PATTERNS'" fi echo " 🚀 Command: $CMD" echo "" # Run with report generation if enabled if [ "$GENERATE_REPORT" = true ]; then export EVAL_GENERATE_REPORT=1 export EVAL_REPORT_SUFFIX="$report_suffix" fi # Run and capture exit code (don't exit on failure) set +e eval $CMD local exit_code=$? set -e return $exit_code } # Run evals if [ "$GENERATE_REPORT" = true ]; then echo " 📄 Report will be saved to EVALS.md" fi FINAL_EXIT_CODE=0 if [ "$MULTI_MODEL" = true ] && [ "$OLLAMA_AVAILABLE" = true ]; then echo " 🔄 Running evals with both supported models for comparison" # Create temp files for individual model reports TEMP_DIR=$(mktemp -d) # Run with small model export EVAL_REPORT_PATH="${TEMP_DIR}/evals_small.md" run_evals_for_model "$MODEL_SMALL" "_small" || FINAL_EXIT_CODE=$? # Unload all models to avoid VRAM corruption when switching echo " 🔄 Unloading models before switching..." curl -s "${OLLAMA_URL}/api/generate" -d "{\"model\":\"$MODEL_SMALL\",\"keep_alive\":0}" > /dev/null 2>&1 sleep 2 # Run with large model export EVAL_REPORT_PATH="${TEMP_DIR}/evals_large.md" run_evals_for_model "$MODEL_LARGE" "_large" || FINAL_EXIT_CODE=$? # Merge reports into final EVALS.md if [ "$GENERATE_REPORT" = true ]; then python "${SCRIPT_DIR}/merge_eval_reports.py" \ "${TEMP_DIR}/evals_small.md" "$MODEL_SMALL" \ "${TEMP_DIR}/evals_large.md" "$MODEL_LARGE" \ > "${PROJECT_ROOT}/EVALS.md" echo "" echo " 📄 Combined report saved to EVALS.md" fi # Cleanup temp directory rm -rf "$TEMP_DIR" else # Single model mode export EVAL_JUDGE_MODEL="${EVAL_JUDGE_MODEL:-$MODEL_LARGE}" export EVAL_REPORT_PATH="${PROJECT_ROOT}/EVALS.md" run_evals_for_model "$EVAL_JUDGE_MODEL" "" || FINAL_EXIT_CODE=$? fi echo "" echo "────────────────────────────────────────────────────────────────" if [ $FINAL_EXIT_CODE -eq 0 ]; then echo " ✅ All evaluations passed!" else echo " ⚠️ Some evaluations failed (exit code: $FINAL_EXIT_CODE)" fi echo "" echo " 📖 Legend:" echo " PASSED → Test passed" echo " FAILED → Test failed" echo " SKIPPED → Test skipped (missing dependencies)" echo " XFAIL → Expected failure (documents known limitation)" echo " XPASS → Bug fixed! (expected failure now passes)" echo "" if [ "$GENERATE_REPORT" = true ]; then echo " 📄 Full report: EVALS.md" echo "" fi echo "────────────────────────────────────────────────────────────────" exit $FINAL_EXIT_CODE