Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
507
evals/test_nutrition_extraction.py
Normal file
507
evals/test_nutrition_extraction.py
Normal file
@@ -0,0 +1,507 @@
|
||||
"""
|
||||
Nutrition Extraction Evaluations
|
||||
|
||||
Tests the LLM's ability to extract accurate nutritional information from meal descriptions.
|
||||
This is critical for smaller models like gemma4 which may struggle with nutrition estimation.
|
||||
|
||||
Run with specific model:
|
||||
EVAL_JUDGE_MODEL=gemma4 ./scripts/run_evals.sh nutrition
|
||||
EVAL_JUDGE_MODEL=gpt-oss:20b ./scripts/run_evals.sh nutrition
|
||||
|
||||
For EVALS.md generation (always use gpt-oss:20b):
|
||||
./scripts/run_evals.sh
|
||||
"""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import requires_judge_llm
|
||||
from helpers import (
|
||||
MockConfig,
|
||||
JUDGE_MODEL,
|
||||
JUDGE_BASE_URL,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test Data - Meals with Expected Nutritional Ranges
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class MealTestCase:
|
||||
"""A meal test case with expected nutritional ranges."""
|
||||
description: str
|
||||
# Expected ranges as (min, max) - None means any value is acceptable
|
||||
calories_range: Tuple[int, int]
|
||||
protein_range: Tuple[int, int]
|
||||
carbs_range: Tuple[int, int]
|
||||
fat_range: Tuple[int, int]
|
||||
# Whether we expect micronutrients to be populated
|
||||
expect_micros: bool = False
|
||||
|
||||
|
||||
# Representative meals across the macro-estimation range (lean, calorie-dense, carb-heavy)
|
||||
MEAL_TEST_CASES = [
|
||||
pytest.param(
|
||||
MealTestCase(
|
||||
description="a grilled chicken breast with steamed broccoli",
|
||||
calories_range=(200, 400),
|
||||
protein_range=(25, 50),
|
||||
carbs_range=(0, 20),
|
||||
fat_range=(3, 15),
|
||||
),
|
||||
id="Nutrition: chicken with broccoli"
|
||||
),
|
||||
pytest.param(
|
||||
MealTestCase(
|
||||
description="a cheeseburger with fries",
|
||||
calories_range=(700, 1200),
|
||||
protein_range=(25, 45),
|
||||
carbs_range=(60, 120),
|
||||
fat_range=(35, 70),
|
||||
),
|
||||
id="Nutrition: cheeseburger with fries"
|
||||
),
|
||||
pytest.param(
|
||||
MealTestCase(
|
||||
description="a bowl of oatmeal with banana and honey",
|
||||
calories_range=(300, 500),
|
||||
protein_range=(6, 15),
|
||||
carbs_range=(50, 90),
|
||||
fat_range=(3, 12),
|
||||
),
|
||||
id="Nutrition: oatmeal with banana"
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Evaluation Helpers
|
||||
# =============================================================================
|
||||
|
||||
def call_nutrition_extraction(
|
||||
cfg: MockConfig,
|
||||
meal_text: str
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Call the nutrition extraction prompt directly and parse the response.
|
||||
Returns the parsed JSON or None if extraction failed.
|
||||
"""
|
||||
from jarvis.tools.builtin.nutrition.log_meal import NUTRITION_SYS
|
||||
from jarvis.llm import call_llm_direct
|
||||
|
||||
user_prompt = (
|
||||
"User said (redacted):\n" + meal_text[:1200] + "\n\n"
|
||||
"Return ONLY JSON or the exact string NONE."
|
||||
)
|
||||
|
||||
raw = call_llm_direct(
|
||||
cfg.ollama_base_url,
|
||||
cfg.ollama_chat_model,
|
||||
NUTRITION_SYS,
|
||||
user_prompt,
|
||||
timeout_sec=cfg.llm_chat_timeout_sec
|
||||
) or ""
|
||||
|
||||
text = raw.strip()
|
||||
if text.upper() == "NONE":
|
||||
return None
|
||||
|
||||
try:
|
||||
# Handle markdown code blocks
|
||||
if "```" in text:
|
||||
# Extract JSON from code block
|
||||
start = text.find("```")
|
||||
end = text.rfind("```")
|
||||
if start != end:
|
||||
inner = text[start:end]
|
||||
# Remove ```json or ``` prefix
|
||||
if inner.startswith("```json"):
|
||||
inner = inner[7:]
|
||||
elif inner.startswith("```"):
|
||||
inner = inner[3:]
|
||||
text = inner.strip()
|
||||
|
||||
return json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def validate_nutrition_data(
|
||||
data: Optional[Dict[str, Any]],
|
||||
case: MealTestCase
|
||||
) -> Tuple[bool, List[str]]:
|
||||
"""
|
||||
Validate extracted nutrition data against expected ranges.
|
||||
Returns (passed, list of issues).
|
||||
"""
|
||||
issues = []
|
||||
|
||||
if data is None:
|
||||
return False, ["Extraction returned None or invalid JSON"]
|
||||
|
||||
# Check required fields exist
|
||||
required_fields = ["calories_kcal", "protein_g", "carbs_g", "fat_g"]
|
||||
for field in required_fields:
|
||||
if field not in data or data[field] is None:
|
||||
issues.append(f"Missing required field: {field}")
|
||||
|
||||
if issues:
|
||||
return False, issues
|
||||
|
||||
# Validate ranges
|
||||
def check_range(value: Any, field_name: str, expected_range: Tuple[int, int]) -> Optional[str]:
|
||||
try:
|
||||
v = float(value)
|
||||
min_val, max_val = expected_range
|
||||
if v < min_val * 0.5: # Allow 50% below minimum
|
||||
return f"{field_name}={v:.0f} too low (expected {min_val}-{max_val})"
|
||||
if v > max_val * 2.0: # Allow 100% above maximum
|
||||
return f"{field_name}={v:.0f} too high (expected {min_val}-{max_val})"
|
||||
except (TypeError, ValueError):
|
||||
return f"{field_name} is not a valid number: {value}"
|
||||
return None
|
||||
|
||||
# Check each macro
|
||||
cal_issue = check_range(data.get("calories_kcal"), "calories", case.calories_range)
|
||||
if cal_issue:
|
||||
issues.append(cal_issue)
|
||||
|
||||
prot_issue = check_range(data.get("protein_g"), "protein", case.protein_range)
|
||||
if prot_issue:
|
||||
issues.append(prot_issue)
|
||||
|
||||
carb_issue = check_range(data.get("carbs_g"), "carbs", case.carbs_range)
|
||||
if carb_issue:
|
||||
issues.append(carb_issue)
|
||||
|
||||
fat_issue = check_range(data.get("fat_g"), "fat", case.fat_range)
|
||||
if fat_issue:
|
||||
issues.append(fat_issue)
|
||||
|
||||
# Check confidence is present and reasonable
|
||||
confidence = data.get("confidence")
|
||||
if confidence is None:
|
||||
issues.append("Missing confidence score")
|
||||
elif not isinstance(confidence, (int, float)) or not (0 <= float(confidence) <= 1):
|
||||
issues.append(f"Invalid confidence: {confidence} (should be 0-1)")
|
||||
|
||||
return len(issues) == 0, issues
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Nutrition Extraction Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestNutritionExtraction:
|
||||
"""
|
||||
Tests for LLM nutrition extraction accuracy.
|
||||
|
||||
These tests verify that the model can:
|
||||
1. Parse meal descriptions correctly
|
||||
2. Return valid JSON with required fields
|
||||
3. Provide reasonable nutritional estimates
|
||||
"""
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
@pytest.mark.parametrize("case", MEAL_TEST_CASES)
|
||||
def test_meal_extraction_accuracy(self, case: MealTestCase, mock_config):
|
||||
"""
|
||||
Test that the model extracts reasonable nutrition data for common meals.
|
||||
"""
|
||||
mock_config.ollama_base_url = JUDGE_BASE_URL
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
mock_config.llm_chat_timeout_sec = 120.0
|
||||
|
||||
print(f"\n[MEAL] Testing meal: {case.description}")
|
||||
print(f" Model: {JUDGE_MODEL}")
|
||||
|
||||
# Call the extraction
|
||||
data = call_nutrition_extraction(mock_config, f"I had {case.description}")
|
||||
|
||||
print(f" Extracted: {json.dumps(data, indent=2) if data else 'None'}")
|
||||
|
||||
# Validate
|
||||
passed, issues = validate_nutrition_data(data, case)
|
||||
|
||||
if data:
|
||||
print(f" Calories: {data.get('calories_kcal')} (expected {case.calories_range[0]}-{case.calories_range[1]})")
|
||||
print(f" Protein: {data.get('protein_g')}g (expected {case.protein_range[0]}-{case.protein_range[1]})")
|
||||
print(f" Carbs: {data.get('carbs_g')}g (expected {case.carbs_range[0]}-{case.carbs_range[1]})")
|
||||
print(f" Fat: {data.get('fat_g')}g (expected {case.fat_range[0]}-{case.fat_range[1]})")
|
||||
print(f" Confidence: {data.get('confidence')}")
|
||||
|
||||
if issues:
|
||||
print(f" FAIL Issues: {issues}")
|
||||
else:
|
||||
print(f" PASS All values within expected ranges")
|
||||
|
||||
assert passed, f"Nutrition extraction failed: {issues}"
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_extraction_returns_valid_json_structure(self, mock_config):
|
||||
"""
|
||||
Test that extraction returns properly structured JSON with all expected fields.
|
||||
"""
|
||||
mock_config.ollama_base_url = JUDGE_BASE_URL
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
mock_config.llm_chat_timeout_sec = 120.0
|
||||
|
||||
print(f"\n[JSON] Testing JSON structure")
|
||||
print(f" Model: {JUDGE_MODEL}")
|
||||
|
||||
data = call_nutrition_extraction(mock_config, "I ate a sandwich for lunch")
|
||||
|
||||
print(f" Response: {json.dumps(data, indent=2) if data else 'None'}")
|
||||
|
||||
assert data is not None, "Should return valid JSON, not None"
|
||||
|
||||
# Check all expected fields
|
||||
expected_fields = [
|
||||
"description", "calories_kcal", "protein_g", "carbs_g", "fat_g",
|
||||
"fiber_g", "sugar_g", "sodium_mg", "potassium_mg", "confidence"
|
||||
]
|
||||
|
||||
missing = [f for f in expected_fields if f not in data]
|
||||
print(f" Missing fields: {missing if missing else 'None'}")
|
||||
|
||||
# Core fields are mandatory
|
||||
core_fields = ["description", "calories_kcal", "protein_g", "carbs_g", "fat_g", "confidence"]
|
||||
core_missing = [f for f in core_fields if f not in data]
|
||||
|
||||
assert not core_missing, f"Missing core fields: {core_missing}"
|
||||
print(f" PASS All core fields present")
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_extraction_handles_ambiguous_portions(self, mock_config):
|
||||
"""
|
||||
Test that model provides reasonable estimates for ambiguous portion descriptions.
|
||||
"""
|
||||
mock_config.ollama_base_url = JUDGE_BASE_URL
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
mock_config.llm_chat_timeout_sec = 120.0
|
||||
|
||||
print(f"\n[AMBIGUOUS] Testing ambiguous portions")
|
||||
print(f" Model: {JUDGE_MODEL}")
|
||||
|
||||
# Ambiguous description - should still get reasonable defaults
|
||||
data = call_nutrition_extraction(mock_config, "I had some rice with chicken")
|
||||
|
||||
print(f" Response: {json.dumps(data, indent=2) if data else 'None'}")
|
||||
|
||||
assert data is not None, "Should handle ambiguous portions"
|
||||
|
||||
# Should have a lower confidence for ambiguous descriptions
|
||||
confidence = data.get("confidence")
|
||||
print(f" Confidence: {confidence}")
|
||||
|
||||
# Calories should be reasonable for rice + chicken (300-800 typical)
|
||||
calories = data.get("calories_kcal")
|
||||
if calories:
|
||||
assert 150 <= float(calories) <= 1200, f"Calories {calories} outside reasonable range"
|
||||
print(f" PASS Calories {calories} within reasonable range")
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_extraction_rejects_non_food(self, mock_config):
|
||||
"""
|
||||
Test that extraction returns NONE for non-food inputs.
|
||||
"""
|
||||
mock_config.ollama_base_url = JUDGE_BASE_URL
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
mock_config.llm_chat_timeout_sec = 120.0
|
||||
|
||||
print(f"\n[NON-FOOD] Testing non-food rejection")
|
||||
print(f" Model: {JUDGE_MODEL}")
|
||||
|
||||
# Non-food input
|
||||
data = call_nutrition_extraction(mock_config, "I went for a walk in the park")
|
||||
|
||||
print(f" Response: {data}")
|
||||
|
||||
# Should return None (NONE response)
|
||||
assert data is None, f"Should return None for non-food input, got: {data}"
|
||||
print(f" PASS Correctly returned None")
|
||||
|
||||
|
||||
class TestNutritionToolIntegration:
|
||||
"""
|
||||
Tests for the full meal logging tool integration.
|
||||
|
||||
These test the complete flow from user input through tool execution.
|
||||
"""
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_log_meal_tool_extracts_macros(self, mock_config, eval_db):
|
||||
"""
|
||||
Test that LogMealTool properly extracts and stores macros.
|
||||
"""
|
||||
from jarvis.tools.builtin.nutrition.log_meal import LogMealTool
|
||||
from jarvis.tools.base import ToolContext
|
||||
from jarvis.memory.db import Database
|
||||
|
||||
mock_config.ollama_base_url = JUDGE_BASE_URL
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
mock_config.llm_chat_timeout_sec = 120.0
|
||||
mock_config.use_stdin = True
|
||||
|
||||
print(f"\n[TOOL] Testing LogMealTool integration")
|
||||
print(f" Model: {JUDGE_MODEL}")
|
||||
|
||||
tool = LogMealTool()
|
||||
|
||||
# Retry up to 3 times since smaller models can be flaky
|
||||
result = None
|
||||
for attempt in range(3):
|
||||
# Fresh DB for each attempt
|
||||
test_db = Database(":memory:", sqlite_vss_path=None)
|
||||
|
||||
messages_printed = []
|
||||
|
||||
def capture_print(msg):
|
||||
messages_printed.append(msg)
|
||||
|
||||
context = ToolContext(
|
||||
db=test_db,
|
||||
cfg=mock_config,
|
||||
system_prompt="You are a helpful assistant.",
|
||||
original_prompt="I had a grilled chicken salad for lunch",
|
||||
redacted_text="I had a grilled chicken salad for lunch",
|
||||
max_retries=0,
|
||||
user_print=capture_print,
|
||||
)
|
||||
|
||||
# Run with incomplete args to trigger extraction
|
||||
result = tool.run({}, context)
|
||||
if result.success:
|
||||
eval_db = test_db # Use the successful DB for assertions
|
||||
break
|
||||
print(f" Attempt {attempt + 1} failed, retrying...")
|
||||
|
||||
print(f" Success: {result.success}")
|
||||
print(f" Reply: {result.reply_text[:200] if result.reply_text else 'None'}...")
|
||||
|
||||
assert result.success, f"Tool should succeed after retries, got: {result.reply_text}"
|
||||
|
||||
# Check that macros are in the reply
|
||||
reply_lower = result.reply_text.lower() if result.reply_text else ""
|
||||
has_macros = any(term in reply_lower for term in ["kcal", "protein", "carb", "fat"])
|
||||
|
||||
print(f" Has macros in reply: {has_macros}")
|
||||
assert has_macros, "Reply should include macro information"
|
||||
|
||||
# Verify meal was stored in DB
|
||||
from datetime import datetime, timezone, timedelta
|
||||
now = datetime.now(timezone.utc)
|
||||
meals = test_db.get_meals_between(
|
||||
(now - timedelta(minutes=5)).isoformat(),
|
||||
(now + timedelta(minutes=5)).isoformat()
|
||||
)
|
||||
|
||||
print(f" Meals in DB: {len(meals)}")
|
||||
assert len(meals) >= 1, "Should have stored at least one meal"
|
||||
|
||||
# Check the stored meal has nutrition data
|
||||
meal = meals[0]
|
||||
# sqlite3.Row needs index or column name access
|
||||
calories = meal["calories_kcal"] if "calories_kcal" in meal.keys() else None
|
||||
print(f" Stored meal calories: {calories}")
|
||||
|
||||
has_stored_macros = calories is not None
|
||||
print(f" Has stored macros: {has_stored_macros}")
|
||||
|
||||
assert has_stored_macros, f"Stored meal should have macros"
|
||||
print(f" PASS Meal logged with macros: {calories} kcal")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Comparison Tests (for debugging model differences)
|
||||
# =============================================================================
|
||||
|
||||
class TestNutritionModelComparison:
|
||||
"""
|
||||
Tests specifically designed to compare nutrition extraction between models.
|
||||
|
||||
These help diagnose why smaller models may perform worse.
|
||||
"""
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_simple_meal_extraction(self, mock_config):
|
||||
"""
|
||||
Simple meal that any model should handle correctly.
|
||||
"""
|
||||
mock_config.ollama_base_url = JUDGE_BASE_URL
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
mock_config.llm_chat_timeout_sec = 120.0
|
||||
|
||||
print(f"\n[SIMPLE] Simple meal test (baseline)")
|
||||
print(f" Model: {JUDGE_MODEL}")
|
||||
|
||||
# Very simple, common meal
|
||||
data = call_nutrition_extraction(mock_config, "I had 2 boiled eggs")
|
||||
|
||||
print(f" Response: {json.dumps(data, indent=2) if data else 'None'}")
|
||||
|
||||
assert data is not None, "Should extract simple meal"
|
||||
|
||||
# 2 boiled eggs: ~140-160 kcal, 12-14g protein, 0-2g carbs, 10-12g fat
|
||||
# Note: Smaller models may sometimes parse as 1 egg (~78 kcal), so we use a loose range
|
||||
calories = data.get("calories_kcal")
|
||||
protein = data.get("protein_g")
|
||||
|
||||
if calories:
|
||||
# Loose range: 1-2 eggs worth (some models miss quantity)
|
||||
assert 60 <= float(calories) <= 350, f"Calories {calories} way off for eggs"
|
||||
|
||||
if protein:
|
||||
assert 5 <= float(protein) <= 20, f"Protein {protein}g way off for eggs"
|
||||
|
||||
print(f" PASS Simple extraction succeeded")
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
def test_extraction_with_quantities(self, mock_config):
|
||||
"""
|
||||
Test extraction with explicit quantities (should improve accuracy).
|
||||
"""
|
||||
mock_config.ollama_base_url = JUDGE_BASE_URL
|
||||
mock_config.ollama_chat_model = JUDGE_MODEL
|
||||
mock_config.llm_chat_timeout_sec = 120.0
|
||||
|
||||
print(f"\n[QUANTITY] Quantity extraction test")
|
||||
print(f" Model: {JUDGE_MODEL}")
|
||||
|
||||
# Explicit quantities should help smaller models
|
||||
data = call_nutrition_extraction(
|
||||
mock_config,
|
||||
"I had 100g of cooked white rice and 150g of grilled chicken breast"
|
||||
)
|
||||
|
||||
print(f" Response: {json.dumps(data, indent=2) if data else 'None'}")
|
||||
|
||||
assert data is not None, "Should extract meal with quantities"
|
||||
|
||||
# 100g rice: ~130 kcal, 2.7g protein, 28g carbs, 0.3g fat
|
||||
# 150g chicken: ~248 kcal, 46g protein, 0g carbs, 5.4g fat
|
||||
# Total: ~378 kcal, ~49g protein, ~28g carbs, ~6g fat
|
||||
# Note: Models can vary significantly; some may overestimate if assuming larger portions
|
||||
|
||||
calories = data.get("calories_kcal")
|
||||
protein = data.get("protein_g")
|
||||
|
||||
if calories:
|
||||
assert 200 <= float(calories) <= 800, f"Calories {calories} off for rice+chicken"
|
||||
|
||||
if protein:
|
||||
# Wider range to accommodate model variance (some assume larger chicken portions)
|
||||
assert 20 <= float(protein) <= 120, f"Protein {protein}g off for rice+chicken"
|
||||
|
||||
print(f" PASS Quantity-based extraction succeeded")
|
||||
Reference in New Issue
Block a user