""" Nutrition Extraction Evaluations Tests the LLM's ability to extract accurate nutritional information from meal descriptions. This is critical for smaller models like gemma4 which may struggle with nutrition estimation. Run with specific model: EVAL_JUDGE_MODEL=gemma4 ./scripts/run_evals.sh nutrition EVAL_JUDGE_MODEL=gpt-oss:20b ./scripts/run_evals.sh nutrition For EVALS.md generation (always use gpt-oss:20b): ./scripts/run_evals.sh """ import json from dataclasses import dataclass from typing import Dict, Any, Optional, List, Tuple import pytest from conftest import requires_judge_llm from helpers import ( MockConfig, JUDGE_MODEL, JUDGE_BASE_URL, ) # ============================================================================= # Test Data - Meals with Expected Nutritional Ranges # ============================================================================= @dataclass class MealTestCase: """A meal test case with expected nutritional ranges.""" description: str # Expected ranges as (min, max) - None means any value is acceptable calories_range: Tuple[int, int] protein_range: Tuple[int, int] carbs_range: Tuple[int, int] fat_range: Tuple[int, int] # Whether we expect micronutrients to be populated expect_micros: bool = False # Representative meals across the macro-estimation range (lean, calorie-dense, carb-heavy) MEAL_TEST_CASES = [ pytest.param( MealTestCase( description="a grilled chicken breast with steamed broccoli", calories_range=(200, 400), protein_range=(25, 50), carbs_range=(0, 20), fat_range=(3, 15), ), id="Nutrition: chicken with broccoli" ), pytest.param( MealTestCase( description="a cheeseburger with fries", calories_range=(700, 1200), protein_range=(25, 45), carbs_range=(60, 120), fat_range=(35, 70), ), id="Nutrition: cheeseburger with fries" ), pytest.param( MealTestCase( description="a bowl of oatmeal with banana and honey", calories_range=(300, 500), protein_range=(6, 15), carbs_range=(50, 90), fat_range=(3, 12), ), id="Nutrition: oatmeal with banana" ), ] # ============================================================================= # Evaluation Helpers # ============================================================================= def call_nutrition_extraction( cfg: MockConfig, meal_text: str ) -> Optional[Dict[str, Any]]: """ Call the nutrition extraction prompt directly and parse the response. Returns the parsed JSON or None if extraction failed. """ from jarvis.tools.builtin.nutrition.log_meal import NUTRITION_SYS from jarvis.llm import call_llm_direct user_prompt = ( "User said (redacted):\n" + meal_text[:1200] + "\n\n" "Return ONLY JSON or the exact string NONE." ) raw = call_llm_direct( cfg.ollama_base_url, cfg.ollama_chat_model, NUTRITION_SYS, user_prompt, timeout_sec=cfg.llm_chat_timeout_sec ) or "" text = raw.strip() if text.upper() == "NONE": return None try: # Handle markdown code blocks if "```" in text: # Extract JSON from code block start = text.find("```") end = text.rfind("```") if start != end: inner = text[start:end] # Remove ```json or ``` prefix if inner.startswith("```json"): inner = inner[7:] elif inner.startswith("```"): inner = inner[3:] text = inner.strip() return json.loads(text) except json.JSONDecodeError: return None def validate_nutrition_data( data: Optional[Dict[str, Any]], case: MealTestCase ) -> Tuple[bool, List[str]]: """ Validate extracted nutrition data against expected ranges. Returns (passed, list of issues). """ issues = [] if data is None: return False, ["Extraction returned None or invalid JSON"] # Check required fields exist required_fields = ["calories_kcal", "protein_g", "carbs_g", "fat_g"] for field in required_fields: if field not in data or data[field] is None: issues.append(f"Missing required field: {field}") if issues: return False, issues # Validate ranges def check_range(value: Any, field_name: str, expected_range: Tuple[int, int]) -> Optional[str]: try: v = float(value) min_val, max_val = expected_range if v < min_val * 0.5: # Allow 50% below minimum return f"{field_name}={v:.0f} too low (expected {min_val}-{max_val})" if v > max_val * 2.0: # Allow 100% above maximum return f"{field_name}={v:.0f} too high (expected {min_val}-{max_val})" except (TypeError, ValueError): return f"{field_name} is not a valid number: {value}" return None # Check each macro cal_issue = check_range(data.get("calories_kcal"), "calories", case.calories_range) if cal_issue: issues.append(cal_issue) prot_issue = check_range(data.get("protein_g"), "protein", case.protein_range) if prot_issue: issues.append(prot_issue) carb_issue = check_range(data.get("carbs_g"), "carbs", case.carbs_range) if carb_issue: issues.append(carb_issue) fat_issue = check_range(data.get("fat_g"), "fat", case.fat_range) if fat_issue: issues.append(fat_issue) # Check confidence is present and reasonable confidence = data.get("confidence") if confidence is None: issues.append("Missing confidence score") elif not isinstance(confidence, (int, float)) or not (0 <= float(confidence) <= 1): issues.append(f"Invalid confidence: {confidence} (should be 0-1)") return len(issues) == 0, issues # ============================================================================= # Nutrition Extraction Tests # ============================================================================= class TestNutritionExtraction: """ Tests for LLM nutrition extraction accuracy. These tests verify that the model can: 1. Parse meal descriptions correctly 2. Return valid JSON with required fields 3. Provide reasonable nutritional estimates """ @pytest.mark.eval @requires_judge_llm @pytest.mark.parametrize("case", MEAL_TEST_CASES) def test_meal_extraction_accuracy(self, case: MealTestCase, mock_config): """ Test that the model extracts reasonable nutrition data for common meals. """ mock_config.ollama_base_url = JUDGE_BASE_URL mock_config.ollama_chat_model = JUDGE_MODEL mock_config.llm_chat_timeout_sec = 120.0 print(f"\n[MEAL] Testing meal: {case.description}") print(f" Model: {JUDGE_MODEL}") # Call the extraction data = call_nutrition_extraction(mock_config, f"I had {case.description}") print(f" Extracted: {json.dumps(data, indent=2) if data else 'None'}") # Validate passed, issues = validate_nutrition_data(data, case) if data: print(f" Calories: {data.get('calories_kcal')} (expected {case.calories_range[0]}-{case.calories_range[1]})") print(f" Protein: {data.get('protein_g')}g (expected {case.protein_range[0]}-{case.protein_range[1]})") print(f" Carbs: {data.get('carbs_g')}g (expected {case.carbs_range[0]}-{case.carbs_range[1]})") print(f" Fat: {data.get('fat_g')}g (expected {case.fat_range[0]}-{case.fat_range[1]})") print(f" Confidence: {data.get('confidence')}") if issues: print(f" FAIL Issues: {issues}") else: print(f" PASS All values within expected ranges") assert passed, f"Nutrition extraction failed: {issues}" @pytest.mark.eval @requires_judge_llm def test_extraction_returns_valid_json_structure(self, mock_config): """ Test that extraction returns properly structured JSON with all expected fields. """ mock_config.ollama_base_url = JUDGE_BASE_URL mock_config.ollama_chat_model = JUDGE_MODEL mock_config.llm_chat_timeout_sec = 120.0 print(f"\n[JSON] Testing JSON structure") print(f" Model: {JUDGE_MODEL}") data = call_nutrition_extraction(mock_config, "I ate a sandwich for lunch") print(f" Response: {json.dumps(data, indent=2) if data else 'None'}") assert data is not None, "Should return valid JSON, not None" # Check all expected fields expected_fields = [ "description", "calories_kcal", "protein_g", "carbs_g", "fat_g", "fiber_g", "sugar_g", "sodium_mg", "potassium_mg", "confidence" ] missing = [f for f in expected_fields if f not in data] print(f" Missing fields: {missing if missing else 'None'}") # Core fields are mandatory core_fields = ["description", "calories_kcal", "protein_g", "carbs_g", "fat_g", "confidence"] core_missing = [f for f in core_fields if f not in data] assert not core_missing, f"Missing core fields: {core_missing}" print(f" PASS All core fields present") @pytest.mark.eval @requires_judge_llm def test_extraction_handles_ambiguous_portions(self, mock_config): """ Test that model provides reasonable estimates for ambiguous portion descriptions. """ mock_config.ollama_base_url = JUDGE_BASE_URL mock_config.ollama_chat_model = JUDGE_MODEL mock_config.llm_chat_timeout_sec = 120.0 print(f"\n[AMBIGUOUS] Testing ambiguous portions") print(f" Model: {JUDGE_MODEL}") # Ambiguous description - should still get reasonable defaults data = call_nutrition_extraction(mock_config, "I had some rice with chicken") print(f" Response: {json.dumps(data, indent=2) if data else 'None'}") assert data is not None, "Should handle ambiguous portions" # Should have a lower confidence for ambiguous descriptions confidence = data.get("confidence") print(f" Confidence: {confidence}") # Calories should be reasonable for rice + chicken (300-800 typical) calories = data.get("calories_kcal") if calories: assert 150 <= float(calories) <= 1200, f"Calories {calories} outside reasonable range" print(f" PASS Calories {calories} within reasonable range") @pytest.mark.eval @requires_judge_llm def test_extraction_rejects_non_food(self, mock_config): """ Test that extraction returns NONE for non-food inputs. """ mock_config.ollama_base_url = JUDGE_BASE_URL mock_config.ollama_chat_model = JUDGE_MODEL mock_config.llm_chat_timeout_sec = 120.0 print(f"\n[NON-FOOD] Testing non-food rejection") print(f" Model: {JUDGE_MODEL}") # Non-food input data = call_nutrition_extraction(mock_config, "I went for a walk in the park") print(f" Response: {data}") # Should return None (NONE response) assert data is None, f"Should return None for non-food input, got: {data}" print(f" PASS Correctly returned None") class TestNutritionToolIntegration: """ Tests for the full meal logging tool integration. These test the complete flow from user input through tool execution. """ @pytest.mark.eval @requires_judge_llm def test_log_meal_tool_extracts_macros(self, mock_config, eval_db): """ Test that LogMealTool properly extracts and stores macros. """ from jarvis.tools.builtin.nutrition.log_meal import LogMealTool from jarvis.tools.base import ToolContext from jarvis.memory.db import Database mock_config.ollama_base_url = JUDGE_BASE_URL mock_config.ollama_chat_model = JUDGE_MODEL mock_config.llm_chat_timeout_sec = 120.0 mock_config.use_stdin = True print(f"\n[TOOL] Testing LogMealTool integration") print(f" Model: {JUDGE_MODEL}") tool = LogMealTool() # Retry up to 3 times since smaller models can be flaky result = None for attempt in range(3): # Fresh DB for each attempt test_db = Database(":memory:", sqlite_vss_path=None) messages_printed = [] def capture_print(msg): messages_printed.append(msg) context = ToolContext( db=test_db, cfg=mock_config, system_prompt="You are a helpful assistant.", original_prompt="I had a grilled chicken salad for lunch", redacted_text="I had a grilled chicken salad for lunch", max_retries=0, user_print=capture_print, ) # Run with incomplete args to trigger extraction result = tool.run({}, context) if result.success: eval_db = test_db # Use the successful DB for assertions break print(f" Attempt {attempt + 1} failed, retrying...") print(f" Success: {result.success}") print(f" Reply: {result.reply_text[:200] if result.reply_text else 'None'}...") assert result.success, f"Tool should succeed after retries, got: {result.reply_text}" # Check that macros are in the reply reply_lower = result.reply_text.lower() if result.reply_text else "" has_macros = any(term in reply_lower for term in ["kcal", "protein", "carb", "fat"]) print(f" Has macros in reply: {has_macros}") assert has_macros, "Reply should include macro information" # Verify meal was stored in DB from datetime import datetime, timezone, timedelta now = datetime.now(timezone.utc) meals = test_db.get_meals_between( (now - timedelta(minutes=5)).isoformat(), (now + timedelta(minutes=5)).isoformat() ) print(f" Meals in DB: {len(meals)}") assert len(meals) >= 1, "Should have stored at least one meal" # Check the stored meal has nutrition data meal = meals[0] # sqlite3.Row needs index or column name access calories = meal["calories_kcal"] if "calories_kcal" in meal.keys() else None print(f" Stored meal calories: {calories}") has_stored_macros = calories is not None print(f" Has stored macros: {has_stored_macros}") assert has_stored_macros, f"Stored meal should have macros" print(f" PASS Meal logged with macros: {calories} kcal") # ============================================================================= # Comparison Tests (for debugging model differences) # ============================================================================= class TestNutritionModelComparison: """ Tests specifically designed to compare nutrition extraction between models. These help diagnose why smaller models may perform worse. """ @pytest.mark.eval @requires_judge_llm def test_simple_meal_extraction(self, mock_config): """ Simple meal that any model should handle correctly. """ mock_config.ollama_base_url = JUDGE_BASE_URL mock_config.ollama_chat_model = JUDGE_MODEL mock_config.llm_chat_timeout_sec = 120.0 print(f"\n[SIMPLE] Simple meal test (baseline)") print(f" Model: {JUDGE_MODEL}") # Very simple, common meal data = call_nutrition_extraction(mock_config, "I had 2 boiled eggs") print(f" Response: {json.dumps(data, indent=2) if data else 'None'}") assert data is not None, "Should extract simple meal" # 2 boiled eggs: ~140-160 kcal, 12-14g protein, 0-2g carbs, 10-12g fat # Note: Smaller models may sometimes parse as 1 egg (~78 kcal), so we use a loose range calories = data.get("calories_kcal") protein = data.get("protein_g") if calories: # Loose range: 1-2 eggs worth (some models miss quantity) assert 60 <= float(calories) <= 350, f"Calories {calories} way off for eggs" if protein: assert 5 <= float(protein) <= 20, f"Protein {protein}g way off for eggs" print(f" PASS Simple extraction succeeded") @pytest.mark.eval @requires_judge_llm def test_extraction_with_quantities(self, mock_config): """ Test extraction with explicit quantities (should improve accuracy). """ mock_config.ollama_base_url = JUDGE_BASE_URL mock_config.ollama_chat_model = JUDGE_MODEL mock_config.llm_chat_timeout_sec = 120.0 print(f"\n[QUANTITY] Quantity extraction test") print(f" Model: {JUDGE_MODEL}") # Explicit quantities should help smaller models data = call_nutrition_extraction( mock_config, "I had 100g of cooked white rice and 150g of grilled chicken breast" ) print(f" Response: {json.dumps(data, indent=2) if data else 'None'}") assert data is not None, "Should extract meal with quantities" # 100g rice: ~130 kcal, 2.7g protein, 28g carbs, 0.3g fat # 150g chicken: ~248 kcal, 46g protein, 0g carbs, 5.4g fat # Total: ~378 kcal, ~49g protein, ~28g carbs, ~6g fat # Note: Models can vary significantly; some may overestimate if assuming larger portions calories = data.get("calories_kcal") protein = data.get("protein_g") if calories: assert 200 <= float(calories) <= 800, f"Calories {calories} off for rice+chicken" if protein: # Wider range to accommodate model variance (some assume larger chicken portions) assert 20 <= float(protein) <= 120, f"Protein {protein}g off for rice+chicken" print(f" PASS Quantity-based extraction succeeded")