javis_bot/evals/conftest.py

"""
Shared fixtures and configuration for evals.

Evals test end-to-end quality of the reply engine with real or mock LLM responses.
"""

import sys
import os
import re
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass, field
from typing import Dict, List, Optional
import pytest

# Robustly locate repository root
_this_file = Path(__file__).resolve()
ROOT = None
for parent in _this_file.parents:
    if (parent / "src" / "jarvis").exists():
        ROOT = parent
        break
if ROOT is None:
    ROOT = _this_file.parent.parent

SRC = ROOT / "src"
EVALS = ROOT / "evals"
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
if str(EVALS) not in sys.path:
    sys.path.insert(0, str(EVALS))

from helpers import MockConfig, JUDGE_MODEL, is_judge_llm_available


# =============================================================================
# Shared Markers
# =============================================================================

_JUDGE_LLM_AVAILABLE = is_judge_llm_available()
requires_judge_llm = pytest.mark.skipif(
    not _JUDGE_LLM_AVAILABLE,
    reason="Judge LLM not available"
)


# =============================================================================
# Test Case Descriptions
# =============================================================================

# Human-readable descriptions for test classes
CLASS_DESCRIPTIONS = {
    "TestResponseQuality": "LLM-as-judge evaluations for response quality",
    "TestContextUtilization": "Tests that agent uses location/time/memory context",
    "TestToolUsage": "Validates tool selection and argument quality",
    "TestMultiStepReasoning": "Complex scenarios requiring tool chaining and synthesis",
    "TestMemoryEnrichment": "Tests automatic memory enrichment keyword extraction",
    "TestLiveEndToEnd": "End-to-end tests against real LLM inference",
    "TestNutritionExtraction": "Tests LLM nutrition extraction accuracy for meal logging",
    "TestNutritionToolIntegration": "Tests full meal logging tool with macro extraction",
    "TestNutritionModelComparison": "Baseline tests for comparing nutrition extraction across models",
    "TestIntentJudgeAccuracy": "Intent judge accuracy for voice command classification",
    "TestIntentJudgePromptQuality": "Intent judge prompt construction quality",
    "TestIntentJudgeFallback": "Intent judge fallback behaviour when unavailable",
    "TestIntentJudgeMultiSegment": "Intent judge with multi-segment buffers and multi-person conversations",
    "TestWakeWordValidationSafetyNet": "Integration: listener rejects judge hallucinations when no wake word present",
    "TestEchoReasoningDistrust": "Integration: listener overrides judge echo claims when EchoDetector cleared",
    "TestHotWindowHeuristicAccuracy": "Integration: could_be_hot_window heuristic passes correct mode to judge",
    "TestProcessedSegmentFilteringIntegration": "Integration: processed segments excluded from judge prompt",
    "TestHotWindowUsesRawText": "Integration: hot window preserves full user text, wake word uses judge extraction",
    "TestMultiSegmentBufferIntegration": "Integration: multi-segment buffer with TTS echoes handled correctly",
    "TestStopCommandBypassesJudge": "Integration: stop commands during TTS bypass judge entirely",
    "TestKnowledgeExtractionQuality": "Tests that novel knowledge is correctly extracted from summaries",
    "TestKnowledgeExtractionRejection": "Tests that noise, stale data, and common knowledge are rejected",
    "TestKnowledgeExtractionReframing": "Tests that interaction descriptions are reframed as knowledge",
    "TestKnowledgeExtractionJudge": "LLM-as-judge evaluations of extraction quality",
    "TestTopicSwitching": "Tests correct tool selection when conversation topic changes",
    "TestFollowUpContext": "Tests context retention for follow-up questions",
    "TestMultiTurnExtended": "Extended multi-turn scenarios with longer conversations",
    "TestGreetingNoToolsLive": "Tests that greetings don't trigger tool calls",
    "TestHelpfulness": "Tests that agent uses tools proactively instead of deflecting",
    "TestDiaryRecencyOrder": "Tests that diary search returns newer entries before older ones",
    "TestGraphRecencySuperseding": "Tests that graph handles contradicting facts with date context",
    "TestRecencyJudge": "LLM judge evaluates whether newer information is preferred over older",
    "TestMalformedResponseAfterTools": "Tests that malformed LLM output after tool results is not surfaced",
    "TestCelebrityIdentityThenFollowUp": "Two-turn celebrity flow: identity query then pronoun follow-up",
    "TestSearchFailureWikipediaRescue": "Wikipedia-rescue payload is consumed correctly, not confabulated over",
    "TestMultiStepEntityQuery": "Single query requiring two sequential webSearch calls (director + filmography)",
}

# Descriptions for non-parametrized tests
TEST_DESCRIPTIONS = {
    "test_weather_response_quality": "Judge evaluates weather response quality",
    "test_location_context_in_search": "Location context flows to search queries",
    "test_simple_search_flow": "Agent calls webSearch for info queries",
    "test_tool_chaining_search_then_fetch": "Agent chains search → fetch for details",
    "test_nutrition_advice_uses_memory_and_data": "Agent uses memory + nutrition data",
    "test_enrichment_extracts_correct_keywords": "Enrichment extracts personalization keywords",
    "test_enrichment_provides_context_to_llm": "Enrichment results appear in system message",
    "test_llm_uses_enrichment_for_personalised_queries": "LLM uses enrichment-surfaced interests for personalised search",
    "test_weather_query_live": "Weather query is answered with current conditions",
    "test_personalized_query_recalls_memory_live": "Assistant checks memory before asking about interests",
    "test_interest_flavoured_query_live": "Interest-flavoured phrasings surface seeded interests in the reply",
    # Nutrition extraction tests
    "test_meal_extraction_accuracy": "Extracts accurate macros for common meals",
    "test_extraction_returns_valid_json_structure": "Returns valid JSON with all required fields",
    "test_extraction_handles_ambiguous_portions": "Handles ambiguous portion descriptions",
    "test_extraction_rejects_non_food": "Returns NONE for non-food inputs",
    "test_log_meal_tool_extracts_macros": "LogMealTool stores meals with macros",
    "test_simple_meal_extraction": "Simple meal baseline (2 boiled eggs)",
    "test_extraction_with_quantities": "Extraction with explicit quantities",
    # Multi-turn context tests
    "test_weather_then_store_hours": "Topic switch: weather → store hours uses webSearch",
    "test_weather_then_restaurant_search": "Topic switch: weather → restaurant uses webSearch",
    "test_search_then_weather": "Topic switch: search → weather uses getWeather",
    "test_follow_up_references_previous_context": "Follow-up references previous turn context",
    "test_three_turn_topic_changes": "3-turn conversation with topic changes",
    "test_rapid_topic_switching": "Rapid back-and-forth topic switching",
    # Greeting no-tools live tests
    "test_greeting_no_tools_live": "Greetings do not trigger tool calls",
    "test_user_instructions_no_tools_live": "User instructions do not trigger tool calls",
    "test_weather_still_triggers_tools_live": "Weather query still triggers tools after a greeting",
    # Helpfulness / anti-deflection tests
    "test_no_deflection_for_weather_forecast_live": "No deflection on weather forecast questions",
    "test_no_deflection_for_answerable_queries_live": "No deflection on answerable questions",
    "test_tool_retry_after_failure_live": "Assistant retries a tool after the first attempt fails",
    "test_graph_knowledge_surfaced_in_reply_live": "Graph-enriched facts surface in the reply, no denial",
    "test_does_not_deny_long_term_memory_live": "Assistant does not deny having long-term memory",
    # Multi-step entity / complex flow tests
    "test_chained_research_possessor_director": "Chained research: who directed Possessor and what else have they made",
    "test_parallel_comparison_paris_vs_london": "Parallel weather lookup: compare Paris and London",
    "test_director_then_filmography_requires_two_searches": "Director-then-filmography needs two searches",
    "test_two_turn_celebrity_flow": "Two-turn celebrity flow: identity then pronoun follow-up",
    "test_single_weather_call_terminates": "Single weather query ends after one tool call",
    "test_max_turn_triggers_digest": "Max-turn cap delivers a digest reply, never silence",
    # Knowledge extraction
    "test_judge_mixed_summary_filters_noise": "Mixed summary: keep novel facts, drop stale weather/recommendations",
    "test_judge_empty_conversation_returns_empty": "Trivial conversations produce no extracted facts",
    "test_open_ended_prompt_grounds_in_graph_context_live": "Open-ended prompt grounds in stored knowledge",
}


def _parse_parametrize_id(node_id: str) -> Optional[str]:
    """Extract the parametrize case ID from a node_id like 'test_foo[case-name]'.

    Returns None if the bracket content is just a pytest-repeat suffix like '1-3'.
    """
    match = re.search(r'\[(.+)\]$', node_id)
    if not match:
        return None

    case_id = match.group(1)

    # Check if this is just a pytest-repeat suffix (e.g., "1-3", "2-3")
    # These have format "N-M" where N is run number and M is total runs
    if re.match(r'^\d+-\d+$', case_id):
        return None

    # Strip pytest-repeat suffix from the end of case IDs (e.g., "greeting-1-3" -> "greeting")
    case_id = re.sub(r'-\d+-\d+$', '', case_id)

    return case_id


def _extract_judge_notes(stdout: Optional[str]) -> Optional[Dict[str, str]]:
    """Parse judge evaluation output from stdout."""
    if not stdout:
        return None

    notes = {}

    # Extract score
    score_match = re.search(r'Score:\s*([\d.]+)', stdout)
    if score_match:
        notes["score"] = score_match.group(1)

    # Extract reasoning
    reasoning_match = re.search(r'Reasoning:\s*(.+?)(?:\n|$)', stdout)
    if reasoning_match:
        notes["reasoning"] = reasoning_match.group(1).strip()

    # Extract response being evaluated
    response_match = re.search(r'Response:\s*(.+?)(?:\.\.\.|$)', stdout)
    if response_match:
        notes["response"] = response_match.group(1).strip()

    return notes if notes else None


def _humanise_test_name(test_name: str) -> str:
    """Turn ``test_some_thing_does_X`` into ``Some thing does X``.

    Last-resort fallback used when a test has no entry in TEST_DESCRIPTIONS
    and no parametrize id. Keeps the report readable for non-technical
    readers — they shouldn't have to parse Python identifiers.
    """
    name = test_name
    if name.startswith("test_"):
        name = name[5:]
    name = name.replace("_", " ").strip()
    if not name:
        return test_name
    return name[0].upper() + name[1:]


def _strip_redundant_prefix(label: str) -> str:
    """Drop noisy prefixes from human-readable case labels.

    Every eval is live by design (the suite drives a real model), so the
    ``Live:`` / ``Live `` prefix is uninformative. Same for trailing model
    suffixes like ``-gpt-oss:20b`` that pytest cross-products into
    parametrize ids — the Model column already shows that.
    """
    s = label.strip()
    # Trailing "-<model>" suffix injected by pytest parametrize cross-product.
    for suffix in ("-gpt-oss:20b", "-gemma4:e2b", "-gemma4:e4b"):
        if s.endswith(suffix):
            s = s[: -len(suffix)].rstrip()
            break
    # Leading "Live:" / "Live " prefix is redundant — the suite is live.
    lower = s.lower()
    for prefix in ("live: ", "live: ", "live "):
        if lower.startswith(prefix):
            s = s[len(prefix):].lstrip()
            if s:
                s = s[0].upper() + s[1:]
            break
    return s


def _get_test_description(test_name: str, case_id: Optional[str]) -> str:
    """
    Get the description for a test case.

    For parametrized tests, the case_id IS the description (set via pytest.param id=).
    For non-parametrized tests, use the TEST_DESCRIPTIONS lookup.
    """
    if case_id:
        return _strip_redundant_prefix(case_id)

    raw = TEST_DESCRIPTIONS.get(test_name)
    if raw is not None:
        return _strip_redundant_prefix(raw)
    # Last-resort: humanise the raw test name so the report doesn't expose
    # Python identifiers to non-technical readers.
    return _humanise_test_name(test_name)


# =============================================================================
# Markdown Report Generation
# =============================================================================

@dataclass
class TestResult:
    """Captured result from a single test run."""
    name: str
    outcome: str  # passed, failed, skipped, xfailed, xpassed
    duration: float
    class_name: str
    test_name: str
    case_id: Optional[str] = None
    description: str = ""
    reason: Optional[str] = None
    stdout: Optional[str] = None
    judge_notes: Optional[Dict[str, str]] = None


@dataclass
class AggregatedTestResult:
    """Aggregated results from multiple runs of the same test."""
    name: str
    class_name: str
    test_name: str
    description: str
    runs: List[TestResult] = field(default_factory=list)

    @property
    def pass_count(self) -> int:
        return sum(1 for r in self.runs if r.outcome in ("passed", "xpassed"))

    @property
    def fail_count(self) -> int:
        return sum(1 for r in self.runs if r.outcome == "failed")

    @property
    def skip_count(self) -> int:
        return sum(1 for r in self.runs if r.outcome == "skipped")

    @property
    def xfail_count(self) -> int:
        return sum(1 for r in self.runs if r.outcome == "xfailed")

    @property
    def total_runs(self) -> int:
        return len(self.runs)

    @property
    def pass_rate(self) -> float:
        countable = self.pass_count + self.fail_count
        return (self.pass_count / countable * 100) if countable > 0 else 0.0

    @property
    def total_duration(self) -> float:
        return sum(r.duration for r in self.runs)

    @property
    def avg_duration(self) -> float:
        return self.total_duration / len(self.runs) if self.runs else 0.0

    @property
    def overall_outcome(self) -> str:
        """Determine overall outcome based on pass rate."""
        if self.skip_count == self.total_runs:
            return "skipped"
        if self.xfail_count == self.total_runs:
            return "xfailed"
        if self.pass_count == self.total_runs:
            return "passed"
        if self.fail_count == self.total_runs:
            return "failed"
        return "partial"

    @property
    def pass_rate_str(self) -> str:
        """Format pass rate as 'X/Y (Z%)'."""
        countable = self.pass_count + self.fail_count
        if countable == 0:
            if self.skip_count > 0:
                return "SKIPPED"
            if self.xfail_count > 0:
                return f"{self.xfail_count}/{self.total_runs} XFAIL"
            return "N/A"
        return f"{self.pass_count}/{countable} ({self.pass_rate:.0f}%)"

    @property
    def judge_notes(self) -> Optional[Dict[str, str]]:
        """Return judge notes from first run that has them."""
        for run in self.runs:
            if run.judge_notes:
                return run.judge_notes
        return None

    @property
    def reason(self) -> Optional[str]:
        """Return reason from first run that has it."""
        for run in self.runs:
            if run.reason:
                return run.reason
        return None


def _strip_repeat_suffix(node_id: str) -> str:
    """
    Strip pytest-repeat iteration suffix from node ID.

    pytest-repeat adds suffixes like [1-3], [2-3], [3-3] to repeated tests.
    This strips those suffixes to get the base test identifier for aggregation.
    """
    # Match patterns like [1-3], [2-3], [3-3] at the end of node ID
    # But preserve parametrize IDs like [greeting-en], [weather-query], etc.
    return re.sub(r'\[(\d+)-(\d+)\]$', '', node_id)


def _get_aggregation_key(result: TestResult) -> str:
    """Get a unique key for aggregating repeated test runs."""
    # Use class_name + test_name + case_id (if any) as the aggregation key
    key_parts = [result.class_name, result.test_name]
    if result.case_id:
        # case_id should already have repeat suffixes stripped by _parse_parametrize_id
        key_parts.append(result.case_id)
    return "::".join(key_parts)


@dataclass
class EvalReport:
    """Aggregated eval results for markdown generation."""
    results: List[TestResult] = field(default_factory=list)
    start_time: Optional[datetime] = None
    end_time: Optional[datetime] = None
    judge_model: str = ""

    def add_result(self, result: TestResult):
        self.results.append(result)

    def get_aggregated_results(self) -> List[AggregatedTestResult]:
        """Aggregate results from multiple runs of the same test."""
        aggregated: Dict[str, AggregatedTestResult] = {}

        for result in self.results:
            key = _get_aggregation_key(result)
            if key not in aggregated:
                # Description should already have repeat suffixes stripped
                aggregated[key] = AggregatedTestResult(
                    name=_strip_repeat_suffix(result.name),
                    class_name=result.class_name,
                    test_name=result.test_name,
                    description=result.description,
                )
            aggregated[key].runs.append(result)

        return list(aggregated.values())

    @property
    def total_unique_tests(self) -> int:
        return len(self.get_aggregated_results())

    @property
    def total_runs(self) -> int:
        return len(self.results)

    @property
    def passed(self) -> int:
        return sum(1 for r in self.results if r.outcome == "passed")

    @property
    def failed(self) -> int:
        return sum(1 for r in self.results if r.outcome == "failed")

    @property
    def skipped(self) -> int:
        return sum(1 for r in self.results if r.outcome == "skipped")

    @property
    def xfailed(self) -> int:
        return sum(1 for r in self.results if r.outcome == "xfailed")

    @property
    def xpassed(self) -> int:
        return sum(1 for r in self.results if r.outcome == "xpassed")

    @property
    def pass_rate(self) -> float:
        countable = self.passed + self.failed + self.xpassed
        return (self.passed + self.xpassed) / countable * 100 if countable > 0 else 0.0

    @property
    def duration(self) -> float:
        return sum(r.duration for r in self.results)

    def generate_markdown(self) -> str:
        """Generate a pretty markdown report with pass rates from multiple runs."""
        lines = []
        aggregated_results = self.get_aggregated_results()

        # Calculate overall stats from aggregated results
        total_tests = len(aggregated_results)
        fully_passed = sum(1 for r in aggregated_results if r.overall_outcome == "passed")
        fully_failed = sum(1 for r in aggregated_results if r.overall_outcome == "failed")
        partial = sum(1 for r in aggregated_results if r.overall_outcome == "partial")
        skipped = sum(1 for r in aggregated_results if r.overall_outcome == "skipped")
        xfailed = sum(1 for r in aggregated_results if r.overall_outcome == "xfailed")

        # Header
        lines.append("# 🧪 Jarvis Evaluation Report")
        lines.append("")
        lines.append(f"**Generated:** {self.end_time.strftime('%Y-%m-%d %H:%M:%S') if self.end_time else 'N/A'}")
        lines.append(f"**Judge Model:** `{self.judge_model}`")
        lines.append(f"**Duration:** {self.duration:.2f}s")
        lines.append(f"**Runs per test:** {self.total_runs // total_tests if total_tests > 0 else 0}")
        lines.append("")

        # Summary stats
        lines.append("## 📊 Summary")
        lines.append("")
        lines.append("| Metric | Count |")
        lines.append("|--------|-------|")
        lines.append(f"| ✅ Fully Passed (100%) | {fully_passed} |")
        lines.append(f"| ⚠️ Partial Pass | {partial} |")
        lines.append(f"| ❌ Fully Failed (0%) | {fully_failed} |")
        lines.append(f"| ⏭️ Skipped | {skipped} |")
        lines.append(f"| 🔸 Expected Fail | {xfailed} |")
        lines.append(f"| **Unique Tests** | **{total_tests}** |")
        lines.append(f"| **Total Runs** | **{self.total_runs}** |")
        lines.append("")

        # Pass rate bar (based on individual runs)
        pass_rate = self.pass_rate
        bar_filled = int(pass_rate / 5)  # 20 chars max
        bar_empty = 20 - bar_filled
        bar = "█" * bar_filled + "░" * bar_empty
        emoji = "🟢" if pass_rate >= 80 else "🟡" if pass_rate >= 50 else "🔴"
        lines.append(f"**Overall Pass Rate:** {emoji} `{bar}` **{pass_rate:.1f}%** ({self.passed}/{self.passed + self.failed} runs)")
        lines.append("")

        # Group aggregated results by class
        by_class: Dict[str, List[AggregatedTestResult]] = {}
        for result in aggregated_results:
            if result.class_name not in by_class:
                by_class[result.class_name] = []
            by_class[result.class_name].append(result)

        # Detailed results
        lines.append("---")
        lines.append("")
        lines.append("## 📋 Detailed Results")
        lines.append("")

        for class_name, class_results in by_class.items():
            class_fully_passed = sum(1 for r in class_results if r.overall_outcome == "passed")
            class_total = len([r for r in class_results if r.overall_outcome not in ("skipped",)])
            class_emoji = "✅" if class_fully_passed == class_total and class_total > 0 else "⚠️" if class_fully_passed > 0 else "❌"

            # Class header with description
            lines.append(f"### {class_emoji} {class_name}")
            if class_name in CLASS_DESCRIPTIONS:
                lines.append(f"> {CLASS_DESCRIPTIONS[class_name]}")
            lines.append("")

            # Check if this class has judge notes (only for LLMAsJudge class)
            is_judge_class = "Judge" in class_name
            has_judge_notes = is_judge_class and any(r.judge_notes for r in class_results)

            if has_judge_notes:
                # Detailed format for judge tests
                for result in class_results:
                    status_emoji = {
                        "passed": "✅",
                        "failed": "❌",
                        "skipped": "⏭️",
                        "xfailed": "🔸",
                        "partial": "⚠️",
                    }.get(result.overall_outcome, "❓")

                    lines.append(f"#### {status_emoji} {result.description}")
                    lines.append("")
                    lines.append(f"**Pass Rate:** {result.pass_rate_str}")

                    if result.judge_notes:
                        notes = result.judge_notes
                        if "response" in notes:
                            lines.append(f"**Input:** `{notes['response']}`")
                        if "score" in notes:
                            score = float(notes['score'])
                            score_bar = "●" * int(score * 10) + "○" * (10 - int(score * 10))
                            lines.append(f"**Score:** {score_bar} ({notes['score']})")
                        if "reasoning" in notes:
                            lines.append(f"**Judge notes:** {notes['reasoning']}")
                        lines.append("")

                    lines.append(f"*Avg Duration: {result.avg_duration:.2f}s*")
                    lines.append("")
            else:
                # Table format for non-judge tests with pass rates
                lines.append("| Test Case | Pass Rate | Status | Avg Duration |")
                lines.append("|-----------|-----------|--------|--------------|")

                for result in class_results:
                    status_emoji = {
                        "passed": "✅",
                        "failed": "❌",
                        "skipped": "⏭️",
                        "xfailed": "🔸",
                        "partial": "⚠️",
                    }.get(result.overall_outcome, "❓")

                    status_text = result.overall_outcome.upper()
                    if result.reason:
                        reason_short = result.reason[:30] + "..." if len(result.reason) > 30 else result.reason
                        status_text += f" ({reason_short})"

                    lines.append(f"| {result.description} | {result.pass_rate_str} | {status_emoji} {status_text} | {result.avg_duration:.2f}s |")

                lines.append("")

        # Footer
        lines.append("---")
        lines.append("")
        lines.append("*Report generated by Jarvis eval suite*")

        return "\n".join(lines)


# Global report instance
_eval_report: Optional[EvalReport] = None


def pytest_configure(config):
    """Initialize the eval report at test session start."""
    global _eval_report
    if os.environ.get("EVAL_GENERATE_REPORT") == "1":
        _eval_report = EvalReport(
            start_time=datetime.now(),
            judge_model=JUDGE_MODEL
        )


def pytest_runtest_logreport(report):
    """Capture each test result."""
    global _eval_report
    if _eval_report is None:
        return

    # Only capture the final result (call phase for passed/failed, setup/teardown for errors)
    if report.when != "call" and not (report.when in ("setup", "teardown") and report.outcome == "failed"):
        return

    # Parse the node ID to extract class and test name
    node_id = report.nodeid
    parts = node_id.split("::")
    class_name = parts[1] if len(parts) > 1 else "Unknown"
    full_test_name = parts[-1] if parts else node_id

    # Extract parametrize case ID (which is the description for parametrized tests)
    case_id = _parse_parametrize_id(full_test_name)
    test_name = full_test_name.split("[")[0]

    # Get description: for parametrized tests, it's the case_id; otherwise from lookup
    description = _get_test_description(test_name, case_id)

    # Determine outcome
    outcome = report.outcome
    if hasattr(report, "wasxfail"):
        outcome = "xpassed" if report.passed else "xfailed"

    # Get skip reason if applicable
    reason = None
    if outcome == "skipped" and hasattr(report, "longrepr"):
        if isinstance(report.longrepr, tuple) and len(report.longrepr) >= 3:
            reason = str(report.longrepr[2])

    # Capture stdout and parse judge notes
    stdout = None
    judge_notes = None
    if hasattr(report, "capstdout") and report.capstdout:
        stdout = report.capstdout
        judge_notes = _extract_judge_notes(stdout)

    # Also check sections for captured stdout
    if not stdout:
        for section_name, section_content in report.sections:
            if "stdout" in section_name.lower():
                stdout = section_content
                judge_notes = _extract_judge_notes(stdout)
                break

    _eval_report.add_result(TestResult(
        name=node_id,
        outcome=outcome,
        duration=report.duration,
        class_name=class_name,
        test_name=test_name,
        case_id=case_id,
        description=description,
        reason=reason,
        stdout=stdout,
        judge_notes=judge_notes,
    ))


def pytest_sessionfinish(session, exitstatus):
    """Generate the markdown report at session end."""
    global _eval_report
    if _eval_report is None:
        return

    _eval_report.end_time = datetime.now()

    # Write the markdown report (ensure UTF-8 encoding for emojis/unicode)
    # Support custom report path via environment variable
    report_path_str = os.environ.get("EVAL_REPORT_PATH")
    if report_path_str:
        report_path = Path(report_path_str)
    else:
        report_path = ROOT / "EVALS.md"

    markdown = _eval_report.generate_markdown()
    report_path.write_text(markdown, encoding="utf-8")
    try:
        print(f"\n📄 Eval report saved to: {report_path}")
    except UnicodeEncodeError:
        print(f"\nEval report saved to: {report_path}")


# =============================================================================
# Fixtures
# =============================================================================

@pytest.fixture
def mock_config():
    """Provide a mock configuration for eval tests."""
    return MockConfig()


@pytest.fixture
def eval_db():
    """Provide an in-memory database for eval tests."""
    from jarvis.memory.db import Database
    db = Database(":memory:", sqlite_vss_path=None)
    yield db
    db.close()


@pytest.fixture
def eval_dialogue_memory():
    """Provide a dialogue memory instance for eval tests."""
    from jarvis.memory.conversation import DialogueMemory
    return DialogueMemory(inactivity_timeout=300, max_interactions=20)


@pytest.fixture
def graph_store(tmp_path):
    """Graph store backed by a temp SQLite DB, closed on teardown.

    Closes the SQLite connection so `tmp_path`'s cleanup can unlink
    the file on Windows. POSIX would tolerate a still-open handle,
    Windows would not.
    """
    from jarvis.memory.graph import GraphMemoryStore
    store = GraphMemoryStore(str(tmp_path / "test.db"))
    try:
        yield store
    finally:
        store.close()