Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/evals/conftest.py
+++ b/evals/conftest.py
@@ -0,0 +1,716 @@
+"""
+Shared fixtures and configuration for evals.
+
+Evals test end-to-end quality of the reply engine with real or mock LLM responses.
+"""
+
+import sys
+import os
+import re
+from pathlib import Path
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+import pytest
+
+# Robustly locate repository root
+_this_file = Path(__file__).resolve()
+ROOT = None
+for parent in _this_file.parents:
+    if (parent / "src" / "jarvis").exists():
+        ROOT = parent
+        break
+if ROOT is None:
+    ROOT = _this_file.parent.parent
+
+SRC = ROOT / "src"
+EVALS = ROOT / "evals"
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+if str(EVALS) not in sys.path:
+    sys.path.insert(0, str(EVALS))
+
+from helpers import MockConfig, JUDGE_MODEL, is_judge_llm_available
+
+
+# =============================================================================
+# Shared Markers
+# =============================================================================
+
+_JUDGE_LLM_AVAILABLE = is_judge_llm_available()
+requires_judge_llm = pytest.mark.skipif(
+    not _JUDGE_LLM_AVAILABLE,
+    reason="Judge LLM not available"
+)
+
+
+# =============================================================================
+# Test Case Descriptions
+# =============================================================================
+
+# Human-readable descriptions for test classes
+CLASS_DESCRIPTIONS = {
+    "TestResponseQuality": "LLM-as-judge evaluations for response quality",
+    "TestContextUtilization": "Tests that agent uses location/time/memory context",
+    "TestToolUsage": "Validates tool selection and argument quality",
+    "TestMultiStepReasoning": "Complex scenarios requiring tool chaining and synthesis",
+    "TestMemoryEnrichment": "Tests automatic memory enrichment keyword extraction",
+    "TestLiveEndToEnd": "End-to-end tests against real LLM inference",
+    "TestNutritionExtraction": "Tests LLM nutrition extraction accuracy for meal logging",
+    "TestNutritionToolIntegration": "Tests full meal logging tool with macro extraction",
+    "TestNutritionModelComparison": "Baseline tests for comparing nutrition extraction across models",
+    "TestIntentJudgeAccuracy": "Intent judge accuracy for voice command classification",
+    "TestIntentJudgePromptQuality": "Intent judge prompt construction quality",
+    "TestIntentJudgeFallback": "Intent judge fallback behaviour when unavailable",
+    "TestIntentJudgeMultiSegment": "Intent judge with multi-segment buffers and multi-person conversations",
+    "TestWakeWordValidationSafetyNet": "Integration: listener rejects judge hallucinations when no wake word present",
+    "TestEchoReasoningDistrust": "Integration: listener overrides judge echo claims when EchoDetector cleared",
+    "TestHotWindowHeuristicAccuracy": "Integration: could_be_hot_window heuristic passes correct mode to judge",
+    "TestProcessedSegmentFilteringIntegration": "Integration: processed segments excluded from judge prompt",
+    "TestHotWindowUsesRawText": "Integration: hot window preserves full user text, wake word uses judge extraction",
+    "TestMultiSegmentBufferIntegration": "Integration: multi-segment buffer with TTS echoes handled correctly",
+    "TestStopCommandBypassesJudge": "Integration: stop commands during TTS bypass judge entirely",
+    "TestKnowledgeExtractionQuality": "Tests that novel knowledge is correctly extracted from summaries",
+    "TestKnowledgeExtractionRejection": "Tests that noise, stale data, and common knowledge are rejected",
+    "TestKnowledgeExtractionReframing": "Tests that interaction descriptions are reframed as knowledge",
+    "TestKnowledgeExtractionJudge": "LLM-as-judge evaluations of extraction quality",
+    "TestTopicSwitching": "Tests correct tool selection when conversation topic changes",
+    "TestFollowUpContext": "Tests context retention for follow-up questions",
+    "TestMultiTurnExtended": "Extended multi-turn scenarios with longer conversations",
+    "TestGreetingNoToolsLive": "Tests that greetings don't trigger tool calls",
+    "TestHelpfulness": "Tests that agent uses tools proactively instead of deflecting",
+    "TestDiaryRecencyOrder": "Tests that diary search returns newer entries before older ones",
+    "TestGraphRecencySuperseding": "Tests that graph handles contradicting facts with date context",
+    "TestRecencyJudge": "LLM judge evaluates whether newer information is preferred over older",
+    "TestMalformedResponseAfterTools": "Tests that malformed LLM output after tool results is not surfaced",
+    "TestCelebrityIdentityThenFollowUp": "Two-turn celebrity flow: identity query then pronoun follow-up",
+    "TestSearchFailureWikipediaRescue": "Wikipedia-rescue payload is consumed correctly, not confabulated over",
+    "TestMultiStepEntityQuery": "Single query requiring two sequential webSearch calls (director + filmography)",
+}
+
+# Descriptions for non-parametrized tests
+TEST_DESCRIPTIONS = {
+    "test_weather_response_quality": "Judge evaluates weather response quality",
+    "test_location_context_in_search": "Location context flows to search queries",
+    "test_simple_search_flow": "Agent calls webSearch for info queries",
+    "test_tool_chaining_search_then_fetch": "Agent chains search → fetch for details",
+    "test_nutrition_advice_uses_memory_and_data": "Agent uses memory + nutrition data",
+    "test_enrichment_extracts_correct_keywords": "Enrichment extracts personalization keywords",
+    "test_enrichment_provides_context_to_llm": "Enrichment results appear in system message",
+    "test_llm_uses_enrichment_for_personalised_queries": "LLM uses enrichment-surfaced interests for personalised search",
+    "test_weather_query_live": "Weather query is answered with current conditions",
+    "test_personalized_query_recalls_memory_live": "Assistant checks memory before asking about interests",
+    "test_interest_flavoured_query_live": "Interest-flavoured phrasings surface seeded interests in the reply",
+    # Nutrition extraction tests
+    "test_meal_extraction_accuracy": "Extracts accurate macros for common meals",
+    "test_extraction_returns_valid_json_structure": "Returns valid JSON with all required fields",
+    "test_extraction_handles_ambiguous_portions": "Handles ambiguous portion descriptions",
+    "test_extraction_rejects_non_food": "Returns NONE for non-food inputs",
+    "test_log_meal_tool_extracts_macros": "LogMealTool stores meals with macros",
+    "test_simple_meal_extraction": "Simple meal baseline (2 boiled eggs)",
+    "test_extraction_with_quantities": "Extraction with explicit quantities",
+    # Multi-turn context tests
+    "test_weather_then_store_hours": "Topic switch: weather → store hours uses webSearch",
+    "test_weather_then_restaurant_search": "Topic switch: weather → restaurant uses webSearch",
+    "test_search_then_weather": "Topic switch: search → weather uses getWeather",
+    "test_follow_up_references_previous_context": "Follow-up references previous turn context",
+    "test_three_turn_topic_changes": "3-turn conversation with topic changes",
+    "test_rapid_topic_switching": "Rapid back-and-forth topic switching",
+    # Greeting no-tools live tests
+    "test_greeting_no_tools_live": "Greetings do not trigger tool calls",
+    "test_user_instructions_no_tools_live": "User instructions do not trigger tool calls",
+    "test_weather_still_triggers_tools_live": "Weather query still triggers tools after a greeting",
+    # Helpfulness / anti-deflection tests
+    "test_no_deflection_for_weather_forecast_live": "No deflection on weather forecast questions",
+    "test_no_deflection_for_answerable_queries_live": "No deflection on answerable questions",
+    "test_tool_retry_after_failure_live": "Assistant retries a tool after the first attempt fails",
+    "test_graph_knowledge_surfaced_in_reply_live": "Graph-enriched facts surface in the reply, no denial",
+    "test_does_not_deny_long_term_memory_live": "Assistant does not deny having long-term memory",
+    # Multi-step entity / complex flow tests
+    "test_chained_research_possessor_director": "Chained research: who directed Possessor and what else have they made",
+    "test_parallel_comparison_paris_vs_london": "Parallel weather lookup: compare Paris and London",
+    "test_director_then_filmography_requires_two_searches": "Director-then-filmography needs two searches",
+    "test_two_turn_celebrity_flow": "Two-turn celebrity flow: identity then pronoun follow-up",
+    "test_single_weather_call_terminates": "Single weather query ends after one tool call",
+    "test_max_turn_triggers_digest": "Max-turn cap delivers a digest reply, never silence",
+    # Knowledge extraction
+    "test_judge_mixed_summary_filters_noise": "Mixed summary: keep novel facts, drop stale weather/recommendations",
+    "test_judge_empty_conversation_returns_empty": "Trivial conversations produce no extracted facts",
+    "test_open_ended_prompt_grounds_in_graph_context_live": "Open-ended prompt grounds in stored knowledge",
+}
+
+
+def _parse_parametrize_id(node_id: str) -> Optional[str]:
+    """Extract the parametrize case ID from a node_id like 'test_foo[case-name]'.
+
+    Returns None if the bracket content is just a pytest-repeat suffix like '1-3'.
+    """
+    match = re.search(r'\[(.+)\]$', node_id)
+    if not match:
+        return None
+
+    case_id = match.group(1)
+
+    # Check if this is just a pytest-repeat suffix (e.g., "1-3", "2-3")
+    # These have format "N-M" where N is run number and M is total runs
+    if re.match(r'^\d+-\d+$', case_id):
+        return None
+
+    # Strip pytest-repeat suffix from the end of case IDs (e.g., "greeting-1-3" -> "greeting")
+    case_id = re.sub(r'-\d+-\d+$', '', case_id)
+
+    return case_id
+
+
+def _extract_judge_notes(stdout: Optional[str]) -> Optional[Dict[str, str]]:
+    """Parse judge evaluation output from stdout."""
+    if not stdout:
+        return None
+
+    notes = {}
+
+    # Extract score
+    score_match = re.search(r'Score:\s*([\d.]+)', stdout)
+    if score_match:
+        notes["score"] = score_match.group(1)
+
+    # Extract reasoning
+    reasoning_match = re.search(r'Reasoning:\s*(.+?)(?:\n|$)', stdout)
+    if reasoning_match:
+        notes["reasoning"] = reasoning_match.group(1).strip()
+
+    # Extract response being evaluated
+    response_match = re.search(r'Response:\s*(.+?)(?:\.\.\.|$)', stdout)
+    if response_match:
+        notes["response"] = response_match.group(1).strip()
+
+    return notes if notes else None
+
+
+def _humanise_test_name(test_name: str) -> str:
+    """Turn ``test_some_thing_does_X`` into ``Some thing does X``.
+
+    Last-resort fallback used when a test has no entry in TEST_DESCRIPTIONS
+    and no parametrize id. Keeps the report readable for non-technical
+    readers — they shouldn't have to parse Python identifiers.
+    """
+    name = test_name
+    if name.startswith("test_"):
+        name = name[5:]
+    name = name.replace("_", " ").strip()
+    if not name:
+        return test_name
+    return name[0].upper() + name[1:]
+
+
+def _strip_redundant_prefix(label: str) -> str:
+    """Drop noisy prefixes from human-readable case labels.
+
+    Every eval is live by design (the suite drives a real model), so the
+    ``Live:`` / ``Live `` prefix is uninformative. Same for trailing model
+    suffixes like ``-gpt-oss:20b`` that pytest cross-products into
+    parametrize ids — the Model column already shows that.
+    """
+    s = label.strip()
+    # Trailing "-<model>" suffix injected by pytest parametrize cross-product.
+    for suffix in ("-gpt-oss:20b", "-gemma4:e2b", "-gemma4:e4b"):
+        if s.endswith(suffix):
+            s = s[: -len(suffix)].rstrip()
+            break
+    # Leading "Live:" / "Live " prefix is redundant — the suite is live.
+    lower = s.lower()
+    for prefix in ("live: ", "live: ", "live "):
+        if lower.startswith(prefix):
+            s = s[len(prefix):].lstrip()
+            if s:
+                s = s[0].upper() + s[1:]
+            break
+    return s
+
+
+def _get_test_description(test_name: str, case_id: Optional[str]) -> str:
+    """
+    Get the description for a test case.
+
+    For parametrized tests, the case_id IS the description (set via pytest.param id=).
+    For non-parametrized tests, use the TEST_DESCRIPTIONS lookup.
+    """
+    if case_id:
+        return _strip_redundant_prefix(case_id)
+
+    raw = TEST_DESCRIPTIONS.get(test_name)
+    if raw is not None:
+        return _strip_redundant_prefix(raw)
+    # Last-resort: humanise the raw test name so the report doesn't expose
+    # Python identifiers to non-technical readers.
+    return _humanise_test_name(test_name)
+
+
+# =============================================================================
+# Markdown Report Generation
+# =============================================================================
+
+@dataclass
+class TestResult:
+    """Captured result from a single test run."""
+    name: str
+    outcome: str  # passed, failed, skipped, xfailed, xpassed
+    duration: float
+    class_name: str
+    test_name: str
+    case_id: Optional[str] = None
+    description: str = ""
+    reason: Optional[str] = None
+    stdout: Optional[str] = None
+    judge_notes: Optional[Dict[str, str]] = None
+
+
+@dataclass
+class AggregatedTestResult:
+    """Aggregated results from multiple runs of the same test."""
+    name: str
+    class_name: str
+    test_name: str
+    description: str
+    runs: List[TestResult] = field(default_factory=list)
+
+    @property
+    def pass_count(self) -> int:
+        return sum(1 for r in self.runs if r.outcome in ("passed", "xpassed"))
+
+    @property
+    def fail_count(self) -> int:
+        return sum(1 for r in self.runs if r.outcome == "failed")
+
+    @property
+    def skip_count(self) -> int:
+        return sum(1 for r in self.runs if r.outcome == "skipped")
+
+    @property
+    def xfail_count(self) -> int:
+        return sum(1 for r in self.runs if r.outcome == "xfailed")
+
+    @property
+    def total_runs(self) -> int:
+        return len(self.runs)
+
+    @property
+    def pass_rate(self) -> float:
+        countable = self.pass_count + self.fail_count
+        return (self.pass_count / countable * 100) if countable > 0 else 0.0
+
+    @property
+    def total_duration(self) -> float:
+        return sum(r.duration for r in self.runs)
+
+    @property
+    def avg_duration(self) -> float:
+        return self.total_duration / len(self.runs) if self.runs else 0.0
+
+    @property
+    def overall_outcome(self) -> str:
+        """Determine overall outcome based on pass rate."""
+        if self.skip_count == self.total_runs:
+            return "skipped"
+        if self.xfail_count == self.total_runs:
+            return "xfailed"
+        if self.pass_count == self.total_runs:
+            return "passed"
+        if self.fail_count == self.total_runs:
+            return "failed"
+        return "partial"
+
+    @property
+    def pass_rate_str(self) -> str:
+        """Format pass rate as 'X/Y (Z%)'."""
+        countable = self.pass_count + self.fail_count
+        if countable == 0:
+            if self.skip_count > 0:
+                return "SKIPPED"
+            if self.xfail_count > 0:
+                return f"{self.xfail_count}/{self.total_runs} XFAIL"
+            return "N/A"
+        return f"{self.pass_count}/{countable} ({self.pass_rate:.0f}%)"
+
+    @property
+    def judge_notes(self) -> Optional[Dict[str, str]]:
+        """Return judge notes from first run that has them."""
+        for run in self.runs:
+            if run.judge_notes:
+                return run.judge_notes
+        return None
+
+    @property
+    def reason(self) -> Optional[str]:
+        """Return reason from first run that has it."""
+        for run in self.runs:
+            if run.reason:
+                return run.reason
+        return None
+
+
+def _strip_repeat_suffix(node_id: str) -> str:
+    """
+    Strip pytest-repeat iteration suffix from node ID.
+
+    pytest-repeat adds suffixes like [1-3], [2-3], [3-3] to repeated tests.
+    This strips those suffixes to get the base test identifier for aggregation.
+    """
+    # Match patterns like [1-3], [2-3], [3-3] at the end of node ID
+    # But preserve parametrize IDs like [greeting-en], [weather-query], etc.
+    return re.sub(r'\[(\d+)-(\d+)\]$', '', node_id)
+
+
+def _get_aggregation_key(result: TestResult) -> str:
+    """Get a unique key for aggregating repeated test runs."""
+    # Use class_name + test_name + case_id (if any) as the aggregation key
+    key_parts = [result.class_name, result.test_name]
+    if result.case_id:
+        # case_id should already have repeat suffixes stripped by _parse_parametrize_id
+        key_parts.append(result.case_id)
+    return "::".join(key_parts)
+
+
+@dataclass
+class EvalReport:
+    """Aggregated eval results for markdown generation."""
+    results: List[TestResult] = field(default_factory=list)
+    start_time: Optional[datetime] = None
+    end_time: Optional[datetime] = None
+    judge_model: str = ""
+
+    def add_result(self, result: TestResult):
+        self.results.append(result)
+
+    def get_aggregated_results(self) -> List[AggregatedTestResult]:
+        """Aggregate results from multiple runs of the same test."""
+        aggregated: Dict[str, AggregatedTestResult] = {}
+
+        for result in self.results:
+            key = _get_aggregation_key(result)
+            if key not in aggregated:
+                # Description should already have repeat suffixes stripped
+                aggregated[key] = AggregatedTestResult(
+                    name=_strip_repeat_suffix(result.name),
+                    class_name=result.class_name,
+                    test_name=result.test_name,
+                    description=result.description,
+                )
+            aggregated[key].runs.append(result)
+
+        return list(aggregated.values())
+
+    @property
+    def total_unique_tests(self) -> int:
+        return len(self.get_aggregated_results())
+
+    @property
+    def total_runs(self) -> int:
+        return len(self.results)
+
+    @property
+    def passed(self) -> int:
+        return sum(1 for r in self.results if r.outcome == "passed")
+
+    @property
+    def failed(self) -> int:
+        return sum(1 for r in self.results if r.outcome == "failed")
+
+    @property
+    def skipped(self) -> int:
+        return sum(1 for r in self.results if r.outcome == "skipped")
+
+    @property
+    def xfailed(self) -> int:
+        return sum(1 for r in self.results if r.outcome == "xfailed")
+
+    @property
+    def xpassed(self) -> int:
+        return sum(1 for r in self.results if r.outcome == "xpassed")
+
+    @property
+    def pass_rate(self) -> float:
+        countable = self.passed + self.failed + self.xpassed
+        return (self.passed + self.xpassed) / countable * 100 if countable > 0 else 0.0
+
+    @property
+    def duration(self) -> float:
+        return sum(r.duration for r in self.results)
+
+    def generate_markdown(self) -> str:
+        """Generate a pretty markdown report with pass rates from multiple runs."""
+        lines = []
+        aggregated_results = self.get_aggregated_results()
+
+        # Calculate overall stats from aggregated results
+        total_tests = len(aggregated_results)
+        fully_passed = sum(1 for r in aggregated_results if r.overall_outcome == "passed")
+        fully_failed = sum(1 for r in aggregated_results if r.overall_outcome == "failed")
+        partial = sum(1 for r in aggregated_results if r.overall_outcome == "partial")
+        skipped = sum(1 for r in aggregated_results if r.overall_outcome == "skipped")
+        xfailed = sum(1 for r in aggregated_results if r.overall_outcome == "xfailed")
+
+        # Header
+        lines.append("# 🧪 Jarvis Evaluation Report")
+        lines.append("")
+        lines.append(f"**Generated:** {self.end_time.strftime('%Y-%m-%d %H:%M:%S') if self.end_time else 'N/A'}")
+        lines.append(f"**Judge Model:** `{self.judge_model}`")
+        lines.append(f"**Duration:** {self.duration:.2f}s")
+        lines.append(f"**Runs per test:** {self.total_runs // total_tests if total_tests > 0 else 0}")
+        lines.append("")
+
+        # Summary stats
+        lines.append("## 📊 Summary")
+        lines.append("")
+        lines.append("| Metric | Count |")
+        lines.append("|--------|-------|")
+        lines.append(f"| ✅ Fully Passed (100%) | {fully_passed} |")
+        lines.append(f"| ⚠️ Partial Pass | {partial} |")
+        lines.append(f"| ❌ Fully Failed (0%) | {fully_failed} |")
+        lines.append(f"| ⏭️ Skipped | {skipped} |")
+        lines.append(f"| 🔸 Expected Fail | {xfailed} |")
+        lines.append(f"| **Unique Tests** | **{total_tests}** |")
+        lines.append(f"| **Total Runs** | **{self.total_runs}** |")
+        lines.append("")
+
+        # Pass rate bar (based on individual runs)
+        pass_rate = self.pass_rate
+        bar_filled = int(pass_rate / 5)  # 20 chars max
+        bar_empty = 20 - bar_filled
+        bar = "█" * bar_filled + "░" * bar_empty
+        emoji = "🟢" if pass_rate >= 80 else "🟡" if pass_rate >= 50 else "🔴"
+        lines.append(f"**Overall Pass Rate:** {emoji} `{bar}` **{pass_rate:.1f}%** ({self.passed}/{self.passed + self.failed} runs)")
+        lines.append("")
+
+        # Group aggregated results by class
+        by_class: Dict[str, List[AggregatedTestResult]] = {}
+        for result in aggregated_results:
+            if result.class_name not in by_class:
+                by_class[result.class_name] = []
+            by_class[result.class_name].append(result)
+
+        # Detailed results
+        lines.append("---")
+        lines.append("")
+        lines.append("## 📋 Detailed Results")
+        lines.append("")
+
+        for class_name, class_results in by_class.items():
+            class_fully_passed = sum(1 for r in class_results if r.overall_outcome == "passed")
+            class_total = len([r for r in class_results if r.overall_outcome not in ("skipped",)])
+            class_emoji = "✅" if class_fully_passed == class_total and class_total > 0 else "⚠️" if class_fully_passed > 0 else "❌"
+
+            # Class header with description
+            lines.append(f"### {class_emoji} {class_name}")
+            if class_name in CLASS_DESCRIPTIONS:
+                lines.append(f"> {CLASS_DESCRIPTIONS[class_name]}")
+            lines.append("")
+
+            # Check if this class has judge notes (only for LLMAsJudge class)
+            is_judge_class = "Judge" in class_name
+            has_judge_notes = is_judge_class and any(r.judge_notes for r in class_results)
+
+            if has_judge_notes:
+                # Detailed format for judge tests
+                for result in class_results:
+                    status_emoji = {
+                        "passed": "✅",
+                        "failed": "❌",
+                        "skipped": "⏭️",
+                        "xfailed": "🔸",
+                        "partial": "⚠️",
+                    }.get(result.overall_outcome, "❓")
+
+                    lines.append(f"#### {status_emoji} {result.description}")
+                    lines.append("")
+                    lines.append(f"**Pass Rate:** {result.pass_rate_str}")
+
+                    if result.judge_notes:
+                        notes = result.judge_notes
+                        if "response" in notes:
+                            lines.append(f"**Input:** `{notes['response']}`")
+                        if "score" in notes:
+                            score = float(notes['score'])
+                            score_bar = "●" * int(score * 10) + "○" * (10 - int(score * 10))
+                            lines.append(f"**Score:** {score_bar} ({notes['score']})")
+                        if "reasoning" in notes:
+                            lines.append(f"**Judge notes:** {notes['reasoning']}")
+                        lines.append("")
+
+                    lines.append(f"*Avg Duration: {result.avg_duration:.2f}s*")
+                    lines.append("")
+            else:
+                # Table format for non-judge tests with pass rates
+                lines.append("| Test Case | Pass Rate | Status | Avg Duration |")
+                lines.append("|-----------|-----------|--------|--------------|")
+
+                for result in class_results:
+                    status_emoji = {
+                        "passed": "✅",
+                        "failed": "❌",
+                        "skipped": "⏭️",
+                        "xfailed": "🔸",
+                        "partial": "⚠️",
+                    }.get(result.overall_outcome, "❓")
+
+                    status_text = result.overall_outcome.upper()
+                    if result.reason:
+                        reason_short = result.reason[:30] + "..." if len(result.reason) > 30 else result.reason
+                        status_text += f" ({reason_short})"
+
+                    lines.append(f"| {result.description} | {result.pass_rate_str} | {status_emoji} {status_text} | {result.avg_duration:.2f}s |")
+
+                lines.append("")
+
+        # Footer
+        lines.append("---")
+        lines.append("")
+        lines.append("*Report generated by Jarvis eval suite*")
+
+        return "\n".join(lines)
+
+
+# Global report instance
+_eval_report: Optional[EvalReport] = None
+
+
+def pytest_configure(config):
+    """Initialize the eval report at test session start."""
+    global _eval_report
+    if os.environ.get("EVAL_GENERATE_REPORT") == "1":
+        _eval_report = EvalReport(
+            start_time=datetime.now(),
+            judge_model=JUDGE_MODEL
+        )
+
+
+def pytest_runtest_logreport(report):
+    """Capture each test result."""
+    global _eval_report
+    if _eval_report is None:
+        return
+
+    # Only capture the final result (call phase for passed/failed, setup/teardown for errors)
+    if report.when != "call" and not (report.when in ("setup", "teardown") and report.outcome == "failed"):
+        return
+
+    # Parse the node ID to extract class and test name
+    node_id = report.nodeid
+    parts = node_id.split("::")
+    class_name = parts[1] if len(parts) > 1 else "Unknown"
+    full_test_name = parts[-1] if parts else node_id
+
+    # Extract parametrize case ID (which is the description for parametrized tests)
+    case_id = _parse_parametrize_id(full_test_name)
+    test_name = full_test_name.split("[")[0]
+
+    # Get description: for parametrized tests, it's the case_id; otherwise from lookup
+    description = _get_test_description(test_name, case_id)
+
+    # Determine outcome
+    outcome = report.outcome
+    if hasattr(report, "wasxfail"):
+        outcome = "xpassed" if report.passed else "xfailed"
+
+    # Get skip reason if applicable
+    reason = None
+    if outcome == "skipped" and hasattr(report, "longrepr"):
+        if isinstance(report.longrepr, tuple) and len(report.longrepr) >= 3:
+            reason = str(report.longrepr[2])
+
+    # Capture stdout and parse judge notes
+    stdout = None
+    judge_notes = None
+    if hasattr(report, "capstdout") and report.capstdout:
+        stdout = report.capstdout
+        judge_notes = _extract_judge_notes(stdout)
+
+    # Also check sections for captured stdout
+    if not stdout:
+        for section_name, section_content in report.sections:
+            if "stdout" in section_name.lower():
+                stdout = section_content
+                judge_notes = _extract_judge_notes(stdout)
+                break
+
+    _eval_report.add_result(TestResult(
+        name=node_id,
+        outcome=outcome,
+        duration=report.duration,
+        class_name=class_name,
+        test_name=test_name,
+        case_id=case_id,
+        description=description,
+        reason=reason,
+        stdout=stdout,
+        judge_notes=judge_notes,
+    ))
+
+
+def pytest_sessionfinish(session, exitstatus):
+    """Generate the markdown report at session end."""
+    global _eval_report
+    if _eval_report is None:
+        return
+
+    _eval_report.end_time = datetime.now()
+
+    # Write the markdown report (ensure UTF-8 encoding for emojis/unicode)
+    # Support custom report path via environment variable
+    report_path_str = os.environ.get("EVAL_REPORT_PATH")
+    if report_path_str:
+        report_path = Path(report_path_str)
+    else:
+        report_path = ROOT / "EVALS.md"
+
+    markdown = _eval_report.generate_markdown()
+    report_path.write_text(markdown, encoding="utf-8")
+    try:
+        print(f"\n📄 Eval report saved to: {report_path}")
+    except UnicodeEncodeError:
+        print(f"\nEval report saved to: {report_path}")
+
+
+# =============================================================================
+# Fixtures
+# =============================================================================
+
+@pytest.fixture
+def mock_config():
+    """Provide a mock configuration for eval tests."""
+    return MockConfig()
+
+
+@pytest.fixture
+def eval_db():
+    """Provide an in-memory database for eval tests."""
+    from jarvis.memory.db import Database
+    db = Database(":memory:", sqlite_vss_path=None)
+    yield db
+    db.close()
+
+
+@pytest.fixture
+def eval_dialogue_memory():
+    """Provide a dialogue memory instance for eval tests."""
+    from jarvis.memory.conversation import DialogueMemory
+    return DialogueMemory(inactivity_timeout=300, max_interactions=20)
+
+
+@pytest.fixture
+def graph_store(tmp_path):
+    """Graph store backed by a temp SQLite DB, closed on teardown.
+
+    Closes the SQLite connection so `tmp_path`'s cleanup can unlink
+    the file on Windows. POSIX would tolerate a still-open handle,
+    Windows would not.
+    """
+    from jarvis.memory.graph import GraphMemoryStore
+    store = GraphMemoryStore(str(tmp_path / "test.db"))
+    try:
+        yield store
+    finally:
+        store.close()
+