Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled

Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
This commit is contained in:
javis-bot
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions

716
evals/conftest.py Normal file
View File

@@ -0,0 +1,716 @@
"""
Shared fixtures and configuration for evals.
Evals test end-to-end quality of the reply engine with real or mock LLM responses.
"""
import sys
import os
import re
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass, field
from typing import Dict, List, Optional
import pytest
# Robustly locate repository root
_this_file = Path(__file__).resolve()
ROOT = None
for parent in _this_file.parents:
if (parent / "src" / "jarvis").exists():
ROOT = parent
break
if ROOT is None:
ROOT = _this_file.parent.parent
SRC = ROOT / "src"
EVALS = ROOT / "evals"
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
if str(EVALS) not in sys.path:
sys.path.insert(0, str(EVALS))
from helpers import MockConfig, JUDGE_MODEL, is_judge_llm_available
# =============================================================================
# Shared Markers
# =============================================================================
_JUDGE_LLM_AVAILABLE = is_judge_llm_available()
requires_judge_llm = pytest.mark.skipif(
not _JUDGE_LLM_AVAILABLE,
reason="Judge LLM not available"
)
# =============================================================================
# Test Case Descriptions
# =============================================================================
# Human-readable descriptions for test classes
CLASS_DESCRIPTIONS = {
"TestResponseQuality": "LLM-as-judge evaluations for response quality",
"TestContextUtilization": "Tests that agent uses location/time/memory context",
"TestToolUsage": "Validates tool selection and argument quality",
"TestMultiStepReasoning": "Complex scenarios requiring tool chaining and synthesis",
"TestMemoryEnrichment": "Tests automatic memory enrichment keyword extraction",
"TestLiveEndToEnd": "End-to-end tests against real LLM inference",
"TestNutritionExtraction": "Tests LLM nutrition extraction accuracy for meal logging",
"TestNutritionToolIntegration": "Tests full meal logging tool with macro extraction",
"TestNutritionModelComparison": "Baseline tests for comparing nutrition extraction across models",
"TestIntentJudgeAccuracy": "Intent judge accuracy for voice command classification",
"TestIntentJudgePromptQuality": "Intent judge prompt construction quality",
"TestIntentJudgeFallback": "Intent judge fallback behaviour when unavailable",
"TestIntentJudgeMultiSegment": "Intent judge with multi-segment buffers and multi-person conversations",
"TestWakeWordValidationSafetyNet": "Integration: listener rejects judge hallucinations when no wake word present",
"TestEchoReasoningDistrust": "Integration: listener overrides judge echo claims when EchoDetector cleared",
"TestHotWindowHeuristicAccuracy": "Integration: could_be_hot_window heuristic passes correct mode to judge",
"TestProcessedSegmentFilteringIntegration": "Integration: processed segments excluded from judge prompt",
"TestHotWindowUsesRawText": "Integration: hot window preserves full user text, wake word uses judge extraction",
"TestMultiSegmentBufferIntegration": "Integration: multi-segment buffer with TTS echoes handled correctly",
"TestStopCommandBypassesJudge": "Integration: stop commands during TTS bypass judge entirely",
"TestKnowledgeExtractionQuality": "Tests that novel knowledge is correctly extracted from summaries",
"TestKnowledgeExtractionRejection": "Tests that noise, stale data, and common knowledge are rejected",
"TestKnowledgeExtractionReframing": "Tests that interaction descriptions are reframed as knowledge",
"TestKnowledgeExtractionJudge": "LLM-as-judge evaluations of extraction quality",
"TestTopicSwitching": "Tests correct tool selection when conversation topic changes",
"TestFollowUpContext": "Tests context retention for follow-up questions",
"TestMultiTurnExtended": "Extended multi-turn scenarios with longer conversations",
"TestGreetingNoToolsLive": "Tests that greetings don't trigger tool calls",
"TestHelpfulness": "Tests that agent uses tools proactively instead of deflecting",
"TestDiaryRecencyOrder": "Tests that diary search returns newer entries before older ones",
"TestGraphRecencySuperseding": "Tests that graph handles contradicting facts with date context",
"TestRecencyJudge": "LLM judge evaluates whether newer information is preferred over older",
"TestMalformedResponseAfterTools": "Tests that malformed LLM output after tool results is not surfaced",
"TestCelebrityIdentityThenFollowUp": "Two-turn celebrity flow: identity query then pronoun follow-up",
"TestSearchFailureWikipediaRescue": "Wikipedia-rescue payload is consumed correctly, not confabulated over",
"TestMultiStepEntityQuery": "Single query requiring two sequential webSearch calls (director + filmography)",
}
# Descriptions for non-parametrized tests
TEST_DESCRIPTIONS = {
"test_weather_response_quality": "Judge evaluates weather response quality",
"test_location_context_in_search": "Location context flows to search queries",
"test_simple_search_flow": "Agent calls webSearch for info queries",
"test_tool_chaining_search_then_fetch": "Agent chains search → fetch for details",
"test_nutrition_advice_uses_memory_and_data": "Agent uses memory + nutrition data",
"test_enrichment_extracts_correct_keywords": "Enrichment extracts personalization keywords",
"test_enrichment_provides_context_to_llm": "Enrichment results appear in system message",
"test_llm_uses_enrichment_for_personalised_queries": "LLM uses enrichment-surfaced interests for personalised search",
"test_weather_query_live": "Weather query is answered with current conditions",
"test_personalized_query_recalls_memory_live": "Assistant checks memory before asking about interests",
"test_interest_flavoured_query_live": "Interest-flavoured phrasings surface seeded interests in the reply",
# Nutrition extraction tests
"test_meal_extraction_accuracy": "Extracts accurate macros for common meals",
"test_extraction_returns_valid_json_structure": "Returns valid JSON with all required fields",
"test_extraction_handles_ambiguous_portions": "Handles ambiguous portion descriptions",
"test_extraction_rejects_non_food": "Returns NONE for non-food inputs",
"test_log_meal_tool_extracts_macros": "LogMealTool stores meals with macros",
"test_simple_meal_extraction": "Simple meal baseline (2 boiled eggs)",
"test_extraction_with_quantities": "Extraction with explicit quantities",
# Multi-turn context tests
"test_weather_then_store_hours": "Topic switch: weather → store hours uses webSearch",
"test_weather_then_restaurant_search": "Topic switch: weather → restaurant uses webSearch",
"test_search_then_weather": "Topic switch: search → weather uses getWeather",
"test_follow_up_references_previous_context": "Follow-up references previous turn context",
"test_three_turn_topic_changes": "3-turn conversation with topic changes",
"test_rapid_topic_switching": "Rapid back-and-forth topic switching",
# Greeting no-tools live tests
"test_greeting_no_tools_live": "Greetings do not trigger tool calls",
"test_user_instructions_no_tools_live": "User instructions do not trigger tool calls",
"test_weather_still_triggers_tools_live": "Weather query still triggers tools after a greeting",
# Helpfulness / anti-deflection tests
"test_no_deflection_for_weather_forecast_live": "No deflection on weather forecast questions",
"test_no_deflection_for_answerable_queries_live": "No deflection on answerable questions",
"test_tool_retry_after_failure_live": "Assistant retries a tool after the first attempt fails",
"test_graph_knowledge_surfaced_in_reply_live": "Graph-enriched facts surface in the reply, no denial",
"test_does_not_deny_long_term_memory_live": "Assistant does not deny having long-term memory",
# Multi-step entity / complex flow tests
"test_chained_research_possessor_director": "Chained research: who directed Possessor and what else have they made",
"test_parallel_comparison_paris_vs_london": "Parallel weather lookup: compare Paris and London",
"test_director_then_filmography_requires_two_searches": "Director-then-filmography needs two searches",
"test_two_turn_celebrity_flow": "Two-turn celebrity flow: identity then pronoun follow-up",
"test_single_weather_call_terminates": "Single weather query ends after one tool call",
"test_max_turn_triggers_digest": "Max-turn cap delivers a digest reply, never silence",
# Knowledge extraction
"test_judge_mixed_summary_filters_noise": "Mixed summary: keep novel facts, drop stale weather/recommendations",
"test_judge_empty_conversation_returns_empty": "Trivial conversations produce no extracted facts",
"test_open_ended_prompt_grounds_in_graph_context_live": "Open-ended prompt grounds in stored knowledge",
}
def _parse_parametrize_id(node_id: str) -> Optional[str]:
"""Extract the parametrize case ID from a node_id like 'test_foo[case-name]'.
Returns None if the bracket content is just a pytest-repeat suffix like '1-3'.
"""
match = re.search(r'\[(.+)\]$', node_id)
if not match:
return None
case_id = match.group(1)
# Check if this is just a pytest-repeat suffix (e.g., "1-3", "2-3")
# These have format "N-M" where N is run number and M is total runs
if re.match(r'^\d+-\d+$', case_id):
return None
# Strip pytest-repeat suffix from the end of case IDs (e.g., "greeting-1-3" -> "greeting")
case_id = re.sub(r'-\d+-\d+$', '', case_id)
return case_id
def _extract_judge_notes(stdout: Optional[str]) -> Optional[Dict[str, str]]:
"""Parse judge evaluation output from stdout."""
if not stdout:
return None
notes = {}
# Extract score
score_match = re.search(r'Score:\s*([\d.]+)', stdout)
if score_match:
notes["score"] = score_match.group(1)
# Extract reasoning
reasoning_match = re.search(r'Reasoning:\s*(.+?)(?:\n|$)', stdout)
if reasoning_match:
notes["reasoning"] = reasoning_match.group(1).strip()
# Extract response being evaluated
response_match = re.search(r'Response:\s*(.+?)(?:\.\.\.|$)', stdout)
if response_match:
notes["response"] = response_match.group(1).strip()
return notes if notes else None
def _humanise_test_name(test_name: str) -> str:
"""Turn ``test_some_thing_does_X`` into ``Some thing does X``.
Last-resort fallback used when a test has no entry in TEST_DESCRIPTIONS
and no parametrize id. Keeps the report readable for non-technical
readers — they shouldn't have to parse Python identifiers.
"""
name = test_name
if name.startswith("test_"):
name = name[5:]
name = name.replace("_", " ").strip()
if not name:
return test_name
return name[0].upper() + name[1:]
def _strip_redundant_prefix(label: str) -> str:
"""Drop noisy prefixes from human-readable case labels.
Every eval is live by design (the suite drives a real model), so the
``Live:`` / ``Live `` prefix is uninformative. Same for trailing model
suffixes like ``-gpt-oss:20b`` that pytest cross-products into
parametrize ids — the Model column already shows that.
"""
s = label.strip()
# Trailing "-<model>" suffix injected by pytest parametrize cross-product.
for suffix in ("-gpt-oss:20b", "-gemma4:e2b", "-gemma4:e4b"):
if s.endswith(suffix):
s = s[: -len(suffix)].rstrip()
break
# Leading "Live:" / "Live " prefix is redundant — the suite is live.
lower = s.lower()
for prefix in ("live: ", "live: ", "live "):
if lower.startswith(prefix):
s = s[len(prefix):].lstrip()
if s:
s = s[0].upper() + s[1:]
break
return s
def _get_test_description(test_name: str, case_id: Optional[str]) -> str:
"""
Get the description for a test case.
For parametrized tests, the case_id IS the description (set via pytest.param id=).
For non-parametrized tests, use the TEST_DESCRIPTIONS lookup.
"""
if case_id:
return _strip_redundant_prefix(case_id)
raw = TEST_DESCRIPTIONS.get(test_name)
if raw is not None:
return _strip_redundant_prefix(raw)
# Last-resort: humanise the raw test name so the report doesn't expose
# Python identifiers to non-technical readers.
return _humanise_test_name(test_name)
# =============================================================================
# Markdown Report Generation
# =============================================================================
@dataclass
class TestResult:
"""Captured result from a single test run."""
name: str
outcome: str # passed, failed, skipped, xfailed, xpassed
duration: float
class_name: str
test_name: str
case_id: Optional[str] = None
description: str = ""
reason: Optional[str] = None
stdout: Optional[str] = None
judge_notes: Optional[Dict[str, str]] = None
@dataclass
class AggregatedTestResult:
"""Aggregated results from multiple runs of the same test."""
name: str
class_name: str
test_name: str
description: str
runs: List[TestResult] = field(default_factory=list)
@property
def pass_count(self) -> int:
return sum(1 for r in self.runs if r.outcome in ("passed", "xpassed"))
@property
def fail_count(self) -> int:
return sum(1 for r in self.runs if r.outcome == "failed")
@property
def skip_count(self) -> int:
return sum(1 for r in self.runs if r.outcome == "skipped")
@property
def xfail_count(self) -> int:
return sum(1 for r in self.runs if r.outcome == "xfailed")
@property
def total_runs(self) -> int:
return len(self.runs)
@property
def pass_rate(self) -> float:
countable = self.pass_count + self.fail_count
return (self.pass_count / countable * 100) if countable > 0 else 0.0
@property
def total_duration(self) -> float:
return sum(r.duration for r in self.runs)
@property
def avg_duration(self) -> float:
return self.total_duration / len(self.runs) if self.runs else 0.0
@property
def overall_outcome(self) -> str:
"""Determine overall outcome based on pass rate."""
if self.skip_count == self.total_runs:
return "skipped"
if self.xfail_count == self.total_runs:
return "xfailed"
if self.pass_count == self.total_runs:
return "passed"
if self.fail_count == self.total_runs:
return "failed"
return "partial"
@property
def pass_rate_str(self) -> str:
"""Format pass rate as 'X/Y (Z%)'."""
countable = self.pass_count + self.fail_count
if countable == 0:
if self.skip_count > 0:
return "SKIPPED"
if self.xfail_count > 0:
return f"{self.xfail_count}/{self.total_runs} XFAIL"
return "N/A"
return f"{self.pass_count}/{countable} ({self.pass_rate:.0f}%)"
@property
def judge_notes(self) -> Optional[Dict[str, str]]:
"""Return judge notes from first run that has them."""
for run in self.runs:
if run.judge_notes:
return run.judge_notes
return None
@property
def reason(self) -> Optional[str]:
"""Return reason from first run that has it."""
for run in self.runs:
if run.reason:
return run.reason
return None
def _strip_repeat_suffix(node_id: str) -> str:
"""
Strip pytest-repeat iteration suffix from node ID.
pytest-repeat adds suffixes like [1-3], [2-3], [3-3] to repeated tests.
This strips those suffixes to get the base test identifier for aggregation.
"""
# Match patterns like [1-3], [2-3], [3-3] at the end of node ID
# But preserve parametrize IDs like [greeting-en], [weather-query], etc.
return re.sub(r'\[(\d+)-(\d+)\]$', '', node_id)
def _get_aggregation_key(result: TestResult) -> str:
"""Get a unique key for aggregating repeated test runs."""
# Use class_name + test_name + case_id (if any) as the aggregation key
key_parts = [result.class_name, result.test_name]
if result.case_id:
# case_id should already have repeat suffixes stripped by _parse_parametrize_id
key_parts.append(result.case_id)
return "::".join(key_parts)
@dataclass
class EvalReport:
"""Aggregated eval results for markdown generation."""
results: List[TestResult] = field(default_factory=list)
start_time: Optional[datetime] = None
end_time: Optional[datetime] = None
judge_model: str = ""
def add_result(self, result: TestResult):
self.results.append(result)
def get_aggregated_results(self) -> List[AggregatedTestResult]:
"""Aggregate results from multiple runs of the same test."""
aggregated: Dict[str, AggregatedTestResult] = {}
for result in self.results:
key = _get_aggregation_key(result)
if key not in aggregated:
# Description should already have repeat suffixes stripped
aggregated[key] = AggregatedTestResult(
name=_strip_repeat_suffix(result.name),
class_name=result.class_name,
test_name=result.test_name,
description=result.description,
)
aggregated[key].runs.append(result)
return list(aggregated.values())
@property
def total_unique_tests(self) -> int:
return len(self.get_aggregated_results())
@property
def total_runs(self) -> int:
return len(self.results)
@property
def passed(self) -> int:
return sum(1 for r in self.results if r.outcome == "passed")
@property
def failed(self) -> int:
return sum(1 for r in self.results if r.outcome == "failed")
@property
def skipped(self) -> int:
return sum(1 for r in self.results if r.outcome == "skipped")
@property
def xfailed(self) -> int:
return sum(1 for r in self.results if r.outcome == "xfailed")
@property
def xpassed(self) -> int:
return sum(1 for r in self.results if r.outcome == "xpassed")
@property
def pass_rate(self) -> float:
countable = self.passed + self.failed + self.xpassed
return (self.passed + self.xpassed) / countable * 100 if countable > 0 else 0.0
@property
def duration(self) -> float:
return sum(r.duration for r in self.results)
def generate_markdown(self) -> str:
"""Generate a pretty markdown report with pass rates from multiple runs."""
lines = []
aggregated_results = self.get_aggregated_results()
# Calculate overall stats from aggregated results
total_tests = len(aggregated_results)
fully_passed = sum(1 for r in aggregated_results if r.overall_outcome == "passed")
fully_failed = sum(1 for r in aggregated_results if r.overall_outcome == "failed")
partial = sum(1 for r in aggregated_results if r.overall_outcome == "partial")
skipped = sum(1 for r in aggregated_results if r.overall_outcome == "skipped")
xfailed = sum(1 for r in aggregated_results if r.overall_outcome == "xfailed")
# Header
lines.append("# 🧪 Jarvis Evaluation Report")
lines.append("")
lines.append(f"**Generated:** {self.end_time.strftime('%Y-%m-%d %H:%M:%S') if self.end_time else 'N/A'}")
lines.append(f"**Judge Model:** `{self.judge_model}`")
lines.append(f"**Duration:** {self.duration:.2f}s")
lines.append(f"**Runs per test:** {self.total_runs // total_tests if total_tests > 0 else 0}")
lines.append("")
# Summary stats
lines.append("## 📊 Summary")
lines.append("")
lines.append("| Metric | Count |")
lines.append("|--------|-------|")
lines.append(f"| ✅ Fully Passed (100%) | {fully_passed} |")
lines.append(f"| ⚠️ Partial Pass | {partial} |")
lines.append(f"| ❌ Fully Failed (0%) | {fully_failed} |")
lines.append(f"| ⏭️ Skipped | {skipped} |")
lines.append(f"| 🔸 Expected Fail | {xfailed} |")
lines.append(f"| **Unique Tests** | **{total_tests}** |")
lines.append(f"| **Total Runs** | **{self.total_runs}** |")
lines.append("")
# Pass rate bar (based on individual runs)
pass_rate = self.pass_rate
bar_filled = int(pass_rate / 5) # 20 chars max
bar_empty = 20 - bar_filled
bar = "" * bar_filled + "" * bar_empty
emoji = "🟢" if pass_rate >= 80 else "🟡" if pass_rate >= 50 else "🔴"
lines.append(f"**Overall Pass Rate:** {emoji} `{bar}` **{pass_rate:.1f}%** ({self.passed}/{self.passed + self.failed} runs)")
lines.append("")
# Group aggregated results by class
by_class: Dict[str, List[AggregatedTestResult]] = {}
for result in aggregated_results:
if result.class_name not in by_class:
by_class[result.class_name] = []
by_class[result.class_name].append(result)
# Detailed results
lines.append("---")
lines.append("")
lines.append("## 📋 Detailed Results")
lines.append("")
for class_name, class_results in by_class.items():
class_fully_passed = sum(1 for r in class_results if r.overall_outcome == "passed")
class_total = len([r for r in class_results if r.overall_outcome not in ("skipped",)])
class_emoji = "" if class_fully_passed == class_total and class_total > 0 else "⚠️" if class_fully_passed > 0 else ""
# Class header with description
lines.append(f"### {class_emoji} {class_name}")
if class_name in CLASS_DESCRIPTIONS:
lines.append(f"> {CLASS_DESCRIPTIONS[class_name]}")
lines.append("")
# Check if this class has judge notes (only for LLMAsJudge class)
is_judge_class = "Judge" in class_name
has_judge_notes = is_judge_class and any(r.judge_notes for r in class_results)
if has_judge_notes:
# Detailed format for judge tests
for result in class_results:
status_emoji = {
"passed": "",
"failed": "",
"skipped": "⏭️",
"xfailed": "🔸",
"partial": "⚠️",
}.get(result.overall_outcome, "")
lines.append(f"#### {status_emoji} {result.description}")
lines.append("")
lines.append(f"**Pass Rate:** {result.pass_rate_str}")
if result.judge_notes:
notes = result.judge_notes
if "response" in notes:
lines.append(f"**Input:** `{notes['response']}`")
if "score" in notes:
score = float(notes['score'])
score_bar = "" * int(score * 10) + "" * (10 - int(score * 10))
lines.append(f"**Score:** {score_bar} ({notes['score']})")
if "reasoning" in notes:
lines.append(f"**Judge notes:** {notes['reasoning']}")
lines.append("")
lines.append(f"*Avg Duration: {result.avg_duration:.2f}s*")
lines.append("")
else:
# Table format for non-judge tests with pass rates
lines.append("| Test Case | Pass Rate | Status | Avg Duration |")
lines.append("|-----------|-----------|--------|--------------|")
for result in class_results:
status_emoji = {
"passed": "",
"failed": "",
"skipped": "⏭️",
"xfailed": "🔸",
"partial": "⚠️",
}.get(result.overall_outcome, "")
status_text = result.overall_outcome.upper()
if result.reason:
reason_short = result.reason[:30] + "..." if len(result.reason) > 30 else result.reason
status_text += f" ({reason_short})"
lines.append(f"| {result.description} | {result.pass_rate_str} | {status_emoji} {status_text} | {result.avg_duration:.2f}s |")
lines.append("")
# Footer
lines.append("---")
lines.append("")
lines.append("*Report generated by Jarvis eval suite*")
return "\n".join(lines)
# Global report instance
_eval_report: Optional[EvalReport] = None
def pytest_configure(config):
"""Initialize the eval report at test session start."""
global _eval_report
if os.environ.get("EVAL_GENERATE_REPORT") == "1":
_eval_report = EvalReport(
start_time=datetime.now(),
judge_model=JUDGE_MODEL
)
def pytest_runtest_logreport(report):
"""Capture each test result."""
global _eval_report
if _eval_report is None:
return
# Only capture the final result (call phase for passed/failed, setup/teardown for errors)
if report.when != "call" and not (report.when in ("setup", "teardown") and report.outcome == "failed"):
return
# Parse the node ID to extract class and test name
node_id = report.nodeid
parts = node_id.split("::")
class_name = parts[1] if len(parts) > 1 else "Unknown"
full_test_name = parts[-1] if parts else node_id
# Extract parametrize case ID (which is the description for parametrized tests)
case_id = _parse_parametrize_id(full_test_name)
test_name = full_test_name.split("[")[0]
# Get description: for parametrized tests, it's the case_id; otherwise from lookup
description = _get_test_description(test_name, case_id)
# Determine outcome
outcome = report.outcome
if hasattr(report, "wasxfail"):
outcome = "xpassed" if report.passed else "xfailed"
# Get skip reason if applicable
reason = None
if outcome == "skipped" and hasattr(report, "longrepr"):
if isinstance(report.longrepr, tuple) and len(report.longrepr) >= 3:
reason = str(report.longrepr[2])
# Capture stdout and parse judge notes
stdout = None
judge_notes = None
if hasattr(report, "capstdout") and report.capstdout:
stdout = report.capstdout
judge_notes = _extract_judge_notes(stdout)
# Also check sections for captured stdout
if not stdout:
for section_name, section_content in report.sections:
if "stdout" in section_name.lower():
stdout = section_content
judge_notes = _extract_judge_notes(stdout)
break
_eval_report.add_result(TestResult(
name=node_id,
outcome=outcome,
duration=report.duration,
class_name=class_name,
test_name=test_name,
case_id=case_id,
description=description,
reason=reason,
stdout=stdout,
judge_notes=judge_notes,
))
def pytest_sessionfinish(session, exitstatus):
"""Generate the markdown report at session end."""
global _eval_report
if _eval_report is None:
return
_eval_report.end_time = datetime.now()
# Write the markdown report (ensure UTF-8 encoding for emojis/unicode)
# Support custom report path via environment variable
report_path_str = os.environ.get("EVAL_REPORT_PATH")
if report_path_str:
report_path = Path(report_path_str)
else:
report_path = ROOT / "EVALS.md"
markdown = _eval_report.generate_markdown()
report_path.write_text(markdown, encoding="utf-8")
try:
print(f"\n📄 Eval report saved to: {report_path}")
except UnicodeEncodeError:
print(f"\nEval report saved to: {report_path}")
# =============================================================================
# Fixtures
# =============================================================================
@pytest.fixture
def mock_config():
"""Provide a mock configuration for eval tests."""
return MockConfig()
@pytest.fixture
def eval_db():
"""Provide an in-memory database for eval tests."""
from jarvis.memory.db import Database
db = Database(":memory:", sqlite_vss_path=None)
yield db
db.close()
@pytest.fixture
def eval_dialogue_memory():
"""Provide a dialogue memory instance for eval tests."""
from jarvis.memory.conversation import DialogueMemory
return DialogueMemory(inactivity_timeout=300, max_interactions=20)
@pytest.fixture
def graph_store(tmp_path):
"""Graph store backed by a temp SQLite DB, closed on teardown.
Closes the SQLite connection so `tmp_path`'s cleanup can unlink
the file on Windows. POSIX would tolerate a still-open handle,
Windows would not.
"""
from jarvis.memory.graph import GraphMemoryStore
store = GraphMemoryStore(str(tmp_path / "test.db"))
try:
yield store
finally:
store.close()