Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
433
evals/test_recency_superseding.py
Normal file
433
evals/test_recency_superseding.py
Normal file
@@ -0,0 +1,433 @@
|
||||
"""
|
||||
Recency Superseding Evaluations
|
||||
|
||||
Tests that newer information correctly takes precedence over older information
|
||||
in both diary enrichment and knowledge graph contexts.
|
||||
|
||||
Scenarios:
|
||||
1. Diary search: newer entries about the same topic should rank first
|
||||
2. Graph enrichment: when presenting conflicting facts, the system should
|
||||
surface the most recent version
|
||||
|
||||
Run:
|
||||
EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh recency
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import requires_judge_llm
|
||||
from helpers import (
|
||||
MockConfig,
|
||||
JUDGE_MODEL,
|
||||
JUDGE_BASE_URL,
|
||||
call_judge_llm,
|
||||
JudgeVerdict,
|
||||
)
|
||||
|
||||
from jarvis.memory.db import Database
|
||||
from jarvis.memory.graph_ops import merge_node_data
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test Data
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class SupersedingCase:
|
||||
"""A scenario where newer information should take precedence."""
|
||||
description: str
|
||||
# Older diary entry (stored first)
|
||||
old_entry: str
|
||||
old_date: str
|
||||
# Newer diary entry (stored second, should win)
|
||||
new_entry: str
|
||||
new_date: str
|
||||
# Search keywords that should match both
|
||||
search_keywords: List[str]
|
||||
# The newer value that should appear first in results
|
||||
newer_value_keywords: List[str]
|
||||
# The older value that should NOT appear first
|
||||
older_value_keywords: List[str]
|
||||
|
||||
|
||||
SUPERSEDING_CASES = [
|
||||
pytest.param(
|
||||
SupersedingCase(
|
||||
description="Office days changed",
|
||||
old_entry=(
|
||||
"[2026-01-15] The user mentioned their office days are Monday and Wednesday. "
|
||||
"They commute to the Shoreditch office on those days."
|
||||
),
|
||||
old_date="2026-01-15",
|
||||
new_entry=(
|
||||
"[2026-03-20] The user said their office days have changed to Monday and Thursday. "
|
||||
"The team restructured and now they go in on different days."
|
||||
),
|
||||
new_date="2026-03-20",
|
||||
search_keywords=["office", "days"],
|
||||
newer_value_keywords=["Thursday", "changed"],
|
||||
older_value_keywords=["Wednesday"],
|
||||
),
|
||||
id="Office days changed from Mon/Wed to Mon/Thu",
|
||||
),
|
||||
pytest.param(
|
||||
SupersedingCase(
|
||||
description="Diet plan updated",
|
||||
old_entry=(
|
||||
"[2025-12-01] The user follows a 2200 kcal bulking diet with 180g protein daily. "
|
||||
"They eat five meals a day."
|
||||
),
|
||||
old_date="2025-12-01",
|
||||
new_entry=(
|
||||
"[2026-03-15] The user switched to a 1800 kcal cutting diet with 150g protein daily. "
|
||||
"They're now doing intermittent fasting with a 16:8 window."
|
||||
),
|
||||
new_date="2026-03-15",
|
||||
search_keywords=["diet", "protein", "kcal"],
|
||||
newer_value_keywords=["1800", "cutting", "intermittent fasting"],
|
||||
older_value_keywords=["2200", "bulking"],
|
||||
),
|
||||
id="Diet changed from bulking to cutting",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests: Diary Search Recency
|
||||
# =============================================================================
|
||||
|
||||
@pytest.mark.eval
|
||||
class TestDiaryRecencyOrder:
|
||||
"""Tests that diary search returns newer entries before older ones
|
||||
when both match the same query."""
|
||||
|
||||
@pytest.fixture
|
||||
def db_with_entries(self, request, tmp_path):
|
||||
"""Create a temporary DB with old and new diary entries."""
|
||||
case: SupersedingCase = request.param
|
||||
|
||||
db = Database(str(tmp_path / "test.db"))
|
||||
|
||||
# Store old entry first
|
||||
db.upsert_conversation_summary(
|
||||
date_utc=case.old_date,
|
||||
summary=case.old_entry,
|
||||
topics="office,schedule,commute",
|
||||
source_app="test",
|
||||
)
|
||||
|
||||
# Store new entry second
|
||||
db.upsert_conversation_summary(
|
||||
date_utc=case.new_date,
|
||||
summary=case.new_entry,
|
||||
topics="office,schedule,commute",
|
||||
source_app="test",
|
||||
)
|
||||
|
||||
yield db, case
|
||||
|
||||
db.close()
|
||||
|
||||
@pytest.mark.parametrize("db_with_entries", SUPERSEDING_CASES, indirect=True)
|
||||
def test_newer_entry_appears_first(self, db_with_entries):
|
||||
"""When two diary entries match the same keywords, the newer one
|
||||
should appear before the older one in search results."""
|
||||
db, case = db_with_entries
|
||||
|
||||
from jarvis.memory.conversation import search_conversation_memory_by_keywords
|
||||
|
||||
results = search_conversation_memory_by_keywords(
|
||||
db=db,
|
||||
keywords=case.search_keywords,
|
||||
max_results=10,
|
||||
)
|
||||
|
||||
assert len(results) >= 2, (
|
||||
f"Expected at least 2 results for '{case.description}', got {len(results)}"
|
||||
)
|
||||
|
||||
# The first result should contain the NEWER information
|
||||
first_result = results[0].lower()
|
||||
has_newer = any(kw.lower() in first_result for kw in case.newer_value_keywords)
|
||||
|
||||
assert has_newer, (
|
||||
f"[{case.description}] First result should contain newer info "
|
||||
f"({case.newer_value_keywords}), but got:\n{results[0][:200]}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests: Graph Superseding
|
||||
# =============================================================================
|
||||
|
||||
@pytest.mark.eval
|
||||
class TestGraphRecencySuperseding:
|
||||
"""Tests that knowledge graph handles contradicting facts across dates
|
||||
by preserving temporal context that allows newer facts to take precedence."""
|
||||
|
||||
@pytest.mark.parametrize("case", SUPERSEDING_CASES)
|
||||
def test_newer_fact_appended_with_date_context(self, graph_store, case):
|
||||
"""When a new fact contradicts an old one in the same node,
|
||||
both should be stored with date context so the LLM can reason
|
||||
about which is current."""
|
||||
case = case.values[0] if hasattr(case, 'values') else case
|
||||
|
||||
# Create a node and add the old fact
|
||||
node = graph_store.create_node(
|
||||
name="Test Node",
|
||||
description=case.description,
|
||||
data=f"[{case.old_date}] " + case.old_entry.split("] ", 1)[-1] if "] " in case.old_entry else case.old_entry,
|
||||
parent_id="root",
|
||||
)
|
||||
|
||||
# Append the new fact
|
||||
new_fact_text = f"[{case.new_date}] " + (case.new_entry.split("] ", 1)[-1] if "] " in case.new_entry else case.new_entry)
|
||||
graph_store.append_to_node(node.id, new_fact_text)
|
||||
|
||||
# Verify both facts are in the node
|
||||
updated = graph_store.get_node(node.id)
|
||||
assert updated is not None
|
||||
|
||||
data_lower = updated.data.lower()
|
||||
# Both old and new values should be present (we append, not replace)
|
||||
has_old = any(kw.lower() in data_lower for kw in case.older_value_keywords)
|
||||
has_new = any(kw.lower() in data_lower for kw in case.newer_value_keywords)
|
||||
|
||||
assert has_old and has_new, (
|
||||
f"[{case.description}] Node should contain both old and new facts. "
|
||||
f"Has old ({case.older_value_keywords}): {has_old}, "
|
||||
f"Has new ({case.newer_value_keywords}): {has_new}"
|
||||
)
|
||||
|
||||
# The newer date should be present for temporal reasoning
|
||||
assert case.new_date in updated.data, (
|
||||
f"[{case.description}] Newer fact should include date prefix '{case.new_date}' "
|
||||
f"for temporal reasoning"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests: Merge supersession (LLM rewrite drops the old contradicting line)
|
||||
# =============================================================================
|
||||
|
||||
@pytest.mark.eval
|
||||
class TestMergeSupersession:
|
||||
"""Exercises `merge_node_data` against a real picker model. When a new
|
||||
fact contradicts an existing line on the same node, the rewrite should
|
||||
drop the older line — not just append both. This is the behaviour the
|
||||
User node accumulates contradictions without."""
|
||||
|
||||
@requires_judge_llm
|
||||
@pytest.mark.parametrize("case", SUPERSEDING_CASES)
|
||||
def test_merge_drops_contradicting_old_line(self, case, graph_store):
|
||||
case = case.values[0] if hasattr(case, 'values') else case
|
||||
|
||||
old_line = (
|
||||
f"[{case.old_date}] "
|
||||
+ (case.old_entry.split("] ", 1)[-1] if "] " in case.old_entry else case.old_entry)
|
||||
)
|
||||
new_line = (
|
||||
f"[{case.new_date}] "
|
||||
+ (case.new_entry.split("] ", 1)[-1] if "] " in case.new_entry else case.new_entry)
|
||||
)
|
||||
|
||||
node = graph_store.create_node(
|
||||
name="Test Node",
|
||||
description=case.description,
|
||||
data=old_line,
|
||||
parent_id="root",
|
||||
)
|
||||
|
||||
result = merge_node_data(
|
||||
store=graph_store,
|
||||
node_id=node.id,
|
||||
new_facts=[new_line],
|
||||
ollama_base_url=JUDGE_BASE_URL,
|
||||
ollama_chat_model=JUDGE_MODEL,
|
||||
timeout_sec=30.0,
|
||||
)
|
||||
|
||||
updated = graph_store.get_node(node.id)
|
||||
assert updated is not None
|
||||
data_lower = updated.data.lower()
|
||||
|
||||
has_new = any(kw.lower() in data_lower for kw in case.newer_value_keywords)
|
||||
has_old = any(kw.lower() in data_lower for kw in case.older_value_keywords)
|
||||
|
||||
print(f"\n 📝 merged data for '{case.description}':\n {updated.data[:300]}")
|
||||
print(f" success={result.success} incorporated={result.incorporated_indices}")
|
||||
|
||||
assert has_new, (
|
||||
f"[{case.description}] Merged data should retain newer info "
|
||||
f"({case.newer_value_keywords}).\n{updated.data}"
|
||||
)
|
||||
assert not has_old, (
|
||||
f"[{case.description}] Merged data should DROP older contradicting info "
|
||||
f"({case.older_value_keywords}). Supersession failed.\n{updated.data}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests: LLM Judge — Does the system use the newer information?
|
||||
# =============================================================================
|
||||
|
||||
@pytest.mark.eval
|
||||
class TestRecencyJudge:
|
||||
"""LLM-as-judge evaluation: given conflicting diary entries at different
|
||||
dates, does the system's enrichment context allow answering with the
|
||||
most recent information?"""
|
||||
|
||||
@requires_judge_llm
|
||||
@pytest.mark.parametrize("case", SUPERSEDING_CASES)
|
||||
def test_judge_prefers_newer_information(self, case):
|
||||
"""Ask a judge LLM: given both old and new diary entries as context,
|
||||
does the answer reflect the NEWER information?"""
|
||||
case = case.values[0] if hasattr(case, 'values') else case
|
||||
|
||||
context = f"Entry 1:\n{case.old_entry}\n\nEntry 2:\n{case.new_entry}"
|
||||
|
||||
judge_system = """You are evaluating whether an AI assistant correctly uses the most recent information when answering.
|
||||
|
||||
You will be given:
|
||||
1. Two diary entries about the same topic from DIFFERENT DATES
|
||||
2. A question about that topic
|
||||
|
||||
Determine: which entry has the MORE RECENT date, and what answer that entry implies.
|
||||
|
||||
Respond with JSON:
|
||||
{"newer_date": "YYYY-MM-DD", "correct_answer_keywords": ["keyword1", "keyword2"], "reasoning": "..."}"""
|
||||
|
||||
judge_user = f"""Diary entries:
|
||||
{context}
|
||||
|
||||
Question: Based on these entries, what is the current/latest information about: {case.description}?"""
|
||||
|
||||
response = call_judge_llm(judge_system, judge_user, timeout_sec=120.0)
|
||||
assert response is not None, "Judge LLM returned no response"
|
||||
|
||||
# Parse judge response
|
||||
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
||||
assert json_match is not None, f"Judge response not valid JSON: {response}"
|
||||
|
||||
verdict = json.loads(json_match.group())
|
||||
assert verdict.get("newer_date") == case.new_date, (
|
||||
f"Judge identified wrong date as newer. "
|
||||
f"Expected {case.new_date}, got {verdict.get('newer_date')}. "
|
||||
f"Reasoning: {verdict.get('reasoning')}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests: End-to-End — reply engine honours newer diary entries
|
||||
# =============================================================================
|
||||
|
||||
# Models to exercise end-to-end. The small model is expected to be flaky on this
|
||||
# task (conflicting facts + recency reasoning), so it's marked xfail rather than
|
||||
# skipped — we still want to catch a surprise improvement.
|
||||
_E2E_MODELS = [
|
||||
pytest.param("gpt-oss:20b", id="gpt-oss:20b"),
|
||||
pytest.param(
|
||||
"gemma4:e2b",
|
||||
id="gemma4:e2b",
|
||||
marks=pytest.mark.xfail(
|
||||
reason="Small model flakes on recency-superseding — tracked, not blocking",
|
||||
strict=False,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _query_for_case(case: "SupersedingCase") -> str:
|
||||
"""Build a natural-language query that targets the entity in conflict."""
|
||||
desc = case.description.lower()
|
||||
if "office" in desc:
|
||||
return "Which days do I go into the office these days?"
|
||||
if "diet" in desc:
|
||||
return "What does my current diet look like — calories and protein?"
|
||||
return f"What's the latest on: {case.description}?"
|
||||
|
||||
|
||||
@pytest.mark.eval
|
||||
class TestReplyUsesNewerDiaryEntry:
|
||||
"""End-to-end: with conflicting diary entries, the reply should reflect
|
||||
the newer one. Exercises the full reply engine (enrichment retrieval,
|
||||
injection ordering, and preamble framing)."""
|
||||
|
||||
@requires_judge_llm
|
||||
@pytest.mark.parametrize("model", _E2E_MODELS)
|
||||
@pytest.mark.parametrize("case", SUPERSEDING_CASES)
|
||||
def test_reply_reflects_newer_entry(
|
||||
self, case, model, mock_config, eval_db, eval_dialogue_memory
|
||||
):
|
||||
# The chat model under test is parametrised internally (to attach xfail
|
||||
# to the small model). The harness-level judge-model loop re-runs this
|
||||
# whole file once per judge phase, which is noise here (the judge model
|
||||
# doesn't affect the reply engine's diary handling). Skip in the small
|
||||
# judge phase so each (case, chat-model) pair runs exactly once.
|
||||
if "gemma4" in JUDGE_MODEL:
|
||||
pytest.skip("Chat model is parametrised here; only runs once per eval session (large judge phase)")
|
||||
case = case.values[0] if hasattr(case, 'values') else case
|
||||
|
||||
from jarvis.reply.engine import run_reply_engine
|
||||
|
||||
# Seed diary with older (wrong) then newer (correct) entry.
|
||||
eval_db.upsert_conversation_summary(
|
||||
date_utc=case.old_date,
|
||||
summary=case.old_entry,
|
||||
topics=",".join(case.search_keywords),
|
||||
source_app="test",
|
||||
)
|
||||
eval_db.upsert_conversation_summary(
|
||||
date_utc=case.new_date,
|
||||
summary=case.new_entry,
|
||||
topics=",".join(case.search_keywords),
|
||||
source_app="test",
|
||||
)
|
||||
|
||||
mock_config.ollama_chat_model = model
|
||||
mock_config.memory_enrichment_source = "diary"
|
||||
|
||||
query = _query_for_case(case)
|
||||
|
||||
with patch(
|
||||
'jarvis.reply.engine.get_location_context_with_timezone',
|
||||
return_value=("Location: London, United Kingdom", None),
|
||||
):
|
||||
reply = run_reply_engine(
|
||||
db=eval_db,
|
||||
cfg=mock_config,
|
||||
tts=None,
|
||||
text=query,
|
||||
dialogue_memory=eval_dialogue_memory,
|
||||
)
|
||||
|
||||
assert reply and reply.strip(), f"[{model}] Reply engine returned empty response"
|
||||
|
||||
reply_lower = reply.lower()
|
||||
has_newer = any(kw.lower() in reply_lower for kw in case.newer_value_keywords)
|
||||
has_only_older = (
|
||||
not has_newer
|
||||
and any(kw.lower() in reply_lower for kw in case.older_value_keywords)
|
||||
)
|
||||
|
||||
print(f"\n 🤖 {model} reply to: {query}")
|
||||
print(f" {reply[:240]}")
|
||||
print(f" newer kws {case.newer_value_keywords} present: {has_newer}")
|
||||
|
||||
assert not has_only_older, (
|
||||
f"[{model}] Reply used ONLY older info "
|
||||
f"({case.older_value_keywords}) and ignored newer entry "
|
||||
f"({case.newer_value_keywords}).\nReply: {reply}"
|
||||
)
|
||||
assert has_newer, (
|
||||
f"[{model}] Reply did not reflect newer diary entry "
|
||||
f"({case.newer_value_keywords}).\nReply: {reply}"
|
||||
)
|
||||
Reference in New Issue
Block a user