Files
javis_bot/evals/test_recency_superseding.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

434 lines
16 KiB
Python

"""
Recency Superseding Evaluations
Tests that newer information correctly takes precedence over older information
in both diary enrichment and knowledge graph contexts.
Scenarios:
1. Diary search: newer entries about the same topic should rank first
2. Graph enrichment: when presenting conflicting facts, the system should
surface the most recent version
Run:
EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh recency
"""
import json
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Optional
from unittest.mock import patch
import pytest
from conftest import requires_judge_llm
from helpers import (
MockConfig,
JUDGE_MODEL,
JUDGE_BASE_URL,
call_judge_llm,
JudgeVerdict,
)
from jarvis.memory.db import Database
from jarvis.memory.graph_ops import merge_node_data
# =============================================================================
# Test Data
# =============================================================================
@dataclass
class SupersedingCase:
"""A scenario where newer information should take precedence."""
description: str
# Older diary entry (stored first)
old_entry: str
old_date: str
# Newer diary entry (stored second, should win)
new_entry: str
new_date: str
# Search keywords that should match both
search_keywords: List[str]
# The newer value that should appear first in results
newer_value_keywords: List[str]
# The older value that should NOT appear first
older_value_keywords: List[str]
SUPERSEDING_CASES = [
pytest.param(
SupersedingCase(
description="Office days changed",
old_entry=(
"[2026-01-15] The user mentioned their office days are Monday and Wednesday. "
"They commute to the Shoreditch office on those days."
),
old_date="2026-01-15",
new_entry=(
"[2026-03-20] The user said their office days have changed to Monday and Thursday. "
"The team restructured and now they go in on different days."
),
new_date="2026-03-20",
search_keywords=["office", "days"],
newer_value_keywords=["Thursday", "changed"],
older_value_keywords=["Wednesday"],
),
id="Office days changed from Mon/Wed to Mon/Thu",
),
pytest.param(
SupersedingCase(
description="Diet plan updated",
old_entry=(
"[2025-12-01] The user follows a 2200 kcal bulking diet with 180g protein daily. "
"They eat five meals a day."
),
old_date="2025-12-01",
new_entry=(
"[2026-03-15] The user switched to a 1800 kcal cutting diet with 150g protein daily. "
"They're now doing intermittent fasting with a 16:8 window."
),
new_date="2026-03-15",
search_keywords=["diet", "protein", "kcal"],
newer_value_keywords=["1800", "cutting", "intermittent fasting"],
older_value_keywords=["2200", "bulking"],
),
id="Diet changed from bulking to cutting",
),
]
# =============================================================================
# Tests: Diary Search Recency
# =============================================================================
@pytest.mark.eval
class TestDiaryRecencyOrder:
"""Tests that diary search returns newer entries before older ones
when both match the same query."""
@pytest.fixture
def db_with_entries(self, request, tmp_path):
"""Create a temporary DB with old and new diary entries."""
case: SupersedingCase = request.param
db = Database(str(tmp_path / "test.db"))
# Store old entry first
db.upsert_conversation_summary(
date_utc=case.old_date,
summary=case.old_entry,
topics="office,schedule,commute",
source_app="test",
)
# Store new entry second
db.upsert_conversation_summary(
date_utc=case.new_date,
summary=case.new_entry,
topics="office,schedule,commute",
source_app="test",
)
yield db, case
db.close()
@pytest.mark.parametrize("db_with_entries", SUPERSEDING_CASES, indirect=True)
def test_newer_entry_appears_first(self, db_with_entries):
"""When two diary entries match the same keywords, the newer one
should appear before the older one in search results."""
db, case = db_with_entries
from jarvis.memory.conversation import search_conversation_memory_by_keywords
results = search_conversation_memory_by_keywords(
db=db,
keywords=case.search_keywords,
max_results=10,
)
assert len(results) >= 2, (
f"Expected at least 2 results for '{case.description}', got {len(results)}"
)
# The first result should contain the NEWER information
first_result = results[0].lower()
has_newer = any(kw.lower() in first_result for kw in case.newer_value_keywords)
assert has_newer, (
f"[{case.description}] First result should contain newer info "
f"({case.newer_value_keywords}), but got:\n{results[0][:200]}"
)
# =============================================================================
# Tests: Graph Superseding
# =============================================================================
@pytest.mark.eval
class TestGraphRecencySuperseding:
"""Tests that knowledge graph handles contradicting facts across dates
by preserving temporal context that allows newer facts to take precedence."""
@pytest.mark.parametrize("case", SUPERSEDING_CASES)
def test_newer_fact_appended_with_date_context(self, graph_store, case):
"""When a new fact contradicts an old one in the same node,
both should be stored with date context so the LLM can reason
about which is current."""
case = case.values[0] if hasattr(case, 'values') else case
# Create a node and add the old fact
node = graph_store.create_node(
name="Test Node",
description=case.description,
data=f"[{case.old_date}] " + case.old_entry.split("] ", 1)[-1] if "] " in case.old_entry else case.old_entry,
parent_id="root",
)
# Append the new fact
new_fact_text = f"[{case.new_date}] " + (case.new_entry.split("] ", 1)[-1] if "] " in case.new_entry else case.new_entry)
graph_store.append_to_node(node.id, new_fact_text)
# Verify both facts are in the node
updated = graph_store.get_node(node.id)
assert updated is not None
data_lower = updated.data.lower()
# Both old and new values should be present (we append, not replace)
has_old = any(kw.lower() in data_lower for kw in case.older_value_keywords)
has_new = any(kw.lower() in data_lower for kw in case.newer_value_keywords)
assert has_old and has_new, (
f"[{case.description}] Node should contain both old and new facts. "
f"Has old ({case.older_value_keywords}): {has_old}, "
f"Has new ({case.newer_value_keywords}): {has_new}"
)
# The newer date should be present for temporal reasoning
assert case.new_date in updated.data, (
f"[{case.description}] Newer fact should include date prefix '{case.new_date}' "
f"for temporal reasoning"
)
# =============================================================================
# Tests: Merge supersession (LLM rewrite drops the old contradicting line)
# =============================================================================
@pytest.mark.eval
class TestMergeSupersession:
"""Exercises `merge_node_data` against a real picker model. When a new
fact contradicts an existing line on the same node, the rewrite should
drop the older line — not just append both. This is the behaviour the
User node accumulates contradictions without."""
@requires_judge_llm
@pytest.mark.parametrize("case", SUPERSEDING_CASES)
def test_merge_drops_contradicting_old_line(self, case, graph_store):
case = case.values[0] if hasattr(case, 'values') else case
old_line = (
f"[{case.old_date}] "
+ (case.old_entry.split("] ", 1)[-1] if "] " in case.old_entry else case.old_entry)
)
new_line = (
f"[{case.new_date}] "
+ (case.new_entry.split("] ", 1)[-1] if "] " in case.new_entry else case.new_entry)
)
node = graph_store.create_node(
name="Test Node",
description=case.description,
data=old_line,
parent_id="root",
)
result = merge_node_data(
store=graph_store,
node_id=node.id,
new_facts=[new_line],
ollama_base_url=JUDGE_BASE_URL,
ollama_chat_model=JUDGE_MODEL,
timeout_sec=30.0,
)
updated = graph_store.get_node(node.id)
assert updated is not None
data_lower = updated.data.lower()
has_new = any(kw.lower() in data_lower for kw in case.newer_value_keywords)
has_old = any(kw.lower() in data_lower for kw in case.older_value_keywords)
print(f"\n 📝 merged data for '{case.description}':\n {updated.data[:300]}")
print(f" success={result.success} incorporated={result.incorporated_indices}")
assert has_new, (
f"[{case.description}] Merged data should retain newer info "
f"({case.newer_value_keywords}).\n{updated.data}"
)
assert not has_old, (
f"[{case.description}] Merged data should DROP older contradicting info "
f"({case.older_value_keywords}). Supersession failed.\n{updated.data}"
)
# =============================================================================
# Tests: LLM Judge — Does the system use the newer information?
# =============================================================================
@pytest.mark.eval
class TestRecencyJudge:
"""LLM-as-judge evaluation: given conflicting diary entries at different
dates, does the system's enrichment context allow answering with the
most recent information?"""
@requires_judge_llm
@pytest.mark.parametrize("case", SUPERSEDING_CASES)
def test_judge_prefers_newer_information(self, case):
"""Ask a judge LLM: given both old and new diary entries as context,
does the answer reflect the NEWER information?"""
case = case.values[0] if hasattr(case, 'values') else case
context = f"Entry 1:\n{case.old_entry}\n\nEntry 2:\n{case.new_entry}"
judge_system = """You are evaluating whether an AI assistant correctly uses the most recent information when answering.
You will be given:
1. Two diary entries about the same topic from DIFFERENT DATES
2. A question about that topic
Determine: which entry has the MORE RECENT date, and what answer that entry implies.
Respond with JSON:
{"newer_date": "YYYY-MM-DD", "correct_answer_keywords": ["keyword1", "keyword2"], "reasoning": "..."}"""
judge_user = f"""Diary entries:
{context}
Question: Based on these entries, what is the current/latest information about: {case.description}?"""
response = call_judge_llm(judge_system, judge_user, timeout_sec=120.0)
assert response is not None, "Judge LLM returned no response"
# Parse judge response
json_match = re.search(r'\{.*\}', response, re.DOTALL)
assert json_match is not None, f"Judge response not valid JSON: {response}"
verdict = json.loads(json_match.group())
assert verdict.get("newer_date") == case.new_date, (
f"Judge identified wrong date as newer. "
f"Expected {case.new_date}, got {verdict.get('newer_date')}. "
f"Reasoning: {verdict.get('reasoning')}"
)
# =============================================================================
# Tests: End-to-End — reply engine honours newer diary entries
# =============================================================================
# Models to exercise end-to-end. The small model is expected to be flaky on this
# task (conflicting facts + recency reasoning), so it's marked xfail rather than
# skipped — we still want to catch a surprise improvement.
_E2E_MODELS = [
pytest.param("gpt-oss:20b", id="gpt-oss:20b"),
pytest.param(
"gemma4:e2b",
id="gemma4:e2b",
marks=pytest.mark.xfail(
reason="Small model flakes on recency-superseding — tracked, not blocking",
strict=False,
),
),
]
def _query_for_case(case: "SupersedingCase") -> str:
"""Build a natural-language query that targets the entity in conflict."""
desc = case.description.lower()
if "office" in desc:
return "Which days do I go into the office these days?"
if "diet" in desc:
return "What does my current diet look like — calories and protein?"
return f"What's the latest on: {case.description}?"
@pytest.mark.eval
class TestReplyUsesNewerDiaryEntry:
"""End-to-end: with conflicting diary entries, the reply should reflect
the newer one. Exercises the full reply engine (enrichment retrieval,
injection ordering, and preamble framing)."""
@requires_judge_llm
@pytest.mark.parametrize("model", _E2E_MODELS)
@pytest.mark.parametrize("case", SUPERSEDING_CASES)
def test_reply_reflects_newer_entry(
self, case, model, mock_config, eval_db, eval_dialogue_memory
):
# The chat model under test is parametrised internally (to attach xfail
# to the small model). The harness-level judge-model loop re-runs this
# whole file once per judge phase, which is noise here (the judge model
# doesn't affect the reply engine's diary handling). Skip in the small
# judge phase so each (case, chat-model) pair runs exactly once.
if "gemma4" in JUDGE_MODEL:
pytest.skip("Chat model is parametrised here; only runs once per eval session (large judge phase)")
case = case.values[0] if hasattr(case, 'values') else case
from jarvis.reply.engine import run_reply_engine
# Seed diary with older (wrong) then newer (correct) entry.
eval_db.upsert_conversation_summary(
date_utc=case.old_date,
summary=case.old_entry,
topics=",".join(case.search_keywords),
source_app="test",
)
eval_db.upsert_conversation_summary(
date_utc=case.new_date,
summary=case.new_entry,
topics=",".join(case.search_keywords),
source_app="test",
)
mock_config.ollama_chat_model = model
mock_config.memory_enrichment_source = "diary"
query = _query_for_case(case)
with patch(
'jarvis.reply.engine.get_location_context_with_timezone',
return_value=("Location: London, United Kingdom", None),
):
reply = run_reply_engine(
db=eval_db,
cfg=mock_config,
tts=None,
text=query,
dialogue_memory=eval_dialogue_memory,
)
assert reply and reply.strip(), f"[{model}] Reply engine returned empty response"
reply_lower = reply.lower()
has_newer = any(kw.lower() in reply_lower for kw in case.newer_value_keywords)
has_only_older = (
not has_newer
and any(kw.lower() in reply_lower for kw in case.older_value_keywords)
)
print(f"\n 🤖 {model} reply to: {query}")
print(f" {reply[:240]}")
print(f" newer kws {case.newer_value_keywords} present: {has_newer}")
assert not has_only_older, (
f"[{model}] Reply used ONLY older info "
f"({case.older_value_keywords}) and ignored newer entry "
f"({case.newer_value_keywords}).\nReply: {reply}"
)
assert has_newer, (
f"[{model}] Reply did not reflect newer diary entry "
f"({case.newer_value_keywords}).\nReply: {reply}"
)