Files
javis_bot/evals/test_diary_summariser_hygiene.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

241 lines
9.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Diary Summariser Hygiene Evaluations (Live)
Verifies the summariser prompt does not preserve assistant failure/deflection
narration in diary entries. Without this hygiene, the assistant's own past
failures get retrieved as "conversation history" on future related queries and
prime the model to repeat the same deflection pattern.
Motivating field incident:
A user asked "tell me about Possessor" and the small model deflected. The
diary then recorded: "the assistant offered to search the web." On the next
day, the same user asked again, and the model imitated the recorded
deflection instead of calling webSearch.
Run: EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh test_diary_summariser
"""
import pytest
from conftest import requires_judge_llm
from helpers import JUDGE_BASE_URL, JUDGE_MODEL
# Exact deflection phrases the summariser must not preserve verbatim.
# Language-agnostic by nature (phrases are English because the field-observed
# summariser output was English, but the *rule* in the prompt is language-agnostic).
_DEFLECTION_PHRASES = (
"could not provide",
"lacked",
"offered to search",
"offer to search",
"offered to perform",
"unable to provide",
"was unable",
"did not have",
"does not have",
"had no specific",
"no specific information",
"no specific details",
"clarified that",
"indicated it",
"initially could not",
"failed to provide",
"no information",
"internal knowledge",
)
@pytest.mark.eval
@requires_judge_llm
class TestDiarySummariserHygieneLive:
"""Live tests that the summariser omits assistant failure narration."""
def _summarise(self, chunks: list[str]) -> tuple[str, str]:
from jarvis.memory.conversation import generate_conversation_summary
summary, topics = generate_conversation_summary(
recent_chunks=chunks,
previous_summary=None,
ollama_base_url=JUDGE_BASE_URL,
ollama_chat_model=JUDGE_MODEL,
timeout_sec=60.0,
)
return summary or "", topics or ""
def test_omits_deflection_narration_for_unknown_entity(self):
"""A conversation where the assistant deflected on an unknown entity,
then eventually found an answer, must summarise only the resolved fact —
not the deflection."""
chunks = [
"User: Tell me about the Possessor movie.",
"Assistant: I don't have specific information about Possessor. Would you like me to search the web for it?",
"User: Yeah go ahead.",
"Assistant: Possessor is a 2020 science-fiction horror film directed by Brandon Cronenberg, starring Andrea Riseborough.",
]
summary, _ = self._summarise(chunks)
print(f"\n Summary: {summary}")
lowered = summary.lower()
hits = [p for p in _DEFLECTION_PHRASES if p in lowered]
if hits:
pytest.xfail(
f"Small judge model {JUDGE_MODEL} still narrated deflections: {hits}. "
f"Summary: {summary}"
)
# Positive requirement: the resolved fact must appear.
assert "possessor" in lowered and (
"2020" in lowered or "cronenberg" in lowered or "film" in lowered or "movie" in lowered
), f"Resolved fact missing from summary: {summary}"
def test_omits_deflection_when_topic_never_resolved(self):
"""When the topic is raised but never resolved, the summary should
record the topic/user intent, not the assistant's deflection."""
chunks = [
"User: What do you know about the book Piranesi?",
"Assistant: I don't have specific information about that book.",
"User: No worries, let's talk about something else. What's the weather?",
"Assistant: It's 15 degrees and cloudy in London.",
]
summary, _ = self._summarise(chunks)
print(f"\n Summary: {summary}")
lowered = summary.lower()
# The topic (Piranesi) may appear, but phrases narrating the
# assistant's inability must not.
hits = [p for p in _DEFLECTION_PHRASES if p in lowered]
if hits:
pytest.xfail(
f"Small judge model {JUDGE_MODEL} still narrated deflections: {hits}. "
f"Summary: {summary}"
)
def test_unrelated_topics_are_not_welded_into_one_clause(self):
"""Regression for the Possessor/Jarvis field incident.
Two distinct topics (the 2020 Cronenberg film Possessor, and the
MCU AI character named Jarvis) in the same conversation must not
be summarised as a single welded clause like "the movie Possessor
and the character Jarvis, identified as the MCU AI...". Downstream
enrichment will treat the appositive as describing both referents
and mislead the next reply.
The sentence that mentions Possessor must not also contain MCU-
specific tokens (Marvel / Stark / Vision / Avengers), and vice
versa.
"""
chunks = [
"User: Have you seen the movie Possessor?",
"Assistant: I don't have specific information about that film. Would you like me to search the web?",
"User: No, unrelated — why are you called Jarvis?",
"Assistant: My name is a nod to the MCU character Jarvis, the AI created by Tony Stark and later embodied by Vision.",
]
summary, _ = self._summarise(chunks)
print(f"\n Summary: {summary}")
import re
sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', summary) if s.strip()]
# Tight phrase-level tokens — naked substrings like "vision" or "stark"
# collide with common English words and would false-positive.
mcu_tokens = (
"tony stark",
"marvel cinematic",
"mcu",
"embodied by vision",
"avengers",
"iron man",
)
welded = []
for s in sentences:
low = s.lower()
mentions_possessor = "possessor" in low
mentions_mcu_jarvis = any(t in low for t in mcu_tokens)
if mentions_possessor and mentions_mcu_jarvis:
welded.append(s)
if welded:
pytest.xfail(
f"Small judge model {JUDGE_MODEL} welded Possessor with MCU-Jarvis "
f"details in the same sentence: {welded}. Full summary: {summary}"
)
# Positive requirement: both topics must survive somewhere — the rule
# is about separation, not suppression.
lowered = summary.lower()
assert "possessor" in lowered, f"Possessor topic dropped: {summary}"
assert "jarvis" in lowered, f"Jarvis topic dropped: {summary}"
def test_preserves_legitimate_user_preferences(self):
"""Regression guard: the hygiene rule must not strip legitimate content
(user preferences, decisions, facts)."""
chunks = [
"User: I prefer Celsius for temperatures.",
"Assistant: Got it, I'll use Celsius from now on.",
"User: Also, I live in Hackney.",
"Assistant: Noted.",
]
summary, _ = self._summarise(chunks)
print(f"\n Summary: {summary}")
lowered = summary.lower()
assert "celsius" in lowered, f"Preference dropped from summary: {summary}"
assert "hackney" in lowered, f"Location dropped from summary: {summary}"
def test_omits_deflection_narration_in_turkish(self):
"""Rule 6 of the summariser prompt promises to apply in every
language, with explicit Turkish examples in the prompt body. This
eval validates the multilingual claim end-to-end on the live
judge model rather than relying on prompt-content assertions
alone (which only prove the prompt *says* it works in any
language, not that it actually does).
Turkish was chosen because the prompt has explicit Turkish
BAD/GOOD pairs and the user of this codebase speaks Turkish.
Spanish would equally validate but would duplicate the same
signal.
"""
chunks = [
"User: Hackney'de iyi bir restoran biliyor musun?",
"Assistant: Hackney'deki güncel restoranlar hakkında özel bir bilgim yok. Web'de aramamı ister misin?",
"User: Boşver. Bugün hava nasıl?",
"Assistant: Londra'da hava 12 derece ve parçalı bulutlu.",
]
summary, _ = self._summarise(chunks)
print(f"\n Summary: {summary}")
lowered = summary.lower()
# Turkish deflection markers: assistant denying having information.
# The summariser must not preserve these in Turkish either.
turkish_deflections = (
"bilgisi yok", # "has no information"
"bilgisi olmadığını", # "that it has no information"
"bilmediğini", # "that it does not know"
"yardımcı olamadı", # "could not help"
"aramamı ister", # "would you like me to search"
"aramayı önerdi", # "suggested searching"
)
hits = [p for p in turkish_deflections if p in lowered]
if hits:
pytest.xfail(
f"Small judge model {JUDGE_MODEL} narrated Turkish deflections: {hits}. "
f"Summary: {summary}"
)
# Positive requirement: at least one of the surviving topics must
# be recorded. The user asked about a restaurant AND the weather.
# The rule is "drop deflections, keep topics" — the topics must
# persist in some recognisable form.
topic_present = any(t in lowered for t in (
"restoran", # restaurant
"hackney",
"hava", # weather
"londra", # London
"12", # the temperature
))
assert topic_present, (
f"Turkish summary dropped every topic, not just deflections: {summary}"
)