Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
241 lines
9.9 KiB
Python
241 lines
9.9 KiB
Python
"""
|
||
Diary Summariser Hygiene Evaluations (Live)
|
||
|
||
Verifies the summariser prompt does not preserve assistant failure/deflection
|
||
narration in diary entries. Without this hygiene, the assistant's own past
|
||
failures get retrieved as "conversation history" on future related queries and
|
||
prime the model to repeat the same deflection pattern.
|
||
|
||
Motivating field incident:
|
||
A user asked "tell me about Possessor" and the small model deflected. The
|
||
diary then recorded: "the assistant offered to search the web." On the next
|
||
day, the same user asked again, and the model imitated the recorded
|
||
deflection instead of calling webSearch.
|
||
|
||
Run: EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh test_diary_summariser
|
||
"""
|
||
|
||
import pytest
|
||
|
||
from conftest import requires_judge_llm
|
||
from helpers import JUDGE_BASE_URL, JUDGE_MODEL
|
||
|
||
|
||
# Exact deflection phrases the summariser must not preserve verbatim.
|
||
# Language-agnostic by nature (phrases are English because the field-observed
|
||
# summariser output was English, but the *rule* in the prompt is language-agnostic).
|
||
_DEFLECTION_PHRASES = (
|
||
"could not provide",
|
||
"lacked",
|
||
"offered to search",
|
||
"offer to search",
|
||
"offered to perform",
|
||
"unable to provide",
|
||
"was unable",
|
||
"did not have",
|
||
"does not have",
|
||
"had no specific",
|
||
"no specific information",
|
||
"no specific details",
|
||
"clarified that",
|
||
"indicated it",
|
||
"initially could not",
|
||
"failed to provide",
|
||
"no information",
|
||
"internal knowledge",
|
||
)
|
||
|
||
|
||
@pytest.mark.eval
|
||
@requires_judge_llm
|
||
class TestDiarySummariserHygieneLive:
|
||
"""Live tests that the summariser omits assistant failure narration."""
|
||
|
||
def _summarise(self, chunks: list[str]) -> tuple[str, str]:
|
||
from jarvis.memory.conversation import generate_conversation_summary
|
||
summary, topics = generate_conversation_summary(
|
||
recent_chunks=chunks,
|
||
previous_summary=None,
|
||
ollama_base_url=JUDGE_BASE_URL,
|
||
ollama_chat_model=JUDGE_MODEL,
|
||
timeout_sec=60.0,
|
||
)
|
||
return summary or "", topics or ""
|
||
|
||
def test_omits_deflection_narration_for_unknown_entity(self):
|
||
"""A conversation where the assistant deflected on an unknown entity,
|
||
then eventually found an answer, must summarise only the resolved fact —
|
||
not the deflection."""
|
||
chunks = [
|
||
"User: Tell me about the Possessor movie.",
|
||
"Assistant: I don't have specific information about Possessor. Would you like me to search the web for it?",
|
||
"User: Yeah go ahead.",
|
||
"Assistant: Possessor is a 2020 science-fiction horror film directed by Brandon Cronenberg, starring Andrea Riseborough.",
|
||
]
|
||
summary, _ = self._summarise(chunks)
|
||
print(f"\n Summary: {summary}")
|
||
|
||
lowered = summary.lower()
|
||
hits = [p for p in _DEFLECTION_PHRASES if p in lowered]
|
||
if hits:
|
||
pytest.xfail(
|
||
f"Small judge model {JUDGE_MODEL} still narrated deflections: {hits}. "
|
||
f"Summary: {summary}"
|
||
)
|
||
|
||
# Positive requirement: the resolved fact must appear.
|
||
assert "possessor" in lowered and (
|
||
"2020" in lowered or "cronenberg" in lowered or "film" in lowered or "movie" in lowered
|
||
), f"Resolved fact missing from summary: {summary}"
|
||
|
||
def test_omits_deflection_when_topic_never_resolved(self):
|
||
"""When the topic is raised but never resolved, the summary should
|
||
record the topic/user intent, not the assistant's deflection."""
|
||
chunks = [
|
||
"User: What do you know about the book Piranesi?",
|
||
"Assistant: I don't have specific information about that book.",
|
||
"User: No worries, let's talk about something else. What's the weather?",
|
||
"Assistant: It's 15 degrees and cloudy in London.",
|
||
]
|
||
summary, _ = self._summarise(chunks)
|
||
print(f"\n Summary: {summary}")
|
||
|
||
lowered = summary.lower()
|
||
# The topic (Piranesi) may appear, but phrases narrating the
|
||
# assistant's inability must not.
|
||
hits = [p for p in _DEFLECTION_PHRASES if p in lowered]
|
||
if hits:
|
||
pytest.xfail(
|
||
f"Small judge model {JUDGE_MODEL} still narrated deflections: {hits}. "
|
||
f"Summary: {summary}"
|
||
)
|
||
|
||
def test_unrelated_topics_are_not_welded_into_one_clause(self):
|
||
"""Regression for the Possessor/Jarvis field incident.
|
||
|
||
Two distinct topics (the 2020 Cronenberg film Possessor, and the
|
||
MCU AI character named Jarvis) in the same conversation must not
|
||
be summarised as a single welded clause like "the movie Possessor
|
||
and the character Jarvis, identified as the MCU AI...". Downstream
|
||
enrichment will treat the appositive as describing both referents
|
||
and mislead the next reply.
|
||
|
||
The sentence that mentions Possessor must not also contain MCU-
|
||
specific tokens (Marvel / Stark / Vision / Avengers), and vice
|
||
versa.
|
||
"""
|
||
chunks = [
|
||
"User: Have you seen the movie Possessor?",
|
||
"Assistant: I don't have specific information about that film. Would you like me to search the web?",
|
||
"User: No, unrelated — why are you called Jarvis?",
|
||
"Assistant: My name is a nod to the MCU character Jarvis, the AI created by Tony Stark and later embodied by Vision.",
|
||
]
|
||
summary, _ = self._summarise(chunks)
|
||
print(f"\n Summary: {summary}")
|
||
|
||
import re
|
||
sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', summary) if s.strip()]
|
||
|
||
# Tight phrase-level tokens — naked substrings like "vision" or "stark"
|
||
# collide with common English words and would false-positive.
|
||
mcu_tokens = (
|
||
"tony stark",
|
||
"marvel cinematic",
|
||
"mcu",
|
||
"embodied by vision",
|
||
"avengers",
|
||
"iron man",
|
||
)
|
||
|
||
welded = []
|
||
for s in sentences:
|
||
low = s.lower()
|
||
mentions_possessor = "possessor" in low
|
||
mentions_mcu_jarvis = any(t in low for t in mcu_tokens)
|
||
if mentions_possessor and mentions_mcu_jarvis:
|
||
welded.append(s)
|
||
|
||
if welded:
|
||
pytest.xfail(
|
||
f"Small judge model {JUDGE_MODEL} welded Possessor with MCU-Jarvis "
|
||
f"details in the same sentence: {welded}. Full summary: {summary}"
|
||
)
|
||
|
||
# Positive requirement: both topics must survive somewhere — the rule
|
||
# is about separation, not suppression.
|
||
lowered = summary.lower()
|
||
assert "possessor" in lowered, f"Possessor topic dropped: {summary}"
|
||
assert "jarvis" in lowered, f"Jarvis topic dropped: {summary}"
|
||
|
||
def test_preserves_legitimate_user_preferences(self):
|
||
"""Regression guard: the hygiene rule must not strip legitimate content
|
||
(user preferences, decisions, facts)."""
|
||
chunks = [
|
||
"User: I prefer Celsius for temperatures.",
|
||
"Assistant: Got it, I'll use Celsius from now on.",
|
||
"User: Also, I live in Hackney.",
|
||
"Assistant: Noted.",
|
||
]
|
||
summary, _ = self._summarise(chunks)
|
||
print(f"\n Summary: {summary}")
|
||
|
||
lowered = summary.lower()
|
||
assert "celsius" in lowered, f"Preference dropped from summary: {summary}"
|
||
assert "hackney" in lowered, f"Location dropped from summary: {summary}"
|
||
|
||
def test_omits_deflection_narration_in_turkish(self):
|
||
"""Rule 6 of the summariser prompt promises to apply in every
|
||
language, with explicit Turkish examples in the prompt body. This
|
||
eval validates the multilingual claim end-to-end on the live
|
||
judge model rather than relying on prompt-content assertions
|
||
alone (which only prove the prompt *says* it works in any
|
||
language, not that it actually does).
|
||
|
||
Turkish was chosen because the prompt has explicit Turkish
|
||
BAD/GOOD pairs and the user of this codebase speaks Turkish.
|
||
Spanish would equally validate but would duplicate the same
|
||
signal.
|
||
"""
|
||
chunks = [
|
||
"User: Hackney'de iyi bir restoran biliyor musun?",
|
||
"Assistant: Hackney'deki güncel restoranlar hakkında özel bir bilgim yok. Web'de aramamı ister misin?",
|
||
"User: Boşver. Bugün hava nasıl?",
|
||
"Assistant: Londra'da hava 12 derece ve parçalı bulutlu.",
|
||
]
|
||
summary, _ = self._summarise(chunks)
|
||
print(f"\n Summary: {summary}")
|
||
|
||
lowered = summary.lower()
|
||
# Turkish deflection markers: assistant denying having information.
|
||
# The summariser must not preserve these in Turkish either.
|
||
turkish_deflections = (
|
||
"bilgisi yok", # "has no information"
|
||
"bilgisi olmadığını", # "that it has no information"
|
||
"bilmediğini", # "that it does not know"
|
||
"yardımcı olamadı", # "could not help"
|
||
"aramamı ister", # "would you like me to search"
|
||
"aramayı önerdi", # "suggested searching"
|
||
)
|
||
hits = [p for p in turkish_deflections if p in lowered]
|
||
if hits:
|
||
pytest.xfail(
|
||
f"Small judge model {JUDGE_MODEL} narrated Turkish deflections: {hits}. "
|
||
f"Summary: {summary}"
|
||
)
|
||
|
||
# Positive requirement: at least one of the surviving topics must
|
||
# be recorded. The user asked about a restaurant AND the weather.
|
||
# The rule is "drop deflections, keep topics" — the topics must
|
||
# persist in some recognisable form.
|
||
topic_present = any(t in lowered for t in (
|
||
"restoran", # restaurant
|
||
"hackney",
|
||
"hava", # weather
|
||
"londra", # London
|
||
"12", # the temperature
|
||
))
|
||
assert topic_present, (
|
||
f"Turkish summary dropped every topic, not just deflections: {summary}"
|
||
)
|
||
|