Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
130 lines
5.8 KiB
Python
130 lines
5.8 KiB
Python
"""
|
|
Memory Digest — Preference-Signal Surfacing (Live)
|
|
|
|
Guards that the memory digest distiller (``enrichment.digest_memory_for_query``)
|
|
surfaces past user engagement in the same domain as a taste/preference signal
|
|
for recommendation-style queries ("what should I watch tonight", "suggest a
|
|
restaurant", etc.), instead of returning NONE just because the snippets never
|
|
contain an explicitly stated preference.
|
|
|
|
Motivating field incident (2026-04-20):
|
|
User asked "what should I watch tonight, Jarvis?". The diary contained
|
|
fresh entries about the user engaging with the films Titanic and Possessor.
|
|
The digest returned NONE → the reply model formed a generic webSearch for
|
|
"what should I watch tonight" → the final reply recommended the generic
|
|
Rotten Tomatoes top-1 result ("Big Mistakes on Netflix"), ignoring the
|
|
user's actual taste and re-recommending nothing-from-their-history.
|
|
|
|
The general principle (encoded in the digest prompt): past interactions in
|
|
the query's domain are preference evidence even when no preference was
|
|
stated in plain words. This is domain-agnostic — it should hold for food,
|
|
books, music, news, films, anywhere.
|
|
|
|
Run: EVAL_JUDGE_MODEL=gemma4:e2b pytest evals/test_memory_digest_preferences.py -v
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from conftest import requires_judge_llm
|
|
from helpers import JUDGE_BASE_URL, JUDGE_MODEL
|
|
|
|
|
|
@pytest.mark.eval
|
|
@requires_judge_llm
|
|
class TestMemoryDigestSurfacesPreferenceSignals:
|
|
"""Live tests that the digest surfaces engagement-as-preference signals."""
|
|
|
|
def _digest(self, query: str, diary_entries: list[str]) -> str:
|
|
from jarvis.reply.enrichment import digest_memory_for_query
|
|
return digest_memory_for_query(
|
|
query=query,
|
|
diary_entries=diary_entries,
|
|
graph_parts=[],
|
|
ollama_base_url=JUDGE_BASE_URL,
|
|
ollama_chat_model=JUDGE_MODEL,
|
|
timeout_sec=60.0,
|
|
)
|
|
|
|
def test_watch_recommendation_surfaces_recently_discussed_films(self):
|
|
"""Reproduces the 2026-04-20 incident directly at the digest layer."""
|
|
diary = [
|
|
"[2026-04-20] The user asked about the movie Titanic; the assistant "
|
|
"summarised its plot and noted it is a 1997 film directed by James Cameron.",
|
|
"[2026-04-19] The conversation focused on the film Possessor; the "
|
|
"assistant said it is a 2020 sci-fi horror by Brandon Cronenberg.",
|
|
"[2026-04-15] The user discussed their weekend plans and mentioned "
|
|
"they had been busy with work projects.",
|
|
"[2026-04-10] The user asked about the weather in London.",
|
|
]
|
|
digest = self._digest("what should I watch tonight?", diary)
|
|
print(f"\n Digest: {digest!r}")
|
|
|
|
# Digest must not be empty — past film engagement is a preference signal.
|
|
if not digest:
|
|
pytest.xfail(
|
|
f"Small judge model {JUDGE_MODEL} returned NONE for a "
|
|
f"recommendation query despite recent film engagement. "
|
|
f"This is the exact regression the prompt-level fix targets."
|
|
)
|
|
|
|
lowered = digest.lower()
|
|
# At least one of the recently-engaged titles must surface.
|
|
surfaced = [t for t in ("titanic", "possessor") if t in lowered]
|
|
assert surfaced, (
|
|
f"Digest did not surface any recently-engaged film as a preference "
|
|
f"signal. Got: {digest!r}"
|
|
)
|
|
|
|
def test_restaurant_recommendation_surfaces_past_cuisine_interest(self):
|
|
"""Same principle, different domain — past food engagement surfaces
|
|
for a restaurant recommendation query."""
|
|
diary = [
|
|
"[2026-04-18] The user asked about ramen shops near their office "
|
|
"and the assistant listed three in Shoreditch.",
|
|
"[2026-04-12] The user discussed cooking a Thai green curry and "
|
|
"asked how to balance the fish sauce.",
|
|
"[2026-04-05] The user mentioned they had a dentist appointment.",
|
|
]
|
|
digest = self._digest("suggest a restaurant for dinner tonight", diary)
|
|
print(f"\n Digest: {digest!r}")
|
|
|
|
if not digest:
|
|
pytest.xfail(
|
|
f"Small judge model {JUDGE_MODEL} returned NONE for a "
|
|
f"restaurant recommendation despite recent cuisine engagement."
|
|
)
|
|
|
|
lowered = digest.lower()
|
|
# At least one of the engaged cuisines/items must surface.
|
|
surfaced = [t for t in ("ramen", "thai", "curry") if t in lowered]
|
|
assert surfaced, (
|
|
f"Digest did not surface any recently-engaged cuisine as a "
|
|
f"preference signal. Got: {digest!r}"
|
|
)
|
|
|
|
def test_unrelated_domain_still_returns_none(self):
|
|
"""Regression guard: the relaxation must not make the digest surface
|
|
everything. Snippets from a wholly different domain should still NONE
|
|
out for a recommendation query."""
|
|
diary = [
|
|
"[2026-04-18] The user asked about the population of Iceland; the "
|
|
"assistant said it is roughly 380,000.",
|
|
"[2026-04-12] The user asked for help debugging a Python import "
|
|
"cycle in their work project.",
|
|
]
|
|
digest = self._digest("what should I watch tonight?", diary)
|
|
print(f"\n Digest: {digest!r}")
|
|
|
|
# Neither snippet is in the films/entertainment domain. The digest
|
|
# should either return empty or at least not falsely invent a film
|
|
# preference from population statistics or Python debugging.
|
|
if digest:
|
|
lowered = digest.lower()
|
|
fabricated = any(
|
|
t in lowered for t in ("film", "movie", "watch", "series", "show")
|
|
)
|
|
assert not fabricated, (
|
|
f"Digest fabricated a film preference from unrelated snippets. "
|
|
f"Got: {digest!r}"
|
|
)
|