javis_bot/evals/test_memory_digest_preferences.py

"""
Memory Digest — Preference-Signal Surfacing (Live)

Guards that the memory digest distiller (``enrichment.digest_memory_for_query``)
surfaces past user engagement in the same domain as a taste/preference signal
for recommendation-style queries ("what should I watch tonight", "suggest a
restaurant", etc.), instead of returning NONE just because the snippets never
contain an explicitly stated preference.

Motivating field incident (2026-04-20):
  User asked "what should I watch tonight, Jarvis?". The diary contained
  fresh entries about the user engaging with the films Titanic and Possessor.
  The digest returned NONE → the reply model formed a generic webSearch for
  "what should I watch tonight" → the final reply recommended the generic
  Rotten Tomatoes top-1 result ("Big Mistakes on Netflix"), ignoring the
  user's actual taste and re-recommending nothing-from-their-history.

The general principle (encoded in the digest prompt): past interactions in
the query's domain are preference evidence even when no preference was
stated in plain words. This is domain-agnostic — it should hold for food,
books, music, news, films, anywhere.

Run: EVAL_JUDGE_MODEL=gemma4:e2b pytest evals/test_memory_digest_preferences.py -v
"""

import pytest

from conftest import requires_judge_llm
from helpers import JUDGE_BASE_URL, JUDGE_MODEL


@pytest.mark.eval
@requires_judge_llm
class TestMemoryDigestSurfacesPreferenceSignals:
    """Live tests that the digest surfaces engagement-as-preference signals."""

    def _digest(self, query: str, diary_entries: list[str]) -> str:
        from jarvis.reply.enrichment import digest_memory_for_query
        return digest_memory_for_query(
            query=query,
            diary_entries=diary_entries,
            graph_parts=[],
            ollama_base_url=JUDGE_BASE_URL,
            ollama_chat_model=JUDGE_MODEL,
            timeout_sec=60.0,
        )

    def test_watch_recommendation_surfaces_recently_discussed_films(self):
        """Reproduces the 2026-04-20 incident directly at the digest layer."""
        diary = [
            "[2026-04-20] The user asked about the movie Titanic; the assistant "
            "summarised its plot and noted it is a 1997 film directed by James Cameron.",
            "[2026-04-19] The conversation focused on the film Possessor; the "
            "assistant said it is a 2020 sci-fi horror by Brandon Cronenberg.",
            "[2026-04-15] The user discussed their weekend plans and mentioned "
            "they had been busy with work projects.",
            "[2026-04-10] The user asked about the weather in London.",
        ]
        digest = self._digest("what should I watch tonight?", diary)
        print(f"\n  Digest: {digest!r}")

        # Digest must not be empty — past film engagement is a preference signal.
        if not digest:
            pytest.xfail(
                f"Small judge model {JUDGE_MODEL} returned NONE for a "
                f"recommendation query despite recent film engagement. "
                f"This is the exact regression the prompt-level fix targets."
            )

        lowered = digest.lower()
        # At least one of the recently-engaged titles must surface.
        surfaced = [t for t in ("titanic", "possessor") if t in lowered]
        assert surfaced, (
            f"Digest did not surface any recently-engaged film as a preference "
            f"signal. Got: {digest!r}"
        )

    def test_restaurant_recommendation_surfaces_past_cuisine_interest(self):
        """Same principle, different domain — past food engagement surfaces
        for a restaurant recommendation query."""
        diary = [
            "[2026-04-18] The user asked about ramen shops near their office "
            "and the assistant listed three in Shoreditch.",
            "[2026-04-12] The user discussed cooking a Thai green curry and "
            "asked how to balance the fish sauce.",
            "[2026-04-05] The user mentioned they had a dentist appointment.",
        ]
        digest = self._digest("suggest a restaurant for dinner tonight", diary)
        print(f"\n  Digest: {digest!r}")

        if not digest:
            pytest.xfail(
                f"Small judge model {JUDGE_MODEL} returned NONE for a "
                f"restaurant recommendation despite recent cuisine engagement."
            )

        lowered = digest.lower()
        # At least one of the engaged cuisines/items must surface.
        surfaced = [t for t in ("ramen", "thai", "curry") if t in lowered]
        assert surfaced, (
            f"Digest did not surface any recently-engaged cuisine as a "
            f"preference signal. Got: {digest!r}"
        )

    def test_unrelated_domain_still_returns_none(self):
        """Regression guard: the relaxation must not make the digest surface
        everything. Snippets from a wholly different domain should still NONE
        out for a recommendation query."""
        diary = [
            "[2026-04-18] The user asked about the population of Iceland; the "
            "assistant said it is roughly 380,000.",
            "[2026-04-12] The user asked for help debugging a Python import "
            "cycle in their work project.",
        ]
        digest = self._digest("what should I watch tonight?", diary)
        print(f"\n  Digest: {digest!r}")

        # Neither snippet is in the films/entertainment domain. The digest
        # should either return empty or at least not falsely invent a film
        # preference from population statistics or Python debugging.
        if digest:
            lowered = digest.lower()
            fabricated = any(
                t in lowered for t in ("film", "movie", "watch", "series", "show")
            )
            assert not fabricated, (
                f"Digest fabricated a film preference from unrelated snippets. "
                f"Got: {digest!r}"
            )