javis_bot/evals/test_planner_personalisation.py

"""
Planner — Personalisation Detection (Live)

Guards that the task-list planner emits a ``searchMemory`` directive as
the first step for queries that implicitly depend on the user's own
interests, tastes, or history — even when the user did not use the word
"preference" or "history" in the query.

Motivating field incident (2026-04-24):
  User asked "Tell me some news that might interest me, Jarvis." The
  planner emitted ``webSearch query='current news'`` with no
  ``searchMemory`` step, so the engine skipped memory enrichment and the
  reply was a generic BBC front-page summary with no personalisation.

The planner's rule 2 already lists "preferences" as a trigger, but
gemma4:e2b doesn't pattern-match phrases like "interest me", "suggest
something for me", "what should I…" onto that category without concrete
examples. This eval asserts the prompt teaches the connection — adding
examples that name the exact linguistic shape of a personalisation
request.

Run: EVAL_JUDGE_MODEL=gemma4:e2b pytest evals/test_planner_personalisation.py -v
"""

import pytest

from conftest import requires_judge_llm
from helpers import JUDGE_BASE_URL, JUDGE_MODEL


def _cfg():
    from types import SimpleNamespace
    return SimpleNamespace(
        ollama_base_url=JUDGE_BASE_URL,
        ollama_chat_model=JUDGE_MODEL,
        planner_model="",
        tool_router_model="",
        intent_judge_model="",
        planner_enabled=True,
        planner_timeout_sec=20.0,
    )


_TOOL_CATALOG = [
    ("webSearch", "Search the web for current facts and events."),
    ("getWeather", "Current weather and forecast for a location."),
    ("stop", "End the turn and reply to the user."),
]


@pytest.mark.eval
@requires_judge_llm
class TestPlannerEmitsSearchMemoryForPersonalisedQueries:
    """Field-regression guard for the 'interest me' pattern."""

    @pytest.mark.parametrize(
        "query",
        [
            "tell me some news that might interest me",
            "suggest something I'd enjoy watching tonight",
            "what should I cook for dinner",
            "recommend a book I'd like",
        ],
        ids=lambda q: q[:40],
    )
    def test_personalised_query_plans_memory_lookup_first(self, query):
        from jarvis.reply.planner import (
            plan_query, plan_requires_memory, is_search_memory_step,
        )

        plan = plan_query(
            cfg=_cfg(),
            query=query,
            dialogue_context="",
            tools=_TOOL_CATALOG,
        )
        print(f"\n  Query: {query!r}")
        print(f"  Plan: {plan}")

        assert plan, (
            f"Planner returned an empty plan for {query!r} — expected a "
            f"multi-step plan starting with a searchMemory directive."
        )
        assert plan_requires_memory(plan), (
            f"Planner did not request memory for personalised query "
            f"{query!r}. Plan: {plan}. The user's own interests are "
            f"exactly what rule 2 of the planner prompt lists as a "
            f"trigger for searchMemory."
        )
        assert is_search_memory_step(plan[0]), (
            f"searchMemory must be the FIRST step so memory enrichment "
            f"runs before any tool call. Plan: {plan}"
        )

    @pytest.mark.parametrize(
        "query",
        [
            "what is the capital of France",
            "who is Britney Spears",
            "what's 2 plus 2",
        ],
        ids=lambda q: q[:40],
    )
    def test_general_knowledge_query_does_not_request_memory(self, query):
        """Negative case: pure general-knowledge queries must NOT trigger
        a searchMemory directive. Every extra searchMemory is a wasted
        memory-enrichment LLM call downstream."""
        from jarvis.reply.planner import plan_query, plan_requires_memory

        plan = plan_query(
            cfg=_cfg(),
            query=query,
            dialogue_context="",
            tools=_TOOL_CATALOG,
        )
        print(f"\n  Query: {query!r}")
        print(f"  Plan: {plan}")

        assert plan, f"Planner returned empty plan for {query!r}"
        assert not plan_requires_memory(plan), (
            f"Planner wrongly requested searchMemory for a general-"
            f"knowledge query {query!r}. That wastes a memory-enrichment "
            f"LLM call on every such turn. Plan: {plan}"
        )