javis_bot/evals/test_merge_consolidation.py

"""
Merge consolidation evaluations.

`merge_node_data` advertises three behaviours beyond the supersession
case covered in `test_recency_superseding.py`:

  1. Near-duplicate dedupe — different wordings of the same fact
     collapse to one canonical line.
  2. Pattern consolidation — repeated activities fold into patterns
     ("ate sushi Mon", "ate sushi Thu" → "regularly eats sushi").
  3. Independence — an unrelated new fact must NOT silently drop an
     existing unrelated line. (The most dangerous failure mode: a
     hallucinated contradiction would erase real data.)

Plus a check that the batched signature works end-to-end with a real
picker model (the round-1 batching has unit tests but no eval).

Run:
    EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh merge_consolidation
"""

from dataclasses import dataclass
from typing import List

import pytest

from conftest import requires_judge_llm
from helpers import JUDGE_MODEL, JUDGE_BASE_URL

from jarvis.memory.graph_ops import merge_node_data


# =============================================================================
# Test data
# =============================================================================

@dataclass
class DedupeCase:
    description: str
    existing_data: str
    new_facts: List[str]
    # Substrings that must remain in the merged data.
    must_contain: List[str]
    # Substrings that should NOT appear (forbidden duplicates).
    must_not_contain: List[str]
    # Maximum line count after merge — caps near-dup explosion.
    max_lines: int


DEDUPE_CASES = [
    pytest.param(
        DedupeCase(
            description="Same fact, different wording",
            existing_data="The user lives in London.",
            new_facts=["The user is based in London."],
            must_contain=["london"],
            must_not_contain=[],
            max_lines=1,
        ),
        id="lives-in vs based-in London",
    ),
    pytest.param(
        DedupeCase(
            description="Job title rephrased",
            existing_data="The user works as a software engineer.",
            new_facts=["The user's job is software engineering."],
            must_contain=["software"],
            must_not_contain=[],
            max_lines=1,
        ),
        id="job rephrased",
    ),
]


@dataclass
class PatternCase:
    description: str
    existing_data: str
    new_facts: List[str]
    # Keyword that should appear in the consolidated pattern line
    # (e.g. "regularly", "often", "frequently", "every").
    pattern_keywords: List[str]
    # Subject the pattern is about (must remain).
    subject_keyword: str
    # Cap on lines — pattern consolidation should shrink, not grow.
    max_lines: int


@dataclass
class PatternBoundaryCase:
    description: str
    existing_data: str
    new_facts: List[str]
    # Substrings that MUST still be present in the merged output —
    # these are distinct one-off events that should not collapse
    # into a fake pattern.
    must_keep_distinct: List[str]


PATTERN_BOUNDARY_CASES = [
    pytest.param(
        PatternBoundaryCase(
            description="One-off events should not be patternised",
            existing_data=(
                "[2025-08-12] The user attended a wedding in Edinburgh.\n"
                "[2025-11-03] The user gave a conference talk in Berlin."
            ),
            new_facts=["[2026-04-25] The user moved house to Manchester."],
            # Three distinct, unrelated one-time events. Folding them
            # into "regularly travels" or similar would invent a
            # pattern that isn't there.
            must_keep_distinct=["edinburgh", "berlin", "manchester"],
        ),
        id="distinct one-off events",
        # Originally xfail(strict=False) — captured a regression where
        # `gemma4:e2b` clustered date-prefixed entries with a new
        # dated entry and silently dropped the older two. The case
        # now passes 3/3 reps on the small model after the
        # META-NARRATIVE rule landed. The causal link is not
        # verified, but the eval is the right place to catch a
        # regression so the marker is dropped and the case stands as
        # a regular PASS.
    ),
]


PATTERN_CASES = [
    pytest.param(
        PatternCase(
            description="Repeated sushi meals",
            existing_data=(
                "[2026-04-07] The user ate sushi for lunch.\n"
                "[2026-04-14] The user had sushi again.\n"
                "[2026-04-21] The user ordered sushi for dinner."
            ),
            new_facts=["[2026-04-25] The user ate sushi today."],
            pattern_keywords=["regularly", "often", "frequently", "weekly", "every", "tend"],
            subject_keyword="sushi",
            max_lines=3,
        ),
        id="sushi pattern",
    ),
]


@dataclass
class IndependenceCase:
    description: str
    existing_data: str
    new_facts: List[str]
    # Substrings that MUST survive — the new fact is unrelated and
    # has no business dropping these.
    must_keep: List[str]
    # Substrings the new fact should add.
    must_add: List[str]


INDEPENDENCE_CASES = [
    pytest.param(
        IndependenceCase(
            description="Vegetarian + unrelated meal mention",
            # Note: "user is vegetarian" + "user ate a Big Mac" is a
            # genuine contradiction the picker may legitimately
            # surface or pick a side on. Use clearly-orthogonal facts
            # instead so the eval is unambiguous.
            existing_data=(
                "The user has a peanut allergy.\n"
                "The user prefers tea over coffee."
            ),
            new_facts=["The user enjoys hiking on weekends."],
            must_keep=["peanut", "tea"],
            must_add=["hiking"],
        ),
        id="independent facts coexist",
    ),
    pytest.param(
        IndependenceCase(
            description="Job + new hobby",
            existing_data="The user works as a software engineer at Equals Money.",
            new_facts=["The user is learning to play the guitar."],
            must_keep=["software", "equals money"],
            must_add=["guitar"],
        ),
        id="job survives unrelated hobby fact",
    ),
]


@dataclass
class MetaNarrativeCase:
    description: str
    existing_data: str
    new_facts: List[str]
    # Substrings that must NOT remain after the merge — these are
    # extractor-artefact lines from earlier prompt versions
    # (assistant-narrating, capability denials) and have no place
    # in a knowledge node.
    must_drop_substrings: List[str]
    # Substrings that MUST remain — genuine knowledge or directives
    # that should not get over-pruned by the meta-narrative rule.
    must_keep_substrings: List[str]


META_NARRATIVE_CASES = [
    pytest.param(
        MetaNarrativeCase(
            description=(
                "Capability-denial line in Directives is dropped, "
                "real directive survives"
            ),
            # Mirrors the real bug report: a self-denial leaked into
            # Directives via an older extractor prompt and persisted
            # because no rewrite-on-write rule covered meta-narrative.
            # Consolidate-all (empty new_facts) should now scrub it
            # without touching the genuine British English directive.
            existing_data=(
                "Always reply in British English.\n"
                "The assistant is unable to navigate to a web page."
            ),
            new_facts=[],
            must_drop_substrings=[
                "unable to navigate",
                "the assistant is unable",
            ],
            must_keep_substrings=["british english"],
        ),
        id="capability denial dropped, directive kept",
    ),
    pytest.param(
        MetaNarrativeCase(
            description=(
                "Assistant-narrating WORLD line is dropped during "
                "self-consolidation"
            ),
            # The extractor's BANNED FACT FORMS list catches these at
            # write-time now, but lines emitted before #291 landed
            # still sit in nodes. Merge prompt must drop them too.
            existing_data=(
                "Possessor (2020) is directed by Brandon Cronenberg.\n"
                "The assistant suggested grilled salmon for dinner."
            ),
            new_facts=[],
            must_drop_substrings=[
                "the assistant suggested",
                "grilled salmon",
            ],
            must_keep_substrings=["possessor", "cronenberg"],
        ),
        id="assistant-suggested line dropped, lookup survives",
    ),
    pytest.param(
        MetaNarrativeCase(
            description=(
                "Polluted node receiving a new fact: meta-narrative "
                "drops AND the new fact lands"
            ),
            # Production path: a diary flush routes one new fact to a
            # node that already holds an older capability-denial line.
            # The merge must drop the denial AND incorporate the new
            # fact — capturing the worst case where the META rule
            # could steal attention from incorporation tracking.
            existing_data=(
                "Always reply in British English.\n"
                "The assistant is unable to navigate to a web page."
            ),
            new_facts=["Keep replies under three sentences."],
            must_drop_substrings=[
                "unable to navigate",
                "the assistant is unable",
            ],
            must_keep_substrings=[
                "british english",
                "three sentences",
            ],
        ),
        id="polluted node + new fact: drop and incorporate",
    ),
    pytest.param(
        MetaNarrativeCase(
            description=(
                "No meta-narrative present — merge must not invent "
                "drops (over-pruning guard)"
            ),
            # Counter-test for over-zealous interpretation of the new
            # rule. A clean Directives node with two genuine
            # imperatives must come through self-consolidation
            # untouched. If this fails the rule is too aggressive.
            existing_data=(
                "Always reply in British English.\n"
                "Keep replies under three sentences."
            ),
            new_facts=[],
            must_drop_substrings=[],
            must_keep_substrings=["british english", "three sentences"],
        ),
        id="genuine directives untouched",
    ),
]


@dataclass
class BatchedCase:
    description: str
    existing_data: str
    new_facts: List[str]
    # Each entry: list of substring alternatives — at least one must
    # appear in the merged data. Captures "the model phrased it
    # however it wanted, but the fact survived".
    expected_signals: List[List[str]]


BATCHED_CASES = [
    pytest.param(
        BatchedCase(
            description="Three independent new facts in one call",
            existing_data="The user lives in London.",
            new_facts=[
                "The user has a dog named Biscuit.",
                "The user prefers oat milk.",
                "The user is allergic to peanuts.",
            ],
            expected_signals=[
                ["london"],
                ["biscuit", "dog"],
                ["oat milk", "oat"],
                ["peanut"],
            ],
        ),
        id="batched 3 new facts",
    ),
]


def _line_count(data: str) -> int:
    return len([l for l in data.split("\n") if l.strip()])


# =============================================================================
# Tests
# =============================================================================

@pytest.mark.eval
class TestNearDuplicateDedupe:
    """Different wordings of the same fact must collapse to one line."""

    @requires_judge_llm
    @pytest.mark.parametrize("case", DEDUPE_CASES)
    def test_near_duplicates_collapse(self, case, graph_store):
        case = case.values[0] if hasattr(case, 'values') else case

        node = graph_store.create_node(
            name="T",
            description=case.description,
            data=case.existing_data,
            parent_id="root",
        )

        result = merge_node_data(
            store=graph_store,
            node_id=node.id,
            new_facts=case.new_facts,
            ollama_base_url=JUDGE_BASE_URL,
            ollama_chat_model=JUDGE_MODEL,
            timeout_sec=30.0,
        )

        merged = graph_store.get_node(node.id).data
        merged_lower = merged.lower()
        line_count = _line_count(merged)

        print(f"\n  📝 dedupe '{case.description}':\n     {merged[:300]}")
        print(f"     success={result.success} lines={line_count}")

        for kw in case.must_contain:
            assert kw.lower() in merged_lower, (
                f"[{case.description}] expected '{kw}' to survive merge.\n{merged}"
            )
        for kw in case.must_not_contain:
            assert kw.lower() not in merged_lower, (
                f"[{case.description}] forbidden '{kw}' leaked into merge.\n{merged}"
            )
        assert line_count <= case.max_lines, (
            f"[{case.description}] merge produced {line_count} lines, expected ≤ {case.max_lines} "
            f"(near-duplicates should collapse).\n{merged}"
        )


@pytest.mark.eval
class TestPatternConsolidation:
    """Repeated activities should fold into patterns rather than
    accumulate as a stack of dated entries."""

    @requires_judge_llm
    @pytest.mark.parametrize("case", PATTERN_CASES)
    def test_repeated_activities_consolidate(self, case, graph_store):
        case = case.values[0] if hasattr(case, 'values') else case

        node = graph_store.create_node(
            name="T",
            description=case.description,
            data=case.existing_data,
            parent_id="root",
        )

        result = merge_node_data(
            store=graph_store,
            node_id=node.id,
            new_facts=case.new_facts,
            ollama_base_url=JUDGE_BASE_URL,
            ollama_chat_model=JUDGE_MODEL,
            timeout_sec=30.0,
        )

        merged = graph_store.get_node(node.id).data
        merged_lower = merged.lower()
        line_count = _line_count(merged)

        print(f"\n  📝 pattern '{case.description}':\n     {merged[:300]}")
        print(f"     success={result.success} lines={line_count}")

        assert case.subject_keyword.lower() in merged_lower, (
            f"[{case.description}] subject '{case.subject_keyword}' lost from merge.\n{merged}"
        )
        has_pattern = any(kw in merged_lower for kw in case.pattern_keywords)
        assert has_pattern, (
            f"[{case.description}] expected pattern wording (any of {case.pattern_keywords}) "
            f"after consolidating repeated activities.\n{merged}"
        )
        assert line_count <= case.max_lines, (
            f"[{case.description}] {line_count} lines remain — repeated activities should "
            f"have consolidated to ≤ {case.max_lines}.\n{merged}"
        )


@pytest.mark.eval
class TestPatternBoundary:
    """Counter-example to `TestPatternConsolidation`: distinct one-off
    events MUST NOT be folded into a fabricated pattern. Pattern
    consolidation should fire on repetition, not on coincidence."""

    @requires_judge_llm
    @pytest.mark.parametrize("case", PATTERN_BOUNDARY_CASES)
    def test_distinct_one_offs_stay_distinct(self, case, graph_store):
        case = case.values[0] if hasattr(case, 'values') else case

        node = graph_store.create_node(
            name="T",
            description=case.description,
            data=case.existing_data,
            parent_id="root",
        )

        result = merge_node_data(
            store=graph_store,
            node_id=node.id,
            new_facts=case.new_facts,
            ollama_base_url=JUDGE_BASE_URL,
            ollama_chat_model=JUDGE_MODEL,
            timeout_sec=30.0,
        )

        merged = graph_store.get_node(node.id).data
        merged_lower = merged.lower()

        print(f"\n  📝 pattern-boundary '{case.description}':\n     {merged[:300]}")
        print(f"     success={result.success}")

        for kw in case.must_keep_distinct:
            assert kw.lower() in merged_lower, (
                f"[{case.description}] distinct event '{kw}' was folded away — "
                f"the picker invented a pattern from one-offs.\n{merged}"
            )


@pytest.mark.eval
class TestIndependenceOfUnrelatedFacts:
    """An unrelated new fact must NOT drop an existing unrelated line.
    Silent erasure of real data is the most dangerous failure mode of
    the rewrite-on-write merge — the hallucination guard catches
    runaway growth, but only this eval catches runaway shrinkage."""

    @requires_judge_llm
    @pytest.mark.parametrize("case", INDEPENDENCE_CASES)
    def test_independent_facts_coexist(self, case, graph_store):
        case = case.values[0] if hasattr(case, 'values') else case

        node = graph_store.create_node(
            name="T",
            description=case.description,
            data=case.existing_data,
            parent_id="root",
        )

        result = merge_node_data(
            store=graph_store,
            node_id=node.id,
            new_facts=case.new_facts,
            ollama_base_url=JUDGE_BASE_URL,
            ollama_chat_model=JUDGE_MODEL,
            timeout_sec=30.0,
        )

        merged = graph_store.get_node(node.id).data
        merged_lower = merged.lower()

        print(f"\n  📝 independence '{case.description}':\n     {merged[:300]}")
        print(f"     success={result.success}")

        for kw in case.must_keep:
            assert kw.lower() in merged_lower, (
                f"[{case.description}] existing fact containing '{kw}' was silently "
                f"dropped by an unrelated new fact — independence violated.\n{merged}"
            )
        for kw in case.must_add:
            assert kw.lower() in merged_lower, (
                f"[{case.description}] new fact containing '{kw}' did not land.\n{merged}"
            )


@pytest.mark.eval
class TestMetaNarrativePruning:
    """Lines that narrate the assistant's own behaviour, capabilities,
    or denials are extractor artefacts from earlier prompt versions,
    not user knowledge. The merge step must drop them during normal
    rewrite-on-write AND during the consolidate-all sweep. Counterpart
    to the extractor's BANNED FACT FORMS list — that catches them at
    write-time, this catches the historical leftovers."""

    @requires_judge_llm
    @pytest.mark.parametrize("case", META_NARRATIVE_CASES)
    def test_meta_narrative_dropped_real_facts_kept(self, case, graph_store):
        case = case.values[0] if hasattr(case, 'values') else case

        node = graph_store.create_node(
            name="T",
            description=case.description,
            data=case.existing_data,
            parent_id="root",
        )

        result = merge_node_data(
            store=graph_store,
            node_id=node.id,
            new_facts=case.new_facts,
            ollama_base_url=JUDGE_BASE_URL,
            ollama_chat_model=JUDGE_MODEL,
            timeout_sec=30.0,
        )

        merged = graph_store.get_node(node.id).data
        merged_lower = merged.lower()

        print(f"\n  📝 meta-narrative '{case.description}':\n     {merged[:300]}")
        print(f"     success={result.success}")

        for kw in case.must_drop_substrings:
            assert kw.lower() not in merged_lower, (
                f"[{case.description}] meta-narrative line containing "
                f"'{kw}' survived the merge — the rule did not fire.\n{merged}"
            )
        for kw in case.must_keep_substrings:
            assert kw.lower() in merged_lower, (
                f"[{case.description}] genuine fact containing '{kw}' was "
                f"over-pruned — the rule is too aggressive.\n{merged}"
            )

        # When new_facts is non-empty the merge must report at least
        # one incorporation. A regression where the META rule steals
        # attention from incorporation tracking would surface here as
        # `incorporated_indices == []` despite the fact landing in
        # the merged data — exactly the failure mode `_match_key`'s
        # tolerant punctuation strip was added to prevent.
        if case.new_facts:
            assert len(result.incorporated_indices) >= 1, (
                f"[{case.description}] new fact landed in merged data "
                f"but incorporated_indices is empty — orchestrator "
                f"would under-report the flush.\n"
                f"merged={merged}\nresult={result}"
            )


@pytest.mark.eval
class TestBatchedMerge:
    """Multiple new facts in one merge call must all land. Pins the
    round-1 batched signature against a real picker model."""

    @requires_judge_llm
    @pytest.mark.parametrize("case", BATCHED_CASES)
    def test_all_batched_facts_land(self, case, graph_store):
        case = case.values[0] if hasattr(case, 'values') else case

        node = graph_store.create_node(
            name="T",
            description=case.description,
            data=case.existing_data,
            parent_id="root",
        )

        result = merge_node_data(
            store=graph_store,
            node_id=node.id,
            new_facts=case.new_facts,
            ollama_base_url=JUDGE_BASE_URL,
            ollama_chat_model=JUDGE_MODEL,
            timeout_sec=30.0,
        )

        merged = graph_store.get_node(node.id).data
        merged_lower = merged.lower()
        line_count = _line_count(merged)

        print(f"\n  📝 batched '{case.description}':\n     {merged[:400]}")
        print(f"     success={result.success} lines={line_count} "
              f"incorporated={result.incorporated_indices}")

        for alternatives in case.expected_signals:
            assert any(alt.lower() in merged_lower for alt in alternatives), (
                f"[{case.description}] none of {alternatives} survived the batched merge.\n"
                f"{merged}"
            )

        # Lower bound on lines: at minimum the merged data should
        # contain a line per surviving fact. Upper bound is enforced
        # by the in-product hallucination guard, not this eval — a
        # cap here is brittle since legitimate consolidation could
        # cross it on a paraphrase the model picks differently.
        assert line_count >= len(case.expected_signals) - 1, (
            f"[{case.description}] {line_count} lines suspiciously low for "
            f"{len(case.expected_signals)} signals — facts may have been silently merged.\n"
            f"{merged}"
        )

        # Pin the round-1 batched reporting fix: every input fact
        # whose substance survived should be tracked in
        # `incorporated_indices`. An empty list when facts clearly
        # landed means the orchestrator under-reports flushes — the
        # exact regression `_match_key`'s tolerant punctuation strip
        # was added to prevent. Allow strict equality OR coverage of
        # all input indices, since the picker may legitimately
        # consolidate two new facts into one line.
        assert len(result.incorporated_indices) >= 1, (
            f"[{case.description}] incorporated_indices is empty despite facts landing — "
            f"reporting drift back. {result.incorporated_indices}"
        )