"""Tests for ``rewrite_all_diary_summaries`` — the LLM-driven bulk sweep that walks every row in ``conversation_summaries`` and asks the chat model to remove deflection narration. Replaces the regex-based scrub sweep tests in #366. The previous regex approach was English-only and accreted patterns whenever the model invented a new shape. The current sweep delegates the semantic check to the chat model itself, which is language-agnostic and improves automatically as models upgrade. The contract under test: 1. Walks every row, writes back rewritten text only when it changed. 2. Preserves ``ts_utc`` on rewrite — the audit trail must survive cleanup. 3. Empty rewrite → keep original, surface ``would_empty: true``. 4. LLM failure on a row → row left untouched, sweep continues. 5. Per-row write failure → row reported with ``error``, sweep continues. 6. Re-embeds rewritten rows when an embed model is configured. 7. Event payload contains counts/booleans only, never raw summary text. """ from __future__ import annotations import time import pytest from jarvis.memory import conversation as cmod from jarvis.memory.conversation import rewrite_all_diary_summaries from jarvis.memory.db import Database def _seed(db: Database, rows: list[tuple[str, str, str | None]]) -> None: """Seed (date_utc, summary, topics) tuples into the DB.""" for date_utc, summary, topics in rows: db.upsert_conversation_summary( date_utc=date_utc, summary=summary, topics=topics, source_app="jarvis", ) class TestRewriteSweepBehaviour: def test_walks_every_row_and_rewrites_only_dirty_ones(self, tmp_path, monkeypatch): db = Database(tmp_path / "jarvis.db") _seed(db, [ ("2026-04-10", "The user asked X. The assistant could not help.", None), ("2026-04-15", "The user prefers Celsius.", None), ("2026-04-27", "The user asked Y. The assistant did not have info.", None), ]) # Fake LLM: drop any sentence containing "the assistant". def fake_call(*args, **kwargs): text = args[3] if len(args) >= 4 else kwargs.get("user_prompt", "") for marker in ("<<>>", "<<>>"): text = text.replace(marker, "") text = text.replace("Return the cleaned text only.", "").strip() kept = [s.strip() for s in text.split(".") if s.strip() and "the assistant" not in s.lower()] return ". ".join(kept) + ("." if kept else "") monkeypatch.setattr(cmod, "call_llm_direct", fake_call) events = list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) assert len(events) == 3 rewritten = [e for e in events if e["rewritten"]] assert len(rewritten) == 2 rows = {r["date_utc"]: r["summary"] for r in db.get_all_conversation_summaries()} assert "could not" not in rows["2026-04-10"].lower() assert "did not have" not in rows["2026-04-27"].lower() # Clean row is byte-identical to the seed. assert rows["2026-04-15"] == "The user prefers Celsius." def test_preserves_ts_utc_on_rewrite(self, tmp_path, monkeypatch): """A maintenance pass must not make cleaned rows look like new writes — the audit trail of when a row was *originally* authored is the only signal users have to verify diary provenance.""" db = Database(tmp_path / "jarvis.db") _seed(db, [ ("2026-04-10", "User asked X. The assistant could not help.", None), ]) original_ts = db.get_all_conversation_summaries()[0]["ts_utc"] # Sleep so a stomped ts_utc would be detectably different. time.sleep(1.1) monkeypatch.setattr( cmod, "call_llm_direct", lambda *a, **k: "User asked X.", ) list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) new_ts = db.get_all_conversation_summaries()[0]["ts_utc"] assert new_ts == original_ts, ( "ts_utc was stomped — audit trail is destroyed by a maintenance pass" ) def test_empty_rewrite_keeps_original_and_surfaces_would_empty(self, tmp_path, monkeypatch): """If the model returns empty (entire row was deflection), keep the original. Empty diary entries are worse than slightly-leaky ones — retrieval treats absence as 'no record'.""" db = Database(tmp_path / "jarvis.db") _seed(db, [ ("2026-04-10", "The assistant could not help. The assistant offered to search.", None), ]) monkeypatch.setattr(cmod, "call_llm_direct", lambda *a, **k: "") events = list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) assert len(events) == 1 assert events[0]["would_empty"] is True assert events[0]["rewritten"] is False # Row must still be there with original content. rows = db.get_all_conversation_summaries() assert rows[0]["summary"].startswith("The assistant could not help") def test_llm_failure_on_one_row_does_not_stop_sweep(self, tmp_path, monkeypatch): """Per-row failure must be fail-open. The sweep continues with the remaining rows so a transient model hiccup on one date does not abandon the rest of the diary.""" db = Database(tmp_path / "jarvis.db") _seed(db, [ ("2026-04-10", "User asked X. The assistant could not help.", None), ("2026-04-15", "User asked Y. The assistant could not help.", None), ("2026-04-27", "User asked Z. The assistant could not help.", None), ]) calls = {"n": 0} def flaky(*args, **kwargs): calls["n"] += 1 if calls["n"] == 2: raise RuntimeError("ollama timeout") return "User asked something." monkeypatch.setattr(cmod, "call_llm_direct", flaky) events = list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) assert len(events) == 3 errors = [e for e in events if e.get("error")] assert len(errors) == 1 # Other two rows still got rewritten. rewritten = [e for e in events if e["rewritten"]] assert len(rewritten) == 2 def test_event_payload_contains_no_raw_summary_text(self, tmp_path, monkeypatch): """Privacy contract: per-row events must contain only counts, booleans, and the date — never any portion of the diary text.""" db = Database(tmp_path / "jarvis.db") sentinel = "thisIsAUniqueSentinelStringThatMustNotLeak" _seed(db, [ ("2026-04-10", f"User said {sentinel}. The assistant could not help.", None), ]) monkeypatch.setattr( cmod, "call_llm_direct", lambda *a, **k: f"User said {sentinel}.", ) events = list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) for ev in events: for v in ev.values(): assert sentinel not in str(v), ( f"diary content leaked into event field: {ev}" ) def test_error_field_is_class_name_only_never_message(self, tmp_path, monkeypatch): """Stringified exception messages can echo offending input back to the caller. The error field must be the class name only.""" db = Database(tmp_path / "jarvis.db") sentinel = "secretDiaryTokenInExceptionMessage" _seed(db, [ ("2026-04-10", f"User said {sentinel}. The assistant could not help.", None), ]) def boom(*a, **k): raise ValueError(f"oops {sentinel}") monkeypatch.setattr(cmod, "call_llm_direct", boom) events = list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) assert len(events) == 1 assert events[0]["error"] == "RewriteFailed" for v in events[0].values(): assert sentinel not in str(v) def test_unchanged_rewrite_does_not_trigger_writeback(self, tmp_path, monkeypatch): """If the LLM returns the input verbatim (clean row), no DB write happens and the embedding stays put. Equivalent of the topic optimiser's 'topics_changed=False → skip writeback' rule.""" db = Database(tmp_path / "jarvis.db") _seed(db, [ ("2026-04-15", "The user prefers Celsius.", None), ]) original_ts = db.get_all_conversation_summaries()[0]["ts_utc"] time.sleep(1.1) monkeypatch.setattr( cmod, "call_llm_direct", lambda *a, **k: "The user prefers Celsius.", ) events = list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) assert events[0]["rewritten"] is False # ts_utc must not have changed since no write happened. assert db.get_all_conversation_summaries()[0]["ts_utc"] == original_ts def test_handles_empty_diary_without_calling_llm(self, tmp_path, monkeypatch): db = Database(tmp_path / "jarvis.db") called = {"n": 0} def tracker(*a, **k): called["n"] += 1 return "" monkeypatch.setattr(cmod, "call_llm_direct", tracker) events = list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) assert events == [] assert called["n"] == 0 def test_strips_markdown_fences_from_model_output(self, tmp_path, monkeypatch): """Some models wrap output in ```text fences despite instructions. The sweep must strip them so the persisted summary is plain text.""" db = Database(tmp_path / "jarvis.db") _seed(db, [ ("2026-04-10", "User asked X. The assistant could not help.", None), ]) monkeypatch.setattr( cmod, "call_llm_direct", lambda *a, **k: "```\nUser asked X.\n```", ) list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) persisted = db.get_all_conversation_summaries()[0]["summary"] assert persisted == "User asked X." assert "```" not in persisted def test_strips_single_line_backtick_wrap(self, tmp_path, monkeypatch): r"""Regression: the previous regex strip treated ``\`\`\`X\`\`\``` as one giant opening fence and consumed the whole response, tripping the empty-rewrite guard and dropping a perfectly good rewrite. The fix unwraps both single-line and multi-line fence shapes.""" db = Database(tmp_path / "jarvis.db") _seed(db, [ ("2026-04-10", "User asked X. The assistant could not help.", None), ]) monkeypatch.setattr( cmod, "call_llm_direct", lambda *a, **k: "```User asked X.```", ) events = list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) # The rewrite must land — not get dropped via the would_empty guard. assert events[0]["rewritten"] is True assert events[0]["would_empty"] is False persisted = db.get_all_conversation_summaries()[0]["summary"] assert persisted == "User asked X." def test_strips_language_tagged_fences(self, tmp_path, monkeypatch): """Models often emit ```text\\n...\\n``` despite being told no markdown. The language tag (anything between the opening ``` and the first newline) must be dropped along with the fence.""" db = Database(tmp_path / "jarvis.db") _seed(db, [ ("2026-04-10", "User asked X. The assistant could not help.", None), ]) monkeypatch.setattr( cmod, "call_llm_direct", lambda *a, **k: "```text\nUser asked X.\n```", ) list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) persisted = db.get_all_conversation_summaries()[0]["summary"] assert persisted == "User asked X." def test_strips_echoed_untrusted_fence_markers(self, tmp_path, monkeypatch): """The diary text is wrapped in ``<<>>`` markers before being passed to the model (treat-as-data framing). Some models echo those markers back. They must be stripped so the markers don't end up persisted in the diary.""" db = Database(tmp_path / "jarvis.db") _seed(db, [ ("2026-04-10", "User asked X. The assistant could not help.", None), ]) monkeypatch.setattr( cmod, "call_llm_direct", lambda *a, **k: ( "<<>>\n" "User asked X.\n" "<<>>" ), ) list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) persisted = db.get_all_conversation_summaries()[0]["summary"] assert persisted == "User asked X." assert "BEGIN UNTRUSTED" not in persisted assert "END UNTRUSTED" not in persisted def test_whitespace_only_difference_is_treated_as_no_change(self, tmp_path, monkeypatch): """Idempotence: the LLM may return content with different leading/ trailing whitespace. We compare stripped texts, so this should not trigger a writeback (no embedding refresh, ts_utc preserved).""" db = Database(tmp_path / "jarvis.db") _seed(db, [ ("2026-04-15", "The user prefers Celsius.", None), ]) original_ts = db.get_all_conversation_summaries()[0]["ts_utc"] time.sleep(1.1) monkeypatch.setattr( cmod, "call_llm_direct", lambda *a, **k: " The user prefers Celsius. \n", ) events = list(rewrite_all_diary_summaries( db, ollama_base_url="http://localhost", ollama_chat_model="test", )) assert events[0]["rewritten"] is False assert db.get_all_conversation_summaries()[0]["ts_utc"] == original_ts