Files
javis_bot/evals/test_memory_digest_identity.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

262 lines
11 KiB
Python

"""
Memory Digest — Identity-Query Fact Surfacing (Live)
Guards that the memory digest distiller (``enrichment.digest_memory_for_query``)
surfaces user-stated facts about the user (location, interests, ongoing
plans, biography) when the current query asks who the user is or what the
assistant knows about them, rather than surfacing past Q&A topics the user
merely asked about.
Motivating field incident:
The user asked "what do you know about me?". The diary contained a
user-stated fact ("goes boxing near E3 2WS") alongside a past Q&A where
the user asked for the area of a rectangle. The digest surfaced the
rectangle question, which is not a fact about the user at all — leading
the reply model to miss the actual identity signal entirely.
General principle (encoded in the digest prompt): for identity queries,
user-stated facts dominate over past Q&A topics, and multiple such facts
should be surfaced when present.
Run: EVAL_JUDGE_MODEL=gemma4:e2b pytest evals/test_memory_digest_identity.py -v
"""
import pytest
from conftest import requires_judge_llm
from helpers import JUDGE_BASE_URL, JUDGE_MODEL
@pytest.mark.eval
@requires_judge_llm
class TestMemoryDigestSurfacesIdentityFacts:
"""Live tests that the digest prefers user-stated facts for identity queries."""
def _digest(self, query: str, diary_entries: list[str]) -> str:
from jarvis.reply.enrichment import digest_memory_for_query
return digest_memory_for_query(
query=query,
diary_entries=diary_entries,
graph_parts=[],
ollama_base_url=JUDGE_BASE_URL,
ollama_chat_model=JUDGE_MODEL,
timeout_sec=60.0,
)
def test_identity_query_surfaces_user_stated_fact_over_past_qa(self):
"""Reproduces the field incident directly at the digest layer.
Padding filler ensures the raw block exceeds ``_DIGEST_MIN_CHARS``
(400) so the distil LLM actually runs — below that threshold the
raw text is passed through unchanged and this test would be a
no-op.
"""
diary = [
"[2026-04-10] The user said they go boxing near E3 2WS.",
"[2026-04-12] The user asked for the area of a rectangle 7 by 9; "
"the assistant said 63.",
"[2026-04-11] The user asked what the capital of Peru is; the "
"assistant said Lima. They also asked about the population and "
"the assistant said it is roughly 10 million in the metro area.",
"[2026-04-09] The user asked the assistant to convert 200 USD to "
"GBP; the assistant said approximately 158 GBP at the current rate.",
"[2026-04-08] The user asked the assistant for the boiling point "
"of water at sea level; the assistant said 100 degrees Celsius.",
]
digest = self._digest("what do you know about me?", diary)
print(f"\n Digest: {digest!r}")
if not digest:
pytest.xfail(
f"Small judge model {JUDGE_MODEL} returned NONE for an "
f"identity query despite user-stated facts being present."
)
lowered = digest.lower()
surfaced_fact = "boxing" in lowered or "e3" in lowered
# Past Q&A topics that must stay out of an identity digest. The
# field-incident topic (rectangle area) is the primary guard;
# currency and boiling-point are included because they are
# numeric/factoid Q&As with no user-preference character — the
# exact failure class the identity rule targets.
surfaced_past_qa = any(
kw in lowered
for kw in (
"rectangle",
"7 by 9",
"area of",
"usd",
"gbp",
"boiling",
)
)
assert surfaced_fact, (
f"Digest did not surface the user-stated boxing/location fact "
f"for an identity query. Got: {digest!r}"
)
assert not surfaced_past_qa, (
f"Digest surfaced past Q&A topics as if they were facts "
f"about the user. Got: {digest!r}"
)
def test_identity_query_surfaces_multiple_user_facts_when_present(self):
"""When several user-stated facts exist, the digest should combine
them rather than pick just one."""
diary = [
"[2026-04-10] The user said they live in East London.",
"[2026-04-11] The user said they are vegetarian.",
"[2026-04-12] The user said they are learning Japanese.",
"[2026-04-13] The user asked about the capital of Peru; the "
"assistant said Lima.",
"[2026-04-09] The user asked the assistant to convert 200 USD to "
"GBP; the assistant said approximately 158 GBP at the current rate.",
"[2026-04-08] The user asked the boiling point of water at sea "
"level; the assistant said 100 degrees Celsius.",
]
digest = self._digest("tell me about myself", diary)
print(f"\n Digest: {digest!r}")
if not digest:
pytest.xfail(
f"Small judge model {JUDGE_MODEL} returned NONE for an "
f"identity query despite multiple user-stated facts."
)
lowered = digest.lower()
facts_hit = sum(
kw in lowered
for kw in ("east london", "vegetarian", "japanese")
)
assert facts_hit >= 2, (
f"Digest surfaced fewer than 2 of the 3 user-stated facts for "
f"an identity query. Got: {digest!r}"
)
past_qa_leak = any(
kw in lowered for kw in ("usd", "gbp", "boiling")
)
assert not past_qa_leak, (
f"Digest leaked a past Q&A topic into an identity-query "
f"digest. Got: {digest!r}"
)
def test_identity_query_with_only_past_qa_returns_none_or_no_false_facts(self):
"""Regression guard: if NO user-stated facts exist, the digest must
not fabricate a user fact from past Q&A topics."""
diary = [
"[2026-04-12] The user asked for the area of a rectangle 7 by 9; "
"the assistant said 63.",
"[2026-04-13] The user asked about the capital of Peru; the "
"assistant said Lima.",
"[2026-04-11] The user asked the assistant to convert 200 USD to "
"GBP; the assistant said approximately 158 GBP at the current rate.",
"[2026-04-10] The user asked the boiling point of water at sea "
"level; the assistant said 100 degrees Celsius.",
"[2026-04-09] The user asked for the capital of Australia; the "
"assistant said Canberra.",
]
digest = self._digest("what do you know about me?", diary)
print(f"\n Digest: {digest!r}")
lowered = digest.lower()
fabricated_user_fact = any(
phrase in lowered
for phrase in (
"user likes math",
"user is interested in math",
"user likes geography",
"user is interested in peru",
)
)
assert not fabricated_user_fact, (
f"Digest fabricated a user-preference claim from past Q&A "
f"topics. Got: {digest!r}"
)
def test_identity_query_does_not_trigger_recommendation_engagement_rule(self):
"""Cross-rule guard: the recommendation-engagement rule says past
interactions count as preference signals for 'what should I watch'.
An IDENTITY query with the same film-engagement diary must not
mistakenly treat the films as facts about the user — the identity
rule still applies and past Q&A topics stay out unless the snippet
explicitly says the user is into that topic."""
diary = [
"[2026-04-20] The user asked about the movie Titanic; the "
"assistant summarised its plot and noted it is a 1997 film "
"directed by James Cameron.",
"[2026-04-19] The conversation focused on the film Possessor; "
"the assistant said it is a 2020 sci-fi horror by Brandon "
"Cronenberg.",
"[2026-04-10] The user said they live in East London and work "
"as a software engineer.",
]
digest = self._digest("what do you know about me?", diary)
print(f"\n Digest: {digest!r}")
if not digest:
pytest.xfail(
f"Small judge model {JUDGE_MODEL} returned NONE for an "
f"identity query despite user-stated facts present."
)
lowered = digest.lower()
user_fact_surfaced = any(
kw in lowered
for kw in ("east london", "software engineer", "engineer")
)
assert user_fact_surfaced, (
f"Digest did not surface the user-stated location/occupation "
f"fact for an identity query. Got: {digest!r}"
)
# The film Q&As must NOT be presented as user facts. The identity
# rule's "not a fact unless the snippet says the user is into it"
# clause must override the recommendation-engagement rule here.
film_presented_as_user_fact = any(
phrase in lowered
for phrase in (
"the user likes",
"the user enjoys",
"the user is a fan",
"the user is into",
"taste signal",
"already covered",
)
)
assert not film_presented_as_user_fact, (
f"Digest applied the recommendation-engagement rule to an "
f"identity query: films framed as user taste/preference. "
f"Got: {digest!r}"
)
def test_recommendation_query_still_surfaces_engagement_when_user_facts_present(self):
"""Reverse cross-rule guard: a recommendation query alongside
user-stated facts must still surface engagement-as-preference.
The identity rule's 'prefer user-stated facts' must not suppress
the recommendation rule's engagement signals."""
diary = [
"[2026-04-20] The user asked about the movie Titanic; the "
"assistant summarised its plot and noted it is a 1997 film "
"directed by James Cameron.",
"[2026-04-19] The conversation focused on the film Possessor; "
"the assistant said it is a 2020 sci-fi horror by Brandon "
"Cronenberg.",
"[2026-04-10] The user said they live in East London.",
]
digest = self._digest("what should I watch tonight?", diary)
print(f"\n Digest: {digest!r}")
if not digest:
pytest.xfail(
f"Small judge model {JUDGE_MODEL} returned NONE for a "
f"recommendation query despite engagement signals present."
)
lowered = digest.lower()
engagement_surfaced = any(
kw in lowered for kw in ("titanic", "possessor")
)
assert engagement_surfaced, (
f"Digest suppressed engagement-as-preference signals on a "
f"recommendation query, likely because the identity rule "
f"dominated. Got: {digest!r}"
)