Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
281
tests/test_diary_poisoning_defence.py
Normal file
281
tests/test_diary_poisoning_defence.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""
|
||||
Unit tests for diary-poisoning defences.
|
||||
|
||||
Two defences against the "assistant's own past deflection, narrated in the diary,
|
||||
primes future sessions to repeat the same deflection" failure mode:
|
||||
|
||||
1. Summariser prompt forbids narrating assistant failures/deflections as facts.
|
||||
2. Reply engine injects diary entries under a reference-only framing rather than
|
||||
as authoritative "conversation history".
|
||||
|
||||
Both were motivated by a field regression where the small model deflected on
|
||||
"tell me about Possessor" because an earlier same-day diary entry narrated
|
||||
"the assistant offered to search the web" — which the model then imitated.
|
||||
"""
|
||||
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from jarvis.memory.conversation import generate_conversation_summary
|
||||
|
||||
|
||||
class TestSummariserForbidsDeflectionNarration:
|
||||
"""The summariser prompt must instruct the LLM to omit assistant failure narration."""
|
||||
|
||||
def _capture_system_prompt(self) -> str:
|
||||
"""Invoke generate_conversation_summary with a mocked LLM and capture the system prompt."""
|
||||
captured = {}
|
||||
|
||||
def fake_call(base_url, model, system_prompt, user_prompt, **kwargs):
|
||||
captured['system_prompt'] = system_prompt
|
||||
return "SUMMARY: x\nTOPICS: a, b"
|
||||
|
||||
with patch('jarvis.llm.call_llm_direct', side_effect=fake_call):
|
||||
generate_conversation_summary(
|
||||
recent_chunks=["User: hi", "Assistant: hello"],
|
||||
previous_summary=None,
|
||||
ollama_base_url="http://localhost:11434",
|
||||
ollama_chat_model="test-model",
|
||||
)
|
||||
|
||||
return captured['system_prompt']
|
||||
|
||||
def test_prompt_forbids_narrating_failures(self):
|
||||
prompt = self._capture_system_prompt()
|
||||
lowered = prompt.lower()
|
||||
# The prompt must explicitly forbid narrating assistant failures.
|
||||
# Accepts any clear injunction shape ("never narrate", "do not narrate",
|
||||
# "drop every sentence", etc.) — what matters is that the directive
|
||||
# is present, not its exact phrasing.
|
||||
assert any(injunction in lowered for injunction in (
|
||||
"never narrate", "do not narrate", "do not record", "do not preserve",
|
||||
"drop every sentence", "drop all forms of",
|
||||
)), "Summariser prompt must explicitly forbid narrating assistant failures."
|
||||
# Must name at least one specific failure pattern — "deflect", "lacked",
|
||||
# "offered to search", "failed to" — otherwise the rule is too abstract
|
||||
# for small models.
|
||||
assert any(term in lowered for term in (
|
||||
"deflect", "lacked", "offered to search", "failed to",
|
||||
)), "Summariser prompt must name specific failure patterns to omit."
|
||||
|
||||
def test_prompt_explains_why_failures_must_be_omitted(self):
|
||||
"""The prompt must give a reason, so the LLM generalises to variants it didn't see."""
|
||||
prompt = self._capture_system_prompt()
|
||||
lowered = prompt.lower()
|
||||
assert any(phrase in lowered for phrase in (
|
||||
"repeat the same",
|
||||
"train future",
|
||||
"generalise",
|
||||
"generalize",
|
||||
"transient",
|
||||
)), "Summariser prompt must explain why failure narration is harmful."
|
||||
|
||||
def test_prompt_requires_attribution_for_assistant_entity_claims(self):
|
||||
"""Regression for the real-world Possessor poisoning.
|
||||
|
||||
Field DB contained a diary entry reading:
|
||||
"The user initially inquired about the movie Possessor, and the
|
||||
assistant provided information stating it is a 2006 science
|
||||
fiction film directed by Brandon Cronenberg..."
|
||||
|
||||
The assistant had hallucinated the year; the summariser recorded
|
||||
the claim under an "the assistant provided information stating…"
|
||||
wrapper but the digest later stripped the attribution, and the
|
||||
claim ended up in the next session's system prompt as if it were
|
||||
established fact.
|
||||
|
||||
The right fix is attribution preservation, not content deletion —
|
||||
we want the summariser to be faithful (so corrections and
|
||||
tool-grounded answers survive in the log) while making clear WHO
|
||||
said WHAT, so downstream readers can calibrate trust.
|
||||
"""
|
||||
prompt = self._capture_system_prompt()
|
||||
lowered = prompt.lower()
|
||||
# The prompt must require attribution for assistant entity claims.
|
||||
assert "attribut" in lowered, (
|
||||
"Summariser prompt must require attribution of assistant claims "
|
||||
"(e.g. write 'the assistant said X' rather than bare 'X')."
|
||||
)
|
||||
# Must warn against promoting attributed claims into unattributed
|
||||
# assertions — that's the exact failure mode that poisoned the DB.
|
||||
assert "unattributed" in lowered or "without attribution" in lowered or (
|
||||
"strip" in lowered and "attribution" in lowered
|
||||
), (
|
||||
"Summariser prompt must forbid stripping attribution from an "
|
||||
"assistant claim (unattributed claims poison downstream)."
|
||||
)
|
||||
# Concrete good/bad example pair showing the failure mode.
|
||||
assert "possessor" in lowered or "piranesi" in lowered, (
|
||||
"Summariser prompt should include a concrete good/bad example "
|
||||
"for attributed assistant claims."
|
||||
)
|
||||
# Must handle the correction chain — user correcting the assistant
|
||||
# should result in BOTH being logged, not silent replacement.
|
||||
assert "correct" in lowered, (
|
||||
"Summariser prompt must explain how to handle user corrections "
|
||||
"of assistant claims (preserve both; don't replace silently)."
|
||||
)
|
||||
|
||||
def test_prompt_is_language_agnostic(self):
|
||||
"""The rule must apply to all languages, not only English."""
|
||||
prompt = self._capture_system_prompt()
|
||||
assert "any language" in prompt.lower() or "all languages" in prompt.lower(), (
|
||||
"Summariser rule must explicitly apply across languages."
|
||||
)
|
||||
|
||||
def test_prompt_forbids_welding_unrelated_topics(self):
|
||||
"""Regression for the Possessor/Jarvis field incident.
|
||||
|
||||
Field DB contained a diary entry reading:
|
||||
"The conversation focused on the movie 'Possessor' and the character
|
||||
'Jarvis,' identified as the artificial intelligence from the
|
||||
Marvel Cinematic Universe, created by Tony Stark and later
|
||||
embodied by Vision."
|
||||
|
||||
Two distinct topics (the 2020 Cronenberg film Possessor, and the MCU
|
||||
AI character named Jarvis) were welded into one clause via "and" plus
|
||||
a dangling appositive. Downstream enrichment treated the MCU
|
||||
description as pertaining to Possessor, and a later session produced
|
||||
a plausible-but-wrong reply grounded in the corrupted summary.
|
||||
|
||||
The rule is a sibling to the attribution rule: attribution without
|
||||
topic-separation still permits compound clauses, and compound clauses
|
||||
are the mechanism by which unrelated facts get retrieved together.
|
||||
"""
|
||||
prompt = self._capture_system_prompt()
|
||||
lowered = prompt.lower()
|
||||
|
||||
# Must forbid joining unrelated topics.
|
||||
assert any(phrase in lowered for phrase in (
|
||||
"do not weld",
|
||||
"not weld",
|
||||
"one topic per sentence",
|
||||
"separate sentence",
|
||||
"separate sentences",
|
||||
)), (
|
||||
"Summariser prompt must forbid welding unrelated topics into one clause."
|
||||
)
|
||||
|
||||
# Must name the specific linguistic mechanism (shared appositive /
|
||||
# dangling modifier) — otherwise small models won't recognise the
|
||||
# failure mode.
|
||||
assert "appositive" in lowered or "relative clause" in lowered or "dangl" in lowered, (
|
||||
"Summariser prompt must name the shared-appositive / dangling-modifier "
|
||||
"mechanism so small models recognise the failure mode."
|
||||
)
|
||||
|
||||
# Concrete good/bad example using the field-observed Possessor/Jarvis
|
||||
# case (the same one used elsewhere in the prompt — but here about
|
||||
# topic separation, not attribution).
|
||||
assert "jarvis" in lowered and "possessor" in lowered, (
|
||||
"Summariser prompt should include the Possessor/Jarvis topic-welding "
|
||||
"BAD→GOOD example."
|
||||
)
|
||||
|
||||
|
||||
class TestRewriteDeflectionSystemPrompt:
|
||||
"""The bulk-rewrite system prompt is a separate LLM context from the
|
||||
summariser. It must carry its own contract guarantees because old
|
||||
diary rows written before the summariser was tightened depend on it
|
||||
to clean themselves up, and downstream behaviour (graph extraction,
|
||||
enrichment, future replies) inherits whatever the rewrite produces.
|
||||
"""
|
||||
|
||||
def _prompt(self) -> str:
|
||||
from jarvis.memory.conversation import _REWRITE_DEFLECTION_SYSTEM_PROMPT
|
||||
return _REWRITE_DEFLECTION_SYSTEM_PROMPT
|
||||
|
||||
def test_prompt_names_the_canonical_deflection_shapes(self):
|
||||
lowered = self._prompt().lower()
|
||||
# The prompt must enumerate enough verb shapes for a small model
|
||||
# to generalise from. A bare "remove deflection" instruction is
|
||||
# too abstract — small models read past it.
|
||||
for shape in (
|
||||
"could not", "couldn't", "cannot", "did not", "does not",
|
||||
"was unable", "was not able", "failed to",
|
||||
"offered to search", "lacks",
|
||||
):
|
||||
assert shape in lowered, (
|
||||
f"Rewrite prompt must name the {shape!r} shape so small "
|
||||
f"models recognise the failure pattern."
|
||||
)
|
||||
|
||||
def test_prompt_protects_attributed_claims_and_user_facts(self):
|
||||
"""The same content that the summariser is allowed to keep must
|
||||
survive the rewrite. Without this guard the rewrite will strip
|
||||
attributed assistant claims (a third-party fact attributed to
|
||||
the assistant) and user-stated facts."""
|
||||
lowered = self._prompt().lower()
|
||||
# Names the kept categories so the model knows what NOT to drop.
|
||||
assert "attributed" in lowered or "user said" in lowered or "user-stated" in lowered, (
|
||||
"Rewrite prompt must explicitly list KEEP categories "
|
||||
"(attributed assistant claims, user-stated facts)."
|
||||
)
|
||||
assert "verbatim" in lowered, (
|
||||
"Rewrite prompt must instruct the model to keep non-deflection "
|
||||
"content verbatim — otherwise it paraphrases and corrupts."
|
||||
)
|
||||
|
||||
def test_prompt_is_language_agnostic(self):
|
||||
lowered = self._prompt().lower()
|
||||
assert "any language" in lowered or "every language" in lowered or "all languages" in lowered, (
|
||||
"Rewrite prompt must apply across languages — the leak shows "
|
||||
"up in any language the user speaks."
|
||||
)
|
||||
|
||||
def test_prompt_forbids_translation(self):
|
||||
"""A rewrite that translates the diary breaks downstream FTS,
|
||||
embeddings, and graph extraction — all of which expect the
|
||||
original language."""
|
||||
lowered = self._prompt().lower()
|
||||
assert "not translate" in lowered or "do not translate" in lowered or (
|
||||
"keep" in lowered and "language" in lowered
|
||||
), "Rewrite prompt must forbid translation of the output."
|
||||
|
||||
def test_prompt_specifies_empty_output_for_all_deflection_rows(self):
|
||||
"""If the row is *entirely* deflection, the model must return the
|
||||
empty string. The Python layer's empty-rewrite guard then keeps
|
||||
the original (an empty diary entry would be worse — retrieval
|
||||
treats absence as 'no record')."""
|
||||
lowered = self._prompt().lower()
|
||||
assert "empty" in lowered, (
|
||||
"Rewrite prompt must instruct the model how to handle a row "
|
||||
"that is entirely deflection (return empty)."
|
||||
)
|
||||
|
||||
|
||||
class TestDiaryEnrichmentInjectionFraming:
|
||||
"""The reply engine must frame diary enrichment as reference-only, not as instructions."""
|
||||
|
||||
def test_engine_injects_diary_under_reference_only_label(self):
|
||||
"""The literal injection string used by _build_initial_system_message must signal reference-only use."""
|
||||
# Read the engine source and verify the label string is present.
|
||||
# We intentionally assert on the source-level string rather than end-to-end
|
||||
# because the full reply engine invocation pulls in the network stack.
|
||||
import inspect
|
||||
from jarvis.reply import engine
|
||||
|
||||
source = inspect.getsource(engine)
|
||||
assert "reference only" in source.lower(), (
|
||||
"Engine must label diary enrichment as 'reference only' to prevent imitation."
|
||||
)
|
||||
assert "do not treat them as instructions" in source.lower() or \
|
||||
"not treat them as instructions" in source.lower(), (
|
||||
"Engine must explicitly tell the model not to treat diary entries as instructions."
|
||||
)
|
||||
|
||||
def test_engine_does_not_use_bare_conversation_history_label(self):
|
||||
"""The old 'Relevant conversation history:' label read as authoritative context.
|
||||
|
||||
We keep this test as a regression guard — if someone reverts to the bare
|
||||
label, this test will fail and force them to preserve the reference-only framing.
|
||||
"""
|
||||
import inspect
|
||||
from jarvis.reply import engine
|
||||
|
||||
source = inspect.getsource(engine)
|
||||
# The bare label (without the reference-only qualifier) must not appear.
|
||||
# We check for the exact old string on its own line.
|
||||
assert '"\\nRelevant conversation history:\\n"' not in source, (
|
||||
"Engine must not use the bare 'Relevant conversation history:' label — "
|
||||
"it reads as authoritative and primes small models to imitate past deflections."
|
||||
)
|
||||
Reference in New Issue
Block a user