Files
javis_bot/tests/test_engine_hot_window_caches.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

307 lines
13 KiB
Python

"""End-to-end coverage for the hot-window scratch caches in run_reply_engine.
Three caches share one primitive (DialogueMemory.hot_cache_*):
1. Warm profile block — query-agnostic, keyed on a constant.
2. Memory enrichment extractor — keyed on the redacted query (+topic hint).
3. Tool router output — keyed on redacted query + strategy + catalogue.
All three should fire on the second matching turn within the hot window so
follow-up queries don't pay for SQLite reads or LLM hops they already did.
Also covers the C1 fix: when the planner explicitly emits a `searchMemory`
step, the recall gate must NOT short-circuit memory enrichment even when
hot-window coverage is high.
"""
from unittest.mock import Mock, patch
import pytest
from src.jarvis.memory.conversation import DialogueMemory
from src.jarvis.reply.engine import run_reply_engine
def _mock_cfg():
cfg = Mock()
cfg.ollama_base_url = "http://localhost:11434"
cfg.ollama_chat_model = "test-large"
cfg.voice_debug = False
cfg.llm_tools_timeout_sec = 8.0
cfg.llm_embed_timeout_sec = 10.0
cfg.llm_chat_timeout_sec = 45.0
cfg.llm_digest_timeout_sec = 8.0
cfg.memory_enrichment_max_results = 5
cfg.memory_enrichment_source = "diary"
cfg.memory_digest_enabled = False
cfg.tool_result_digest_enabled = False
cfg.location_ip_address = None
cfg.location_auto_detect = False
cfg.location_enabled = False
cfg.agentic_max_turns = 8
cfg.tool_search_max_calls = 3
cfg.tool_selection_strategy = "all"
cfg.tool_carryover_max_turns = 2
cfg.tool_carryover_per_entry_chars = 1200
cfg.mcps = {}
cfg.llm_thinking_enabled = False
cfg.tts_engine = "none"
cfg.ollama_embed_model = "test-embed"
cfg.db_path = ":memory:"
return cfg
@pytest.mark.unit
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
@patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""})
@patch("src.jarvis.memory.graph.GraphMemoryStore")
@patch("src.jarvis.reply.engine.select_tools", return_value=[])
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
@patch("src.jarvis.reply.engine.extract_text_from_response")
@patch("src.jarvis.reply.engine.chat_with_messages")
def test_tool_router_cached_across_turns(
mock_chat, mock_extract, mock_extractor, mock_plan, mock_select,
_mock_graph, _mock_warm, _mock_fmt,
):
"""Two identical queries within the same DialogueMemory should call the
tool router exactly once — the second turn must hit the hot-window cache.
"""
mock_chat.side_effect = [
{"message": {"content": "hello"}},
{"message": {"content": "hello again"}},
]
mock_extract.side_effect = ["hello", "hello again"]
db = Mock()
cfg = _mock_cfg()
dm = DialogueMemory()
run_reply_engine(db=db, cfg=cfg, tts=None, text="say hi", dialogue_memory=dm)
run_reply_engine(db=db, cfg=cfg, tts=None, text="say hi", dialogue_memory=dm)
assert mock_select.call_count == 1, (
f"router should be cached on identical query; called {mock_select.call_count} times"
)
@pytest.mark.unit
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
@patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""})
@patch("src.jarvis.memory.graph.GraphMemoryStore")
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
@patch("src.jarvis.reply.engine.extract_text_from_response")
@patch("src.jarvis.reply.engine.chat_with_messages")
def test_router_fallback_to_all_tools_is_not_cached(
mock_chat, mock_extract, mock_extractor, mock_plan,
_mock_graph, _mock_warm, _mock_fmt,
):
"""When the router falls open to the full tool catalogue (its parse-failure
fail-open path), the engine must NOT persist that result in the
conversation-scoped cache. Otherwise a single small-model fluke pins
``allowed_tools`` to "all N" for the rest of the session, overwhelms the
planner, and starves the chat model.
Field trace (2026-05-03): user said "navigate to youtube.com". The router
LLM flaked, fell open to ~41 tools, the cache stored that, every
subsequent navigate attempt replayed the cached 41-tool set, and the small
chat model produced an empty reply ("Sorry, I had trouble processing
that"). Pre-#281 this didn't happen because the router re-rolled per turn.
"""
from src.jarvis.tools.registry import BUILTIN_TOOLS
full_catalogue = list(BUILTIN_TOOLS.keys())
mock_chat.side_effect = [
{"message": {"content": "hello"}},
{"message": {"content": "hello again"}},
]
mock_extract.side_effect = ["hello", "hello again"]
db = Mock()
cfg = _mock_cfg()
dm = DialogueMemory()
with patch(
"src.jarvis.reply.engine.select_tools",
return_value=full_catalogue,
) as mock_select:
run_reply_engine(db=db, cfg=cfg, tts=None, text="navigate to youtube", dialogue_memory=dm)
run_reply_engine(db=db, cfg=cfg, tts=None, text="navigate to youtube", dialogue_memory=dm)
assert mock_select.call_count == 2, (
"fall-open-to-all-tools must not be cached; the router should re-run "
f"on the second identical turn — was called {mock_select.call_count} times"
)
@pytest.mark.unit
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
@patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""})
@patch("src.jarvis.memory.graph.GraphMemoryStore")
@patch("src.jarvis.reply.engine.select_tools", return_value=[])
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={"keywords": ["x"], "questions": []})
@patch("src.jarvis.memory.conversation.search_conversation_memory_by_keywords", return_value=[])
@patch("src.jarvis.reply.engine.extract_text_from_response")
@patch("src.jarvis.reply.engine.chat_with_messages")
def test_memory_extractor_cached_across_turns(
mock_chat, mock_extract, _mock_search, mock_extractor,
_mock_plan, _mock_select, _mock_graph, _mock_warm, _mock_fmt,
):
"""Empty plan → fail-open path runs the extractor. The second identical
follow-up must skip the extractor LLM call.
The recall gate would also fire on a tool-grounded follow-up, so we
keep the dialogue free of tool messages here to exercise the extractor
path on both turns.
"""
mock_chat.side_effect = [
{"message": {"content": "first"}},
{"message": {"content": "second"}},
]
mock_extract.side_effect = ["first", "second"]
db = Mock()
cfg = _mock_cfg()
dm = DialogueMemory()
run_reply_engine(db=db, cfg=cfg, tts=None,
text="tell me about pushkin", dialogue_memory=dm)
run_reply_engine(db=db, cfg=cfg, tts=None,
text="tell me about pushkin", dialogue_memory=dm)
assert mock_extractor.call_count == 1, (
f"extractor should be cached; called {mock_extractor.call_count} times"
)
@pytest.mark.unit
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="warm-block")
@patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "u", "directives": "d"})
@patch("src.jarvis.memory.graph.GraphMemoryStore")
@patch("src.jarvis.reply.engine.select_tools", return_value=[])
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
@patch("src.jarvis.reply.engine.extract_text_from_response")
@patch("src.jarvis.reply.engine.chat_with_messages")
def test_warm_profile_cached_across_turns(
mock_chat, mock_extract, _mock_extractor, _mock_plan,
_mock_select, _mock_graph, mock_build, _mock_fmt,
):
"""Warm profile is query-agnostic; second turn must reuse the cached
block instead of re-traversing the graph store.
"""
mock_chat.side_effect = [
{"message": {"content": "a"}},
{"message": {"content": "b"}},
]
mock_extract.side_effect = ["a", "b"]
db = Mock()
cfg = _mock_cfg()
dm = DialogueMemory()
run_reply_engine(db=db, cfg=cfg, tts=None, text="hi", dialogue_memory=dm)
run_reply_engine(db=db, cfg=cfg, tts=None, text="anything else", dialogue_memory=dm)
assert mock_build.call_count == 1, (
f"warm profile should be built once and cached; got {mock_build.call_count} calls"
)
@pytest.mark.unit
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
@patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""})
@patch("src.jarvis.memory.graph.GraphMemoryStore")
@patch("src.jarvis.reply.engine.select_tools", return_value=[])
@patch(
"src.jarvis.reply.engine.plan_query",
return_value=["searchMemory topic='justin bieber'", "reply"],
)
@patch("src.jarvis.reply.engine.extract_search_params_for_memory",
return_value={"keywords": ["bieber"], "questions": []})
@patch("src.jarvis.memory.conversation.search_conversation_memory_by_keywords", return_value=[])
@patch("src.jarvis.reply.engine.extract_text_from_response")
@patch("src.jarvis.reply.engine.chat_with_messages")
def test_planner_search_memory_overrides_recall_gate(
mock_chat, mock_extract, _mock_search, mock_extractor,
_mock_plan, _mock_select, _mock_graph, _mock_warm, _mock_fmt,
):
"""C1 fix: when the planner emits `searchMemory`, the recall gate must
NOT short-circuit memory enrichment even though the hot window contains
a fresh tool result that overlaps the query.
"""
mock_chat.side_effect = [
{"message": {"content": "Canadian singer."}},
]
mock_extract.side_effect = ["Canadian singer."]
db = Mock()
cfg = _mock_cfg()
dm = DialogueMemory()
# Plant a fresh tool result that would otherwise satisfy the recall gate.
dm.add_message("user", "who is justin bieber")
dm.record_tool_turn([
{"role": "tool", "tool_call_id": "c1",
"content": "Justin Bieber is a Canadian singer with the song Baby."},
])
dm.add_message("assistant", "Canadian singer.")
run_reply_engine(db=db, cfg=cfg, tts=None,
text="bieber more about justin", dialogue_memory=dm)
# Planner explicitly demanded memory → extractor must run.
assert mock_extractor.call_count == 1, (
"extractor must run when planner emits searchMemory, "
"regardless of recall-gate coverage"
)
@pytest.mark.unit
@patch("src.jarvis.memory.graph_ops.format_warm_profile_block", return_value="")
@patch("src.jarvis.memory.graph_ops.build_warm_profile", return_value={"user": "", "directives": ""})
@patch("src.jarvis.memory.graph.GraphMemoryStore")
@patch("src.jarvis.reply.engine.select_tools", return_value=[])
@patch("src.jarvis.reply.engine.plan_query", return_value=[])
@patch("src.jarvis.reply.engine.extract_search_params_for_memory", return_value={})
@patch("src.jarvis.reply.engine.extract_text_from_response")
@patch("src.jarvis.reply.engine.chat_with_messages")
def test_new_conversation_clears_cache_and_carryover(
mock_chat, mock_extract, _mock_extractor, _mock_plan, mock_select,
_mock_graph, _mock_warm, _mock_fmt,
):
"""When the previous conversation has lapsed past the inactivity
window, the engine must wipe the conversation-scoped cache and any
leftover tool carryover before running the new turn. Otherwise stale
state from a previous session would leak into a fresh one.
"""
mock_chat.side_effect = [
{"message": {"content": "fresh"}},
]
mock_extract.side_effect = ["fresh"]
db = Mock()
cfg = _mock_cfg()
dm = DialogueMemory()
# Plant cache + carryover from a prior (now-lapsed) session.
dm.hot_cache_put(dm.WARM_PROFILE_CACHE_KEY, "old-block")
dm.hot_cache_put("router:old", ["webSearch"])
dm.record_tool_turn([
{"role": "tool", "tool_call_id": "c1", "content": "ancient result"},
])
assert dm._tool_turns
assert dm.hot_cache_get(dm.WARM_PROFILE_CACHE_KEY) == "old-block"
# No recent messages → engine treats this turn as a new conversation.
run_reply_engine(db=db, cfg=cfg, tts=None, text="hello", dialogue_memory=dm)
# Stale router entry must be gone (full hot-cache wipe), and the old
# tool carryover must not be visible to the new conversation.
assert dm.hot_cache_get("router:old") is None
# The tool carryover from before must have been cleared on entry; any
# tool turns recorded later in this turn would only come from THIS
# reply (mock chat returns a final reply with no tool calls).
assert dm._tool_turns == []