Files
javis_bot/tests/test_intent_judge.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

891 lines
34 KiB
Python

"""Tests for the intent judge module."""
import pytest
from unittest.mock import patch, MagicMock
from jarvis.listening.intent_judge import (
IntentJudge,
IntentJudgeConfig,
IntentJudgment,
create_intent_judge,
)
from jarvis.listening.transcript_buffer import TranscriptSegment
class TestIntentJudgeConfig:
"""Tests for IntentJudgeConfig."""
def test_default_config(self):
"""Default config has reasonable values."""
config = IntentJudgeConfig()
assert config.assistant_name == "Jarvis"
assert config.model == "gemma4:e2b"
assert config.timeout_sec == 15.0
assert config.aliases == []
def test_custom_config(self):
"""Can customize config values."""
config = IntentJudgeConfig(
assistant_name="Friday",
model="llama3.2:1b",
aliases=["computer"],
)
assert config.assistant_name == "Friday"
assert config.model == "llama3.2:1b"
assert config.aliases == ["computer"]
class TestIntentJudgment:
"""Tests for IntentJudgment dataclass."""
def test_basic_judgment(self):
"""Can create a basic judgment."""
judgment = IntentJudgment(
directed=True,
query="what time is it",
stop=False,
confidence="high",
reasoning="clear wake word",
)
assert judgment.directed is True
assert judgment.query == "what time is it"
assert judgment.stop is False
assert judgment.confidence == "high"
class TestIntentJudge:
"""Tests for IntentJudge class."""
def test_init(self):
"""Can initialize intent judge."""
judge = IntentJudge()
assert judge.config.assistant_name == "Jarvis"
def test_init_with_config(self):
"""Can initialize with custom config."""
config = IntentJudgeConfig(assistant_name="Friday")
judge = IntentJudge(config)
assert judge.config.assistant_name == "Friday"
def test_available_when_requests_installed(self):
"""available is True when requests is installed."""
judge = IntentJudge()
judge._available = True
judge._last_error_time = 0.0
assert judge.available is True
def test_unavailable_during_error_cooldown(self):
"""available is False during error cooldown."""
import time
judge = IntentJudge()
judge._available = True
judge._last_error_time = time.time()
judge._error_cooldown = 30.0
assert judge.available is False
def test_build_system_prompt(self):
"""System prompt includes assistant name."""
config = IntentJudgeConfig(assistant_name="Friday")
judge = IntentJudge(config)
prompt = judge._build_system_prompt()
assert "Friday" in prompt
def test_build_user_prompt_basic(self):
"""User prompt includes transcript."""
judge = IntentJudge()
segments = [
TranscriptSegment("hello jarvis", 1000.0, 1001.0),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=1000.5,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=False,
)
assert "hello jarvis" in prompt
def test_build_user_prompt_hot_window(self):
"""User prompt indicates hot window mode."""
judge = IntentJudge()
segments = [
TranscriptSegment("what time is it", 1000.0, 1001.0),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=True,
)
assert "HOT WINDOW" in prompt
def test_build_user_prompt_normalises_aliases(self):
"""Aliases (Whisper variants) are replaced with the assistant name in the prompt."""
config = IntentJudgeConfig(
assistant_name="Jarvis",
aliases=["jervis", "jaivis", "jar is"],
)
judge = IntentJudge(config)
segments = [
TranscriptSegment("Jervis what time is it", 1000.0, 1001.0),
TranscriptSegment("Jaivis tell me a joke", 1002.0, 1003.0),
TranscriptSegment("hey Jar is, are you there", 1004.0, 1005.0),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=1000.5,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=False,
)
assert "Jervis" not in prompt
assert "Jaivis" not in prompt
assert "Jar is" not in prompt
# Each aliased segment is rewritten to use the primary wake word.
assert prompt.count("Jarvis") >= 3
def test_build_user_prompt_alias_word_boundary(self):
"""Alias normalisation respects word boundaries (won't eat substrings)."""
config = IntentJudgeConfig(assistant_name="Jarvis", aliases=["jar"])
judge = IntentJudge(config)
segments = [
TranscriptSegment("put the jar on the table", 1000.0, 1001.0),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=False,
)
# "jar" as a standalone word still gets normalised — that's expected
# given the user configured it as an alias.
assert "Jarvis" in prompt
# But "jarring" would NOT be replaced if it appeared.
segments2 = [TranscriptSegment("the noise was jarring", 1000.0, 1001.0)]
prompt2 = judge._build_user_prompt(
segments2,
wake_timestamp=None,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=False,
)
assert "jarring" in prompt2
assert "Jarvisring" not in prompt2
def test_build_user_prompt_no_aliases_unchanged(self):
"""With no aliases configured, segment text is passed through unchanged."""
config = IntentJudgeConfig(assistant_name="Jarvis", aliases=[])
judge = IntentJudge(config)
segments = [TranscriptSegment("Jervis what time", 1000.0, 1001.0)]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=False,
)
assert "Jervis" in prompt
def test_build_user_prompt_with_tts(self):
"""User prompt includes TTS info."""
judge = IntentJudge()
segments = [
TranscriptSegment("the weather is nice", 1000.0, 1001.0, is_during_tts=True),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="The weather is nice and sunny",
last_tts_finish_time=999.0,
in_hot_window=True,
)
assert "TTS" in prompt
assert "weather is nice and sunny" in prompt
def test_parse_response_valid_json(self):
"""Parses valid JSON response."""
judge = IntentJudge()
response = '{"directed": true, "query": "what time", "stop": false, "confidence": "high", "reasoning": "clear"}'
result = judge._parse_response(response)
assert result is not None
assert result.directed is True
assert result.query == "what time"
assert result.stop is False
assert result.confidence == "high"
def test_parse_response_with_extra_text(self):
"""Parses response with extra text around JSON."""
judge = IntentJudge()
response = 'Here is my analysis: {"directed": true, "query": "test", "stop": false, "confidence": "medium", "reasoning": "test"}'
result = judge._parse_response(response)
assert result is not None
assert result.directed is True
def test_parse_response_invalid_json(self):
"""Returns None for invalid JSON."""
judge = IntentJudge()
response = "This is not valid JSON at all"
result = judge._parse_response(response)
assert result is None
def test_parse_response_missing_fields(self):
"""Handles missing fields with defaults."""
judge = IntentJudge()
response = '{"directed": true}'
result = judge._parse_response(response)
assert result is not None
assert result.directed is True
assert result.query == ""
assert result.stop is False
assert result.confidence == "low"
def test_judge_returns_none_when_unavailable(self):
"""judge() returns None when unavailable."""
judge = IntentJudge()
judge._available = False
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
result = judge.judge(segments)
assert result is None
def test_judge_returns_none_for_empty_segments(self):
"""judge() returns None for empty segments."""
judge = IntentJudge()
result = judge.judge([])
assert result is None
def test_judge_with_mock_api(self):
"""judge() calls API and parses response."""
judge = IntentJudge()
judge._available = True
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"response": '{"directed": true, "query": "what time is it", "stop": false, "confidence": "high", "reasoning": "wake word detected"}'
}
segments = [
TranscriptSegment("jarvis what time is it", 1000.0, 1002.0),
]
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
result = judge.judge(
segments,
wake_timestamp=1000.5,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=False,
)
assert result is not None
assert result.directed is True
assert result.query == "what time is it"
def test_judge_handles_api_error(self):
"""judge() handles API errors gracefully."""
judge = IntentJudge()
judge._available = True
mock_response = MagicMock()
mock_response.status_code = 500
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
result = judge.judge(segments)
assert result is None
def test_judge_handles_timeout(self):
"""judge() handles timeout gracefully."""
import requests as real_requests
judge = IntentJudge()
judge._available = True
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
with patch('jarvis.listening.intent_judge.requests.post', side_effect=real_requests.Timeout()):
result = judge.judge(segments)
assert result is None
def test_timeout_does_not_trigger_backoff(self):
"""Timeouts must NOT trigger the 30s cooldown.
Voice is a high-turn environment: a single slow call must not lock out
intent judging for the next half-minute of conversation. The upstream
engagement-signal gate (wake word / hot window / TTS) already prevents
hammering Ollama on ambient speech, so individual timeouts are safe to
retry immediately on the next real engagement.
"""
import requests as real_requests
judge = IntentJudge()
judge._available = True
judge._last_error_time = 0.0
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
with patch('jarvis.listening.intent_judge.requests.post', side_effect=real_requests.Timeout()):
judge.judge(segments)
assert judge._last_error_time == 0.0, "timeout must NOT lock out future calls"
assert judge.available is True, "judge must remain available after a single timeout"
def test_http_error_does_not_trigger_backoff(self):
"""Transient HTTP errors (503 etc.) must NOT trigger the 30s cooldown.
Same reasoning as timeouts — we want to retry on the next engagement
signal, not lock out intent judging.
"""
judge = IntentJudge()
judge._available = True
judge._last_error_time = 0.0
mock_response = MagicMock()
mock_response.status_code = 503
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
judge.judge(segments)
assert judge._last_error_time == 0.0
assert judge.available is True
def test_connection_error_does_trigger_backoff(self):
"""Connection errors (Ollama actually down) DO trigger the 30s cooldown.
If the server is unreachable, retrying on every engagement just wastes
time. This is the one case where backoff is appropriate — it gives
Ollama a chance to come back up.
"""
import requests as real_requests
judge = IntentJudge()
judge._available = True
judge._last_error_time = 0.0
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
with patch(
'jarvis.listening.intent_judge.requests.post',
side_effect=real_requests.ConnectionError("refused"),
):
judge.judge(segments)
assert judge._last_error_time > 0.0
assert judge.available is False
def test_last_failure_reason_recorded_on_timeout(self):
"""Judge should remember why the last call failed so the listener can surface it."""
import requests as real_requests
judge = IntentJudge()
judge._available = True
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
with patch('jarvis.listening.intent_judge.requests.post', side_effect=real_requests.Timeout()):
judge.judge(segments)
assert "timeout" in judge.last_failure_reason.lower()
def test_last_failure_reason_recorded_on_http_error(self):
"""HTTP non-200 responses should be recorded as a failure reason."""
judge = IntentJudge()
judge._available = True
# Clear any stray _last_error_time from earlier test setup
judge._last_error_time = 0.0
mock_response = MagicMock()
mock_response.status_code = 503
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
judge.judge(segments)
assert "503" in judge.last_failure_reason
def test_last_failure_reason_cleared_on_success(self):
"""Successful judgments clear the last failure reason."""
judge = IntentJudge()
judge._available = True
judge._last_failure_reason = "timeout"
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"response": '{"directed": false, "query": "", "stop": false, "confidence": "high", "reasoning": "ok"}'
}
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
result = judge.judge(segments)
assert result is not None
assert judge.last_failure_reason == ""
class TestResponseParserRobustness:
"""Tests for response parser edge cases seen in the wild."""
def test_parse_response_with_nested_braces(self):
"""Parser handles JSON where a string value contains braces.
The old regex `\\{[^{}]*\\}` failed on any nested brace, producing
spurious "unavailable" errors when the model quoted code in reasoning.
"""
judge = IntentJudge()
response = '{"directed": true, "query": "format as {json}", "stop": false, "confidence": "high", "reasoning": "user asked about {formatting}"}'
result = judge._parse_response(response)
assert result is not None
assert result.directed is True
assert "json" in result.query
def test_parse_response_with_markdown_code_fence(self):
"""Parser handles JSON wrapped in ```json ... ``` fences."""
judge = IntentJudge()
response = '```json\n{"directed": true, "query": "hi", "stop": false, "confidence": "high", "reasoning": "ok"}\n```'
result = judge._parse_response(response)
assert result is not None
assert result.directed is True
assert result.query == "hi"
def test_parse_response_normalises_aliases_in_query(self):
"""Misheard wake-word aliases are rewritten to the primary name in
the directed query, not just in the transcript segments. Field
capture (2026-04-21): Whisper heard 'Chavis'; the judge echoed it
back in its ``query`` and the reply engine saw 'random pop artist,
Chavis' as the user's intent — polluting memory search and
prompts. The rewrite is case-insensitive and only applies on word
boundaries.
"""
config = IntentJudgeConfig(
assistant_name="Jarvis",
aliases=["chavis", "jervis"],
)
judge = IntentJudge(config)
response = (
'{"directed": true, '
'"query": "tell me a random pop artist, Chavis", '
'"stop": false, "confidence": "high", "reasoning": "ok"}'
)
result = judge._parse_response(response)
assert result is not None
assert result.directed is True
# Alias must be replaced with the canonical assistant name.
assert "chavis" not in result.query.lower(), (
f"Alias leaked into query: {result.query!r}"
)
assert "Jarvis" in result.query, (
f"Expected canonical name in query, got: {result.query!r}"
)
def test_parse_response_no_aliases_leaves_query_untouched(self):
"""With an empty alias list, the query passes through verbatim."""
config = IntentJudgeConfig(assistant_name="Jarvis", aliases=[])
judge = IntentJudge(config)
response = (
'{"directed": true, "query": "what is the weather like", '
'"stop": false, "confidence": "high", "reasoning": "ok"}'
)
result = judge._parse_response(response)
assert result is not None
assert result.query == "what is the weather like"
class TestCreateIntentJudge:
"""Tests for create_intent_judge factory function."""
def test_creates_judge_with_defaults(self):
"""Creates judge from config with defaults."""
mock_cfg = MagicMock()
mock_cfg.intent_judge_enabled = True
mock_cfg.intent_judge_model = "gemma4:e2b"
mock_cfg.ollama_base_url = "http://localhost:11434"
mock_cfg.intent_judge_timeout_sec = 3.0
mock_cfg.wake_word = "jarvis"
mock_cfg.wake_aliases = []
judge = create_intent_judge(mock_cfg)
assert judge is not None
assert judge.config.model == "gemma4:e2b"
def test_always_returns_judge_when_requests_available(self):
"""Always returns judge when requests library is available (per spec)."""
mock_cfg = MagicMock()
mock_cfg.intent_judge_model = "gemma4:e2b"
mock_cfg.ollama_base_url = "http://localhost:11434"
mock_cfg.intent_judge_timeout_sec = 3.0
mock_cfg.wake_word = "jarvis"
mock_cfg.wake_aliases = []
judge = create_intent_judge(mock_cfg)
# Judge should always be created (per spec - falls back only when unavailable)
assert judge is not None
class TestWarmUp:
"""Tests for IntentJudge.warm_up()."""
def test_warmup_posts_to_generate_with_keep_alive(self):
"""Warmup issues a /api/generate request that pins the model in memory."""
judge = IntentJudge(IntentJudgeConfig(model="gemma4:e2b"))
with patch("jarvis.listening.intent_judge.requests") as mock_requests:
mock_requests.post.return_value = MagicMock(status_code=200)
ok = judge.warm_up()
assert ok is True
args, kwargs = mock_requests.post.call_args
assert args[0].endswith("/api/generate")
assert kwargs["json"]["model"] == "gemma4:e2b"
assert kwargs["json"]["keep_alive"] == "30m"
assert kwargs["json"]["stream"] is False
def test_warmup_returns_false_on_http_error(self):
"""Warmup reports failure when Ollama returns a non-200 status."""
judge = IntentJudge()
with patch("jarvis.listening.intent_judge.requests") as mock_requests:
mock_requests.post.return_value = MagicMock(status_code=500)
assert judge.warm_up() is False
def test_warmup_swallows_exceptions(self):
"""Warmup never raises — transport errors return False."""
judge = IntentJudge()
with patch("jarvis.listening.intent_judge.requests") as mock_requests:
mock_requests.post.side_effect = RuntimeError("boom")
assert judge.warm_up() is False
def test_warmup_skipped_when_unavailable(self):
"""Warmup is a no-op when requests isn't installed."""
judge = IntentJudge()
judge._available = False
assert judge.warm_up() is False
class TestEchoFollowUpPattern:
"""Tests for echo + follow-up pattern handling."""
def test_system_prompt_includes_echo_followup_guidance(self):
"""System prompt includes guidance for echo + follow-up pattern."""
judge = IntentJudge()
prompt = judge._build_system_prompt()
# Check that the prompt mentions echo handling
assert "(during TTS)" in prompt # Should explain during TTS marker
assert "echo" in prompt.lower() # Should mention echo
def test_user_prompt_with_echo_and_followup(self):
"""User prompt correctly formats transcript with potential echo + follow-up."""
judge = IntentJudge()
segments = [
TranscriptSegment(
"London has 8 hours of daylight. That's cool tell me more",
1000.0, 1003.0
),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="London has around 8 hours of daylight",
last_tts_finish_time=999.0,
in_hot_window=True,
)
# Prompt should show hot window mode and include TTS text
assert "HOT WINDOW" in prompt
assert "8 hours of daylight" in prompt # TTS text included
def test_judge_extracts_followup_from_echo_mixed_transcript(self):
"""Judge correctly extracts follow-up from transcript containing echo."""
judge = IntentJudge()
judge._available = True
# Simulate response where LLM correctly identifies echo + follow-up
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"response": '{"directed": true, "query": "that\'s cool tell me more", "stop": false, "confidence": "high", "reasoning": "first part matches TTS (echo), second part is user follow-up"}'
}
segments = [
TranscriptSegment(
"London has 8 hours of daylight. That's cool tell me more",
1000.0, 1003.0
),
]
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
result = judge.judge(
segments,
wake_timestamp=None,
last_tts_text="London has around 8 hours of daylight",
last_tts_finish_time=999.0,
in_hot_window=True,
)
assert result is not None
assert result.directed is True
# The extracted query should be the follow-up, not the echo
assert "tell me more" in result.query.lower()
class TestCurrentSegmentMarker:
"""Tests for CURRENT - JUDGE THIS marker functionality."""
def test_current_segment_marked_in_prompt(self):
"""Prompt marks the current segment being judged."""
judge = IntentJudge()
segments = [
TranscriptSegment("old query from before", 1000.0, 1001.0),
TranscriptSegment("hello jarvis", 1002.0, 1003.0), # New segment
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=True,
current_text="hello jarvis", # Mark this as current
)
# The current segment should be marked
assert "CURRENT - JUDGE THIS" in prompt
# Verify it's associated with the right segment
assert '"hello jarvis"' in prompt
def test_current_segment_not_marked_when_no_match(self):
"""Prompt doesn't mark segments when current_text doesn't match."""
judge = IntentJudge()
segments = [
TranscriptSegment("hello jarvis", 1000.0, 1001.0),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=True,
current_text="something else", # Doesn't match any segment
)
# No segment should be marked as current
assert "CURRENT - JUDGE THIS" not in prompt
def test_current_segment_case_insensitive_match(self):
"""Current segment matching is case insensitive."""
judge = IntentJudge()
segments = [
TranscriptSegment("Hello Jarvis", 1000.0, 1001.0),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=True,
current_text="hello jarvis", # Different case
)
# Should still mark the segment
assert "CURRENT - JUDGE THIS" in prompt
def test_judge_passes_current_text_to_prompt(self):
"""judge() method passes current_text parameter correctly."""
judge = IntentJudge()
judge._available = True
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {
"response": '{"directed": true, "query": "no thank you", "stop": false, "confidence": "high", "reasoning": "user response"}'
}
segments = [
TranscriptSegment("old processed query", 1000.0, 1001.0),
TranscriptSegment("no thank you", 1002.0, 1003.0),
]
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response) as mock_post:
judge.judge(
segments,
wake_timestamp=None,
last_tts_text="Would you like more info?",
last_tts_finish_time=1001.5,
in_hot_window=True,
current_text="no thank you",
)
# Verify the prompt sent to the API contains the marker
call_args = mock_post.call_args
prompt = call_args[1]["json"]["prompt"]
assert "CURRENT - JUDGE THIS" in prompt
def test_system_prompt_includes_current_segment_guidance(self):
"""System prompt explains the CURRENT - JUDGE THIS marker."""
judge = IntentJudge()
prompt = judge._build_system_prompt()
# System prompt should explain the marker
assert "CURRENT - JUDGE THIS" in prompt
assert "segment to judge" in prompt.lower()
class TestCrossSegmentContextInPrompt:
"""Tests that the system prompt guides cross-segment reference resolution.
When the CURRENT segment contains vague references like "that", "it", "this",
the intent judge should use PREVIOUS segments to resolve them into a complete query.
"""
def test_system_prompt_encourages_cross_segment_resolution(self):
"""System prompt should explicitly tell the LLM to resolve references from other segments."""
judge = IntentJudge()
prompt = judge._build_system_prompt()
# The prompt must mention resolving references from other/previous/background segments
prompt_lower = prompt.lower()
assert "previous" in prompt_lower or "other segment" in prompt_lower or "background" in prompt_lower, (
"System prompt should mention using previous/background segments to resolve references"
)
def test_system_prompt_has_cross_segment_example(self):
"""System prompt should include an example of cross-segment reference resolution."""
judge = IntentJudge()
prompt = judge._build_system_prompt()
# Should have an example where context comes from a DIFFERENT segment than the wake word
# The key indicator is showing a multi-segment scenario in the prompt examples
assert "previous segment" in prompt.lower() or "background context" in prompt.lower() or "earlier segment" in prompt.lower(), (
"System prompt should have guidance about using earlier/background segments for context"
)
def test_context_segments_included_in_user_prompt(self):
"""Background context segments (unprocessed, no wake word) appear in the user prompt."""
judge = IntentJudge()
segments = [
TranscriptSegment("I think dinosaurs are cool", 1000.0, 1001.0),
TranscriptSegment("What do you think about that Jarvis", 1002.0, 1003.0),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=1002.5,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=False,
current_text="What do you think about that Jarvis",
)
# Both segments should be in the prompt — the first provides context
assert "dinosaurs are cool" in prompt
assert "What do you think about that Jarvis" in prompt
assert "CURRENT - JUDGE THIS" in prompt
class TestProcessedSegmentFiltering:
"""Tests for processed segment filtering functionality.
When segments have had queries extracted, they should be filtered out
from the intent judge prompt to prevent re-extraction of old queries.
"""
def test_processed_segments_filtered_from_prompt(self):
"""Processed segments are not included in the prompt."""
judge = IntentJudge()
segments = [
TranscriptSegment("jarvis whats the weather", 1000.0, 1001.0, processed=True),
TranscriptSegment("jarvis tell me a joke", 1002.0, 1003.0),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=True,
current_text="jarvis tell me a joke",
)
# The processed segment should NOT appear in the prompt
assert "whats the weather" not in prompt
# The current segment should appear
assert "tell me a joke" in prompt
def test_current_segment_shown_even_if_processed(self):
"""Current segment is shown even if marked as processed (edge case)."""
judge = IntentJudge()
# This edge case shouldn't happen in practice, but handle it gracefully
segments = [
TranscriptSegment("jarvis tell me a joke", 1000.0, 1001.0, processed=True),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=True,
current_text="jarvis tell me a joke", # Same as processed segment
)
# Current segment should still be shown (it's what we're judging)
assert "tell me a joke" in prompt
assert "CURRENT - JUDGE THIS" in prompt
def test_multiple_processed_segments_all_filtered(self):
"""Multiple processed segments are all filtered."""
judge = IntentJudge()
segments = [
TranscriptSegment("first old query", 1000.0, 1001.0, processed=True),
TranscriptSegment("second old query", 1001.0, 1002.0, processed=True),
TranscriptSegment("new query", 1002.0, 1003.0),
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=True,
current_text="new query",
)
# Both processed segments should be filtered
assert "first old query" not in prompt
assert "second old query" not in prompt
# Current segment should be present
assert "new query" in prompt
def test_unprocessed_context_segments_preserved(self):
"""Non-wake-word context segments (unprocessed) are preserved."""
judge = IntentJudge()
segments = [
TranscriptSegment("I wonder about the weather", 1000.0, 1001.0), # Context
TranscriptSegment("jarvis old query", 1001.0, 1002.0, processed=True), # Processed
TranscriptSegment("Yeah me too", 1002.0, 1003.0), # Context
TranscriptSegment("jarvis what do you think", 1003.0, 1004.0), # Current
]
prompt = judge._build_user_prompt(
segments,
wake_timestamp=None,
last_tts_text="",
last_tts_finish_time=0.0,
in_hot_window=True,
current_text="jarvis what do you think",
)
# Context segments (not processed, not wake word) should be preserved
assert "I wonder about the weather" in prompt
assert "Yeah me too" in prompt
# Processed segment should be filtered
assert "old query" not in prompt
# Current segment should be present
assert "what do you think" in prompt