Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
891 lines
34 KiB
Python
891 lines
34 KiB
Python
"""Tests for the intent judge module."""
|
|
|
|
import pytest
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
from jarvis.listening.intent_judge import (
|
|
IntentJudge,
|
|
IntentJudgeConfig,
|
|
IntentJudgment,
|
|
create_intent_judge,
|
|
)
|
|
from jarvis.listening.transcript_buffer import TranscriptSegment
|
|
|
|
|
|
class TestIntentJudgeConfig:
|
|
"""Tests for IntentJudgeConfig."""
|
|
|
|
def test_default_config(self):
|
|
"""Default config has reasonable values."""
|
|
config = IntentJudgeConfig()
|
|
assert config.assistant_name == "Jarvis"
|
|
assert config.model == "gemma4:e2b"
|
|
assert config.timeout_sec == 15.0
|
|
assert config.aliases == []
|
|
|
|
def test_custom_config(self):
|
|
"""Can customize config values."""
|
|
config = IntentJudgeConfig(
|
|
assistant_name="Friday",
|
|
model="llama3.2:1b",
|
|
aliases=["computer"],
|
|
)
|
|
assert config.assistant_name == "Friday"
|
|
assert config.model == "llama3.2:1b"
|
|
assert config.aliases == ["computer"]
|
|
|
|
|
|
class TestIntentJudgment:
|
|
"""Tests for IntentJudgment dataclass."""
|
|
|
|
def test_basic_judgment(self):
|
|
"""Can create a basic judgment."""
|
|
judgment = IntentJudgment(
|
|
directed=True,
|
|
query="what time is it",
|
|
stop=False,
|
|
confidence="high",
|
|
reasoning="clear wake word",
|
|
)
|
|
assert judgment.directed is True
|
|
assert judgment.query == "what time is it"
|
|
assert judgment.stop is False
|
|
assert judgment.confidence == "high"
|
|
|
|
|
|
class TestIntentJudge:
|
|
"""Tests for IntentJudge class."""
|
|
|
|
def test_init(self):
|
|
"""Can initialize intent judge."""
|
|
judge = IntentJudge()
|
|
assert judge.config.assistant_name == "Jarvis"
|
|
|
|
def test_init_with_config(self):
|
|
"""Can initialize with custom config."""
|
|
config = IntentJudgeConfig(assistant_name="Friday")
|
|
judge = IntentJudge(config)
|
|
assert judge.config.assistant_name == "Friday"
|
|
|
|
def test_available_when_requests_installed(self):
|
|
"""available is True when requests is installed."""
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
judge._last_error_time = 0.0
|
|
assert judge.available is True
|
|
|
|
def test_unavailable_during_error_cooldown(self):
|
|
"""available is False during error cooldown."""
|
|
import time
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
judge._last_error_time = time.time()
|
|
judge._error_cooldown = 30.0
|
|
assert judge.available is False
|
|
|
|
def test_build_system_prompt(self):
|
|
"""System prompt includes assistant name."""
|
|
config = IntentJudgeConfig(assistant_name="Friday")
|
|
judge = IntentJudge(config)
|
|
prompt = judge._build_system_prompt()
|
|
assert "Friday" in prompt
|
|
|
|
def test_build_user_prompt_basic(self):
|
|
"""User prompt includes transcript."""
|
|
judge = IntentJudge()
|
|
segments = [
|
|
TranscriptSegment("hello jarvis", 1000.0, 1001.0),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=1000.5,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=False,
|
|
)
|
|
assert "hello jarvis" in prompt
|
|
|
|
def test_build_user_prompt_hot_window(self):
|
|
"""User prompt indicates hot window mode."""
|
|
judge = IntentJudge()
|
|
segments = [
|
|
TranscriptSegment("what time is it", 1000.0, 1001.0),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=True,
|
|
)
|
|
assert "HOT WINDOW" in prompt
|
|
|
|
def test_build_user_prompt_normalises_aliases(self):
|
|
"""Aliases (Whisper variants) are replaced with the assistant name in the prompt."""
|
|
config = IntentJudgeConfig(
|
|
assistant_name="Jarvis",
|
|
aliases=["jervis", "jaivis", "jar is"],
|
|
)
|
|
judge = IntentJudge(config)
|
|
segments = [
|
|
TranscriptSegment("Jervis what time is it", 1000.0, 1001.0),
|
|
TranscriptSegment("Jaivis tell me a joke", 1002.0, 1003.0),
|
|
TranscriptSegment("hey Jar is, are you there", 1004.0, 1005.0),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=1000.5,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=False,
|
|
)
|
|
assert "Jervis" not in prompt
|
|
assert "Jaivis" not in prompt
|
|
assert "Jar is" not in prompt
|
|
# Each aliased segment is rewritten to use the primary wake word.
|
|
assert prompt.count("Jarvis") >= 3
|
|
|
|
def test_build_user_prompt_alias_word_boundary(self):
|
|
"""Alias normalisation respects word boundaries (won't eat substrings)."""
|
|
config = IntentJudgeConfig(assistant_name="Jarvis", aliases=["jar"])
|
|
judge = IntentJudge(config)
|
|
segments = [
|
|
TranscriptSegment("put the jar on the table", 1000.0, 1001.0),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=False,
|
|
)
|
|
# "jar" as a standalone word still gets normalised — that's expected
|
|
# given the user configured it as an alias.
|
|
assert "Jarvis" in prompt
|
|
# But "jarring" would NOT be replaced if it appeared.
|
|
segments2 = [TranscriptSegment("the noise was jarring", 1000.0, 1001.0)]
|
|
prompt2 = judge._build_user_prompt(
|
|
segments2,
|
|
wake_timestamp=None,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=False,
|
|
)
|
|
assert "jarring" in prompt2
|
|
assert "Jarvisring" not in prompt2
|
|
|
|
def test_build_user_prompt_no_aliases_unchanged(self):
|
|
"""With no aliases configured, segment text is passed through unchanged."""
|
|
config = IntentJudgeConfig(assistant_name="Jarvis", aliases=[])
|
|
judge = IntentJudge(config)
|
|
segments = [TranscriptSegment("Jervis what time", 1000.0, 1001.0)]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=False,
|
|
)
|
|
assert "Jervis" in prompt
|
|
|
|
def test_build_user_prompt_with_tts(self):
|
|
"""User prompt includes TTS info."""
|
|
judge = IntentJudge()
|
|
segments = [
|
|
TranscriptSegment("the weather is nice", 1000.0, 1001.0, is_during_tts=True),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="The weather is nice and sunny",
|
|
last_tts_finish_time=999.0,
|
|
in_hot_window=True,
|
|
)
|
|
assert "TTS" in prompt
|
|
assert "weather is nice and sunny" in prompt
|
|
|
|
def test_parse_response_valid_json(self):
|
|
"""Parses valid JSON response."""
|
|
judge = IntentJudge()
|
|
response = '{"directed": true, "query": "what time", "stop": false, "confidence": "high", "reasoning": "clear"}'
|
|
result = judge._parse_response(response)
|
|
|
|
assert result is not None
|
|
assert result.directed is True
|
|
assert result.query == "what time"
|
|
assert result.stop is False
|
|
assert result.confidence == "high"
|
|
|
|
def test_parse_response_with_extra_text(self):
|
|
"""Parses response with extra text around JSON."""
|
|
judge = IntentJudge()
|
|
response = 'Here is my analysis: {"directed": true, "query": "test", "stop": false, "confidence": "medium", "reasoning": "test"}'
|
|
result = judge._parse_response(response)
|
|
|
|
assert result is not None
|
|
assert result.directed is True
|
|
|
|
def test_parse_response_invalid_json(self):
|
|
"""Returns None for invalid JSON."""
|
|
judge = IntentJudge()
|
|
response = "This is not valid JSON at all"
|
|
result = judge._parse_response(response)
|
|
|
|
assert result is None
|
|
|
|
def test_parse_response_missing_fields(self):
|
|
"""Handles missing fields with defaults."""
|
|
judge = IntentJudge()
|
|
response = '{"directed": true}'
|
|
result = judge._parse_response(response)
|
|
|
|
assert result is not None
|
|
assert result.directed is True
|
|
assert result.query == ""
|
|
assert result.stop is False
|
|
assert result.confidence == "low"
|
|
|
|
def test_judge_returns_none_when_unavailable(self):
|
|
"""judge() returns None when unavailable."""
|
|
judge = IntentJudge()
|
|
judge._available = False
|
|
|
|
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
|
|
result = judge.judge(segments)
|
|
|
|
assert result is None
|
|
|
|
def test_judge_returns_none_for_empty_segments(self):
|
|
"""judge() returns None for empty segments."""
|
|
judge = IntentJudge()
|
|
result = judge.judge([])
|
|
assert result is None
|
|
|
|
def test_judge_with_mock_api(self):
|
|
"""judge() calls API and parses response."""
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 200
|
|
mock_response.json.return_value = {
|
|
"response": '{"directed": true, "query": "what time is it", "stop": false, "confidence": "high", "reasoning": "wake word detected"}'
|
|
}
|
|
|
|
segments = [
|
|
TranscriptSegment("jarvis what time is it", 1000.0, 1002.0),
|
|
]
|
|
|
|
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
|
|
result = judge.judge(
|
|
segments,
|
|
wake_timestamp=1000.5,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=False,
|
|
)
|
|
|
|
assert result is not None
|
|
assert result.directed is True
|
|
assert result.query == "what time is it"
|
|
|
|
def test_judge_handles_api_error(self):
|
|
"""judge() handles API errors gracefully."""
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 500
|
|
|
|
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
|
|
|
|
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
|
|
result = judge.judge(segments)
|
|
|
|
assert result is None
|
|
|
|
def test_judge_handles_timeout(self):
|
|
"""judge() handles timeout gracefully."""
|
|
import requests as real_requests
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
|
|
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
|
|
|
|
with patch('jarvis.listening.intent_judge.requests.post', side_effect=real_requests.Timeout()):
|
|
result = judge.judge(segments)
|
|
|
|
assert result is None
|
|
|
|
def test_timeout_does_not_trigger_backoff(self):
|
|
"""Timeouts must NOT trigger the 30s cooldown.
|
|
|
|
Voice is a high-turn environment: a single slow call must not lock out
|
|
intent judging for the next half-minute of conversation. The upstream
|
|
engagement-signal gate (wake word / hot window / TTS) already prevents
|
|
hammering Ollama on ambient speech, so individual timeouts are safe to
|
|
retry immediately on the next real engagement.
|
|
"""
|
|
import requests as real_requests
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
judge._last_error_time = 0.0
|
|
|
|
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
|
|
|
|
with patch('jarvis.listening.intent_judge.requests.post', side_effect=real_requests.Timeout()):
|
|
judge.judge(segments)
|
|
|
|
assert judge._last_error_time == 0.0, "timeout must NOT lock out future calls"
|
|
assert judge.available is True, "judge must remain available after a single timeout"
|
|
|
|
def test_http_error_does_not_trigger_backoff(self):
|
|
"""Transient HTTP errors (503 etc.) must NOT trigger the 30s cooldown.
|
|
|
|
Same reasoning as timeouts — we want to retry on the next engagement
|
|
signal, not lock out intent judging.
|
|
"""
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
judge._last_error_time = 0.0
|
|
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 503
|
|
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
|
|
|
|
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
|
|
judge.judge(segments)
|
|
|
|
assert judge._last_error_time == 0.0
|
|
assert judge.available is True
|
|
|
|
def test_connection_error_does_trigger_backoff(self):
|
|
"""Connection errors (Ollama actually down) DO trigger the 30s cooldown.
|
|
|
|
If the server is unreachable, retrying on every engagement just wastes
|
|
time. This is the one case where backoff is appropriate — it gives
|
|
Ollama a chance to come back up.
|
|
"""
|
|
import requests as real_requests
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
judge._last_error_time = 0.0
|
|
|
|
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
|
|
|
|
with patch(
|
|
'jarvis.listening.intent_judge.requests.post',
|
|
side_effect=real_requests.ConnectionError("refused"),
|
|
):
|
|
judge.judge(segments)
|
|
|
|
assert judge._last_error_time > 0.0
|
|
assert judge.available is False
|
|
|
|
def test_last_failure_reason_recorded_on_timeout(self):
|
|
"""Judge should remember why the last call failed so the listener can surface it."""
|
|
import requests as real_requests
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
|
|
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
|
|
|
|
with patch('jarvis.listening.intent_judge.requests.post', side_effect=real_requests.Timeout()):
|
|
judge.judge(segments)
|
|
|
|
assert "timeout" in judge.last_failure_reason.lower()
|
|
|
|
def test_last_failure_reason_recorded_on_http_error(self):
|
|
"""HTTP non-200 responses should be recorded as a failure reason."""
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
# Clear any stray _last_error_time from earlier test setup
|
|
judge._last_error_time = 0.0
|
|
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 503
|
|
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
|
|
|
|
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
|
|
judge.judge(segments)
|
|
|
|
assert "503" in judge.last_failure_reason
|
|
|
|
def test_last_failure_reason_cleared_on_success(self):
|
|
"""Successful judgments clear the last failure reason."""
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
judge._last_failure_reason = "timeout"
|
|
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 200
|
|
mock_response.json.return_value = {
|
|
"response": '{"directed": false, "query": "", "stop": false, "confidence": "high", "reasoning": "ok"}'
|
|
}
|
|
segments = [TranscriptSegment("test", 1000.0, 1001.0)]
|
|
|
|
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
|
|
result = judge.judge(segments)
|
|
|
|
assert result is not None
|
|
assert judge.last_failure_reason == ""
|
|
|
|
|
|
class TestResponseParserRobustness:
|
|
"""Tests for response parser edge cases seen in the wild."""
|
|
|
|
def test_parse_response_with_nested_braces(self):
|
|
"""Parser handles JSON where a string value contains braces.
|
|
|
|
The old regex `\\{[^{}]*\\}` failed on any nested brace, producing
|
|
spurious "unavailable" errors when the model quoted code in reasoning.
|
|
"""
|
|
judge = IntentJudge()
|
|
response = '{"directed": true, "query": "format as {json}", "stop": false, "confidence": "high", "reasoning": "user asked about {formatting}"}'
|
|
result = judge._parse_response(response)
|
|
|
|
assert result is not None
|
|
assert result.directed is True
|
|
assert "json" in result.query
|
|
|
|
def test_parse_response_with_markdown_code_fence(self):
|
|
"""Parser handles JSON wrapped in ```json ... ``` fences."""
|
|
judge = IntentJudge()
|
|
response = '```json\n{"directed": true, "query": "hi", "stop": false, "confidence": "high", "reasoning": "ok"}\n```'
|
|
result = judge._parse_response(response)
|
|
|
|
assert result is not None
|
|
assert result.directed is True
|
|
assert result.query == "hi"
|
|
|
|
def test_parse_response_normalises_aliases_in_query(self):
|
|
"""Misheard wake-word aliases are rewritten to the primary name in
|
|
the directed query, not just in the transcript segments. Field
|
|
capture (2026-04-21): Whisper heard 'Chavis'; the judge echoed it
|
|
back in its ``query`` and the reply engine saw 'random pop artist,
|
|
Chavis' as the user's intent — polluting memory search and
|
|
prompts. The rewrite is case-insensitive and only applies on word
|
|
boundaries.
|
|
"""
|
|
config = IntentJudgeConfig(
|
|
assistant_name="Jarvis",
|
|
aliases=["chavis", "jervis"],
|
|
)
|
|
judge = IntentJudge(config)
|
|
response = (
|
|
'{"directed": true, '
|
|
'"query": "tell me a random pop artist, Chavis", '
|
|
'"stop": false, "confidence": "high", "reasoning": "ok"}'
|
|
)
|
|
result = judge._parse_response(response)
|
|
|
|
assert result is not None
|
|
assert result.directed is True
|
|
# Alias must be replaced with the canonical assistant name.
|
|
assert "chavis" not in result.query.lower(), (
|
|
f"Alias leaked into query: {result.query!r}"
|
|
)
|
|
assert "Jarvis" in result.query, (
|
|
f"Expected canonical name in query, got: {result.query!r}"
|
|
)
|
|
|
|
def test_parse_response_no_aliases_leaves_query_untouched(self):
|
|
"""With an empty alias list, the query passes through verbatim."""
|
|
config = IntentJudgeConfig(assistant_name="Jarvis", aliases=[])
|
|
judge = IntentJudge(config)
|
|
response = (
|
|
'{"directed": true, "query": "what is the weather like", '
|
|
'"stop": false, "confidence": "high", "reasoning": "ok"}'
|
|
)
|
|
result = judge._parse_response(response)
|
|
|
|
assert result is not None
|
|
assert result.query == "what is the weather like"
|
|
|
|
|
|
class TestCreateIntentJudge:
|
|
"""Tests for create_intent_judge factory function."""
|
|
|
|
def test_creates_judge_with_defaults(self):
|
|
"""Creates judge from config with defaults."""
|
|
mock_cfg = MagicMock()
|
|
mock_cfg.intent_judge_enabled = True
|
|
mock_cfg.intent_judge_model = "gemma4:e2b"
|
|
mock_cfg.ollama_base_url = "http://localhost:11434"
|
|
mock_cfg.intent_judge_timeout_sec = 3.0
|
|
mock_cfg.wake_word = "jarvis"
|
|
mock_cfg.wake_aliases = []
|
|
|
|
judge = create_intent_judge(mock_cfg)
|
|
|
|
assert judge is not None
|
|
assert judge.config.model == "gemma4:e2b"
|
|
|
|
def test_always_returns_judge_when_requests_available(self):
|
|
"""Always returns judge when requests library is available (per spec)."""
|
|
mock_cfg = MagicMock()
|
|
mock_cfg.intent_judge_model = "gemma4:e2b"
|
|
mock_cfg.ollama_base_url = "http://localhost:11434"
|
|
mock_cfg.intent_judge_timeout_sec = 3.0
|
|
mock_cfg.wake_word = "jarvis"
|
|
mock_cfg.wake_aliases = []
|
|
|
|
judge = create_intent_judge(mock_cfg)
|
|
# Judge should always be created (per spec - falls back only when unavailable)
|
|
assert judge is not None
|
|
|
|
|
|
class TestWarmUp:
|
|
"""Tests for IntentJudge.warm_up()."""
|
|
|
|
def test_warmup_posts_to_generate_with_keep_alive(self):
|
|
"""Warmup issues a /api/generate request that pins the model in memory."""
|
|
judge = IntentJudge(IntentJudgeConfig(model="gemma4:e2b"))
|
|
with patch("jarvis.listening.intent_judge.requests") as mock_requests:
|
|
mock_requests.post.return_value = MagicMock(status_code=200)
|
|
ok = judge.warm_up()
|
|
|
|
assert ok is True
|
|
args, kwargs = mock_requests.post.call_args
|
|
assert args[0].endswith("/api/generate")
|
|
assert kwargs["json"]["model"] == "gemma4:e2b"
|
|
assert kwargs["json"]["keep_alive"] == "30m"
|
|
assert kwargs["json"]["stream"] is False
|
|
|
|
def test_warmup_returns_false_on_http_error(self):
|
|
"""Warmup reports failure when Ollama returns a non-200 status."""
|
|
judge = IntentJudge()
|
|
with patch("jarvis.listening.intent_judge.requests") as mock_requests:
|
|
mock_requests.post.return_value = MagicMock(status_code=500)
|
|
assert judge.warm_up() is False
|
|
|
|
def test_warmup_swallows_exceptions(self):
|
|
"""Warmup never raises — transport errors return False."""
|
|
judge = IntentJudge()
|
|
with patch("jarvis.listening.intent_judge.requests") as mock_requests:
|
|
mock_requests.post.side_effect = RuntimeError("boom")
|
|
assert judge.warm_up() is False
|
|
|
|
def test_warmup_skipped_when_unavailable(self):
|
|
"""Warmup is a no-op when requests isn't installed."""
|
|
judge = IntentJudge()
|
|
judge._available = False
|
|
assert judge.warm_up() is False
|
|
|
|
|
|
class TestEchoFollowUpPattern:
|
|
"""Tests for echo + follow-up pattern handling."""
|
|
|
|
def test_system_prompt_includes_echo_followup_guidance(self):
|
|
"""System prompt includes guidance for echo + follow-up pattern."""
|
|
judge = IntentJudge()
|
|
prompt = judge._build_system_prompt()
|
|
|
|
# Check that the prompt mentions echo handling
|
|
assert "(during TTS)" in prompt # Should explain during TTS marker
|
|
assert "echo" in prompt.lower() # Should mention echo
|
|
|
|
def test_user_prompt_with_echo_and_followup(self):
|
|
"""User prompt correctly formats transcript with potential echo + follow-up."""
|
|
judge = IntentJudge()
|
|
segments = [
|
|
TranscriptSegment(
|
|
"London has 8 hours of daylight. That's cool tell me more",
|
|
1000.0, 1003.0
|
|
),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="London has around 8 hours of daylight",
|
|
last_tts_finish_time=999.0,
|
|
in_hot_window=True,
|
|
)
|
|
|
|
# Prompt should show hot window mode and include TTS text
|
|
assert "HOT WINDOW" in prompt
|
|
assert "8 hours of daylight" in prompt # TTS text included
|
|
|
|
def test_judge_extracts_followup_from_echo_mixed_transcript(self):
|
|
"""Judge correctly extracts follow-up from transcript containing echo."""
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
|
|
# Simulate response where LLM correctly identifies echo + follow-up
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 200
|
|
mock_response.json.return_value = {
|
|
"response": '{"directed": true, "query": "that\'s cool tell me more", "stop": false, "confidence": "high", "reasoning": "first part matches TTS (echo), second part is user follow-up"}'
|
|
}
|
|
|
|
segments = [
|
|
TranscriptSegment(
|
|
"London has 8 hours of daylight. That's cool tell me more",
|
|
1000.0, 1003.0
|
|
),
|
|
]
|
|
|
|
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response):
|
|
result = judge.judge(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="London has around 8 hours of daylight",
|
|
last_tts_finish_time=999.0,
|
|
in_hot_window=True,
|
|
)
|
|
|
|
assert result is not None
|
|
assert result.directed is True
|
|
# The extracted query should be the follow-up, not the echo
|
|
assert "tell me more" in result.query.lower()
|
|
|
|
|
|
class TestCurrentSegmentMarker:
|
|
"""Tests for CURRENT - JUDGE THIS marker functionality."""
|
|
|
|
def test_current_segment_marked_in_prompt(self):
|
|
"""Prompt marks the current segment being judged."""
|
|
judge = IntentJudge()
|
|
segments = [
|
|
TranscriptSegment("old query from before", 1000.0, 1001.0),
|
|
TranscriptSegment("hello jarvis", 1002.0, 1003.0), # New segment
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=True,
|
|
current_text="hello jarvis", # Mark this as current
|
|
)
|
|
|
|
# The current segment should be marked
|
|
assert "CURRENT - JUDGE THIS" in prompt
|
|
# Verify it's associated with the right segment
|
|
assert '"hello jarvis"' in prompt
|
|
|
|
def test_current_segment_not_marked_when_no_match(self):
|
|
"""Prompt doesn't mark segments when current_text doesn't match."""
|
|
judge = IntentJudge()
|
|
segments = [
|
|
TranscriptSegment("hello jarvis", 1000.0, 1001.0),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=True,
|
|
current_text="something else", # Doesn't match any segment
|
|
)
|
|
|
|
# No segment should be marked as current
|
|
assert "CURRENT - JUDGE THIS" not in prompt
|
|
|
|
def test_current_segment_case_insensitive_match(self):
|
|
"""Current segment matching is case insensitive."""
|
|
judge = IntentJudge()
|
|
segments = [
|
|
TranscriptSegment("Hello Jarvis", 1000.0, 1001.0),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=True,
|
|
current_text="hello jarvis", # Different case
|
|
)
|
|
|
|
# Should still mark the segment
|
|
assert "CURRENT - JUDGE THIS" in prompt
|
|
|
|
def test_judge_passes_current_text_to_prompt(self):
|
|
"""judge() method passes current_text parameter correctly."""
|
|
judge = IntentJudge()
|
|
judge._available = True
|
|
|
|
mock_response = MagicMock()
|
|
mock_response.status_code = 200
|
|
mock_response.json.return_value = {
|
|
"response": '{"directed": true, "query": "no thank you", "stop": false, "confidence": "high", "reasoning": "user response"}'
|
|
}
|
|
|
|
segments = [
|
|
TranscriptSegment("old processed query", 1000.0, 1001.0),
|
|
TranscriptSegment("no thank you", 1002.0, 1003.0),
|
|
]
|
|
|
|
with patch('jarvis.listening.intent_judge.requests.post', return_value=mock_response) as mock_post:
|
|
judge.judge(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="Would you like more info?",
|
|
last_tts_finish_time=1001.5,
|
|
in_hot_window=True,
|
|
current_text="no thank you",
|
|
)
|
|
|
|
# Verify the prompt sent to the API contains the marker
|
|
call_args = mock_post.call_args
|
|
prompt = call_args[1]["json"]["prompt"]
|
|
assert "CURRENT - JUDGE THIS" in prompt
|
|
|
|
def test_system_prompt_includes_current_segment_guidance(self):
|
|
"""System prompt explains the CURRENT - JUDGE THIS marker."""
|
|
judge = IntentJudge()
|
|
prompt = judge._build_system_prompt()
|
|
|
|
# System prompt should explain the marker
|
|
assert "CURRENT - JUDGE THIS" in prompt
|
|
assert "segment to judge" in prompt.lower()
|
|
|
|
|
|
class TestCrossSegmentContextInPrompt:
|
|
"""Tests that the system prompt guides cross-segment reference resolution.
|
|
|
|
When the CURRENT segment contains vague references like "that", "it", "this",
|
|
the intent judge should use PREVIOUS segments to resolve them into a complete query.
|
|
"""
|
|
|
|
def test_system_prompt_encourages_cross_segment_resolution(self):
|
|
"""System prompt should explicitly tell the LLM to resolve references from other segments."""
|
|
judge = IntentJudge()
|
|
prompt = judge._build_system_prompt()
|
|
|
|
# The prompt must mention resolving references from other/previous/background segments
|
|
prompt_lower = prompt.lower()
|
|
assert "previous" in prompt_lower or "other segment" in prompt_lower or "background" in prompt_lower, (
|
|
"System prompt should mention using previous/background segments to resolve references"
|
|
)
|
|
|
|
def test_system_prompt_has_cross_segment_example(self):
|
|
"""System prompt should include an example of cross-segment reference resolution."""
|
|
judge = IntentJudge()
|
|
prompt = judge._build_system_prompt()
|
|
|
|
# Should have an example where context comes from a DIFFERENT segment than the wake word
|
|
# The key indicator is showing a multi-segment scenario in the prompt examples
|
|
assert "previous segment" in prompt.lower() or "background context" in prompt.lower() or "earlier segment" in prompt.lower(), (
|
|
"System prompt should have guidance about using earlier/background segments for context"
|
|
)
|
|
|
|
def test_context_segments_included_in_user_prompt(self):
|
|
"""Background context segments (unprocessed, no wake word) appear in the user prompt."""
|
|
judge = IntentJudge()
|
|
segments = [
|
|
TranscriptSegment("I think dinosaurs are cool", 1000.0, 1001.0),
|
|
TranscriptSegment("What do you think about that Jarvis", 1002.0, 1003.0),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=1002.5,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=False,
|
|
current_text="What do you think about that Jarvis",
|
|
)
|
|
|
|
# Both segments should be in the prompt — the first provides context
|
|
assert "dinosaurs are cool" in prompt
|
|
assert "What do you think about that Jarvis" in prompt
|
|
assert "CURRENT - JUDGE THIS" in prompt
|
|
|
|
|
|
class TestProcessedSegmentFiltering:
|
|
"""Tests for processed segment filtering functionality.
|
|
|
|
When segments have had queries extracted, they should be filtered out
|
|
from the intent judge prompt to prevent re-extraction of old queries.
|
|
"""
|
|
|
|
def test_processed_segments_filtered_from_prompt(self):
|
|
"""Processed segments are not included in the prompt."""
|
|
judge = IntentJudge()
|
|
segments = [
|
|
TranscriptSegment("jarvis whats the weather", 1000.0, 1001.0, processed=True),
|
|
TranscriptSegment("jarvis tell me a joke", 1002.0, 1003.0),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=True,
|
|
current_text="jarvis tell me a joke",
|
|
)
|
|
|
|
# The processed segment should NOT appear in the prompt
|
|
assert "whats the weather" not in prompt
|
|
# The current segment should appear
|
|
assert "tell me a joke" in prompt
|
|
|
|
def test_current_segment_shown_even_if_processed(self):
|
|
"""Current segment is shown even if marked as processed (edge case)."""
|
|
judge = IntentJudge()
|
|
# This edge case shouldn't happen in practice, but handle it gracefully
|
|
segments = [
|
|
TranscriptSegment("jarvis tell me a joke", 1000.0, 1001.0, processed=True),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=True,
|
|
current_text="jarvis tell me a joke", # Same as processed segment
|
|
)
|
|
|
|
# Current segment should still be shown (it's what we're judging)
|
|
assert "tell me a joke" in prompt
|
|
assert "CURRENT - JUDGE THIS" in prompt
|
|
|
|
def test_multiple_processed_segments_all_filtered(self):
|
|
"""Multiple processed segments are all filtered."""
|
|
judge = IntentJudge()
|
|
segments = [
|
|
TranscriptSegment("first old query", 1000.0, 1001.0, processed=True),
|
|
TranscriptSegment("second old query", 1001.0, 1002.0, processed=True),
|
|
TranscriptSegment("new query", 1002.0, 1003.0),
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=True,
|
|
current_text="new query",
|
|
)
|
|
|
|
# Both processed segments should be filtered
|
|
assert "first old query" not in prompt
|
|
assert "second old query" not in prompt
|
|
# Current segment should be present
|
|
assert "new query" in prompt
|
|
|
|
def test_unprocessed_context_segments_preserved(self):
|
|
"""Non-wake-word context segments (unprocessed) are preserved."""
|
|
judge = IntentJudge()
|
|
segments = [
|
|
TranscriptSegment("I wonder about the weather", 1000.0, 1001.0), # Context
|
|
TranscriptSegment("jarvis old query", 1001.0, 1002.0, processed=True), # Processed
|
|
TranscriptSegment("Yeah me too", 1002.0, 1003.0), # Context
|
|
TranscriptSegment("jarvis what do you think", 1003.0, 1004.0), # Current
|
|
]
|
|
prompt = judge._build_user_prompt(
|
|
segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=True,
|
|
current_text="jarvis what do you think",
|
|
)
|
|
|
|
# Context segments (not processed, not wake word) should be preserved
|
|
assert "I wonder about the weather" in prompt
|
|
assert "Yeah me too" in prompt
|
|
# Processed segment should be filtered
|
|
assert "old query" not in prompt
|
|
# Current segment should be present
|
|
assert "what do you think" in prompt
|