Files
javis_bot/tests/test_query_validation.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

614 lines
24 KiB
Python

"""
Tests for wake word validation in the listener.
These tests verify that:
1. Wake word presence is verified in wake word mode
2. Hot window mode doesn't require wake word
3. Various state timing scenarios are handled correctly
"""
import pytest
from unittest.mock import patch, MagicMock
import time
from jarvis.listening.wake_detection import is_wake_word_detected
class TestWakeWordValidation:
"""Tests for wake word presence validation in wake word mode.
The listener must verify wake word is present when:
1. We're in wake word mode (not hot window)
2. Intent judge says directed=true
"""
def test_wake_word_detected_with_jarvis(self):
"""Wake word detection finds 'jarvis' in text."""
text = "hey jarvis what time is it"
assert is_wake_word_detected(text, "jarvis", []) is True
def test_wake_word_detected_with_alias(self):
"""Wake word detection finds alias."""
text = "hey assistant what time is it"
assert is_wake_word_detected(text, "jarvis", ["assistant"]) is True
def test_wake_word_not_detected_without_wake_word(self):
"""Wake word detection returns False when no wake word present."""
text = "how are you"
assert is_wake_word_detected(text, "jarvis", []) is False
def test_wake_word_not_detected_similar_but_different(self):
"""Wake word detection doesn't match similar words."""
text = "I was jarring some preserves"
# "jarring" is similar to "jarvis" but should not match with high threshold
assert is_wake_word_detected(text, "jarvis", [], fuzzy_ratio=0.9) is False
def test_bug_scenario_no_wake_word_in_query(self):
"""
Bug scenario: Intent judge says directed=true for 'How are you?'
but there's no wake word - this should be rejected in wake word mode.
"""
text = "how are you"
wake_word = "jarvis"
aliases = []
# In wake word mode (not hot window), we need to verify wake word
could_be_hot_window = False
if not could_be_hot_window:
# Check if wake word is present
has_wake_word = is_wake_word_detected(text, wake_word, aliases)
# This should be False - there's no "jarvis" in "how are you"
assert has_wake_word is False, "Should reject - no wake word in text"
def test_valid_query_with_wake_word(self):
"""Valid scenario: Wake word is present in the query."""
text = "jarvis what's the weather"
wake_word = "jarvis"
aliases = []
has_wake_word = is_wake_word_detected(text, wake_word, aliases)
assert has_wake_word is True
def test_hot_window_mode_no_wake_word_needed(self):
"""In hot window mode, wake word is not required."""
text = "tell me more"
wake_word = "jarvis"
aliases = []
# In hot window mode, we don't check for wake word
could_be_hot_window = True
# The wake word check is skipped in hot window mode
# Intent judge decides based on context
if not could_be_hot_window:
has_wake_word = is_wake_word_detected(text, wake_word, aliases)
# Would fail, but we're in hot window so this check is skipped
# No assertion needed - just verifying the logic flow
def test_wake_word_with_fuzzy_match(self):
"""Fuzzy matching catches slight variations."""
text = "hey jarv what time is it" # Slight typo
wake_word = "jarvis"
aliases = []
# With lower fuzzy ratio (0.7), "jarv" might match "jarvis"
result = is_wake_word_detected(text, wake_word, aliases, fuzzy_ratio=0.7)
# "jarv" to "jarvis" ratio is about 0.73
assert result is True
def test_wake_word_case_insensitive(self):
"""Wake word detection is case insensitive."""
text = "JARVIS what time is it"
wake_word = "jarvis"
aliases = []
# Function expects lowercase text
assert is_wake_word_detected(text.lower(), wake_word, aliases) is True
class TestIntentJudgeWakeWordValidation:
"""Integration tests for intent judge + wake word validation."""
def test_intent_judge_directed_rejected_without_wake_word(self):
"""
Simulate the bug: Intent judge says directed=true but no wake word.
In wake word mode, this should be rejected.
"""
# Simulated state
text_lower = "how are you"
could_be_hot_window = False # Wake word mode
wake_timestamp = None # No wake word detected by audio detector
wake_word = "jarvis"
aliases = []
# Intent judge (incorrectly) returns directed=true
intent_judgment_directed = True
intent_judgment_query = "how are you"
# Validation logic from listener
should_accept = False
if intent_judgment_directed and intent_judgment_query:
if not could_be_hot_window:
# In wake word mode, verify wake word
has_wake_word = wake_timestamp is not None or is_wake_word_detected(
text_lower, wake_word, aliases
)
should_accept = has_wake_word
else:
should_accept = True
assert should_accept is False, "Should reject - no wake word in wake word mode"
def test_intent_judge_directed_accepted_with_wake_word(self):
"""Intent judge directed=true is accepted when wake word is present."""
text_lower = "jarvis what's the weather"
could_be_hot_window = False # Wake word mode
wake_timestamp = None # Doesn't matter, text has wake word
wake_word = "jarvis"
aliases = []
intent_judgment_directed = True
intent_judgment_query = "what's the weather"
should_accept = False
if intent_judgment_directed and intent_judgment_query:
if not could_be_hot_window:
has_wake_word = wake_timestamp is not None or is_wake_word_detected(
text_lower, wake_word, aliases
)
should_accept = has_wake_word
else:
should_accept = True
assert should_accept is True, "Should accept - wake word present"
def test_intent_judge_directed_accepted_with_timestamp(self):
"""Intent judge directed=true is accepted when wake_timestamp is set."""
text_lower = "what's the weather" # Wake word might be trimmed already
could_be_hot_window = False # Wake word mode
wake_timestamp = 1000.5 # Wake word was detected by audio detector
wake_word = "jarvis"
aliases = []
intent_judgment_directed = True
intent_judgment_query = "what's the weather"
should_accept = False
if intent_judgment_directed and intent_judgment_query:
if not could_be_hot_window:
has_wake_word = wake_timestamp is not None or is_wake_word_detected(
text_lower, wake_word, aliases
)
should_accept = has_wake_word
else:
should_accept = True
assert should_accept is True, "Should accept - wake_timestamp is set"
def test_hot_window_always_accepts_directed(self):
"""In hot window mode, directed=true is always accepted."""
text_lower = "tell me more"
could_be_hot_window = True # Hot window mode
wake_timestamp = None
wake_word = "jarvis"
aliases = []
intent_judgment_directed = True
intent_judgment_query = "tell me more"
should_accept = False
if intent_judgment_directed and intent_judgment_query:
if not could_be_hot_window:
has_wake_word = wake_timestamp is not None or is_wake_word_detected(
text_lower, wake_word, aliases
)
should_accept = has_wake_word
else:
should_accept = True # Hot window - no wake word needed
assert should_accept is True, "Should accept - hot window mode"
def test_hot_window_uses_actual_text_not_intent_judge_query(self):
"""In hot window mode, the actual user text should be used as the query.
Regression test: previously the intent judge's extracted query was used,
which could lose words (e.g. extracting "I" from "No, I'm good.").
Per spec: "Hot window input should reflect what the user actually said."
"""
text_lower = "no, i'm good."
intent_judgment_query = "I" # Bad extraction by small LLM
# In hot window mode, we should use text_lower, not intent_judgment_query
hot_query = text_lower
assert hot_query == "no, i'm good."
assert hot_query != intent_judgment_query
class TestWakeTimestampCapture:
"""Tests that _wake_timestamp is set when a wake word is detected.
Bug fix: _wake_timestamp was never set, only initialised to None and
cleared. This meant the intent judge always received wake_timestamp=None,
so it never marked segments with "(WAKE WORD DETECTED)" and fell back to
incorrect reasoning — classifying directed queries as not directed.
"""
def test_wake_timestamp_set_on_wake_word_detection(self):
"""_wake_timestamp is set to utterance_start_time when wake word is detected."""
from unittest.mock import MagicMock, patch, PropertyMock
# Build a minimal listener-like object with _process_transcript behaviour
listener = MagicMock()
listener._wake_timestamp = None
listener.tts = None
listener.cfg = MagicMock()
listener.cfg.wake_word = "jarvis"
listener.cfg.wake_aliases = []
listener.cfg.wake_fuzzy_ratio = 0.78
# Simulate the logic from _process_transcript early beep section
text_lower = "jarvis what's the weather tomorrow"
utterance_start_time = 1000.5
in_hot_window = False
wake_word = listener.cfg.wake_word
aliases = list(set(listener.cfg.wake_aliases) | {wake_word})
fuzzy_ratio = float(listener.cfg.wake_fuzzy_ratio)
if not in_hot_window:
if is_wake_word_detected(text_lower, wake_word, aliases, fuzzy_ratio):
listener._wake_timestamp = utterance_start_time
assert listener._wake_timestamp == 1000.5, \
"_wake_timestamp should be set to utterance_start_time when wake word detected"
def test_wake_timestamp_not_set_without_wake_word(self):
"""_wake_timestamp stays None when no wake word is present."""
listener = MagicMock()
listener._wake_timestamp = None
listener.cfg = MagicMock()
listener.cfg.wake_word = "jarvis"
listener.cfg.wake_aliases = []
listener.cfg.wake_fuzzy_ratio = 0.78
text_lower = "what's the weather tomorrow"
utterance_start_time = 1000.5
in_hot_window = False
wake_word = listener.cfg.wake_word
aliases = list(set(listener.cfg.wake_aliases) | {wake_word})
fuzzy_ratio = float(listener.cfg.wake_fuzzy_ratio)
if not in_hot_window:
if is_wake_word_detected(text_lower, wake_word, aliases, fuzzy_ratio):
listener._wake_timestamp = utterance_start_time
assert listener._wake_timestamp is None, \
"_wake_timestamp should stay None when no wake word detected"
def test_wake_timestamp_not_set_in_hot_window(self):
"""_wake_timestamp is not set in hot window mode (no wake word needed)."""
listener = MagicMock()
listener._wake_timestamp = None
listener.cfg = MagicMock()
listener.cfg.wake_word = "jarvis"
listener.cfg.wake_aliases = []
listener.cfg.wake_fuzzy_ratio = 0.78
text_lower = "jarvis what's the weather"
utterance_start_time = 1000.5
in_hot_window = True
wake_word = listener.cfg.wake_word
aliases = list(set(listener.cfg.wake_aliases) | {wake_word})
fuzzy_ratio = float(listener.cfg.wake_fuzzy_ratio)
# In hot window, we skip wake word detection
if not in_hot_window:
if is_wake_word_detected(text_lower, wake_word, aliases, fuzzy_ratio):
listener._wake_timestamp = utterance_start_time
assert listener._wake_timestamp is None, \
"_wake_timestamp should not be set in hot window mode"
class TestStateTimingScenarios:
"""Tests for state timing and transitions.
These tests verify that the listener correctly handles various
timing scenarios involving wake word, TTS, and hot window states.
"""
def test_utterance_time_matters_not_processing_time(self):
"""
Key principle: What matters is WHEN the user started speaking,
not when processing completes.
"""
hot_window_end_time = 1000.0
# Scenario 1: User spoke during hot window, processed after expiry
utterance_start_time = 998.0 # During hot window
processing_time = 1002.0 # After hot window expired
spoke_during_hot_window = utterance_start_time < hot_window_end_time
assert spoke_during_hot_window is True
# Should be treated as hot window because user STARTED during hot window
def test_utterance_after_hot_window_requires_wake_word(self):
"""Utterance that started after hot window requires wake word."""
hot_window_end_time = 1000.0
# User started speaking after hot window ended
utterance_start_time = 1002.0 # After hot window
spoke_during_hot_window = utterance_start_time < hot_window_end_time
assert spoke_during_hot_window is False
# This requires wake word
def test_utterance_spanning_hot_window_expiry(self):
"""
Utterance that started during hot window but ended after expiry
should still be treated as hot window.
"""
tts_finish_time = 995.0
hot_window_seconds = 5.0
hot_window_end_time = tts_finish_time + hot_window_seconds # 1000.0
# User started during hot window, finished after
utterance_start_time = 998.0
utterance_end_time = 1003.0
# The key check: did user START during hot window?
spoke_during_hot_window = utterance_start_time < hot_window_end_time
assert spoke_during_hot_window is True
def test_long_utterance_during_tts(self):
"""
Long utterance that started during TTS should be treated as
potential follow-up or interrupt.
"""
tts_start_time = 990.0
tts_finish_time = 1010.0 # 20 second TTS
# User started speaking during TTS
utterance_start_time = 1005.0 # During TTS
utterance_end_time = 1015.0 # After TTS ended
spoke_during_tts = (
utterance_start_time >= tts_start_time and
utterance_start_time < tts_finish_time
)
assert spoke_during_tts is True
def test_quick_followup_after_tts(self):
"""Quick follow-up right after TTS should be in hot window."""
tts_finish_time = 1000.0
echo_tolerance = 0.3
hot_window_seconds = 3.0
# User speaks right after TTS
utterance_start_time = 1000.5 # Just after TTS
# Should be well within hot window
time_since_tts = utterance_start_time - tts_finish_time
in_hot_window = time_since_tts < (echo_tolerance + hot_window_seconds)
assert in_hot_window is True
class TestHotWindowQueryValidation:
"""Tests for hot window behavior."""
def test_stop_command_validation(self):
"""Stop commands should work in hot window."""
current_segment = "stop"
# Stop commands are always accepted when detected
assert "stop" in current_segment.lower()
def test_interrupt_during_tts(self):
"""Interrupt during TTS should work with wake word."""
current_segment = "jarvis stop talking"
wake_word = "jarvis"
has_wake_word = is_wake_word_detected(current_segment.lower(), wake_word, [])
assert has_wake_word is True
class TestHotWindowEchoRejection:
"""Tests documenting that echo rejection should NOT expire hot window.
Bug scenario: User says follow-up, but TTS echo is transcribed first.
The echo gets rejected, but the hot window should remain active for
the real follow-up that comes immediately after.
"""
def test_echo_rejection_should_not_expire_hot_window(self):
"""
Bug fix test: Echo rejection must NOT expire hot window.
Scenario from real usage:
1. TTS finishes at 13:12:24.390, hot window starts (3 seconds)
2. User says: "No, that's you. I was talking to Google."
3. But Whisper first transcribes TTS echo (97.3% similarity)
4. Echo is correctly rejected
5. BUG (fixed): Hot window was being expired here
6. Real follow-up arrives but hot window is already gone
The fix: Echo rejection clears voice state but keeps hot window active.
"""
# Timeline simulation
tts_finish_time = 1000.0
hot_window_duration = 3.0
hot_window_end_time = tts_finish_time + hot_window_duration # 1003.0
# Echo arrives at 1000.5 (during hot window)
echo_arrival_time = 1000.5
# Real follow-up arrives at 1001.2 (during hot window)
followup_arrival_time = 1001.2
# Both arrive within hot window
assert echo_arrival_time < hot_window_end_time
assert followup_arrival_time < hot_window_end_time
# Key assertion: After rejecting echo, hot window should still be active
# for the follow-up that arrives 0.7 seconds later
time_between_echo_and_followup = followup_arrival_time - echo_arrival_time
assert time_between_echo_and_followup < hot_window_duration, \
"Follow-up should be within hot window if echo didn't expire it"
def test_real_followup_after_echo_is_accepted(self):
"""
After echo is rejected, real follow-up should still work.
The hot window stays active, so the follow-up doesn't need wake word.
"""
# User's real follow-up (no wake word needed in hot window)
followup_text = "no that's you i was talking to google"
wake_word = "jarvis"
# This doesn't have wake word
has_wake_word = is_wake_word_detected(followup_text, wake_word, [])
assert has_wake_word is False
# But in hot window mode, it should still be accepted
# (the listener trusts intent judge for hot window speech)
in_hot_window = True
should_require_wake_word = not in_hot_window
# No wake word required in hot window
assert should_require_wake_word is False
class TestQueryValidationNotUsed:
"""Tests documenting why we DON'T use query-to-segment text matching.
Query validation (checking if LLM's extracted query matches the segment text)
was considered but rejected because it has both false positives and false
negatives that make it unreliable.
Instead, we rely on:
1. Wake word presence check (in wake word mode)
2. CURRENT - JUDGE THIS prompt marker (guides LLM to right segment)
3. Processed segment filtering (old queries filtered from prompt)
"""
def test_false_negative_synthesized_query_paraphrased(self):
"""
FALSE NEGATIVE: Valid synthesized query rejected due to paraphrasing.
User says: "Jarvis what do you think"
LLM synthesizes: "share your thoughts on the weather"
These have almost no word overlap - validation would reject valid query!
"""
text = "jarvis what do you think"
synthesized_query = "share your thoughts on the weather"
# Remove wake word for fair comparison
text_without_wake = text.replace("jarvis", "").strip()
# Check 1: substring match
assert synthesized_query not in text
assert text not in synthesized_query
assert text_without_wake not in synthesized_query
# Check 2: word overlap
text_words = set(text_without_wake.split()) # {what, do, you, think}
query_words = set(synthesized_query.split()) # {share, your, thoughts, on, the, weather}
overlap = text_words & query_words
# Only "your" might overlap (you vs your - not exact match)
# This valid query would be INCORRECTLY REJECTED
assert len(overlap) < len(query_words) / 2, "Low overlap would reject valid query"
def test_false_negative_synthesized_query_context_heavy(self):
"""
FALSE NEGATIVE: Valid query with heavy context synthesis rejected.
Multi-person conversation about iPhone, user asks "Jarvis how much"
LLM synthesizes: "how much does the new iPhone 15 Pro Max cost in the UK"
"""
text = "jarvis how much"
synthesized_query = "how much does the new iPhone 15 Pro Max cost in the UK"
text_without_wake = text.replace("jarvis", "").strip() # "how much"
# Substring check passes! "how much" is in the query
assert text_without_wake in synthesized_query
# But what if user said it differently?
text2 = "jarvis what's the price"
text2_without_wake = text2.replace("jarvis", "").strip() # "what's the price"
# This would FAIL - different phrasing
assert text2_without_wake not in synthesized_query
def test_false_positive_coincidental_overlap(self):
"""
FALSE POSITIVE: Wrong segment query accepted due to coincidental overlap.
User says: "hey assistant, how are you doing, tell me the weather"
LLM extracts from WRONG segment: "how are you"
But "how are you" IS in the current text!
"""
current_text = "hey assistant how are you doing tell me the weather"
wrong_query = "how are you" # From a different segment!
# This INCORRECTLY PASSES - query is substring of text
assert wrong_query in current_text, "Wrong query passes validation!"
def test_false_positive_common_words_overlap(self):
"""
FALSE POSITIVE: Wrong query has word overlap with common phrases.
User says: "assistant what time is it"
Wrong segment had: "what time should we leave for dinner"
"""
current_text = "assistant what time is it"
wrong_query = "what time should we leave for dinner"
# Word overlap
current_words = set(current_text.split())
query_words = set(wrong_query.split())
overlap = current_words & query_words
# Overlap: {what, time} = 2 words
# Query has 7 words, threshold = 3.5
# 2 < 3.5 - this one would be rejected
# But with shorter wrong query:
wrong_query_short = "what time should we leave"
query_words_short = set(wrong_query_short.split())
overlap_short = current_words & query_words_short
# Overlap: {what, time} = 2 words
# Query has 5 words, threshold = 2.5
# 2 < 2.5 - still rejected, but barely
# The point: validation is fragile and unreliable
def test_wake_word_check_is_reliable(self):
"""
Wake word check is reliable - no false positives or negatives.
If user says "how are you" without wake word:
- Wake word check correctly rejects (no "jarvis")
If user says "jarvis what do you think":
- Wake word check correctly accepts (has "jarvis")
- LLM can synthesize any query it wants
"""
# Case 1: No wake word - correctly rejected
text_no_wake = "how are you"
assert is_wake_word_detected(text_no_wake, "jarvis", []) is False
# Case 2: Has wake word - correctly accepted
text_with_wake = "jarvis what do you think"
assert is_wake_word_detected(text_with_wake, "jarvis", []) is True
# The LLM can then synthesize: "what do you think about the weather"
# We trust the LLM's synthesis because the wake word validated user intent