Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
614 lines
24 KiB
Python
614 lines
24 KiB
Python
"""
|
|
Tests for wake word validation in the listener.
|
|
|
|
These tests verify that:
|
|
1. Wake word presence is verified in wake word mode
|
|
2. Hot window mode doesn't require wake word
|
|
3. Various state timing scenarios are handled correctly
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import patch, MagicMock
|
|
import time
|
|
|
|
from jarvis.listening.wake_detection import is_wake_word_detected
|
|
|
|
|
|
class TestWakeWordValidation:
|
|
"""Tests for wake word presence validation in wake word mode.
|
|
|
|
The listener must verify wake word is present when:
|
|
1. We're in wake word mode (not hot window)
|
|
2. Intent judge says directed=true
|
|
"""
|
|
|
|
def test_wake_word_detected_with_jarvis(self):
|
|
"""Wake word detection finds 'jarvis' in text."""
|
|
text = "hey jarvis what time is it"
|
|
assert is_wake_word_detected(text, "jarvis", []) is True
|
|
|
|
def test_wake_word_detected_with_alias(self):
|
|
"""Wake word detection finds alias."""
|
|
text = "hey assistant what time is it"
|
|
assert is_wake_word_detected(text, "jarvis", ["assistant"]) is True
|
|
|
|
def test_wake_word_not_detected_without_wake_word(self):
|
|
"""Wake word detection returns False when no wake word present."""
|
|
text = "how are you"
|
|
assert is_wake_word_detected(text, "jarvis", []) is False
|
|
|
|
def test_wake_word_not_detected_similar_but_different(self):
|
|
"""Wake word detection doesn't match similar words."""
|
|
text = "I was jarring some preserves"
|
|
# "jarring" is similar to "jarvis" but should not match with high threshold
|
|
assert is_wake_word_detected(text, "jarvis", [], fuzzy_ratio=0.9) is False
|
|
|
|
def test_bug_scenario_no_wake_word_in_query(self):
|
|
"""
|
|
Bug scenario: Intent judge says directed=true for 'How are you?'
|
|
but there's no wake word - this should be rejected in wake word mode.
|
|
"""
|
|
text = "how are you"
|
|
wake_word = "jarvis"
|
|
aliases = []
|
|
|
|
# In wake word mode (not hot window), we need to verify wake word
|
|
could_be_hot_window = False
|
|
|
|
if not could_be_hot_window:
|
|
# Check if wake word is present
|
|
has_wake_word = is_wake_word_detected(text, wake_word, aliases)
|
|
# This should be False - there's no "jarvis" in "how are you"
|
|
assert has_wake_word is False, "Should reject - no wake word in text"
|
|
|
|
def test_valid_query_with_wake_word(self):
|
|
"""Valid scenario: Wake word is present in the query."""
|
|
text = "jarvis what's the weather"
|
|
wake_word = "jarvis"
|
|
aliases = []
|
|
|
|
has_wake_word = is_wake_word_detected(text, wake_word, aliases)
|
|
assert has_wake_word is True
|
|
|
|
def test_hot_window_mode_no_wake_word_needed(self):
|
|
"""In hot window mode, wake word is not required."""
|
|
text = "tell me more"
|
|
wake_word = "jarvis"
|
|
aliases = []
|
|
|
|
# In hot window mode, we don't check for wake word
|
|
could_be_hot_window = True
|
|
|
|
# The wake word check is skipped in hot window mode
|
|
# Intent judge decides based on context
|
|
if not could_be_hot_window:
|
|
has_wake_word = is_wake_word_detected(text, wake_word, aliases)
|
|
# Would fail, but we're in hot window so this check is skipped
|
|
# No assertion needed - just verifying the logic flow
|
|
|
|
def test_wake_word_with_fuzzy_match(self):
|
|
"""Fuzzy matching catches slight variations."""
|
|
text = "hey jarv what time is it" # Slight typo
|
|
wake_word = "jarvis"
|
|
aliases = []
|
|
|
|
# With lower fuzzy ratio (0.7), "jarv" might match "jarvis"
|
|
result = is_wake_word_detected(text, wake_word, aliases, fuzzy_ratio=0.7)
|
|
# "jarv" to "jarvis" ratio is about 0.73
|
|
assert result is True
|
|
|
|
def test_wake_word_case_insensitive(self):
|
|
"""Wake word detection is case insensitive."""
|
|
text = "JARVIS what time is it"
|
|
wake_word = "jarvis"
|
|
aliases = []
|
|
|
|
# Function expects lowercase text
|
|
assert is_wake_word_detected(text.lower(), wake_word, aliases) is True
|
|
|
|
|
|
class TestIntentJudgeWakeWordValidation:
|
|
"""Integration tests for intent judge + wake word validation."""
|
|
|
|
def test_intent_judge_directed_rejected_without_wake_word(self):
|
|
"""
|
|
Simulate the bug: Intent judge says directed=true but no wake word.
|
|
In wake word mode, this should be rejected.
|
|
"""
|
|
# Simulated state
|
|
text_lower = "how are you"
|
|
could_be_hot_window = False # Wake word mode
|
|
wake_timestamp = None # No wake word detected by audio detector
|
|
wake_word = "jarvis"
|
|
aliases = []
|
|
|
|
# Intent judge (incorrectly) returns directed=true
|
|
intent_judgment_directed = True
|
|
intent_judgment_query = "how are you"
|
|
|
|
# Validation logic from listener
|
|
should_accept = False
|
|
if intent_judgment_directed and intent_judgment_query:
|
|
if not could_be_hot_window:
|
|
# In wake word mode, verify wake word
|
|
has_wake_word = wake_timestamp is not None or is_wake_word_detected(
|
|
text_lower, wake_word, aliases
|
|
)
|
|
should_accept = has_wake_word
|
|
else:
|
|
should_accept = True
|
|
|
|
assert should_accept is False, "Should reject - no wake word in wake word mode"
|
|
|
|
def test_intent_judge_directed_accepted_with_wake_word(self):
|
|
"""Intent judge directed=true is accepted when wake word is present."""
|
|
text_lower = "jarvis what's the weather"
|
|
could_be_hot_window = False # Wake word mode
|
|
wake_timestamp = None # Doesn't matter, text has wake word
|
|
wake_word = "jarvis"
|
|
aliases = []
|
|
|
|
intent_judgment_directed = True
|
|
intent_judgment_query = "what's the weather"
|
|
|
|
should_accept = False
|
|
if intent_judgment_directed and intent_judgment_query:
|
|
if not could_be_hot_window:
|
|
has_wake_word = wake_timestamp is not None or is_wake_word_detected(
|
|
text_lower, wake_word, aliases
|
|
)
|
|
should_accept = has_wake_word
|
|
else:
|
|
should_accept = True
|
|
|
|
assert should_accept is True, "Should accept - wake word present"
|
|
|
|
def test_intent_judge_directed_accepted_with_timestamp(self):
|
|
"""Intent judge directed=true is accepted when wake_timestamp is set."""
|
|
text_lower = "what's the weather" # Wake word might be trimmed already
|
|
could_be_hot_window = False # Wake word mode
|
|
wake_timestamp = 1000.5 # Wake word was detected by audio detector
|
|
wake_word = "jarvis"
|
|
aliases = []
|
|
|
|
intent_judgment_directed = True
|
|
intent_judgment_query = "what's the weather"
|
|
|
|
should_accept = False
|
|
if intent_judgment_directed and intent_judgment_query:
|
|
if not could_be_hot_window:
|
|
has_wake_word = wake_timestamp is not None or is_wake_word_detected(
|
|
text_lower, wake_word, aliases
|
|
)
|
|
should_accept = has_wake_word
|
|
else:
|
|
should_accept = True
|
|
|
|
assert should_accept is True, "Should accept - wake_timestamp is set"
|
|
|
|
def test_hot_window_always_accepts_directed(self):
|
|
"""In hot window mode, directed=true is always accepted."""
|
|
text_lower = "tell me more"
|
|
could_be_hot_window = True # Hot window mode
|
|
wake_timestamp = None
|
|
wake_word = "jarvis"
|
|
aliases = []
|
|
|
|
intent_judgment_directed = True
|
|
intent_judgment_query = "tell me more"
|
|
|
|
should_accept = False
|
|
if intent_judgment_directed and intent_judgment_query:
|
|
if not could_be_hot_window:
|
|
has_wake_word = wake_timestamp is not None or is_wake_word_detected(
|
|
text_lower, wake_word, aliases
|
|
)
|
|
should_accept = has_wake_word
|
|
else:
|
|
should_accept = True # Hot window - no wake word needed
|
|
|
|
assert should_accept is True, "Should accept - hot window mode"
|
|
|
|
def test_hot_window_uses_actual_text_not_intent_judge_query(self):
|
|
"""In hot window mode, the actual user text should be used as the query.
|
|
|
|
Regression test: previously the intent judge's extracted query was used,
|
|
which could lose words (e.g. extracting "I" from "No, I'm good.").
|
|
Per spec: "Hot window input should reflect what the user actually said."
|
|
"""
|
|
text_lower = "no, i'm good."
|
|
intent_judgment_query = "I" # Bad extraction by small LLM
|
|
|
|
# In hot window mode, we should use text_lower, not intent_judgment_query
|
|
hot_query = text_lower
|
|
assert hot_query == "no, i'm good."
|
|
assert hot_query != intent_judgment_query
|
|
|
|
|
|
class TestWakeTimestampCapture:
|
|
"""Tests that _wake_timestamp is set when a wake word is detected.
|
|
|
|
Bug fix: _wake_timestamp was never set, only initialised to None and
|
|
cleared. This meant the intent judge always received wake_timestamp=None,
|
|
so it never marked segments with "(WAKE WORD DETECTED)" and fell back to
|
|
incorrect reasoning — classifying directed queries as not directed.
|
|
"""
|
|
|
|
def test_wake_timestamp_set_on_wake_word_detection(self):
|
|
"""_wake_timestamp is set to utterance_start_time when wake word is detected."""
|
|
from unittest.mock import MagicMock, patch, PropertyMock
|
|
|
|
# Build a minimal listener-like object with _process_transcript behaviour
|
|
listener = MagicMock()
|
|
listener._wake_timestamp = None
|
|
listener.tts = None
|
|
listener.cfg = MagicMock()
|
|
listener.cfg.wake_word = "jarvis"
|
|
listener.cfg.wake_aliases = []
|
|
listener.cfg.wake_fuzzy_ratio = 0.78
|
|
|
|
# Simulate the logic from _process_transcript early beep section
|
|
text_lower = "jarvis what's the weather tomorrow"
|
|
utterance_start_time = 1000.5
|
|
in_hot_window = False
|
|
|
|
wake_word = listener.cfg.wake_word
|
|
aliases = list(set(listener.cfg.wake_aliases) | {wake_word})
|
|
fuzzy_ratio = float(listener.cfg.wake_fuzzy_ratio)
|
|
|
|
if not in_hot_window:
|
|
if is_wake_word_detected(text_lower, wake_word, aliases, fuzzy_ratio):
|
|
listener._wake_timestamp = utterance_start_time
|
|
|
|
assert listener._wake_timestamp == 1000.5, \
|
|
"_wake_timestamp should be set to utterance_start_time when wake word detected"
|
|
|
|
def test_wake_timestamp_not_set_without_wake_word(self):
|
|
"""_wake_timestamp stays None when no wake word is present."""
|
|
listener = MagicMock()
|
|
listener._wake_timestamp = None
|
|
listener.cfg = MagicMock()
|
|
listener.cfg.wake_word = "jarvis"
|
|
listener.cfg.wake_aliases = []
|
|
listener.cfg.wake_fuzzy_ratio = 0.78
|
|
|
|
text_lower = "what's the weather tomorrow"
|
|
utterance_start_time = 1000.5
|
|
in_hot_window = False
|
|
|
|
wake_word = listener.cfg.wake_word
|
|
aliases = list(set(listener.cfg.wake_aliases) | {wake_word})
|
|
fuzzy_ratio = float(listener.cfg.wake_fuzzy_ratio)
|
|
|
|
if not in_hot_window:
|
|
if is_wake_word_detected(text_lower, wake_word, aliases, fuzzy_ratio):
|
|
listener._wake_timestamp = utterance_start_time
|
|
|
|
assert listener._wake_timestamp is None, \
|
|
"_wake_timestamp should stay None when no wake word detected"
|
|
|
|
def test_wake_timestamp_not_set_in_hot_window(self):
|
|
"""_wake_timestamp is not set in hot window mode (no wake word needed)."""
|
|
listener = MagicMock()
|
|
listener._wake_timestamp = None
|
|
listener.cfg = MagicMock()
|
|
listener.cfg.wake_word = "jarvis"
|
|
listener.cfg.wake_aliases = []
|
|
listener.cfg.wake_fuzzy_ratio = 0.78
|
|
|
|
text_lower = "jarvis what's the weather"
|
|
utterance_start_time = 1000.5
|
|
in_hot_window = True
|
|
|
|
wake_word = listener.cfg.wake_word
|
|
aliases = list(set(listener.cfg.wake_aliases) | {wake_word})
|
|
fuzzy_ratio = float(listener.cfg.wake_fuzzy_ratio)
|
|
|
|
# In hot window, we skip wake word detection
|
|
if not in_hot_window:
|
|
if is_wake_word_detected(text_lower, wake_word, aliases, fuzzy_ratio):
|
|
listener._wake_timestamp = utterance_start_time
|
|
|
|
assert listener._wake_timestamp is None, \
|
|
"_wake_timestamp should not be set in hot window mode"
|
|
|
|
|
|
class TestStateTimingScenarios:
|
|
"""Tests for state timing and transitions.
|
|
|
|
These tests verify that the listener correctly handles various
|
|
timing scenarios involving wake word, TTS, and hot window states.
|
|
"""
|
|
|
|
def test_utterance_time_matters_not_processing_time(self):
|
|
"""
|
|
Key principle: What matters is WHEN the user started speaking,
|
|
not when processing completes.
|
|
"""
|
|
hot_window_end_time = 1000.0
|
|
|
|
# Scenario 1: User spoke during hot window, processed after expiry
|
|
utterance_start_time = 998.0 # During hot window
|
|
processing_time = 1002.0 # After hot window expired
|
|
|
|
spoke_during_hot_window = utterance_start_time < hot_window_end_time
|
|
assert spoke_during_hot_window is True
|
|
|
|
# Should be treated as hot window because user STARTED during hot window
|
|
|
|
def test_utterance_after_hot_window_requires_wake_word(self):
|
|
"""Utterance that started after hot window requires wake word."""
|
|
hot_window_end_time = 1000.0
|
|
|
|
# User started speaking after hot window ended
|
|
utterance_start_time = 1002.0 # After hot window
|
|
|
|
spoke_during_hot_window = utterance_start_time < hot_window_end_time
|
|
assert spoke_during_hot_window is False
|
|
|
|
# This requires wake word
|
|
|
|
def test_utterance_spanning_hot_window_expiry(self):
|
|
"""
|
|
Utterance that started during hot window but ended after expiry
|
|
should still be treated as hot window.
|
|
"""
|
|
tts_finish_time = 995.0
|
|
hot_window_seconds = 5.0
|
|
hot_window_end_time = tts_finish_time + hot_window_seconds # 1000.0
|
|
|
|
# User started during hot window, finished after
|
|
utterance_start_time = 998.0
|
|
utterance_end_time = 1003.0
|
|
|
|
# The key check: did user START during hot window?
|
|
spoke_during_hot_window = utterance_start_time < hot_window_end_time
|
|
assert spoke_during_hot_window is True
|
|
|
|
def test_long_utterance_during_tts(self):
|
|
"""
|
|
Long utterance that started during TTS should be treated as
|
|
potential follow-up or interrupt.
|
|
"""
|
|
tts_start_time = 990.0
|
|
tts_finish_time = 1010.0 # 20 second TTS
|
|
|
|
# User started speaking during TTS
|
|
utterance_start_time = 1005.0 # During TTS
|
|
utterance_end_time = 1015.0 # After TTS ended
|
|
|
|
spoke_during_tts = (
|
|
utterance_start_time >= tts_start_time and
|
|
utterance_start_time < tts_finish_time
|
|
)
|
|
assert spoke_during_tts is True
|
|
|
|
def test_quick_followup_after_tts(self):
|
|
"""Quick follow-up right after TTS should be in hot window."""
|
|
tts_finish_time = 1000.0
|
|
echo_tolerance = 0.3
|
|
hot_window_seconds = 3.0
|
|
|
|
# User speaks right after TTS
|
|
utterance_start_time = 1000.5 # Just after TTS
|
|
|
|
# Should be well within hot window
|
|
time_since_tts = utterance_start_time - tts_finish_time
|
|
in_hot_window = time_since_tts < (echo_tolerance + hot_window_seconds)
|
|
|
|
assert in_hot_window is True
|
|
|
|
|
|
class TestHotWindowQueryValidation:
|
|
"""Tests for hot window behavior."""
|
|
|
|
def test_stop_command_validation(self):
|
|
"""Stop commands should work in hot window."""
|
|
current_segment = "stop"
|
|
# Stop commands are always accepted when detected
|
|
assert "stop" in current_segment.lower()
|
|
|
|
def test_interrupt_during_tts(self):
|
|
"""Interrupt during TTS should work with wake word."""
|
|
current_segment = "jarvis stop talking"
|
|
wake_word = "jarvis"
|
|
|
|
has_wake_word = is_wake_word_detected(current_segment.lower(), wake_word, [])
|
|
assert has_wake_word is True
|
|
|
|
|
|
class TestHotWindowEchoRejection:
|
|
"""Tests documenting that echo rejection should NOT expire hot window.
|
|
|
|
Bug scenario: User says follow-up, but TTS echo is transcribed first.
|
|
The echo gets rejected, but the hot window should remain active for
|
|
the real follow-up that comes immediately after.
|
|
"""
|
|
|
|
def test_echo_rejection_should_not_expire_hot_window(self):
|
|
"""
|
|
Bug fix test: Echo rejection must NOT expire hot window.
|
|
|
|
Scenario from real usage:
|
|
1. TTS finishes at 13:12:24.390, hot window starts (3 seconds)
|
|
2. User says: "No, that's you. I was talking to Google."
|
|
3. But Whisper first transcribes TTS echo (97.3% similarity)
|
|
4. Echo is correctly rejected
|
|
5. BUG (fixed): Hot window was being expired here
|
|
6. Real follow-up arrives but hot window is already gone
|
|
|
|
The fix: Echo rejection clears voice state but keeps hot window active.
|
|
"""
|
|
# Timeline simulation
|
|
tts_finish_time = 1000.0
|
|
hot_window_duration = 3.0
|
|
hot_window_end_time = tts_finish_time + hot_window_duration # 1003.0
|
|
|
|
# Echo arrives at 1000.5 (during hot window)
|
|
echo_arrival_time = 1000.5
|
|
|
|
# Real follow-up arrives at 1001.2 (during hot window)
|
|
followup_arrival_time = 1001.2
|
|
|
|
# Both arrive within hot window
|
|
assert echo_arrival_time < hot_window_end_time
|
|
assert followup_arrival_time < hot_window_end_time
|
|
|
|
# Key assertion: After rejecting echo, hot window should still be active
|
|
# for the follow-up that arrives 0.7 seconds later
|
|
time_between_echo_and_followup = followup_arrival_time - echo_arrival_time
|
|
assert time_between_echo_and_followup < hot_window_duration, \
|
|
"Follow-up should be within hot window if echo didn't expire it"
|
|
|
|
def test_real_followup_after_echo_is_accepted(self):
|
|
"""
|
|
After echo is rejected, real follow-up should still work.
|
|
|
|
The hot window stays active, so the follow-up doesn't need wake word.
|
|
"""
|
|
# User's real follow-up (no wake word needed in hot window)
|
|
followup_text = "no that's you i was talking to google"
|
|
wake_word = "jarvis"
|
|
|
|
# This doesn't have wake word
|
|
has_wake_word = is_wake_word_detected(followup_text, wake_word, [])
|
|
assert has_wake_word is False
|
|
|
|
# But in hot window mode, it should still be accepted
|
|
# (the listener trusts intent judge for hot window speech)
|
|
in_hot_window = True
|
|
should_require_wake_word = not in_hot_window
|
|
|
|
# No wake word required in hot window
|
|
assert should_require_wake_word is False
|
|
|
|
|
|
class TestQueryValidationNotUsed:
|
|
"""Tests documenting why we DON'T use query-to-segment text matching.
|
|
|
|
Query validation (checking if LLM's extracted query matches the segment text)
|
|
was considered but rejected because it has both false positives and false
|
|
negatives that make it unreliable.
|
|
|
|
Instead, we rely on:
|
|
1. Wake word presence check (in wake word mode)
|
|
2. CURRENT - JUDGE THIS prompt marker (guides LLM to right segment)
|
|
3. Processed segment filtering (old queries filtered from prompt)
|
|
"""
|
|
|
|
def test_false_negative_synthesized_query_paraphrased(self):
|
|
"""
|
|
FALSE NEGATIVE: Valid synthesized query rejected due to paraphrasing.
|
|
|
|
User says: "Jarvis what do you think"
|
|
LLM synthesizes: "share your thoughts on the weather"
|
|
These have almost no word overlap - validation would reject valid query!
|
|
"""
|
|
text = "jarvis what do you think"
|
|
synthesized_query = "share your thoughts on the weather"
|
|
|
|
# Remove wake word for fair comparison
|
|
text_without_wake = text.replace("jarvis", "").strip()
|
|
|
|
# Check 1: substring match
|
|
assert synthesized_query not in text
|
|
assert text not in synthesized_query
|
|
assert text_without_wake not in synthesized_query
|
|
|
|
# Check 2: word overlap
|
|
text_words = set(text_without_wake.split()) # {what, do, you, think}
|
|
query_words = set(synthesized_query.split()) # {share, your, thoughts, on, the, weather}
|
|
overlap = text_words & query_words
|
|
|
|
# Only "your" might overlap (you vs your - not exact match)
|
|
# This valid query would be INCORRECTLY REJECTED
|
|
assert len(overlap) < len(query_words) / 2, "Low overlap would reject valid query"
|
|
|
|
def test_false_negative_synthesized_query_context_heavy(self):
|
|
"""
|
|
FALSE NEGATIVE: Valid query with heavy context synthesis rejected.
|
|
|
|
Multi-person conversation about iPhone, user asks "Jarvis how much"
|
|
LLM synthesizes: "how much does the new iPhone 15 Pro Max cost in the UK"
|
|
"""
|
|
text = "jarvis how much"
|
|
synthesized_query = "how much does the new iPhone 15 Pro Max cost in the UK"
|
|
|
|
text_without_wake = text.replace("jarvis", "").strip() # "how much"
|
|
|
|
# Substring check passes! "how much" is in the query
|
|
assert text_without_wake in synthesized_query
|
|
|
|
# But what if user said it differently?
|
|
text2 = "jarvis what's the price"
|
|
text2_without_wake = text2.replace("jarvis", "").strip() # "what's the price"
|
|
|
|
# This would FAIL - different phrasing
|
|
assert text2_without_wake not in synthesized_query
|
|
|
|
def test_false_positive_coincidental_overlap(self):
|
|
"""
|
|
FALSE POSITIVE: Wrong segment query accepted due to coincidental overlap.
|
|
|
|
User says: "hey assistant, how are you doing, tell me the weather"
|
|
LLM extracts from WRONG segment: "how are you"
|
|
But "how are you" IS in the current text!
|
|
"""
|
|
current_text = "hey assistant how are you doing tell me the weather"
|
|
wrong_query = "how are you" # From a different segment!
|
|
|
|
# This INCORRECTLY PASSES - query is substring of text
|
|
assert wrong_query in current_text, "Wrong query passes validation!"
|
|
|
|
def test_false_positive_common_words_overlap(self):
|
|
"""
|
|
FALSE POSITIVE: Wrong query has word overlap with common phrases.
|
|
|
|
User says: "assistant what time is it"
|
|
Wrong segment had: "what time should we leave for dinner"
|
|
"""
|
|
current_text = "assistant what time is it"
|
|
wrong_query = "what time should we leave for dinner"
|
|
|
|
# Word overlap
|
|
current_words = set(current_text.split())
|
|
query_words = set(wrong_query.split())
|
|
overlap = current_words & query_words
|
|
|
|
# Overlap: {what, time} = 2 words
|
|
# Query has 7 words, threshold = 3.5
|
|
# 2 < 3.5 - this one would be rejected
|
|
|
|
# But with shorter wrong query:
|
|
wrong_query_short = "what time should we leave"
|
|
query_words_short = set(wrong_query_short.split())
|
|
overlap_short = current_words & query_words_short
|
|
|
|
# Overlap: {what, time} = 2 words
|
|
# Query has 5 words, threshold = 2.5
|
|
# 2 < 2.5 - still rejected, but barely
|
|
|
|
# The point: validation is fragile and unreliable
|
|
|
|
def test_wake_word_check_is_reliable(self):
|
|
"""
|
|
Wake word check is reliable - no false positives or negatives.
|
|
|
|
If user says "how are you" without wake word:
|
|
- Wake word check correctly rejects (no "jarvis")
|
|
|
|
If user says "jarvis what do you think":
|
|
- Wake word check correctly accepts (has "jarvis")
|
|
- LLM can synthesize any query it wants
|
|
"""
|
|
# Case 1: No wake word - correctly rejected
|
|
text_no_wake = "how are you"
|
|
assert is_wake_word_detected(text_no_wake, "jarvis", []) is False
|
|
|
|
# Case 2: Has wake word - correctly accepted
|
|
text_with_wake = "jarvis what do you think"
|
|
assert is_wake_word_detected(text_with_wake, "jarvis", []) is True
|
|
|
|
# The LLM can then synthesize: "what do you think about the weather"
|
|
# We trust the LLM's synthesis because the wake word validated user intent
|