Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
822 lines
34 KiB
Python
822 lines
34 KiB
Python
"""
|
|
Tests for echo detection module.
|
|
|
|
These tests verify that TTS echo detection properly identifies
|
|
when heard audio is an echo of TTS output vs genuine user speech.
|
|
"""
|
|
|
|
import time
|
|
import pytest
|
|
from jarvis.listening.echo_detection import EchoDetector
|
|
|
|
|
|
class TestTextNormalization:
|
|
"""Tests for text normalization handling TTS/Whisper differences."""
|
|
|
|
def test_normalize_celsius_symbol(self):
|
|
"""Normalizes 9°C to '9 degrees celsius'."""
|
|
detector = EchoDetector()
|
|
result = detector._normalize_for_comparison("It's 9°C outside")
|
|
assert "9 degrees celsius" in result
|
|
assert "°" not in result
|
|
|
|
def test_normalize_fahrenheit_symbol(self):
|
|
"""Normalizes 48°F to '48 degrees fahrenheit'."""
|
|
detector = EchoDetector()
|
|
result = detector._normalize_for_comparison("It's 48°F")
|
|
assert "48 degrees fahrenheit" in result
|
|
|
|
def test_normalize_generic_degree(self):
|
|
"""Normalizes standalone degree symbol."""
|
|
detector = EchoDetector()
|
|
result = detector._normalize_for_comparison("Turn it to 180°")
|
|
assert "180 degrees" in result
|
|
|
|
def test_normalize_with_space(self):
|
|
"""Handles space between number and degree symbol."""
|
|
detector = EchoDetector()
|
|
result = detector._normalize_for_comparison("It's 9 °C")
|
|
assert "9 degrees celsius" in result
|
|
|
|
def test_normalize_removes_parentheses(self):
|
|
"""Removes parentheses from text."""
|
|
detector = EchoDetector()
|
|
result = detector._normalize_for_comparison("It's 48°F (9°C)")
|
|
# Should contain both values without parentheses
|
|
assert "(" not in result
|
|
assert ")" not in result
|
|
assert "48 degrees fahrenheit" in result
|
|
assert "9 degrees celsius" in result
|
|
|
|
|
|
class TestTextSimilarity:
|
|
"""Tests for text similarity matching."""
|
|
|
|
def test_exact_match(self):
|
|
"""Detects exact text match."""
|
|
detector = EchoDetector()
|
|
assert detector._check_text_similarity("hello world", "hello world") is True
|
|
|
|
def test_case_insensitive_match(self):
|
|
"""Detects match regardless of case."""
|
|
detector = EchoDetector()
|
|
assert detector._check_text_similarity("Hello World", "hello world") is True
|
|
|
|
def test_partial_match(self):
|
|
"""Detects when heard text is substring of TTS."""
|
|
detector = EchoDetector()
|
|
tts = "the weather today is sunny and warm"
|
|
heard = "sunny and warm"
|
|
assert detector._check_text_similarity(heard, tts) is True
|
|
|
|
def test_no_match(self):
|
|
"""Returns False for unrelated text."""
|
|
detector = EchoDetector()
|
|
assert detector._check_text_similarity("what time is it", "the weather is nice") is False
|
|
|
|
def test_degree_symbol_match(self):
|
|
"""Matches degree symbol text against Whisper transcription."""
|
|
detector = EchoDetector()
|
|
tts = "It's currently 9°C outside"
|
|
heard = "It's currently 9 degrees celsius outside"
|
|
assert detector._check_text_similarity(heard, tts) is True
|
|
|
|
def test_empty_strings(self):
|
|
"""Returns False for empty strings."""
|
|
detector = EchoDetector()
|
|
assert detector._check_text_similarity("", "hello") is False
|
|
assert detector._check_text_similarity("hello", "") is False
|
|
assert detector._check_text_similarity("", "") is False
|
|
|
|
def test_higher_threshold_in_hot_window(self):
|
|
"""Uses higher threshold (92) for hot window to reduce false rejections."""
|
|
detector = EchoDetector()
|
|
# Test that threshold parameter affects matching
|
|
# Use text with typos/variations that won't be exact match
|
|
# "the weether forcast" vs "the weather forecast" scores ~89-92
|
|
tts = "the weather forecast"
|
|
heard = "the weether forcast" # typos - similar but not exact
|
|
# At low threshold this should match, at threshold above score it should not
|
|
low_threshold = detector._check_text_similarity(heard, tts, threshold=80)
|
|
high_threshold = detector._check_text_similarity(heard, tts, threshold=95)
|
|
# Lower threshold (80) should match text scoring ~92
|
|
assert low_threshold is True
|
|
# Higher threshold (95) should reject text scoring ~92
|
|
assert high_threshold is False
|
|
|
|
|
|
class TestEchoRejection:
|
|
"""Tests for the main echo rejection decision logic."""
|
|
|
|
def test_no_rejection_without_tts(self):
|
|
"""Doesn't reject if no TTS was ever played."""
|
|
detector = EchoDetector()
|
|
assert detector.should_reject_as_echo("hello", current_energy=0.01) is False
|
|
|
|
def test_rejects_echo_during_tts(self):
|
|
"""Rejects matching text during TTS playback."""
|
|
detector = EchoDetector()
|
|
tts_text = "the weather is nice today"
|
|
detector.track_tts_start(tts_text)
|
|
|
|
# Simulate utterance starting right after TTS starts
|
|
utterance_start = time.time()
|
|
|
|
result = detector.should_reject_as_echo(
|
|
heard_text="nice today",
|
|
current_energy=0.01,
|
|
is_during_tts=True,
|
|
tts_rate=200.0,
|
|
utterance_start_time=utterance_start
|
|
)
|
|
assert result is True
|
|
|
|
def test_accepts_different_text_during_tts(self):
|
|
"""Accepts non-matching text during TTS (interruption)."""
|
|
detector = EchoDetector()
|
|
detector.track_tts_start("the weather is nice")
|
|
|
|
result = detector.should_reject_as_echo(
|
|
heard_text="stop",
|
|
current_energy=0.05,
|
|
is_during_tts=True,
|
|
tts_rate=200.0,
|
|
utterance_start_time=time.time()
|
|
)
|
|
assert result is False
|
|
|
|
def test_rejects_echo_in_cooldown_window(self):
|
|
"""Rejects matching text shortly after TTS finishes."""
|
|
detector = EchoDetector()
|
|
tts_text = "hello world"
|
|
detector.track_tts_start(tts_text, baseline_energy=0.01)
|
|
detector.track_tts_finish()
|
|
|
|
# Simulate utterance starting immediately after TTS
|
|
utterance_start = time.time()
|
|
|
|
result = detector.should_reject_as_echo(
|
|
heard_text="hello world",
|
|
current_energy=0.008, # Low energy (below baseline * threshold)
|
|
is_during_tts=False,
|
|
utterance_start_time=utterance_start
|
|
)
|
|
assert result is True
|
|
|
|
def test_accepts_high_energy_in_cooldown(self):
|
|
"""Accepts speech with high energy even in cooldown (real user)."""
|
|
detector = EchoDetector(energy_spike_threshold=2.0)
|
|
detector.track_tts_start("hello world", baseline_energy=0.01)
|
|
detector.track_tts_finish()
|
|
|
|
utterance_start = time.time()
|
|
|
|
result = detector.should_reject_as_echo(
|
|
heard_text="hello world",
|
|
current_energy=0.05, # High energy (5x baseline)
|
|
is_during_tts=False,
|
|
utterance_start_time=utterance_start
|
|
)
|
|
assert result is False
|
|
|
|
def test_accepts_after_extended_window(self):
|
|
"""Accepts speech after extended echo window expires."""
|
|
detector = EchoDetector(echo_tolerance=0.3)
|
|
detector.track_tts_start("hello world")
|
|
detector.track_tts_finish()
|
|
|
|
# Simulate utterance starting well after TTS (2 seconds)
|
|
utterance_start = time.time() + 2.0
|
|
detector._last_tts_finish_time = time.time() - 2.0 # TTS finished 2s ago
|
|
|
|
result = detector.should_reject_as_echo(
|
|
heard_text="hello world",
|
|
current_energy=0.01,
|
|
is_during_tts=False,
|
|
utterance_start_time=utterance_start
|
|
)
|
|
assert result is False
|
|
|
|
@pytest.mark.unit
|
|
def test_rejects_echo_during_tts_with_timing_drift(self):
|
|
"""Rejects echo during TTS even when timing-based segment matching fails.
|
|
|
|
When TTS timing drifts (plays faster/slower than expected), segment
|
|
matching may check the wrong portion of the TTS text. The fallback
|
|
full-TTS check should catch these cases for long utterances.
|
|
"""
|
|
detector = EchoDetector()
|
|
# Weather forecast TTS
|
|
tts_text = (
|
|
"the weather tomorrow is expected to be mostly cloudy with a high "
|
|
"of around 8 degrees celsius 46.4 degrees fahrenheit and a low of "
|
|
"2 degrees celsius 35.6 degrees fahrenheit it should be quite breezy"
|
|
)
|
|
detector.track_tts_start(tts_text)
|
|
|
|
# Simulate TTS playing faster than expected - utterance starts early in TTS
|
|
# but the actual audio is from the middle/end (timing drift)
|
|
tts_start = detector._tts_start_time
|
|
# Utterance starts 2 seconds after TTS, but this is actually audio from later in TTS
|
|
utterance_start = tts_start + 2.0
|
|
|
|
# This fragment is from the middle of TTS but segment matching will
|
|
# look at the wrong segment due to timing drift
|
|
heard = "35.6 degrees fahrenheit it should be quite breezy"
|
|
|
|
result = detector.should_reject_as_echo(
|
|
heard_text=heard,
|
|
current_energy=0.01,
|
|
is_during_tts=True,
|
|
tts_rate=200.0,
|
|
utterance_start_time=utterance_start
|
|
)
|
|
# Should be rejected via full-TTS fallback (8 words, 100% similarity)
|
|
assert result is True, "Should reject echo via full-TTS fallback when segment matching fails"
|
|
|
|
@pytest.mark.unit
|
|
def test_accepts_stop_command_during_tts_fallback(self):
|
|
"""Stop commands should not trigger the full-TTS fallback rejection.
|
|
|
|
The fallback only applies to utterances > 4 words, so short commands
|
|
like 'stop' should still be accepted during TTS.
|
|
"""
|
|
detector = EchoDetector()
|
|
detector.track_tts_start("the weather tomorrow will be sunny and warm")
|
|
|
|
result = detector.should_reject_as_echo(
|
|
heard_text="stop",
|
|
current_energy=0.05,
|
|
is_during_tts=True,
|
|
tts_rate=200.0,
|
|
utterance_start_time=time.time()
|
|
)
|
|
assert result is False, "Stop command should not be rejected during TTS"
|
|
|
|
|
|
class TestLeadingEchoCleanup:
|
|
"""Tests for cleanup_leading_echo functionality."""
|
|
|
|
def test_cleanup_leading_overlap(self):
|
|
"""Removes leading words that match end of TTS."""
|
|
detector = EchoDetector()
|
|
detector._last_tts_text = "the weather today is sunny"
|
|
|
|
heard = "is sunny what time is it"
|
|
result = detector.cleanup_leading_echo(heard)
|
|
assert result == "what time is it"
|
|
|
|
def test_no_cleanup_when_no_overlap(self):
|
|
"""Doesn't modify text when there's no overlap."""
|
|
detector = EchoDetector()
|
|
detector._last_tts_text = "the weather is nice"
|
|
|
|
heard = "what time is it"
|
|
result = detector.cleanup_leading_echo(heard)
|
|
assert result == heard
|
|
|
|
def test_no_cleanup_short_overlap(self):
|
|
"""Doesn't cleanup if overlap is only 1 word."""
|
|
detector = EchoDetector()
|
|
detector._last_tts_text = "the weather is nice"
|
|
|
|
heard = "nice what time is it" # Only 1 word overlap
|
|
result = detector.cleanup_leading_echo(heard)
|
|
assert result == heard # No cleanup for 1-word overlap
|
|
|
|
def test_cleanup_requires_remainder(self):
|
|
"""Doesn't cleanup if the entire heard text is the echo."""
|
|
detector = EchoDetector()
|
|
detector._last_tts_text = "the weather is nice"
|
|
|
|
heard = "is nice" # Entire text is echo, no remainder
|
|
result = detector.cleanup_leading_echo(heard)
|
|
assert result == heard # Don't cleanup if nothing remains
|
|
|
|
def test_cleanup_fuzzy_word_match(self):
|
|
"""Handles Whisper transcription differences (e.g. Tbilisi vs T-Valisi)."""
|
|
detector = EchoDetector()
|
|
detector._last_tts_text = (
|
|
"I don't have a direct way to predict tomorrow's weather, "
|
|
"but I can check for you. Let me search for the forecast in Tbilisi."
|
|
)
|
|
|
|
heard = (
|
|
"i don't have a direct way to predict tomorrow's weather "
|
|
"but i can check for you let me search for the forecast in t-valisi "
|
|
"you already searched so i can see the tool calls"
|
|
)
|
|
result = detector.cleanup_leading_echo(heard)
|
|
assert "you already searched" in result
|
|
assert "forecast" not in result
|
|
|
|
|
|
class TestHotWindowEchoDetection:
|
|
"""Tests for echo detection in hot window mode."""
|
|
|
|
def test_higher_threshold_in_hot_window(self):
|
|
"""Uses stricter matching in hot window to allow more follow-up speech."""
|
|
detector = EchoDetector()
|
|
detector.track_tts_start("tell me about the weather today")
|
|
detector.track_tts_finish()
|
|
|
|
utterance_start = time.time()
|
|
|
|
# Text that's somewhat similar but not the same
|
|
result = detector.should_reject_as_echo(
|
|
heard_text="tell me more",
|
|
current_energy=0.01,
|
|
is_during_tts=False,
|
|
utterance_start_time=utterance_start,
|
|
in_hot_window=True # Hot window mode
|
|
)
|
|
# Should be less likely to reject in hot window due to higher threshold
|
|
# (The actual behavior depends on similarity scores)
|
|
assert result is False # "tell me more" is different enough
|
|
|
|
def test_partial_echo_from_long_tts(self):
|
|
"""Detects partial echo from a long TTS response.
|
|
|
|
This tests the scenario where TTS outputs a long response and Whisper
|
|
picks up only a portion of it, potentially with transcription errors.
|
|
Common in rooms with echo/reverb at higher volumes.
|
|
"""
|
|
detector = EchoDetector()
|
|
# Simulate a long weather response
|
|
tts_text = (
|
|
"You're in London, and I've got the latest weather update for you: "
|
|
"it's currently overcast with light rain showers, and the temperature "
|
|
"is around 8 degrees celsius at 18:48 UTC. I'd recommend grabbing an "
|
|
"umbrella to stay dry. Would you like me to suggest any outdoor "
|
|
"activities or provide more weather details?"
|
|
)
|
|
detector.track_tts_start(tts_text)
|
|
detector.track_tts_finish()
|
|
|
|
utterance_start = time.time()
|
|
|
|
# Partial echo that Whisper picked up (with some transcription variations)
|
|
partial_echo = "the temperature is around 8 degrees celsius. I'd recommend grabbing an umbrella"
|
|
|
|
# Should detect as echo - this is clearly part of the TTS output
|
|
result = detector._check_text_similarity(partial_echo, tts_text, threshold=70)
|
|
assert result is True, f"Should detect partial echo at threshold 70"
|
|
|
|
def test_echo_with_whisper_transcription_errors(self):
|
|
"""Detects echo even with Whisper transcription errors.
|
|
|
|
Whisper sometimes mishears numbers and times (e.g., "18:48" as "1848").
|
|
The fuzzy matching should still catch these as echo.
|
|
"""
|
|
detector = EchoDetector()
|
|
tts_text = "the temperature is 8 degrees celsius at 18:48 UTC"
|
|
detector.track_tts_start(tts_text)
|
|
detector.track_tts_finish()
|
|
|
|
# Whisper transcription with errors
|
|
heard_with_errors = "the temperature is around 8 degrees celsius at 1848 UTC"
|
|
|
|
# Should still detect similarity despite transcription errors
|
|
result = detector._check_text_similarity(heard_with_errors, tts_text, threshold=70)
|
|
assert result is True, "Should detect echo despite transcription errors"
|
|
|
|
def test_echo_question_from_tts(self):
|
|
"""Detects when a question from TTS is echoed back.
|
|
|
|
TTS often ends with questions like "Would you like more details?"
|
|
These should be detected as echo, not new user queries.
|
|
"""
|
|
detector = EchoDetector()
|
|
tts_text = (
|
|
"The weather is nice today. Would you like me to suggest "
|
|
"any outdoor activities or provide more weather details?"
|
|
)
|
|
detector.track_tts_start(tts_text)
|
|
detector.track_tts_finish()
|
|
|
|
# Echo of the question portion
|
|
echoed_question = "would you like me to suggest any outdoor activities"
|
|
|
|
result = detector._check_text_similarity(echoed_question, tts_text, threshold=70)
|
|
assert result is True, "Should detect echoed question from TTS"
|
|
|
|
def test_accepts_genuine_followup_in_hot_window(self):
|
|
"""Accepts genuine follow-up that differs from TTS content."""
|
|
detector = EchoDetector()
|
|
tts_text = "The weather in London is currently overcast with rain"
|
|
detector.track_tts_start(tts_text)
|
|
detector.track_tts_finish()
|
|
|
|
utterance_start = time.time()
|
|
|
|
# Genuine follow-up question - different content
|
|
followup = "what about tomorrow's forecast"
|
|
|
|
result = detector.should_reject_as_echo(
|
|
heard_text=followup,
|
|
current_energy=0.03,
|
|
is_during_tts=False,
|
|
utterance_start_time=utterance_start,
|
|
in_hot_window=True
|
|
)
|
|
assert result is False, "Should accept genuine follow-up question"
|
|
|
|
def test_threshold_70_catches_partial_matches(self):
|
|
"""Verifies threshold 70 catches partial echo matches.
|
|
|
|
When using threshold 70 in hot window for fast rejection,
|
|
partial echoes with ~75% similarity should be caught.
|
|
"""
|
|
detector = EchoDetector()
|
|
tts_text = "London has about 8 hours of daylight in winter months"
|
|
|
|
# Partial echo with some differences
|
|
partial_echo = "London has about 8 hours of daylight"
|
|
|
|
# At threshold 70, should match (this is clearly a partial echo)
|
|
result_70 = detector._check_text_similarity(partial_echo, tts_text, threshold=70)
|
|
assert result_70 is True, "Threshold 70 should catch partial echo"
|
|
|
|
# At threshold 92 (default hot window), might not match as strictly
|
|
# This is fine - the intent judge handles ambiguous cases
|
|
result_92 = detector._check_text_similarity(partial_echo, tts_text, threshold=92)
|
|
# We don't assert on this as it depends on the fuzzy match algorithm
|
|
|
|
|
|
class TestSalvageDuringTTS:
|
|
"""Tests for cleanup_leading_echo_during_tts functionality.
|
|
|
|
This tests the salvage logic that extracts user speech from utterances
|
|
that start during TTS (mixed echo + user speech).
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def detector(self):
|
|
return EchoDetector()
|
|
|
|
def test_salvages_user_speech_after_echo(self, detector):
|
|
"""Extracts user speech that follows TTS echo.
|
|
|
|
Scenario: User starts speaking during TTS, mic picks up end of TTS
|
|
plus user's actual question.
|
|
"""
|
|
tts_text = (
|
|
"According to the BBC Weather forecast, tomorrow in Kensington is expected "
|
|
"to be quite gloomy with overcast conditions. You might want to bundle up "
|
|
"and plan your outdoor activities accordingly."
|
|
)
|
|
detector._last_tts_text = tts_text
|
|
detector._tts_start_time = 1000.0
|
|
|
|
# User's mic picks up end of TTS + their actual question
|
|
heard = (
|
|
"You might want to bundle up and plan your outdoor activities accordingly. "
|
|
"Okay, let's switch the topic now. I want to talk about philosophy."
|
|
)
|
|
|
|
# Utterance started 10 seconds into TTS
|
|
result = detector.cleanup_leading_echo_during_tts(heard, tts_rate=200, utterance_start_time=1010.0)
|
|
|
|
# Should remove echo and keep user's speech
|
|
assert "bundle up" not in result.lower(), "Echo portion should be removed"
|
|
assert "philosophy" in result.lower(), "User's actual question should be preserved"
|
|
assert "switch the topic" in result.lower(), "User's speech should be preserved"
|
|
|
|
def test_salvage_with_timing_mismatch(self, detector):
|
|
"""Salvages correctly even when timing estimate is off.
|
|
|
|
Real-world scenario: mic timing doesn't perfectly match TTS timing
|
|
due to audio processing delays, pre-roll buffer, etc.
|
|
"""
|
|
tts_text = (
|
|
"It's going to be quite chilly. You might want to bundle up "
|
|
"and plan your outdoor activities accordingly."
|
|
)
|
|
detector._last_tts_text = tts_text
|
|
detector._tts_start_time = 1000.0
|
|
|
|
# User's mic picks up end of TTS + their question
|
|
# Timing estimate would be wrong, but full-text fallback should work
|
|
heard = "plan your outdoor activities accordingly. What do you think life is about?"
|
|
|
|
# Even with wrong timing estimate, should find match in full TTS
|
|
result = detector.cleanup_leading_echo_during_tts(heard, tts_rate=200, utterance_start_time=1005.0)
|
|
|
|
assert "outdoor activities" not in result.lower(), "Echo should be removed"
|
|
assert "life is about" in result.lower(), "User's question should be preserved"
|
|
|
|
def test_no_salvage_when_no_overlap(self, detector):
|
|
"""Returns original text when no overlap with TTS."""
|
|
detector._last_tts_text = "The weather is nice today"
|
|
detector._tts_start_time = 1000.0
|
|
|
|
heard = "What time is it?"
|
|
result = detector.cleanup_leading_echo_during_tts(heard, tts_rate=200, utterance_start_time=1005.0)
|
|
|
|
assert result == heard, "Should return original when no echo overlap"
|
|
|
|
def test_no_salvage_when_all_echo(self, detector):
|
|
"""Returns original when entire utterance is echo (no user speech to salvage)."""
|
|
tts_text = "The weather is nice and sunny today"
|
|
detector._last_tts_text = tts_text
|
|
detector._tts_start_time = 1000.0
|
|
|
|
# Entire heard text matches end of TTS - nothing to salvage
|
|
heard = "nice and sunny today"
|
|
result = detector.cleanup_leading_echo_during_tts(heard, tts_rate=200, utterance_start_time=1005.0)
|
|
|
|
# Should return original since there's nothing left after removing echo
|
|
assert result == heard
|
|
|
|
def test_echo_not_in_salvaged_output(self, detector):
|
|
"""Verifies echo portion doesn't slip into salvaged output.
|
|
|
|
This is the critical test - ensures we don't accidentally include
|
|
echo text in what we return to the user.
|
|
"""
|
|
tts_text = (
|
|
"According to the forecast, it will rain tomorrow. "
|
|
"Would you like me to suggest indoor activities?"
|
|
)
|
|
detector._last_tts_text = tts_text
|
|
detector._tts_start_time = 1000.0
|
|
|
|
heard = "Would you like me to suggest indoor activities? No thanks, tell me about philosophy instead."
|
|
result = detector.cleanup_leading_echo_during_tts(heard, tts_rate=200, utterance_start_time=1008.0)
|
|
|
|
# Critical: echo words should NOT be in the result
|
|
assert "suggest indoor activities" not in result.lower(), "Echo phrase must not be in output"
|
|
assert "would you like" not in result.lower(), "Echo phrase must not be in output"
|
|
# User's actual request should be preserved
|
|
assert "philosophy" in result.lower(), "User's request should be preserved"
|
|
|
|
|
|
class TestRealWorldSalvageScenarios:
|
|
"""Tests for real-world salvage scenarios that have caused regressions.
|
|
|
|
These tests capture actual issues encountered in production:
|
|
- Temperature notation differences (5.7°C vs "5.7 degrees Celsius")
|
|
- User appending speech to TTS echo
|
|
- Whisper transcription differences from TTS text
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def detector(self):
|
|
return EchoDetector()
|
|
|
|
def test_temperature_notation_mismatch(self, detector):
|
|
"""Salvages user speech when Whisper transcribes temperature differently.
|
|
|
|
Real scenario: TTS says "5.7°C" but Whisper transcribes "5.7 degrees Celsius"
|
|
This caused salvage to fail because word-level matching didn't match.
|
|
"""
|
|
tts_text = "It's going to be a bit chilly tomorrow in Kensington, with overcast skies and a temperature around 5.7°C."
|
|
detector._last_tts_text = tts_text
|
|
|
|
# Whisper transcribes temperature differently
|
|
heard = "It's going to be a bit chilly tomorrow in Kensington with overcast skies and a temperature around 5.7 degrees Celsius. Nice, you remembered not to say it in Fahrenheit."
|
|
|
|
result = detector.cleanup_leading_echo(heard)
|
|
|
|
# Should salvage user's follow-up
|
|
assert "nice" in result.lower(), "User's follow-up should be preserved"
|
|
assert "fahrenheit" in result.lower(), "User's comment should be preserved"
|
|
# Echo should be removed
|
|
assert "chilly tomorrow" not in result.lower(), "Echo should be removed"
|
|
|
|
def test_user_appends_speech_to_full_tts_echo(self, detector):
|
|
"""User speaks immediately after TTS, mic captures both.
|
|
|
|
The entire TTS is captured plus user's response. cleanup_leading_echo
|
|
should remove the TTS portion and return user's speech.
|
|
"""
|
|
tts_text = "Would you like some help finding one?"
|
|
detector._last_tts_text = tts_text
|
|
|
|
# User responds right after TTS, mic captures both
|
|
heard = "Would you like some help finding one? No thanks, I'm good."
|
|
|
|
result = detector.cleanup_leading_echo(heard)
|
|
|
|
# Should return user's response
|
|
assert "no thanks" in result.lower(), "User's response should be preserved"
|
|
assert "i'm good" in result.lower() or "im good" in result.lower(), "User's response should be preserved"
|
|
# Echo should be removed
|
|
assert "would you like" not in result.lower(), "Echo should be removed"
|
|
|
|
def test_salvage_preserves_user_question(self, detector):
|
|
"""Salvage preserves user's follow-up question after echo."""
|
|
tts_text = "The weather tomorrow will be cloudy with a high of 12 degrees."
|
|
detector._last_tts_text = tts_text
|
|
|
|
heard = "The weather tomorrow will be cloudy with a high of 12 degrees. What about the day after?"
|
|
|
|
result = detector.cleanup_leading_echo(heard)
|
|
|
|
assert "what about" in result.lower(), "User's question should be preserved"
|
|
assert "day after" in result.lower(), "User's question should be preserved"
|
|
assert "cloudy" not in result.lower(), "Echo should be removed"
|
|
|
|
def test_no_salvage_when_heard_matches_tts_exactly(self, detector):
|
|
"""Returns original when heard text is exactly TTS (no user speech).
|
|
|
|
This ensures we don't accidentally salvage a trailing word from pure echo.
|
|
"""
|
|
tts_text = "Would you like some help finding one?"
|
|
detector._last_tts_text = tts_text
|
|
|
|
# Heard matches TTS exactly - no user speech to salvage
|
|
heard = "Would you like some help finding one?"
|
|
|
|
result = detector.cleanup_leading_echo(heard)
|
|
|
|
# Should return original (full echo, nothing to salvage)
|
|
assert result == heard, "Should return original when no user speech to salvage"
|
|
|
|
def test_salvage_with_minor_transcription_errors(self, detector):
|
|
"""Salvage works despite minor Whisper transcription errors."""
|
|
tts_text = "I can see you're interested in finding out more about this topic."
|
|
detector._last_tts_text = tts_text
|
|
|
|
# Whisper may drop punctuation or have minor differences
|
|
heard = "I can see youre interested in finding out more about this topic tell me about philosophy"
|
|
|
|
result = detector.cleanup_leading_echo(heard)
|
|
|
|
# Should salvage user's request (may or may not work depending on how different)
|
|
# At minimum, shouldn't crash
|
|
assert result is not None
|
|
|
|
|
|
class TestFullTTSFallbackSalvage:
|
|
"""Tests for salvaging user speech in the full-TTS fallback path.
|
|
|
|
The full-TTS fallback (threshold 70) catches echoes with significant timing drift
|
|
that segment matching misses. But when the heard text contains TTS echo + user speech,
|
|
we should salvage the user speech instead of rejecting the entire utterance.
|
|
|
|
Real bug scenario:
|
|
- TTS: "...Temperature will be around 10°C (50°F). A great day to grab a cuppa."
|
|
- Heard: "50 degrees Fahrenheit. A great day to grab a cup. Tell me a random topic."
|
|
- OLD behavior: Rejected entire utterance as echo (74.6% similarity to full TTS)
|
|
- NEW behavior: Salvage "Tell me a random topic" from the suffix
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def detector(self):
|
|
return EchoDetector()
|
|
|
|
def test_salvages_user_speech_from_mixed_echo(self, detector):
|
|
"""User speech after TTS echo should not be rejected.
|
|
|
|
The similarity match finds the echo prefix, but there's user speech
|
|
at the end that should be salvaged.
|
|
"""
|
|
tts_text = (
|
|
"I think there's been a mix-up! We were just talking about the weather "
|
|
"in Kensington, London. Let me check again. According to the tool, "
|
|
"tomorrow's forecast for Kensington is: Overcast with a chance of light "
|
|
"drizzle. Temperature will be around 10°C (50°F). A great day to grab "
|
|
"a cuppa and enjoy the outdoors."
|
|
)
|
|
detector.track_tts_start(tts_text)
|
|
detector._tts_start_time = 1000.0
|
|
|
|
# Heard: end of TTS + user speech
|
|
heard = (
|
|
"50 degrees Fahrenheit. A great day to grab a cup and enjoy the outdoors. "
|
|
"Fine, yeah. Then tell me a random topic about philosophy."
|
|
)
|
|
|
|
# This should NOT be rejected because there's salvageable user speech
|
|
result = detector.should_reject_as_echo(
|
|
heard_text=heard,
|
|
current_energy=0.01,
|
|
is_during_tts=True,
|
|
tts_rate=200,
|
|
utterance_start_time=1012.0 # Near end of TTS
|
|
)
|
|
|
|
assert result is False, (
|
|
"Should NOT reject when there's user speech to salvage. "
|
|
"The full-TTS fallback should check for salvageable suffix."
|
|
)
|
|
|
|
def test_still_rejects_pure_echo_in_fallback(self, detector):
|
|
"""Pure echo (no user speech) should still be rejected by fallback."""
|
|
tts_text = (
|
|
"I think there's been a mix-up! We were just talking about the weather. "
|
|
"Let me check again. Tomorrow's forecast is overcast with light drizzle. "
|
|
"Temperature will be around 10°C."
|
|
)
|
|
detector.track_tts_start(tts_text)
|
|
detector._tts_start_time = 1000.0
|
|
|
|
# Heard: just echo, no user speech
|
|
heard = "Tomorrow's forecast is overcast with light drizzle. Temperature will be around 10 degrees Celsius."
|
|
|
|
result = detector.should_reject_as_echo(
|
|
heard_text=heard,
|
|
current_energy=0.01,
|
|
is_during_tts=True,
|
|
tts_rate=200,
|
|
utterance_start_time=1005.0
|
|
)
|
|
|
|
assert result is True, "Pure echo should still be rejected"
|
|
|
|
def test_salvage_suffix_from_echo_returns_user_speech(self, detector):
|
|
"""_salvage_suffix_from_echo returns the user speech portion."""
|
|
tts_text = "The weather is nice. Would you like to hear more?"
|
|
detector._last_tts_text = tts_text
|
|
detector._tts_start_time = 1000.0
|
|
|
|
heard = "Would you like to hear more? No thanks, tell me about philosophy."
|
|
|
|
result = detector._salvage_suffix_from_echo(heard, tts_rate=200, utterance_start_time=1005.0)
|
|
|
|
assert result is not None
|
|
assert "philosophy" in result.lower(), "User speech should be salvaged"
|
|
assert "would you like" not in result.lower(), "Echo should be removed"
|
|
|
|
def test_salvage_returns_none_for_pure_echo(self, detector):
|
|
"""_salvage_suffix_from_echo returns None for pure echo."""
|
|
tts_text = "The weather is nice today."
|
|
detector._last_tts_text = tts_text
|
|
detector._tts_start_time = 1000.0
|
|
|
|
# Pure echo, nothing to salvage
|
|
heard = "The weather is nice today."
|
|
|
|
result = detector._salvage_suffix_from_echo(heard, tts_rate=200, utterance_start_time=1005.0)
|
|
|
|
# Should return None (nothing salvaged) or original text
|
|
assert result is None or result == heard
|
|
|
|
|
|
class TestRightmostEchoBoundarySalvage:
|
|
"""Field regression: follow-up that starts with a Whisper-mangled echo tail.
|
|
|
|
Captured from a real session on 2026-04-20:
|
|
TTS said: "The movie Possessor is a psychological thriller that
|
|
explores themes of surveillance and identity."
|
|
User said: "Who made it?"
|
|
Whisper heard: "laws, themes of surveillance and identity. Who made it?"
|
|
|
|
The user started speaking inside the 3s follow-up hot window, and
|
|
Whisper merged the mic-captured echo tail with the real follow-up.
|
|
Every salvage path in the codebase before this commit either returned
|
|
the text unchanged (exact-word cleanup — fails because 'laws' doesn't
|
|
match 'explores') or truncated the salvage to just 'made it?' (fuzzy
|
|
prefix iteration picks the SHORTEST suffix first). Both are wrong:
|
|
the whole follow-up — 'Who made it?' — must survive so the intent
|
|
judge can dispatch it.
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def detector_with_tts(self):
|
|
import time as _time
|
|
d = EchoDetector()
|
|
tts = (
|
|
"The movie Possessor is a psychological thriller that "
|
|
"explores themes of surveillance and identity."
|
|
)
|
|
now = _time.time()
|
|
d._last_tts_text = tts
|
|
d._tts_start_time = now - 10.0
|
|
d._last_tts_finish_time = now - 1.0
|
|
d._tts_exact_duration = 9.0
|
|
return d, now
|
|
|
|
def test_salvages_full_follow_up_after_whisper_mangled_echo_prefix(self, detector_with_tts):
|
|
detector, now = detector_with_tts
|
|
heard = "laws, themes of surveillance and identity. Who made it?"
|
|
|
|
result = detector.salvage_after_echo_tail(heard)
|
|
|
|
assert result is not None, "expected a salvage, got None (rejection)"
|
|
lowered = result.lower()
|
|
# All three words of the real follow-up must survive the salvage.
|
|
assert "who" in lowered
|
|
assert "made" in lowered
|
|
assert "it" in lowered
|
|
# None of the echo-tail filler should leak through.
|
|
assert "surveillance" not in lowered
|
|
assert "identity" not in lowered
|
|
assert "themes" not in lowered
|
|
assert "laws" not in lowered
|
|
|
|
def test_returns_none_when_heard_is_pure_echo(self, detector_with_tts):
|
|
detector, _now = detector_with_tts
|
|
heard = "themes of surveillance and identity"
|
|
# Nothing non-echo after the tail — nothing to salvage.
|
|
result = detector.salvage_after_echo_tail(heard)
|
|
assert result is None
|
|
|
|
def test_returns_none_when_heard_shares_nothing_with_tts(self, detector_with_tts):
|
|
detector, _now = detector_with_tts
|
|
heard = "what is the weather tomorrow in London"
|
|
# No echo prefix at all — no salvage needed; caller keeps the text as-is.
|
|
result = detector.salvage_after_echo_tail(heard)
|
|
assert result is None
|