Files
javis_bot/tests/test_echo_detection.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

822 lines
34 KiB
Python

"""
Tests for echo detection module.
These tests verify that TTS echo detection properly identifies
when heard audio is an echo of TTS output vs genuine user speech.
"""
import time
import pytest
from jarvis.listening.echo_detection import EchoDetector
class TestTextNormalization:
"""Tests for text normalization handling TTS/Whisper differences."""
def test_normalize_celsius_symbol(self):
"""Normalizes 9°C to '9 degrees celsius'."""
detector = EchoDetector()
result = detector._normalize_for_comparison("It's 9°C outside")
assert "9 degrees celsius" in result
assert "°" not in result
def test_normalize_fahrenheit_symbol(self):
"""Normalizes 48°F to '48 degrees fahrenheit'."""
detector = EchoDetector()
result = detector._normalize_for_comparison("It's 48°F")
assert "48 degrees fahrenheit" in result
def test_normalize_generic_degree(self):
"""Normalizes standalone degree symbol."""
detector = EchoDetector()
result = detector._normalize_for_comparison("Turn it to 180°")
assert "180 degrees" in result
def test_normalize_with_space(self):
"""Handles space between number and degree symbol."""
detector = EchoDetector()
result = detector._normalize_for_comparison("It's 9 °C")
assert "9 degrees celsius" in result
def test_normalize_removes_parentheses(self):
"""Removes parentheses from text."""
detector = EchoDetector()
result = detector._normalize_for_comparison("It's 48°F (9°C)")
# Should contain both values without parentheses
assert "(" not in result
assert ")" not in result
assert "48 degrees fahrenheit" in result
assert "9 degrees celsius" in result
class TestTextSimilarity:
"""Tests for text similarity matching."""
def test_exact_match(self):
"""Detects exact text match."""
detector = EchoDetector()
assert detector._check_text_similarity("hello world", "hello world") is True
def test_case_insensitive_match(self):
"""Detects match regardless of case."""
detector = EchoDetector()
assert detector._check_text_similarity("Hello World", "hello world") is True
def test_partial_match(self):
"""Detects when heard text is substring of TTS."""
detector = EchoDetector()
tts = "the weather today is sunny and warm"
heard = "sunny and warm"
assert detector._check_text_similarity(heard, tts) is True
def test_no_match(self):
"""Returns False for unrelated text."""
detector = EchoDetector()
assert detector._check_text_similarity("what time is it", "the weather is nice") is False
def test_degree_symbol_match(self):
"""Matches degree symbol text against Whisper transcription."""
detector = EchoDetector()
tts = "It's currently 9°C outside"
heard = "It's currently 9 degrees celsius outside"
assert detector._check_text_similarity(heard, tts) is True
def test_empty_strings(self):
"""Returns False for empty strings."""
detector = EchoDetector()
assert detector._check_text_similarity("", "hello") is False
assert detector._check_text_similarity("hello", "") is False
assert detector._check_text_similarity("", "") is False
def test_higher_threshold_in_hot_window(self):
"""Uses higher threshold (92) for hot window to reduce false rejections."""
detector = EchoDetector()
# Test that threshold parameter affects matching
# Use text with typos/variations that won't be exact match
# "the weether forcast" vs "the weather forecast" scores ~89-92
tts = "the weather forecast"
heard = "the weether forcast" # typos - similar but not exact
# At low threshold this should match, at threshold above score it should not
low_threshold = detector._check_text_similarity(heard, tts, threshold=80)
high_threshold = detector._check_text_similarity(heard, tts, threshold=95)
# Lower threshold (80) should match text scoring ~92
assert low_threshold is True
# Higher threshold (95) should reject text scoring ~92
assert high_threshold is False
class TestEchoRejection:
"""Tests for the main echo rejection decision logic."""
def test_no_rejection_without_tts(self):
"""Doesn't reject if no TTS was ever played."""
detector = EchoDetector()
assert detector.should_reject_as_echo("hello", current_energy=0.01) is False
def test_rejects_echo_during_tts(self):
"""Rejects matching text during TTS playback."""
detector = EchoDetector()
tts_text = "the weather is nice today"
detector.track_tts_start(tts_text)
# Simulate utterance starting right after TTS starts
utterance_start = time.time()
result = detector.should_reject_as_echo(
heard_text="nice today",
current_energy=0.01,
is_during_tts=True,
tts_rate=200.0,
utterance_start_time=utterance_start
)
assert result is True
def test_accepts_different_text_during_tts(self):
"""Accepts non-matching text during TTS (interruption)."""
detector = EchoDetector()
detector.track_tts_start("the weather is nice")
result = detector.should_reject_as_echo(
heard_text="stop",
current_energy=0.05,
is_during_tts=True,
tts_rate=200.0,
utterance_start_time=time.time()
)
assert result is False
def test_rejects_echo_in_cooldown_window(self):
"""Rejects matching text shortly after TTS finishes."""
detector = EchoDetector()
tts_text = "hello world"
detector.track_tts_start(tts_text, baseline_energy=0.01)
detector.track_tts_finish()
# Simulate utterance starting immediately after TTS
utterance_start = time.time()
result = detector.should_reject_as_echo(
heard_text="hello world",
current_energy=0.008, # Low energy (below baseline * threshold)
is_during_tts=False,
utterance_start_time=utterance_start
)
assert result is True
def test_accepts_high_energy_in_cooldown(self):
"""Accepts speech with high energy even in cooldown (real user)."""
detector = EchoDetector(energy_spike_threshold=2.0)
detector.track_tts_start("hello world", baseline_energy=0.01)
detector.track_tts_finish()
utterance_start = time.time()
result = detector.should_reject_as_echo(
heard_text="hello world",
current_energy=0.05, # High energy (5x baseline)
is_during_tts=False,
utterance_start_time=utterance_start
)
assert result is False
def test_accepts_after_extended_window(self):
"""Accepts speech after extended echo window expires."""
detector = EchoDetector(echo_tolerance=0.3)
detector.track_tts_start("hello world")
detector.track_tts_finish()
# Simulate utterance starting well after TTS (2 seconds)
utterance_start = time.time() + 2.0
detector._last_tts_finish_time = time.time() - 2.0 # TTS finished 2s ago
result = detector.should_reject_as_echo(
heard_text="hello world",
current_energy=0.01,
is_during_tts=False,
utterance_start_time=utterance_start
)
assert result is False
@pytest.mark.unit
def test_rejects_echo_during_tts_with_timing_drift(self):
"""Rejects echo during TTS even when timing-based segment matching fails.
When TTS timing drifts (plays faster/slower than expected), segment
matching may check the wrong portion of the TTS text. The fallback
full-TTS check should catch these cases for long utterances.
"""
detector = EchoDetector()
# Weather forecast TTS
tts_text = (
"the weather tomorrow is expected to be mostly cloudy with a high "
"of around 8 degrees celsius 46.4 degrees fahrenheit and a low of "
"2 degrees celsius 35.6 degrees fahrenheit it should be quite breezy"
)
detector.track_tts_start(tts_text)
# Simulate TTS playing faster than expected - utterance starts early in TTS
# but the actual audio is from the middle/end (timing drift)
tts_start = detector._tts_start_time
# Utterance starts 2 seconds after TTS, but this is actually audio from later in TTS
utterance_start = tts_start + 2.0
# This fragment is from the middle of TTS but segment matching will
# look at the wrong segment due to timing drift
heard = "35.6 degrees fahrenheit it should be quite breezy"
result = detector.should_reject_as_echo(
heard_text=heard,
current_energy=0.01,
is_during_tts=True,
tts_rate=200.0,
utterance_start_time=utterance_start
)
# Should be rejected via full-TTS fallback (8 words, 100% similarity)
assert result is True, "Should reject echo via full-TTS fallback when segment matching fails"
@pytest.mark.unit
def test_accepts_stop_command_during_tts_fallback(self):
"""Stop commands should not trigger the full-TTS fallback rejection.
The fallback only applies to utterances > 4 words, so short commands
like 'stop' should still be accepted during TTS.
"""
detector = EchoDetector()
detector.track_tts_start("the weather tomorrow will be sunny and warm")
result = detector.should_reject_as_echo(
heard_text="stop",
current_energy=0.05,
is_during_tts=True,
tts_rate=200.0,
utterance_start_time=time.time()
)
assert result is False, "Stop command should not be rejected during TTS"
class TestLeadingEchoCleanup:
"""Tests for cleanup_leading_echo functionality."""
def test_cleanup_leading_overlap(self):
"""Removes leading words that match end of TTS."""
detector = EchoDetector()
detector._last_tts_text = "the weather today is sunny"
heard = "is sunny what time is it"
result = detector.cleanup_leading_echo(heard)
assert result == "what time is it"
def test_no_cleanup_when_no_overlap(self):
"""Doesn't modify text when there's no overlap."""
detector = EchoDetector()
detector._last_tts_text = "the weather is nice"
heard = "what time is it"
result = detector.cleanup_leading_echo(heard)
assert result == heard
def test_no_cleanup_short_overlap(self):
"""Doesn't cleanup if overlap is only 1 word."""
detector = EchoDetector()
detector._last_tts_text = "the weather is nice"
heard = "nice what time is it" # Only 1 word overlap
result = detector.cleanup_leading_echo(heard)
assert result == heard # No cleanup for 1-word overlap
def test_cleanup_requires_remainder(self):
"""Doesn't cleanup if the entire heard text is the echo."""
detector = EchoDetector()
detector._last_tts_text = "the weather is nice"
heard = "is nice" # Entire text is echo, no remainder
result = detector.cleanup_leading_echo(heard)
assert result == heard # Don't cleanup if nothing remains
def test_cleanup_fuzzy_word_match(self):
"""Handles Whisper transcription differences (e.g. Tbilisi vs T-Valisi)."""
detector = EchoDetector()
detector._last_tts_text = (
"I don't have a direct way to predict tomorrow's weather, "
"but I can check for you. Let me search for the forecast in Tbilisi."
)
heard = (
"i don't have a direct way to predict tomorrow's weather "
"but i can check for you let me search for the forecast in t-valisi "
"you already searched so i can see the tool calls"
)
result = detector.cleanup_leading_echo(heard)
assert "you already searched" in result
assert "forecast" not in result
class TestHotWindowEchoDetection:
"""Tests for echo detection in hot window mode."""
def test_higher_threshold_in_hot_window(self):
"""Uses stricter matching in hot window to allow more follow-up speech."""
detector = EchoDetector()
detector.track_tts_start("tell me about the weather today")
detector.track_tts_finish()
utterance_start = time.time()
# Text that's somewhat similar but not the same
result = detector.should_reject_as_echo(
heard_text="tell me more",
current_energy=0.01,
is_during_tts=False,
utterance_start_time=utterance_start,
in_hot_window=True # Hot window mode
)
# Should be less likely to reject in hot window due to higher threshold
# (The actual behavior depends on similarity scores)
assert result is False # "tell me more" is different enough
def test_partial_echo_from_long_tts(self):
"""Detects partial echo from a long TTS response.
This tests the scenario where TTS outputs a long response and Whisper
picks up only a portion of it, potentially with transcription errors.
Common in rooms with echo/reverb at higher volumes.
"""
detector = EchoDetector()
# Simulate a long weather response
tts_text = (
"You're in London, and I've got the latest weather update for you: "
"it's currently overcast with light rain showers, and the temperature "
"is around 8 degrees celsius at 18:48 UTC. I'd recommend grabbing an "
"umbrella to stay dry. Would you like me to suggest any outdoor "
"activities or provide more weather details?"
)
detector.track_tts_start(tts_text)
detector.track_tts_finish()
utterance_start = time.time()
# Partial echo that Whisper picked up (with some transcription variations)
partial_echo = "the temperature is around 8 degrees celsius. I'd recommend grabbing an umbrella"
# Should detect as echo - this is clearly part of the TTS output
result = detector._check_text_similarity(partial_echo, tts_text, threshold=70)
assert result is True, f"Should detect partial echo at threshold 70"
def test_echo_with_whisper_transcription_errors(self):
"""Detects echo even with Whisper transcription errors.
Whisper sometimes mishears numbers and times (e.g., "18:48" as "1848").
The fuzzy matching should still catch these as echo.
"""
detector = EchoDetector()
tts_text = "the temperature is 8 degrees celsius at 18:48 UTC"
detector.track_tts_start(tts_text)
detector.track_tts_finish()
# Whisper transcription with errors
heard_with_errors = "the temperature is around 8 degrees celsius at 1848 UTC"
# Should still detect similarity despite transcription errors
result = detector._check_text_similarity(heard_with_errors, tts_text, threshold=70)
assert result is True, "Should detect echo despite transcription errors"
def test_echo_question_from_tts(self):
"""Detects when a question from TTS is echoed back.
TTS often ends with questions like "Would you like more details?"
These should be detected as echo, not new user queries.
"""
detector = EchoDetector()
tts_text = (
"The weather is nice today. Would you like me to suggest "
"any outdoor activities or provide more weather details?"
)
detector.track_tts_start(tts_text)
detector.track_tts_finish()
# Echo of the question portion
echoed_question = "would you like me to suggest any outdoor activities"
result = detector._check_text_similarity(echoed_question, tts_text, threshold=70)
assert result is True, "Should detect echoed question from TTS"
def test_accepts_genuine_followup_in_hot_window(self):
"""Accepts genuine follow-up that differs from TTS content."""
detector = EchoDetector()
tts_text = "The weather in London is currently overcast with rain"
detector.track_tts_start(tts_text)
detector.track_tts_finish()
utterance_start = time.time()
# Genuine follow-up question - different content
followup = "what about tomorrow's forecast"
result = detector.should_reject_as_echo(
heard_text=followup,
current_energy=0.03,
is_during_tts=False,
utterance_start_time=utterance_start,
in_hot_window=True
)
assert result is False, "Should accept genuine follow-up question"
def test_threshold_70_catches_partial_matches(self):
"""Verifies threshold 70 catches partial echo matches.
When using threshold 70 in hot window for fast rejection,
partial echoes with ~75% similarity should be caught.
"""
detector = EchoDetector()
tts_text = "London has about 8 hours of daylight in winter months"
# Partial echo with some differences
partial_echo = "London has about 8 hours of daylight"
# At threshold 70, should match (this is clearly a partial echo)
result_70 = detector._check_text_similarity(partial_echo, tts_text, threshold=70)
assert result_70 is True, "Threshold 70 should catch partial echo"
# At threshold 92 (default hot window), might not match as strictly
# This is fine - the intent judge handles ambiguous cases
result_92 = detector._check_text_similarity(partial_echo, tts_text, threshold=92)
# We don't assert on this as it depends on the fuzzy match algorithm
class TestSalvageDuringTTS:
"""Tests for cleanup_leading_echo_during_tts functionality.
This tests the salvage logic that extracts user speech from utterances
that start during TTS (mixed echo + user speech).
"""
@pytest.fixture
def detector(self):
return EchoDetector()
def test_salvages_user_speech_after_echo(self, detector):
"""Extracts user speech that follows TTS echo.
Scenario: User starts speaking during TTS, mic picks up end of TTS
plus user's actual question.
"""
tts_text = (
"According to the BBC Weather forecast, tomorrow in Kensington is expected "
"to be quite gloomy with overcast conditions. You might want to bundle up "
"and plan your outdoor activities accordingly."
)
detector._last_tts_text = tts_text
detector._tts_start_time = 1000.0
# User's mic picks up end of TTS + their actual question
heard = (
"You might want to bundle up and plan your outdoor activities accordingly. "
"Okay, let's switch the topic now. I want to talk about philosophy."
)
# Utterance started 10 seconds into TTS
result = detector.cleanup_leading_echo_during_tts(heard, tts_rate=200, utterance_start_time=1010.0)
# Should remove echo and keep user's speech
assert "bundle up" not in result.lower(), "Echo portion should be removed"
assert "philosophy" in result.lower(), "User's actual question should be preserved"
assert "switch the topic" in result.lower(), "User's speech should be preserved"
def test_salvage_with_timing_mismatch(self, detector):
"""Salvages correctly even when timing estimate is off.
Real-world scenario: mic timing doesn't perfectly match TTS timing
due to audio processing delays, pre-roll buffer, etc.
"""
tts_text = (
"It's going to be quite chilly. You might want to bundle up "
"and plan your outdoor activities accordingly."
)
detector._last_tts_text = tts_text
detector._tts_start_time = 1000.0
# User's mic picks up end of TTS + their question
# Timing estimate would be wrong, but full-text fallback should work
heard = "plan your outdoor activities accordingly. What do you think life is about?"
# Even with wrong timing estimate, should find match in full TTS
result = detector.cleanup_leading_echo_during_tts(heard, tts_rate=200, utterance_start_time=1005.0)
assert "outdoor activities" not in result.lower(), "Echo should be removed"
assert "life is about" in result.lower(), "User's question should be preserved"
def test_no_salvage_when_no_overlap(self, detector):
"""Returns original text when no overlap with TTS."""
detector._last_tts_text = "The weather is nice today"
detector._tts_start_time = 1000.0
heard = "What time is it?"
result = detector.cleanup_leading_echo_during_tts(heard, tts_rate=200, utterance_start_time=1005.0)
assert result == heard, "Should return original when no echo overlap"
def test_no_salvage_when_all_echo(self, detector):
"""Returns original when entire utterance is echo (no user speech to salvage)."""
tts_text = "The weather is nice and sunny today"
detector._last_tts_text = tts_text
detector._tts_start_time = 1000.0
# Entire heard text matches end of TTS - nothing to salvage
heard = "nice and sunny today"
result = detector.cleanup_leading_echo_during_tts(heard, tts_rate=200, utterance_start_time=1005.0)
# Should return original since there's nothing left after removing echo
assert result == heard
def test_echo_not_in_salvaged_output(self, detector):
"""Verifies echo portion doesn't slip into salvaged output.
This is the critical test - ensures we don't accidentally include
echo text in what we return to the user.
"""
tts_text = (
"According to the forecast, it will rain tomorrow. "
"Would you like me to suggest indoor activities?"
)
detector._last_tts_text = tts_text
detector._tts_start_time = 1000.0
heard = "Would you like me to suggest indoor activities? No thanks, tell me about philosophy instead."
result = detector.cleanup_leading_echo_during_tts(heard, tts_rate=200, utterance_start_time=1008.0)
# Critical: echo words should NOT be in the result
assert "suggest indoor activities" not in result.lower(), "Echo phrase must not be in output"
assert "would you like" not in result.lower(), "Echo phrase must not be in output"
# User's actual request should be preserved
assert "philosophy" in result.lower(), "User's request should be preserved"
class TestRealWorldSalvageScenarios:
"""Tests for real-world salvage scenarios that have caused regressions.
These tests capture actual issues encountered in production:
- Temperature notation differences (5.7°C vs "5.7 degrees Celsius")
- User appending speech to TTS echo
- Whisper transcription differences from TTS text
"""
@pytest.fixture
def detector(self):
return EchoDetector()
def test_temperature_notation_mismatch(self, detector):
"""Salvages user speech when Whisper transcribes temperature differently.
Real scenario: TTS says "5.7°C" but Whisper transcribes "5.7 degrees Celsius"
This caused salvage to fail because word-level matching didn't match.
"""
tts_text = "It's going to be a bit chilly tomorrow in Kensington, with overcast skies and a temperature around 5.7°C."
detector._last_tts_text = tts_text
# Whisper transcribes temperature differently
heard = "It's going to be a bit chilly tomorrow in Kensington with overcast skies and a temperature around 5.7 degrees Celsius. Nice, you remembered not to say it in Fahrenheit."
result = detector.cleanup_leading_echo(heard)
# Should salvage user's follow-up
assert "nice" in result.lower(), "User's follow-up should be preserved"
assert "fahrenheit" in result.lower(), "User's comment should be preserved"
# Echo should be removed
assert "chilly tomorrow" not in result.lower(), "Echo should be removed"
def test_user_appends_speech_to_full_tts_echo(self, detector):
"""User speaks immediately after TTS, mic captures both.
The entire TTS is captured plus user's response. cleanup_leading_echo
should remove the TTS portion and return user's speech.
"""
tts_text = "Would you like some help finding one?"
detector._last_tts_text = tts_text
# User responds right after TTS, mic captures both
heard = "Would you like some help finding one? No thanks, I'm good."
result = detector.cleanup_leading_echo(heard)
# Should return user's response
assert "no thanks" in result.lower(), "User's response should be preserved"
assert "i'm good" in result.lower() or "im good" in result.lower(), "User's response should be preserved"
# Echo should be removed
assert "would you like" not in result.lower(), "Echo should be removed"
def test_salvage_preserves_user_question(self, detector):
"""Salvage preserves user's follow-up question after echo."""
tts_text = "The weather tomorrow will be cloudy with a high of 12 degrees."
detector._last_tts_text = tts_text
heard = "The weather tomorrow will be cloudy with a high of 12 degrees. What about the day after?"
result = detector.cleanup_leading_echo(heard)
assert "what about" in result.lower(), "User's question should be preserved"
assert "day after" in result.lower(), "User's question should be preserved"
assert "cloudy" not in result.lower(), "Echo should be removed"
def test_no_salvage_when_heard_matches_tts_exactly(self, detector):
"""Returns original when heard text is exactly TTS (no user speech).
This ensures we don't accidentally salvage a trailing word from pure echo.
"""
tts_text = "Would you like some help finding one?"
detector._last_tts_text = tts_text
# Heard matches TTS exactly - no user speech to salvage
heard = "Would you like some help finding one?"
result = detector.cleanup_leading_echo(heard)
# Should return original (full echo, nothing to salvage)
assert result == heard, "Should return original when no user speech to salvage"
def test_salvage_with_minor_transcription_errors(self, detector):
"""Salvage works despite minor Whisper transcription errors."""
tts_text = "I can see you're interested in finding out more about this topic."
detector._last_tts_text = tts_text
# Whisper may drop punctuation or have minor differences
heard = "I can see youre interested in finding out more about this topic tell me about philosophy"
result = detector.cleanup_leading_echo(heard)
# Should salvage user's request (may or may not work depending on how different)
# At minimum, shouldn't crash
assert result is not None
class TestFullTTSFallbackSalvage:
"""Tests for salvaging user speech in the full-TTS fallback path.
The full-TTS fallback (threshold 70) catches echoes with significant timing drift
that segment matching misses. But when the heard text contains TTS echo + user speech,
we should salvage the user speech instead of rejecting the entire utterance.
Real bug scenario:
- TTS: "...Temperature will be around 10°C (50°F). A great day to grab a cuppa."
- Heard: "50 degrees Fahrenheit. A great day to grab a cup. Tell me a random topic."
- OLD behavior: Rejected entire utterance as echo (74.6% similarity to full TTS)
- NEW behavior: Salvage "Tell me a random topic" from the suffix
"""
@pytest.fixture
def detector(self):
return EchoDetector()
def test_salvages_user_speech_from_mixed_echo(self, detector):
"""User speech after TTS echo should not be rejected.
The similarity match finds the echo prefix, but there's user speech
at the end that should be salvaged.
"""
tts_text = (
"I think there's been a mix-up! We were just talking about the weather "
"in Kensington, London. Let me check again. According to the tool, "
"tomorrow's forecast for Kensington is: Overcast with a chance of light "
"drizzle. Temperature will be around 10°C (50°F). A great day to grab "
"a cuppa and enjoy the outdoors."
)
detector.track_tts_start(tts_text)
detector._tts_start_time = 1000.0
# Heard: end of TTS + user speech
heard = (
"50 degrees Fahrenheit. A great day to grab a cup and enjoy the outdoors. "
"Fine, yeah. Then tell me a random topic about philosophy."
)
# This should NOT be rejected because there's salvageable user speech
result = detector.should_reject_as_echo(
heard_text=heard,
current_energy=0.01,
is_during_tts=True,
tts_rate=200,
utterance_start_time=1012.0 # Near end of TTS
)
assert result is False, (
"Should NOT reject when there's user speech to salvage. "
"The full-TTS fallback should check for salvageable suffix."
)
def test_still_rejects_pure_echo_in_fallback(self, detector):
"""Pure echo (no user speech) should still be rejected by fallback."""
tts_text = (
"I think there's been a mix-up! We were just talking about the weather. "
"Let me check again. Tomorrow's forecast is overcast with light drizzle. "
"Temperature will be around 10°C."
)
detector.track_tts_start(tts_text)
detector._tts_start_time = 1000.0
# Heard: just echo, no user speech
heard = "Tomorrow's forecast is overcast with light drizzle. Temperature will be around 10 degrees Celsius."
result = detector.should_reject_as_echo(
heard_text=heard,
current_energy=0.01,
is_during_tts=True,
tts_rate=200,
utterance_start_time=1005.0
)
assert result is True, "Pure echo should still be rejected"
def test_salvage_suffix_from_echo_returns_user_speech(self, detector):
"""_salvage_suffix_from_echo returns the user speech portion."""
tts_text = "The weather is nice. Would you like to hear more?"
detector._last_tts_text = tts_text
detector._tts_start_time = 1000.0
heard = "Would you like to hear more? No thanks, tell me about philosophy."
result = detector._salvage_suffix_from_echo(heard, tts_rate=200, utterance_start_time=1005.0)
assert result is not None
assert "philosophy" in result.lower(), "User speech should be salvaged"
assert "would you like" not in result.lower(), "Echo should be removed"
def test_salvage_returns_none_for_pure_echo(self, detector):
"""_salvage_suffix_from_echo returns None for pure echo."""
tts_text = "The weather is nice today."
detector._last_tts_text = tts_text
detector._tts_start_time = 1000.0
# Pure echo, nothing to salvage
heard = "The weather is nice today."
result = detector._salvage_suffix_from_echo(heard, tts_rate=200, utterance_start_time=1005.0)
# Should return None (nothing salvaged) or original text
assert result is None or result == heard
class TestRightmostEchoBoundarySalvage:
"""Field regression: follow-up that starts with a Whisper-mangled echo tail.
Captured from a real session on 2026-04-20:
TTS said: "The movie Possessor is a psychological thriller that
explores themes of surveillance and identity."
User said: "Who made it?"
Whisper heard: "laws, themes of surveillance and identity. Who made it?"
The user started speaking inside the 3s follow-up hot window, and
Whisper merged the mic-captured echo tail with the real follow-up.
Every salvage path in the codebase before this commit either returned
the text unchanged (exact-word cleanup — fails because 'laws' doesn't
match 'explores') or truncated the salvage to just 'made it?' (fuzzy
prefix iteration picks the SHORTEST suffix first). Both are wrong:
the whole follow-up — 'Who made it?' — must survive so the intent
judge can dispatch it.
"""
@pytest.fixture
def detector_with_tts(self):
import time as _time
d = EchoDetector()
tts = (
"The movie Possessor is a psychological thriller that "
"explores themes of surveillance and identity."
)
now = _time.time()
d._last_tts_text = tts
d._tts_start_time = now - 10.0
d._last_tts_finish_time = now - 1.0
d._tts_exact_duration = 9.0
return d, now
def test_salvages_full_follow_up_after_whisper_mangled_echo_prefix(self, detector_with_tts):
detector, now = detector_with_tts
heard = "laws, themes of surveillance and identity. Who made it?"
result = detector.salvage_after_echo_tail(heard)
assert result is not None, "expected a salvage, got None (rejection)"
lowered = result.lower()
# All three words of the real follow-up must survive the salvage.
assert "who" in lowered
assert "made" in lowered
assert "it" in lowered
# None of the echo-tail filler should leak through.
assert "surveillance" not in lowered
assert "identity" not in lowered
assert "themes" not in lowered
assert "laws" not in lowered
def test_returns_none_when_heard_is_pure_echo(self, detector_with_tts):
detector, _now = detector_with_tts
heard = "themes of surveillance and identity"
# Nothing non-echo after the tail — nothing to salvage.
result = detector.salvage_after_echo_tail(heard)
assert result is None
def test_returns_none_when_heard_shares_nothing_with_tts(self, detector_with_tts):
detector, _now = detector_with_tts
heard = "what is the weather tomorrow in London"
# No echo prefix at all — no salvage needed; caller keeps the text as-is.
result = detector.salvage_after_echo_tail(heard)
assert result is None