Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/evals/test_intent_judge.py
+++ b/evals/test_intent_judge.py
@@ -0,0 +1,962 @@
+"""
+Evals for the Intent Judge LLM.
+
+Deduplicated suite: 22 cases covering all behaviour axes from the original 59.
+See PR description / commit message for the dedup rationale.
+"""
+
+import pytest
+from unittest.mock import patch, MagicMock
+from dataclasses import dataclass
+from typing import Optional, List, Union
+
+from helpers import JUDGE_MODEL, JUDGE_BASE_URL, is_judge_llm_available
+
+
+# =============================================================================
+# Test Data
+# =============================================================================
+
+@dataclass
+class IntentJudgeTestCase:
+    """Test case for intent judge evaluation."""
+    name: str
+    transcript: str
+    last_tts_text: str
+    in_hot_window: bool
+    wake_timestamp: Optional[float]
+    expected_directed: bool
+    expected_query_contains: Optional[Union[str, List[str]]]
+    expected_query_not_contains: Optional[Union[str, List[str]]] = None
+    expected_stop: bool = False
+
+
+# Single-segment cases - one per distinct behaviour axis.
+INTENT_JUDGE_TEST_CASES = [
+    # Wake word + simple question (canonical directed+extract)
+    IntentJudgeTestCase(
+        name="wake_word_simple_question",
+        transcript="Jarvis what time is it",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.5,
+        expected_directed=True,
+        expected_query_contains="time",
+        expected_query_not_contains="jarvis",
+    ),
+    # Wake word at sentence end, adjacent to a named entity. Regression guard:
+    # the judge previously left "Jarvis" in the query, causing the reply engine
+    # to treat "Possessor Jarvis" as the film title instead of "Possessor".
+    IntentJudgeTestCase(
+        name="wake_word_trailing_after_named_entity",
+        transcript="what do you know about the movie called Possessor Jarvis",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1001.5,
+        expected_directed=True,
+        expected_query_contains="possessor",
+        expected_query_not_contains="jarvis",
+    ),
+    # Wake word mid-sentence (not at start, not at end). Ensures the judge
+    # removes every occurrence, not just the leading one.
+    IntentJudgeTestCase(
+        name="wake_word_mid_sentence",
+        transcript="hey Jarvis what's the weather in London",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.3,
+        expected_directed=True,
+        expected_query_contains="weather",
+        expected_query_not_contains="jarvis",
+    ),
+    # Wake word + command/imperative addressed to the assistant (not a question)
+    IntentJudgeTestCase(
+        name="wake_word_command_timer",
+        transcript="Jarvis set a timer for 5 minutes",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.5,
+        expected_directed=True,
+        expected_query_contains="timer",
+        expected_query_not_contains="jarvis",
+    ),
+    # Wake word + statement/command to remember something
+    IntentJudgeTestCase(
+        name="wake_word_statement_remember",
+        transcript="Jarvis remind me to call mum at 5pm",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.5,
+        expected_directed=True,
+        expected_query_contains="mum",
+    ),
+    # Wake word + casual share-of-information statement (no explicit command
+    # or question). Regression guard: the judge previously rejected these as
+    # "not directed" because the sentence was a statement about the user's
+    # own action rather than a command or question, even though the wake
+    # word was clearly addressed to the assistant.
+    IntentJudgeTestCase(
+        name="wake_word_share_statement_burger",
+        transcript="Jarvis, I just ate a burger from McDonald's.",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.5,
+        expected_directed=True,
+        expected_query_contains="burger",
+        expected_query_not_contains="jarvis",
+    ),
+    IntentJudgeTestCase(
+        name="wake_word_share_statement_feeling",
+        transcript="Jarvis I'm feeling a bit tired today",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.5,
+        expected_directed=True,
+        expected_query_contains="tired",
+        expected_query_not_contains="jarvis",
+    ),
+    # Wake word at the END of a declarative statement. Position of the wake
+    # word must not affect directedness — this pattern must also be directed.
+    IntentJudgeTestCase(
+        name="wake_word_share_statement_trailing",
+        transcript="My flight just got cancelled, Jarvis",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1001.5,
+        expected_directed=True,
+        expected_query_contains="flight",
+        expected_query_not_contains="jarvis",
+    ),
+    # Wake word at the END of a declarative statement that contains a
+    # capitalised brand/product name immediately before "Jarvis". Regression:
+    # gemma4:e2b misread "big Mac Jarvis" as the compound name "Mac Jarvis",
+    # treating "Jarvis" as a surname rather than the wake word, and returned
+    # directed=false despite its own reasoning stating it found the wake word.
+    IntentJudgeTestCase(
+        name="wake_word_trailing_after_capitalised_brand",
+        transcript="I just ate a big Mac Jarvis",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1001.5,
+        expected_directed=True,
+        expected_query_contains="big Mac",
+        expected_query_not_contains="jarvis",
+    ),
+    # Self-contained imperative with an intentionally open subject ("something",
+    # "anything", "a joke") — these are valid queries and must not be treated
+    # as vague references or standalone "re-issue prior question" imperatives.
+    # Regression: gemma4:e2b was returning directed=false with reasoning "no
+    # extractable query" on "Jarvis say something please" because it conflated
+    # the open subject with a topic-less question.
+    IntentJudgeTestCase(
+        name="wake_word_open_imperative_say_something",
+        transcript="Jarvis say something please",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.5,
+        expected_directed=True,
+        expected_query_contains="say something",
+        expected_query_not_contains="jarvis",
+    ),
+    IntentJudgeTestCase(
+        name="wake_word_open_imperative_tell_me_a_joke",
+        transcript="Jarvis tell me a joke",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.5,
+        expected_directed=True,
+        expected_query_contains="joke",
+        expected_query_not_contains="jarvis",
+    ),
+    IntentJudgeTestCase(
+        name="wake_word_open_imperative_tell_me_anything",
+        transcript="Jarvis tell me anything",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.5,
+        expected_directed=True,
+        expected_query_contains="anything",
+        expected_query_not_contains="jarvis",
+    ),
+    IntentJudgeTestCase(
+        name="wake_word_open_imperative_give_me_advice",
+        transcript="Jarvis give me advice please",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.5,
+        expected_directed=True,
+        expected_query_contains="advice",
+        expected_query_not_contains="jarvis",
+    ),
+    IntentJudgeTestCase(
+        name="wake_word_open_imperative_surprise_me",
+        transcript="Jarvis surprise me",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.5,
+        expected_directed=True,
+        expected_query_contains="surprise",
+        expected_query_not_contains="jarvis",
+    ),
+    # Same-segment context synthesis (distinct from simple wake+Q)
+    IntentJudgeTestCase(
+        name="context_synthesis_weather_opinion",
+        transcript="I think the weather is great today in London. What do you think, Jarvis?",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.8,
+        expected_directed=True,
+        expected_query_contains="weather",
+    ),
+    # Echo + user follow-up in hot window
+    IntentJudgeTestCase(
+        name="echo_plus_followup_extracted",
+        transcript="London has 8 hours of daylight. That's quite cool. Tell me more.",
+        last_tts_text="On this day, London receives around 7-8 hours of daylight.",
+        in_hot_window=True,
+        wake_timestamp=None,
+        expected_directed=True,
+        expected_query_contains="more",
+    ),
+    # Stop command during TTS
+    IntentJudgeTestCase(
+        name="stop_command_during_tts",
+        transcript="stop",
+        last_tts_text="Let me tell you about the history of...",
+        in_hot_window=False,
+        wake_timestamp=None,
+        expected_directed=True,
+        expected_query_contains=None,
+        expected_stop=True,
+    ),
+    # No wake word, not hot window -> not directed
+    IntentJudgeTestCase(
+        name="no_wake_word_casual_speech",
+        transcript="I think the weather is nice today",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=None,
+        expected_directed=False,
+        expected_query_contains=None,
+    ),
+    # Wake word only mentioned in narrative -> not directed
+    IntentJudgeTestCase(
+        name="mentioned_in_narrative_past_tense",
+        transcript="I told my friend about Jarvis yesterday",
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.8,
+        expected_directed=False,
+        expected_query_contains=None,
+    ),
+    # Hot window simple follow-up
+    IntentJudgeTestCase(
+        name="hot_window_simple_followup",
+        transcript="What about next week?",
+        last_tts_text="The weather this weekend will be rainy.",
+        in_hot_window=True,
+        wake_timestamp=None,
+        expected_directed=True,
+        expected_query_contains="next week",
+    ),
+]
+
+
+@dataclass
+class MultiSegmentTestCase:
+    """Test case with multiple transcript segments (realistic buffer state)."""
+    name: str
+    segments: list
+    last_tts_text: str
+    in_hot_window: bool
+    wake_timestamp: Optional[float]
+    expected_directed: bool
+    expected_query_contains: Optional[Union[str, List[str]]]
+    expected_query_not_contains: Optional[Union[str, List[str]]] = None
+    expected_stop: bool = False
+    aliases: Optional[List[str]] = None
+
+
+MULTI_SEGMENT_TEST_CASES = [
+    # Real-logs scenario: echo + rejected similar + wake retry
+    MultiSegmentTestCase(
+        name="echo_plus_rejected_similar_plus_wake_retry",
+        segments=[
+            ("and relatively windy, about 11 kilometers per hour", False),
+            ("Okay, well, what about any new movies tomorrow?", False),
+            ("Jarvis, what about new movies tomorrow?", False),
+        ],
+        last_tts_text="Tomorrow's weather in Kensington looks a bit gloomy, with overcast conditions expected. It'll be quite cool, around 6°C, and relatively windy, about 11 km/h.",
+        in_hot_window=False,
+        wake_timestamp=1004.5,
+        expected_directed=True,
+        expected_query_contains="movies",
+        expected_query_not_contains="weather",
+    ),
+    # Hot window with echo in buffer + user follow-up
+    MultiSegmentTestCase(
+        name="buffer_echo_then_followup_hot_window",
+        segments=[
+            ("The weather is sunny and warm", False),
+            ("What about the weekend?", False),
+        ],
+        last_tts_text="The weather today is sunny and warm, around 20 degrees.",
+        in_hot_window=True,
+        wake_timestamp=None,
+        expected_directed=True,
+        expected_query_contains="weekend",
+        expected_query_not_contains="sunny",
+    ),
+    # Stop command with TTS echoes in buffer
+    MultiSegmentTestCase(
+        name="multiple_echoes_then_interrupt",
+        segments=[
+            ("Let me tell you about", True),
+            ("the history of", True),
+            ("Jarvis stop", False),
+        ],
+        last_tts_text="Let me tell you about the history of ancient Rome.",
+        in_hot_window=False,
+        wake_timestamp=1002.0,
+        expected_directed=True,
+        expected_query_contains=None,
+        expected_stop=True,
+    ),
+    # No wake word in multi-segment buffer
+    MultiSegmentTestCase(
+        name="no_wake_word_in_buffer",
+        segments=[
+            ("How are you?", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=None,
+        expected_directed=False,
+        expected_query_contains=None,
+    ),
+    # Context synthesis with prior ambient speech that must be filtered
+    MultiSegmentTestCase(
+        name="context_synthesis_with_prior_ambient",
+        segments=[
+            ("Did you see the game last night?", False),
+            ("Yeah it was amazing", False),
+            ("The food here is excellent. Jarvis, what's the best dish to order?", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1004.0,
+        expected_directed=True,
+        expected_query_contains="dish",
+        expected_query_not_contains="game",
+    ),
+    # Multi-person conversation: context synthesis across speakers without explicit pronoun
+    MultiSegmentTestCase(
+        name="multi_person_weather_discussion",
+        segments=[
+            ("I wonder what the weather will be like tomorrow", False),
+            ("Yeah we should check before planning the picnic", False),
+            ("Jarvis what do you think", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1004.0,
+        expected_directed=True,
+        expected_query_contains="weather",
+    ),
+    # Multi-person + vague reference ("that" = iPhone from earlier segment)
+    MultiSegmentTestCase(
+        name="multi_person_vague_reference",
+        segments=[
+            ("The new iPhone looks pretty cool", False),
+            ("I heard the camera is amazing", False),
+            ("Jarvis how much does that cost", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1004.0,
+        expected_directed=True,
+        expected_query_contains="iphone",
+    ),
+    # User statement follow-up in hot window (not an echo of TTS question)
+    MultiSegmentTestCase(
+        name="user_followup_statement_after_question_nihilism",
+        segments=[
+            ("Some people find that appealing", True),
+            ("While others see it as a bleak outlook", True),
+            ("What are your thoughts on nihilism", True),
+            ("I think it's way more ridiculous than absurdism. Absurdism is the way to go.", False),
+        ],
+        last_tts_text="Nihilism is an interesting philosophical position. Some people find it appealing, while others see it as a bleak outlook. What are your thoughts on nihilism?",
+        in_hot_window=True,
+        wake_timestamp=None,
+        expected_directed=True,
+        expected_query_contains="absurdism",
+        expected_query_not_contains="what are your thoughts",
+    ),
+    # Cross-segment vague reference ("that" -> dinosaurs)
+    MultiSegmentTestCase(
+        name="cross_segment_dinosaur_opinion",
+        segments=[
+            ("I think dinosaurs are cool", False),
+            ("What do you think about that Jarvis", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1002.5,
+        expected_directed=True,
+        expected_query_contains="dinosaur",
+    ),
+    # Imperative resolution: "answer that" -> re-issue prior question
+    MultiSegmentTestCase(
+        name="cross_segment_answer_that_weather",
+        segments=[
+            ("Sorry, how's the weather today?", False),
+            ("Jarvis, answer that", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1002.5,
+        expected_directed=True,
+        expected_query_contains="weather",
+        expected_query_not_contains="answer that",
+    ),
+    # Imperative resolution with unrelated noise between Q and imperative
+    MultiSegmentTestCase(
+        name="cross_segment_answer_that_with_noise",
+        segments=[
+            ("How tall is Mount Everest", False),
+            ("Charlie sands to that", False),
+            ("Jarvis answer that", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1004.5,
+        expected_directed=True,
+        expected_query_contains="everest",
+        expected_query_not_contains="answer that",
+    ),
+    # Whisper tense variant of imperative ("answered that")
+    MultiSegmentTestCase(
+        name="cross_segment_answered_that_whisper_variant",
+        segments=[
+            ("Sorry, how's the weather today?", False),
+            ("Jarvis answered that", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1002.5,
+        expected_directed=True,
+        expected_query_contains="weather",
+        expected_query_not_contains="answered that",
+    ),
+    # Multi-word imperative variant
+    MultiSegmentTestCase(
+        name="cross_segment_go_ahead_and_answer",
+        segments=[
+            ("What's the capital of Portugal", False),
+            ("Jarvis go ahead and answer", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1002.5,
+        expected_directed=True,
+        expected_query_contains="portugal",
+        expected_query_not_contains="go ahead and answer",
+    ),
+    # Imperative superseded by new explicit question in same segment
+    MultiSegmentTestCase(
+        name="cross_segment_imperative_superseded_by_new_question",
+        segments=[
+            ("How's the weather today?", False),
+            ("Jarvis, answer that — actually, what time is it?", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1002.5,
+        expected_directed=True,
+        expected_query_contains="time",
+        expected_query_not_contains="weather",
+    ),
+    # Cross-segment follow-up in hot window (topic extension)
+    MultiSegmentTestCase(
+        name="cross_segment_hot_window_followup",
+        segments=[
+            ("The capital of France is Paris", True),
+            ("What about Germany", False),
+        ],
+        last_tts_text="The capital of France is Paris, known as the City of Light.",
+        in_hot_window=True,
+        wake_timestamp=None,
+        expected_directed=True,
+        expected_query_contains="germany",
+    ),
+    # Alias (Whisper mishearing) should be treated as the wake word. Without
+    # alias normalisation the small model sees "Jervis" and decides the user
+    # is addressing a different person.
+    MultiSegmentTestCase(
+        name="alias_treated_as_wake_word",
+        segments=[
+            ("Jervis, what time is it in London?", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1000.8,
+        expected_directed=True,
+        expected_query_contains="time",
+        aliases=["jervis", "jaivis", "jervis", "javis"],
+    ),
+    # Alias mid-utterance after narrative context — the model must still
+    # recognise the addressee as the assistant and resolve the vague reference.
+    MultiSegmentTestCase(
+        name="alias_after_narrative_context",
+        segments=[
+            ("The new iPhone looks pretty cool", False),
+            ("I heard the camera is amazing", False),
+            ("Jaivis how much does that cost", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1004.0,
+        expected_directed=True,
+        expected_query_contains="iphone",
+        aliases=["jervis", "jaivis", "jervis", "javis"],
+    ),
+    # Buried target sentence amid interleaved unrelated chatter (multi-topic
+    # disambiguation). Two separate topics coexist in the buffer — iPhone
+    # pricing thread and an unrelated Yankees game discussion. The wake-word
+    # segment contains a vague reference ("it") that must resolve to the
+    # correct thread (iPhone), not the most recent unrelated topic.
+    MultiSegmentTestCase(
+        name="buried_target_amid_unrelated_chatter",
+        segments=[
+            ("The new iPhone looks pretty cool", False),
+            ("Did you see the Yankees game last night", False),
+            ("I heard the camera is amazing on that phone", False),
+            ("Yeah that was a great play in the ninth inning", False),
+            ("Jarvis how much does it cost", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1008.5,
+        expected_directed=True,
+        expected_query_contains="iphone",
+        expected_query_not_contains="yankees",
+    ),
+    # Same buried-target disambiguation, but the wake-word question has no
+    # explicit pronoun ("what's the price" instead of "how much does it cost").
+    # The judge must still resolve the topic from prior segments — a query of
+    # "what's the price" is not answerable alone.
+    MultiSegmentTestCase(
+        name="buried_target_topicless_question",
+        segments=[
+            ("so anyway the meeting ran really long yesterday", False),
+            ("did you catch the ball game", False),
+            ("the new iPhone is out", False),
+            ("yeah they lost again though", False),
+            ("I want the pro model", False),
+            ("Jarvis what's the price", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1010.5,
+        expected_directed=True,
+        # Parent-noun rule: resolving to a sub-item ("pro model") must also
+        # include the parent noun/brand ("iPhone") — "pro model" alone is
+        # not self-contained.
+        expected_query_contains=["iphone", "pro"],
+        expected_query_not_contains="ball game",
+    ),
+    # Vague reference "they" — the AirPods are the only plural antecedent
+    # that can be cost-queried, so "how much do they cost" must resolve to
+    # the AirPods thread and include the brand/noun in the query.
+    MultiSegmentTestCase(
+        name="buried_target_plural_vague_ref_they",
+        segments=[
+            ("the AirPods sound great", False),
+            ("yeah the bass is really punchy", False),
+            ("Jarvis how much do they cost", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1006.5,
+        expected_directed=True,
+        expected_query_contains="airpods",
+    ),
+    # Hot-window override: a topic-less follow-up ("tell me more") in hot
+    # window must stay directed=true even though a topic-rich earlier buffer
+    # would otherwise trigger the topic-resolution heuristic. The HOT WINDOW
+    # rule must win over the "topic-less question" vague-reference rule.
+    MultiSegmentTestCase(
+        name="hot_window_override_topicless_followup",
+        segments=[
+            ("the new iPhone is out", False),
+            ("I want the pro model", False),
+            ("tell me more", False),
+        ],
+        last_tts_text="The iPhone 16 Pro has a titanium frame and a new camera system.",
+        in_hot_window=True,
+        wake_timestamp=None,
+        expected_directed=True,
+        expected_query_contains=None,
+    ),
+    # Wake word mid-utterance after narrative buffer, addressing the assistant.
+    # Real-world case: user was discussing Mata Hari in the background, then
+    # turned to the assistant with "Jarvis, do you know what she's talking about,
+    # about Mata Hari?". The small model mis-classified as "not directed" with
+    # reasoning that contradicted the verdict. The wake word is mid-utterance
+    # here but the trailing clause addresses the assistant directly ("do YOU
+    # know"), so this must be DIRECTED.
+    MultiSegmentTestCase(
+        name="wake_word_after_narrative_addresses_assistant",
+        segments=[
+            ("The dude was a lie upon the lie", False),
+            ("Mata Hari was never a traitor, she was an honest woman", False),
+            ("Jarvis, do you know what she's talking about, about Mata Hari?", False),
+        ],
+        last_tts_text="",
+        in_hot_window=False,
+        wake_timestamp=1004.5,
+        expected_directed=True,
+        expected_query_contains="mata hari",
+    ),
+]
+
+
+# Cases known to fail with the small model on the current prompt.
+# Track regressions / future prompt improvements here.
+KNOWN_FAILING_CASES: set = set()
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+def _as_substring_list(value):
+    """Normalise an expected_query_contains / _not_contains value to a list."""
+    if value is None:
+        return []
+    if isinstance(value, str):
+        return [value]
+    return list(value)
+
+
+def create_transcript_segment(
+    text: str,
+    start_time: float = 1000.0,
+    is_during_tts: bool = False,
+    processed: bool = False,
+):
+    """Create a TranscriptSegment for testing."""
+    from jarvis.listening.transcript_buffer import TranscriptSegment
+    return TranscriptSegment(
+        text=text,
+        start_time=start_time,
+        end_time=start_time + 2.0,
+        energy=0.01,
+        is_during_tts=is_during_tts,
+        processed=processed,
+    )
+
+
+def run_intent_judge(case: IntentJudgeTestCase):
+    """Run the intent judge on a test case."""
+    from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
+
+    judge = IntentJudge(IntentJudgeConfig(
+        assistant_name="Jarvis",
+        model="gemma4:e2b",
+        timeout_sec=10.0,
+    ))
+
+    if not judge.available:
+        return None
+
+    segments = [create_transcript_segment(case.transcript)]
+
+    return judge.judge(
+        segments=segments,
+        wake_timestamp=case.wake_timestamp,
+        last_tts_text=case.last_tts_text,
+        last_tts_finish_time=999.0 if case.last_tts_text else 0.0,
+        in_hot_window=case.in_hot_window,
+        current_text=case.transcript,
+    )
+
+
+def run_intent_judge_multi_segment(case: "MultiSegmentTestCase"):
+    """Run the intent judge on a multi-segment test case."""
+    from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
+
+    judge = IntentJudge(IntentJudgeConfig(
+        assistant_name="Jarvis",
+        aliases=list(case.aliases or []),
+        model="gemma4:e2b",
+        timeout_sec=10.0,
+    ))
+
+    if not judge.available:
+        return None
+
+    segments = []
+    base_time = 1000.0
+    for i, (text, is_during_tts) in enumerate(case.segments):
+        segments.append(create_transcript_segment(
+            text=text,
+            start_time=base_time + (i * 2.0),
+            is_during_tts=is_during_tts,
+        ))
+
+    current_text = ""
+    for text, is_during_tts in reversed(case.segments):
+        if not is_during_tts:
+            current_text = text
+            break
+
+    return judge.judge(
+        segments=segments,
+        wake_timestamp=case.wake_timestamp,
+        last_tts_text=case.last_tts_text,
+        last_tts_finish_time=999.0 if case.last_tts_text else 0.0,
+        in_hot_window=case.in_hot_window,
+        current_text=current_text,
+    )
+
+
+def is_intent_judge_available() -> bool:
+    """Check if the intent judge model is available."""
+    import requests
+    try:
+        resp = requests.get("http://127.0.0.1:11434/api/tags", timeout=2)
+        if resp.status_code != 200:
+            return False
+        data = resp.json()
+        models = [m.get("name", "") for m in data.get("models", [])]
+        return any("gemma4" in m for m in models)
+    except Exception:
+        return False
+
+
+def _skip_if_not_intent_judge_phase():
+    """Intent judge tests are fixed to gemma4:e2b and would run twice under the
+    multi-model eval matrix. Skip during the large-model phase to keep runtime
+    down; they still run once during the small-model (gemma4) phase."""
+    if "gemma4" not in JUDGE_MODEL:
+        pytest.skip(f"Intent judge tests only run in the gemma4 phase (current: {JUDGE_MODEL})")
+
+
+# =============================================================================
+# Tests
+# =============================================================================
+
+class TestIntentJudgeAccuracy:
+    """Evals for intent judge accuracy."""
+
+    @pytest.mark.parametrize("case", INTENT_JUDGE_TEST_CASES, ids=lambda c: c.name)
+    def test_intent_judge_case(self, case: IntentJudgeTestCase):
+        _skip_if_not_intent_judge_phase()
+        if not is_intent_judge_available():
+            pytest.skip("Intent judge model (gemma4) not available")
+
+        if case.name in KNOWN_FAILING_CASES:
+            pytest.xfail(f"Known issue: {case.name} needs prompt improvement")
+
+        result = run_intent_judge(case)
+
+        if result is None:
+            pytest.fail("Intent judge returned None")
+
+        print(f"\n{'='*60}")
+        print(f"Test Case: {case.name}")
+        print(f"Transcript: {case.transcript}")
+        print(f"TTS: {case.last_tts_text[:50]}..." if case.last_tts_text else "TTS: None")
+        print(f"Mode: {'hot_window' if case.in_hot_window else 'wake_word'}")
+        print(f"{'='*60}")
+        print(f"Result: directed={result.directed}, query='{result.query}', stop={result.stop}")
+        print(f"Confidence: {result.confidence}")
+        print(f"Reasoning: {result.reasoning}")
+        print(f"{'='*60}")
+
+        assert result.directed == case.expected_directed, (
+            f"Expected directed={case.expected_directed}, got {result.directed}. "
+            f"Reasoning: {result.reasoning}"
+        )
+        assert result.stop == case.expected_stop, (
+            f"Expected stop={case.expected_stop}, got {result.stop}. "
+            f"Reasoning: {result.reasoning}"
+        )
+        for needle in _as_substring_list(case.expected_query_contains):
+            assert needle.lower() in (result.query or "").lower(), (
+                f"Expected query to contain '{needle}', "
+                f"got '{result.query}'. Reasoning: {result.reasoning}"
+            )
+        if result.query:
+            for needle in _as_substring_list(case.expected_query_not_contains):
+                assert needle.lower() not in result.query.lower(), (
+                    f"Expected query to NOT contain '{needle}', "
+                    f"got '{result.query}'. Reasoning: {result.reasoning}"
+                )
+
+
+class TestIntentJudgePromptQuality:
+    """Tests for intent judge prompt construction quality."""
+
+    def test_hot_window_mode_indicated_in_prompt(self):
+        from jarvis.listening.intent_judge import IntentJudge
+
+        judge = IntentJudge()
+        segments = [create_transcript_segment("hello")]
+
+        prompt = judge._build_user_prompt(
+            segments=segments,
+            wake_timestamp=None,
+            last_tts_text="Test TTS",
+            last_tts_finish_time=999.0,
+            in_hot_window=True,
+        )
+
+        assert "HOT WINDOW" in prompt
+
+    def test_tts_text_included_for_echo_detection(self):
+        from jarvis.listening.intent_judge import IntentJudge
+
+        judge = IntentJudge()
+        segments = [create_transcript_segment("The weather is nice")]
+        tts_text = "The weather today is nice and sunny"
+
+        prompt = judge._build_user_prompt(
+            segments=segments,
+            wake_timestamp=None,
+            last_tts_text=tts_text,
+            last_tts_finish_time=999.0,
+            in_hot_window=True,
+        )
+
+        assert "nice and sunny" in prompt
+
+    def test_system_prompt_has_echo_guidance(self):
+        from jarvis.listening.intent_judge import IntentJudge
+
+        judge = IntentJudge()
+        prompt = judge._build_system_prompt()
+
+        assert "echo" in prompt.lower()
+        assert "(during TTS)" in prompt
+
+
+class TestIntentJudgeFallback:
+    """Tests for intent judge fallback behaviour."""
+
+    def test_returns_none_when_ollama_unavailable(self):
+        from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
+
+        judge = IntentJudge(IntentJudgeConfig(
+            ollama_base_url="http://127.0.0.1:99999",
+            timeout_sec=1.0,
+        ))
+
+        segments = [create_transcript_segment("test")]
+        result = judge.judge(segments)
+
+        assert result is None
+
+
+class TestIntentJudgeMultiSegment:
+    """Evals for intent judge with realistic multi-segment transcript buffers."""
+
+    @pytest.mark.parametrize("case", MULTI_SEGMENT_TEST_CASES, ids=lambda c: c.name)
+    def test_multi_segment_case(self, case: MultiSegmentTestCase):
+        _skip_if_not_intent_judge_phase()
+        if not is_intent_judge_available():
+            pytest.skip("Intent judge model (gemma4) not available")
+
+        if case.name in KNOWN_FAILING_CASES:
+            pytest.xfail(f"Known issue: {case.name} needs prompt improvement")
+
+        result = run_intent_judge_multi_segment(case)
+
+        if result is None:
+            pytest.fail("Intent judge returned None")
+
+        print(f"\n{'='*60}")
+        print(f"Test Case: {case.name}")
+        print(f"Segments:")
+        for text, is_tts in case.segments:
+            marker = " (during TTS)" if is_tts else ""
+            print(f"  - \"{text}\"{marker}")
+        print(f"TTS: {case.last_tts_text[:50]}..." if case.last_tts_text else "TTS: None")
+        print(f"Mode: {'hot_window' if case.in_hot_window else 'wake_word'}")
+        print(f"{'='*60}")
+        print(f"Result: directed={result.directed}, query='{result.query}', stop={result.stop}")
+        print(f"Confidence: {result.confidence}")
+        print(f"Reasoning: {result.reasoning}")
+        print(f"{'='*60}")
+
+        assert result.directed == case.expected_directed, (
+            f"Expected directed={case.expected_directed}, got {result.directed}. "
+            f"Reasoning: {result.reasoning}"
+        )
+        assert result.stop == case.expected_stop, (
+            f"Expected stop={case.expected_stop}, got {result.stop}. "
+            f"Reasoning: {result.reasoning}"
+        )
+        for needle in _as_substring_list(case.expected_query_contains):
+            assert needle.lower() in (result.query or "").lower(), (
+                f"Expected query to contain '{needle}', "
+                f"got '{result.query}'. Reasoning: {result.reasoning}"
+            )
+        if result.query:
+            for needle in _as_substring_list(case.expected_query_not_contains):
+                assert needle.lower() not in result.query.lower(), (
+                    f"Expected query to NOT contain '{needle}', "
+                    f"got '{result.query}'. Reasoning: {result.reasoning}"
+                )
+
+
+class TestProcessedSegmentFiltering:
+    """Tests for processed segment filtering in intent judge."""
+
+    def test_processed_segment_not_reextracted(self):
+        _skip_if_not_intent_judge_phase()
+        if not is_intent_judge_available():
+            pytest.skip("Intent judge model (gemma4) not available")
+
+        from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
+
+        judge = IntentJudge(IntentJudgeConfig(
+            assistant_name="Jarvis",
+            model="gemma4:e2b",
+            timeout_sec=10.0,
+        ))
+
+        segments = [
+            create_transcript_segment(
+                text="Jarvis what's the weather in London",
+                start_time=1000.0,
+                processed=True,
+            ),
+            create_transcript_segment(
+                text="Jarvis tell me a random topic",
+                start_time=1010.0,
+                processed=False,
+            ),
+        ]
+
+        result = judge.judge(
+            segments=segments,
+            wake_timestamp=1010.0,
+            last_tts_text="",
+            last_tts_finish_time=0.0,
+            in_hot_window=False,
+            current_text="Jarvis tell me a random topic",
+        )
+
+        assert result is not None
+        assert result.directed is True
+        assert "random" in result.query.lower() or "topic" in result.query.lower(), (
+            f"Expected query about 'random topic', got '{result.query}'."
+        )
+        assert "weather" not in result.query.lower(), (
+            f"Query contains 'weather' from processed segment: '{result.query}'"
+        )
+
+        print(f"\n✅ Correctly extracted new query: '{result.query}'")