Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
963 lines
36 KiB
Python
963 lines
36 KiB
Python
"""
|
|
Evals for the Intent Judge LLM.
|
|
|
|
Deduplicated suite: 22 cases covering all behaviour axes from the original 59.
|
|
See PR description / commit message for the dedup rationale.
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import patch, MagicMock
|
|
from dataclasses import dataclass
|
|
from typing import Optional, List, Union
|
|
|
|
from helpers import JUDGE_MODEL, JUDGE_BASE_URL, is_judge_llm_available
|
|
|
|
|
|
# =============================================================================
|
|
# Test Data
|
|
# =============================================================================
|
|
|
|
@dataclass
|
|
class IntentJudgeTestCase:
|
|
"""Test case for intent judge evaluation."""
|
|
name: str
|
|
transcript: str
|
|
last_tts_text: str
|
|
in_hot_window: bool
|
|
wake_timestamp: Optional[float]
|
|
expected_directed: bool
|
|
expected_query_contains: Optional[Union[str, List[str]]]
|
|
expected_query_not_contains: Optional[Union[str, List[str]]] = None
|
|
expected_stop: bool = False
|
|
|
|
|
|
# Single-segment cases - one per distinct behaviour axis.
|
|
INTENT_JUDGE_TEST_CASES = [
|
|
# Wake word + simple question (canonical directed+extract)
|
|
IntentJudgeTestCase(
|
|
name="wake_word_simple_question",
|
|
transcript="Jarvis what time is it",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.5,
|
|
expected_directed=True,
|
|
expected_query_contains="time",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
# Wake word at sentence end, adjacent to a named entity. Regression guard:
|
|
# the judge previously left "Jarvis" in the query, causing the reply engine
|
|
# to treat "Possessor Jarvis" as the film title instead of "Possessor".
|
|
IntentJudgeTestCase(
|
|
name="wake_word_trailing_after_named_entity",
|
|
transcript="what do you know about the movie called Possessor Jarvis",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1001.5,
|
|
expected_directed=True,
|
|
expected_query_contains="possessor",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
# Wake word mid-sentence (not at start, not at end). Ensures the judge
|
|
# removes every occurrence, not just the leading one.
|
|
IntentJudgeTestCase(
|
|
name="wake_word_mid_sentence",
|
|
transcript="hey Jarvis what's the weather in London",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.3,
|
|
expected_directed=True,
|
|
expected_query_contains="weather",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
# Wake word + command/imperative addressed to the assistant (not a question)
|
|
IntentJudgeTestCase(
|
|
name="wake_word_command_timer",
|
|
transcript="Jarvis set a timer for 5 minutes",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.5,
|
|
expected_directed=True,
|
|
expected_query_contains="timer",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
# Wake word + statement/command to remember something
|
|
IntentJudgeTestCase(
|
|
name="wake_word_statement_remember",
|
|
transcript="Jarvis remind me to call mum at 5pm",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.5,
|
|
expected_directed=True,
|
|
expected_query_contains="mum",
|
|
),
|
|
# Wake word + casual share-of-information statement (no explicit command
|
|
# or question). Regression guard: the judge previously rejected these as
|
|
# "not directed" because the sentence was a statement about the user's
|
|
# own action rather than a command or question, even though the wake
|
|
# word was clearly addressed to the assistant.
|
|
IntentJudgeTestCase(
|
|
name="wake_word_share_statement_burger",
|
|
transcript="Jarvis, I just ate a burger from McDonald's.",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.5,
|
|
expected_directed=True,
|
|
expected_query_contains="burger",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
IntentJudgeTestCase(
|
|
name="wake_word_share_statement_feeling",
|
|
transcript="Jarvis I'm feeling a bit tired today",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.5,
|
|
expected_directed=True,
|
|
expected_query_contains="tired",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
# Wake word at the END of a declarative statement. Position of the wake
|
|
# word must not affect directedness — this pattern must also be directed.
|
|
IntentJudgeTestCase(
|
|
name="wake_word_share_statement_trailing",
|
|
transcript="My flight just got cancelled, Jarvis",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1001.5,
|
|
expected_directed=True,
|
|
expected_query_contains="flight",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
# Wake word at the END of a declarative statement that contains a
|
|
# capitalised brand/product name immediately before "Jarvis". Regression:
|
|
# gemma4:e2b misread "big Mac Jarvis" as the compound name "Mac Jarvis",
|
|
# treating "Jarvis" as a surname rather than the wake word, and returned
|
|
# directed=false despite its own reasoning stating it found the wake word.
|
|
IntentJudgeTestCase(
|
|
name="wake_word_trailing_after_capitalised_brand",
|
|
transcript="I just ate a big Mac Jarvis",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1001.5,
|
|
expected_directed=True,
|
|
expected_query_contains="big Mac",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
# Self-contained imperative with an intentionally open subject ("something",
|
|
# "anything", "a joke") — these are valid queries and must not be treated
|
|
# as vague references or standalone "re-issue prior question" imperatives.
|
|
# Regression: gemma4:e2b was returning directed=false with reasoning "no
|
|
# extractable query" on "Jarvis say something please" because it conflated
|
|
# the open subject with a topic-less question.
|
|
IntentJudgeTestCase(
|
|
name="wake_word_open_imperative_say_something",
|
|
transcript="Jarvis say something please",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.5,
|
|
expected_directed=True,
|
|
expected_query_contains="say something",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
IntentJudgeTestCase(
|
|
name="wake_word_open_imperative_tell_me_a_joke",
|
|
transcript="Jarvis tell me a joke",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.5,
|
|
expected_directed=True,
|
|
expected_query_contains="joke",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
IntentJudgeTestCase(
|
|
name="wake_word_open_imperative_tell_me_anything",
|
|
transcript="Jarvis tell me anything",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.5,
|
|
expected_directed=True,
|
|
expected_query_contains="anything",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
IntentJudgeTestCase(
|
|
name="wake_word_open_imperative_give_me_advice",
|
|
transcript="Jarvis give me advice please",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.5,
|
|
expected_directed=True,
|
|
expected_query_contains="advice",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
IntentJudgeTestCase(
|
|
name="wake_word_open_imperative_surprise_me",
|
|
transcript="Jarvis surprise me",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.5,
|
|
expected_directed=True,
|
|
expected_query_contains="surprise",
|
|
expected_query_not_contains="jarvis",
|
|
),
|
|
# Same-segment context synthesis (distinct from simple wake+Q)
|
|
IntentJudgeTestCase(
|
|
name="context_synthesis_weather_opinion",
|
|
transcript="I think the weather is great today in London. What do you think, Jarvis?",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.8,
|
|
expected_directed=True,
|
|
expected_query_contains="weather",
|
|
),
|
|
# Echo + user follow-up in hot window
|
|
IntentJudgeTestCase(
|
|
name="echo_plus_followup_extracted",
|
|
transcript="London has 8 hours of daylight. That's quite cool. Tell me more.",
|
|
last_tts_text="On this day, London receives around 7-8 hours of daylight.",
|
|
in_hot_window=True,
|
|
wake_timestamp=None,
|
|
expected_directed=True,
|
|
expected_query_contains="more",
|
|
),
|
|
# Stop command during TTS
|
|
IntentJudgeTestCase(
|
|
name="stop_command_during_tts",
|
|
transcript="stop",
|
|
last_tts_text="Let me tell you about the history of...",
|
|
in_hot_window=False,
|
|
wake_timestamp=None,
|
|
expected_directed=True,
|
|
expected_query_contains=None,
|
|
expected_stop=True,
|
|
),
|
|
# No wake word, not hot window -> not directed
|
|
IntentJudgeTestCase(
|
|
name="no_wake_word_casual_speech",
|
|
transcript="I think the weather is nice today",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=None,
|
|
expected_directed=False,
|
|
expected_query_contains=None,
|
|
),
|
|
# Wake word only mentioned in narrative -> not directed
|
|
IntentJudgeTestCase(
|
|
name="mentioned_in_narrative_past_tense",
|
|
transcript="I told my friend about Jarvis yesterday",
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.8,
|
|
expected_directed=False,
|
|
expected_query_contains=None,
|
|
),
|
|
# Hot window simple follow-up
|
|
IntentJudgeTestCase(
|
|
name="hot_window_simple_followup",
|
|
transcript="What about next week?",
|
|
last_tts_text="The weather this weekend will be rainy.",
|
|
in_hot_window=True,
|
|
wake_timestamp=None,
|
|
expected_directed=True,
|
|
expected_query_contains="next week",
|
|
),
|
|
]
|
|
|
|
|
|
@dataclass
|
|
class MultiSegmentTestCase:
|
|
"""Test case with multiple transcript segments (realistic buffer state)."""
|
|
name: str
|
|
segments: list
|
|
last_tts_text: str
|
|
in_hot_window: bool
|
|
wake_timestamp: Optional[float]
|
|
expected_directed: bool
|
|
expected_query_contains: Optional[Union[str, List[str]]]
|
|
expected_query_not_contains: Optional[Union[str, List[str]]] = None
|
|
expected_stop: bool = False
|
|
aliases: Optional[List[str]] = None
|
|
|
|
|
|
MULTI_SEGMENT_TEST_CASES = [
|
|
# Real-logs scenario: echo + rejected similar + wake retry
|
|
MultiSegmentTestCase(
|
|
name="echo_plus_rejected_similar_plus_wake_retry",
|
|
segments=[
|
|
("and relatively windy, about 11 kilometers per hour", False),
|
|
("Okay, well, what about any new movies tomorrow?", False),
|
|
("Jarvis, what about new movies tomorrow?", False),
|
|
],
|
|
last_tts_text="Tomorrow's weather in Kensington looks a bit gloomy, with overcast conditions expected. It'll be quite cool, around 6°C, and relatively windy, about 11 km/h.",
|
|
in_hot_window=False,
|
|
wake_timestamp=1004.5,
|
|
expected_directed=True,
|
|
expected_query_contains="movies",
|
|
expected_query_not_contains="weather",
|
|
),
|
|
# Hot window with echo in buffer + user follow-up
|
|
MultiSegmentTestCase(
|
|
name="buffer_echo_then_followup_hot_window",
|
|
segments=[
|
|
("The weather is sunny and warm", False),
|
|
("What about the weekend?", False),
|
|
],
|
|
last_tts_text="The weather today is sunny and warm, around 20 degrees.",
|
|
in_hot_window=True,
|
|
wake_timestamp=None,
|
|
expected_directed=True,
|
|
expected_query_contains="weekend",
|
|
expected_query_not_contains="sunny",
|
|
),
|
|
# Stop command with TTS echoes in buffer
|
|
MultiSegmentTestCase(
|
|
name="multiple_echoes_then_interrupt",
|
|
segments=[
|
|
("Let me tell you about", True),
|
|
("the history of", True),
|
|
("Jarvis stop", False),
|
|
],
|
|
last_tts_text="Let me tell you about the history of ancient Rome.",
|
|
in_hot_window=False,
|
|
wake_timestamp=1002.0,
|
|
expected_directed=True,
|
|
expected_query_contains=None,
|
|
expected_stop=True,
|
|
),
|
|
# No wake word in multi-segment buffer
|
|
MultiSegmentTestCase(
|
|
name="no_wake_word_in_buffer",
|
|
segments=[
|
|
("How are you?", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=None,
|
|
expected_directed=False,
|
|
expected_query_contains=None,
|
|
),
|
|
# Context synthesis with prior ambient speech that must be filtered
|
|
MultiSegmentTestCase(
|
|
name="context_synthesis_with_prior_ambient",
|
|
segments=[
|
|
("Did you see the game last night?", False),
|
|
("Yeah it was amazing", False),
|
|
("The food here is excellent. Jarvis, what's the best dish to order?", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1004.0,
|
|
expected_directed=True,
|
|
expected_query_contains="dish",
|
|
expected_query_not_contains="game",
|
|
),
|
|
# Multi-person conversation: context synthesis across speakers without explicit pronoun
|
|
MultiSegmentTestCase(
|
|
name="multi_person_weather_discussion",
|
|
segments=[
|
|
("I wonder what the weather will be like tomorrow", False),
|
|
("Yeah we should check before planning the picnic", False),
|
|
("Jarvis what do you think", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1004.0,
|
|
expected_directed=True,
|
|
expected_query_contains="weather",
|
|
),
|
|
# Multi-person + vague reference ("that" = iPhone from earlier segment)
|
|
MultiSegmentTestCase(
|
|
name="multi_person_vague_reference",
|
|
segments=[
|
|
("The new iPhone looks pretty cool", False),
|
|
("I heard the camera is amazing", False),
|
|
("Jarvis how much does that cost", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1004.0,
|
|
expected_directed=True,
|
|
expected_query_contains="iphone",
|
|
),
|
|
# User statement follow-up in hot window (not an echo of TTS question)
|
|
MultiSegmentTestCase(
|
|
name="user_followup_statement_after_question_nihilism",
|
|
segments=[
|
|
("Some people find that appealing", True),
|
|
("While others see it as a bleak outlook", True),
|
|
("What are your thoughts on nihilism", True),
|
|
("I think it's way more ridiculous than absurdism. Absurdism is the way to go.", False),
|
|
],
|
|
last_tts_text="Nihilism is an interesting philosophical position. Some people find it appealing, while others see it as a bleak outlook. What are your thoughts on nihilism?",
|
|
in_hot_window=True,
|
|
wake_timestamp=None,
|
|
expected_directed=True,
|
|
expected_query_contains="absurdism",
|
|
expected_query_not_contains="what are your thoughts",
|
|
),
|
|
# Cross-segment vague reference ("that" -> dinosaurs)
|
|
MultiSegmentTestCase(
|
|
name="cross_segment_dinosaur_opinion",
|
|
segments=[
|
|
("I think dinosaurs are cool", False),
|
|
("What do you think about that Jarvis", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1002.5,
|
|
expected_directed=True,
|
|
expected_query_contains="dinosaur",
|
|
),
|
|
# Imperative resolution: "answer that" -> re-issue prior question
|
|
MultiSegmentTestCase(
|
|
name="cross_segment_answer_that_weather",
|
|
segments=[
|
|
("Sorry, how's the weather today?", False),
|
|
("Jarvis, answer that", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1002.5,
|
|
expected_directed=True,
|
|
expected_query_contains="weather",
|
|
expected_query_not_contains="answer that",
|
|
),
|
|
# Imperative resolution with unrelated noise between Q and imperative
|
|
MultiSegmentTestCase(
|
|
name="cross_segment_answer_that_with_noise",
|
|
segments=[
|
|
("How tall is Mount Everest", False),
|
|
("Charlie sands to that", False),
|
|
("Jarvis answer that", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1004.5,
|
|
expected_directed=True,
|
|
expected_query_contains="everest",
|
|
expected_query_not_contains="answer that",
|
|
),
|
|
# Whisper tense variant of imperative ("answered that")
|
|
MultiSegmentTestCase(
|
|
name="cross_segment_answered_that_whisper_variant",
|
|
segments=[
|
|
("Sorry, how's the weather today?", False),
|
|
("Jarvis answered that", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1002.5,
|
|
expected_directed=True,
|
|
expected_query_contains="weather",
|
|
expected_query_not_contains="answered that",
|
|
),
|
|
# Multi-word imperative variant
|
|
MultiSegmentTestCase(
|
|
name="cross_segment_go_ahead_and_answer",
|
|
segments=[
|
|
("What's the capital of Portugal", False),
|
|
("Jarvis go ahead and answer", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1002.5,
|
|
expected_directed=True,
|
|
expected_query_contains="portugal",
|
|
expected_query_not_contains="go ahead and answer",
|
|
),
|
|
# Imperative superseded by new explicit question in same segment
|
|
MultiSegmentTestCase(
|
|
name="cross_segment_imperative_superseded_by_new_question",
|
|
segments=[
|
|
("How's the weather today?", False),
|
|
("Jarvis, answer that — actually, what time is it?", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1002.5,
|
|
expected_directed=True,
|
|
expected_query_contains="time",
|
|
expected_query_not_contains="weather",
|
|
),
|
|
# Cross-segment follow-up in hot window (topic extension)
|
|
MultiSegmentTestCase(
|
|
name="cross_segment_hot_window_followup",
|
|
segments=[
|
|
("The capital of France is Paris", True),
|
|
("What about Germany", False),
|
|
],
|
|
last_tts_text="The capital of France is Paris, known as the City of Light.",
|
|
in_hot_window=True,
|
|
wake_timestamp=None,
|
|
expected_directed=True,
|
|
expected_query_contains="germany",
|
|
),
|
|
# Alias (Whisper mishearing) should be treated as the wake word. Without
|
|
# alias normalisation the small model sees "Jervis" and decides the user
|
|
# is addressing a different person.
|
|
MultiSegmentTestCase(
|
|
name="alias_treated_as_wake_word",
|
|
segments=[
|
|
("Jervis, what time is it in London?", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1000.8,
|
|
expected_directed=True,
|
|
expected_query_contains="time",
|
|
aliases=["jervis", "jaivis", "jervis", "javis"],
|
|
),
|
|
# Alias mid-utterance after narrative context — the model must still
|
|
# recognise the addressee as the assistant and resolve the vague reference.
|
|
MultiSegmentTestCase(
|
|
name="alias_after_narrative_context",
|
|
segments=[
|
|
("The new iPhone looks pretty cool", False),
|
|
("I heard the camera is amazing", False),
|
|
("Jaivis how much does that cost", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1004.0,
|
|
expected_directed=True,
|
|
expected_query_contains="iphone",
|
|
aliases=["jervis", "jaivis", "jervis", "javis"],
|
|
),
|
|
# Buried target sentence amid interleaved unrelated chatter (multi-topic
|
|
# disambiguation). Two separate topics coexist in the buffer — iPhone
|
|
# pricing thread and an unrelated Yankees game discussion. The wake-word
|
|
# segment contains a vague reference ("it") that must resolve to the
|
|
# correct thread (iPhone), not the most recent unrelated topic.
|
|
MultiSegmentTestCase(
|
|
name="buried_target_amid_unrelated_chatter",
|
|
segments=[
|
|
("The new iPhone looks pretty cool", False),
|
|
("Did you see the Yankees game last night", False),
|
|
("I heard the camera is amazing on that phone", False),
|
|
("Yeah that was a great play in the ninth inning", False),
|
|
("Jarvis how much does it cost", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1008.5,
|
|
expected_directed=True,
|
|
expected_query_contains="iphone",
|
|
expected_query_not_contains="yankees",
|
|
),
|
|
# Same buried-target disambiguation, but the wake-word question has no
|
|
# explicit pronoun ("what's the price" instead of "how much does it cost").
|
|
# The judge must still resolve the topic from prior segments — a query of
|
|
# "what's the price" is not answerable alone.
|
|
MultiSegmentTestCase(
|
|
name="buried_target_topicless_question",
|
|
segments=[
|
|
("so anyway the meeting ran really long yesterday", False),
|
|
("did you catch the ball game", False),
|
|
("the new iPhone is out", False),
|
|
("yeah they lost again though", False),
|
|
("I want the pro model", False),
|
|
("Jarvis what's the price", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1010.5,
|
|
expected_directed=True,
|
|
# Parent-noun rule: resolving to a sub-item ("pro model") must also
|
|
# include the parent noun/brand ("iPhone") — "pro model" alone is
|
|
# not self-contained.
|
|
expected_query_contains=["iphone", "pro"],
|
|
expected_query_not_contains="ball game",
|
|
),
|
|
# Vague reference "they" — the AirPods are the only plural antecedent
|
|
# that can be cost-queried, so "how much do they cost" must resolve to
|
|
# the AirPods thread and include the brand/noun in the query.
|
|
MultiSegmentTestCase(
|
|
name="buried_target_plural_vague_ref_they",
|
|
segments=[
|
|
("the AirPods sound great", False),
|
|
("yeah the bass is really punchy", False),
|
|
("Jarvis how much do they cost", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1006.5,
|
|
expected_directed=True,
|
|
expected_query_contains="airpods",
|
|
),
|
|
# Hot-window override: a topic-less follow-up ("tell me more") in hot
|
|
# window must stay directed=true even though a topic-rich earlier buffer
|
|
# would otherwise trigger the topic-resolution heuristic. The HOT WINDOW
|
|
# rule must win over the "topic-less question" vague-reference rule.
|
|
MultiSegmentTestCase(
|
|
name="hot_window_override_topicless_followup",
|
|
segments=[
|
|
("the new iPhone is out", False),
|
|
("I want the pro model", False),
|
|
("tell me more", False),
|
|
],
|
|
last_tts_text="The iPhone 16 Pro has a titanium frame and a new camera system.",
|
|
in_hot_window=True,
|
|
wake_timestamp=None,
|
|
expected_directed=True,
|
|
expected_query_contains=None,
|
|
),
|
|
# Wake word mid-utterance after narrative buffer, addressing the assistant.
|
|
# Real-world case: user was discussing Mata Hari in the background, then
|
|
# turned to the assistant with "Jarvis, do you know what she's talking about,
|
|
# about Mata Hari?". The small model mis-classified as "not directed" with
|
|
# reasoning that contradicted the verdict. The wake word is mid-utterance
|
|
# here but the trailing clause addresses the assistant directly ("do YOU
|
|
# know"), so this must be DIRECTED.
|
|
MultiSegmentTestCase(
|
|
name="wake_word_after_narrative_addresses_assistant",
|
|
segments=[
|
|
("The dude was a lie upon the lie", False),
|
|
("Mata Hari was never a traitor, she was an honest woman", False),
|
|
("Jarvis, do you know what she's talking about, about Mata Hari?", False),
|
|
],
|
|
last_tts_text="",
|
|
in_hot_window=False,
|
|
wake_timestamp=1004.5,
|
|
expected_directed=True,
|
|
expected_query_contains="mata hari",
|
|
),
|
|
]
|
|
|
|
|
|
# Cases known to fail with the small model on the current prompt.
|
|
# Track regressions / future prompt improvements here.
|
|
KNOWN_FAILING_CASES: set = set()
|
|
|
|
|
|
# =============================================================================
|
|
# Helper Functions
|
|
# =============================================================================
|
|
|
|
def _as_substring_list(value):
|
|
"""Normalise an expected_query_contains / _not_contains value to a list."""
|
|
if value is None:
|
|
return []
|
|
if isinstance(value, str):
|
|
return [value]
|
|
return list(value)
|
|
|
|
|
|
def create_transcript_segment(
|
|
text: str,
|
|
start_time: float = 1000.0,
|
|
is_during_tts: bool = False,
|
|
processed: bool = False,
|
|
):
|
|
"""Create a TranscriptSegment for testing."""
|
|
from jarvis.listening.transcript_buffer import TranscriptSegment
|
|
return TranscriptSegment(
|
|
text=text,
|
|
start_time=start_time,
|
|
end_time=start_time + 2.0,
|
|
energy=0.01,
|
|
is_during_tts=is_during_tts,
|
|
processed=processed,
|
|
)
|
|
|
|
|
|
def run_intent_judge(case: IntentJudgeTestCase):
|
|
"""Run the intent judge on a test case."""
|
|
from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
|
|
|
|
judge = IntentJudge(IntentJudgeConfig(
|
|
assistant_name="Jarvis",
|
|
model="gemma4:e2b",
|
|
timeout_sec=10.0,
|
|
))
|
|
|
|
if not judge.available:
|
|
return None
|
|
|
|
segments = [create_transcript_segment(case.transcript)]
|
|
|
|
return judge.judge(
|
|
segments=segments,
|
|
wake_timestamp=case.wake_timestamp,
|
|
last_tts_text=case.last_tts_text,
|
|
last_tts_finish_time=999.0 if case.last_tts_text else 0.0,
|
|
in_hot_window=case.in_hot_window,
|
|
current_text=case.transcript,
|
|
)
|
|
|
|
|
|
def run_intent_judge_multi_segment(case: "MultiSegmentTestCase"):
|
|
"""Run the intent judge on a multi-segment test case."""
|
|
from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
|
|
|
|
judge = IntentJudge(IntentJudgeConfig(
|
|
assistant_name="Jarvis",
|
|
aliases=list(case.aliases or []),
|
|
model="gemma4:e2b",
|
|
timeout_sec=10.0,
|
|
))
|
|
|
|
if not judge.available:
|
|
return None
|
|
|
|
segments = []
|
|
base_time = 1000.0
|
|
for i, (text, is_during_tts) in enumerate(case.segments):
|
|
segments.append(create_transcript_segment(
|
|
text=text,
|
|
start_time=base_time + (i * 2.0),
|
|
is_during_tts=is_during_tts,
|
|
))
|
|
|
|
current_text = ""
|
|
for text, is_during_tts in reversed(case.segments):
|
|
if not is_during_tts:
|
|
current_text = text
|
|
break
|
|
|
|
return judge.judge(
|
|
segments=segments,
|
|
wake_timestamp=case.wake_timestamp,
|
|
last_tts_text=case.last_tts_text,
|
|
last_tts_finish_time=999.0 if case.last_tts_text else 0.0,
|
|
in_hot_window=case.in_hot_window,
|
|
current_text=current_text,
|
|
)
|
|
|
|
|
|
def is_intent_judge_available() -> bool:
|
|
"""Check if the intent judge model is available."""
|
|
import requests
|
|
try:
|
|
resp = requests.get("http://127.0.0.1:11434/api/tags", timeout=2)
|
|
if resp.status_code != 200:
|
|
return False
|
|
data = resp.json()
|
|
models = [m.get("name", "") for m in data.get("models", [])]
|
|
return any("gemma4" in m for m in models)
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _skip_if_not_intent_judge_phase():
|
|
"""Intent judge tests are fixed to gemma4:e2b and would run twice under the
|
|
multi-model eval matrix. Skip during the large-model phase to keep runtime
|
|
down; they still run once during the small-model (gemma4) phase."""
|
|
if "gemma4" not in JUDGE_MODEL:
|
|
pytest.skip(f"Intent judge tests only run in the gemma4 phase (current: {JUDGE_MODEL})")
|
|
|
|
|
|
# =============================================================================
|
|
# Tests
|
|
# =============================================================================
|
|
|
|
class TestIntentJudgeAccuracy:
|
|
"""Evals for intent judge accuracy."""
|
|
|
|
@pytest.mark.parametrize("case", INTENT_JUDGE_TEST_CASES, ids=lambda c: c.name)
|
|
def test_intent_judge_case(self, case: IntentJudgeTestCase):
|
|
_skip_if_not_intent_judge_phase()
|
|
if not is_intent_judge_available():
|
|
pytest.skip("Intent judge model (gemma4) not available")
|
|
|
|
if case.name in KNOWN_FAILING_CASES:
|
|
pytest.xfail(f"Known issue: {case.name} needs prompt improvement")
|
|
|
|
result = run_intent_judge(case)
|
|
|
|
if result is None:
|
|
pytest.fail("Intent judge returned None")
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Test Case: {case.name}")
|
|
print(f"Transcript: {case.transcript}")
|
|
print(f"TTS: {case.last_tts_text[:50]}..." if case.last_tts_text else "TTS: None")
|
|
print(f"Mode: {'hot_window' if case.in_hot_window else 'wake_word'}")
|
|
print(f"{'='*60}")
|
|
print(f"Result: directed={result.directed}, query='{result.query}', stop={result.stop}")
|
|
print(f"Confidence: {result.confidence}")
|
|
print(f"Reasoning: {result.reasoning}")
|
|
print(f"{'='*60}")
|
|
|
|
assert result.directed == case.expected_directed, (
|
|
f"Expected directed={case.expected_directed}, got {result.directed}. "
|
|
f"Reasoning: {result.reasoning}"
|
|
)
|
|
assert result.stop == case.expected_stop, (
|
|
f"Expected stop={case.expected_stop}, got {result.stop}. "
|
|
f"Reasoning: {result.reasoning}"
|
|
)
|
|
for needle in _as_substring_list(case.expected_query_contains):
|
|
assert needle.lower() in (result.query or "").lower(), (
|
|
f"Expected query to contain '{needle}', "
|
|
f"got '{result.query}'. Reasoning: {result.reasoning}"
|
|
)
|
|
if result.query:
|
|
for needle in _as_substring_list(case.expected_query_not_contains):
|
|
assert needle.lower() not in result.query.lower(), (
|
|
f"Expected query to NOT contain '{needle}', "
|
|
f"got '{result.query}'. Reasoning: {result.reasoning}"
|
|
)
|
|
|
|
|
|
class TestIntentJudgePromptQuality:
|
|
"""Tests for intent judge prompt construction quality."""
|
|
|
|
def test_hot_window_mode_indicated_in_prompt(self):
|
|
from jarvis.listening.intent_judge import IntentJudge
|
|
|
|
judge = IntentJudge()
|
|
segments = [create_transcript_segment("hello")]
|
|
|
|
prompt = judge._build_user_prompt(
|
|
segments=segments,
|
|
wake_timestamp=None,
|
|
last_tts_text="Test TTS",
|
|
last_tts_finish_time=999.0,
|
|
in_hot_window=True,
|
|
)
|
|
|
|
assert "HOT WINDOW" in prompt
|
|
|
|
def test_tts_text_included_for_echo_detection(self):
|
|
from jarvis.listening.intent_judge import IntentJudge
|
|
|
|
judge = IntentJudge()
|
|
segments = [create_transcript_segment("The weather is nice")]
|
|
tts_text = "The weather today is nice and sunny"
|
|
|
|
prompt = judge._build_user_prompt(
|
|
segments=segments,
|
|
wake_timestamp=None,
|
|
last_tts_text=tts_text,
|
|
last_tts_finish_time=999.0,
|
|
in_hot_window=True,
|
|
)
|
|
|
|
assert "nice and sunny" in prompt
|
|
|
|
def test_system_prompt_has_echo_guidance(self):
|
|
from jarvis.listening.intent_judge import IntentJudge
|
|
|
|
judge = IntentJudge()
|
|
prompt = judge._build_system_prompt()
|
|
|
|
assert "echo" in prompt.lower()
|
|
assert "(during TTS)" in prompt
|
|
|
|
|
|
class TestIntentJudgeFallback:
|
|
"""Tests for intent judge fallback behaviour."""
|
|
|
|
def test_returns_none_when_ollama_unavailable(self):
|
|
from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
|
|
|
|
judge = IntentJudge(IntentJudgeConfig(
|
|
ollama_base_url="http://127.0.0.1:99999",
|
|
timeout_sec=1.0,
|
|
))
|
|
|
|
segments = [create_transcript_segment("test")]
|
|
result = judge.judge(segments)
|
|
|
|
assert result is None
|
|
|
|
|
|
class TestIntentJudgeMultiSegment:
|
|
"""Evals for intent judge with realistic multi-segment transcript buffers."""
|
|
|
|
@pytest.mark.parametrize("case", MULTI_SEGMENT_TEST_CASES, ids=lambda c: c.name)
|
|
def test_multi_segment_case(self, case: MultiSegmentTestCase):
|
|
_skip_if_not_intent_judge_phase()
|
|
if not is_intent_judge_available():
|
|
pytest.skip("Intent judge model (gemma4) not available")
|
|
|
|
if case.name in KNOWN_FAILING_CASES:
|
|
pytest.xfail(f"Known issue: {case.name} needs prompt improvement")
|
|
|
|
result = run_intent_judge_multi_segment(case)
|
|
|
|
if result is None:
|
|
pytest.fail("Intent judge returned None")
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Test Case: {case.name}")
|
|
print(f"Segments:")
|
|
for text, is_tts in case.segments:
|
|
marker = " (during TTS)" if is_tts else ""
|
|
print(f" - \"{text}\"{marker}")
|
|
print(f"TTS: {case.last_tts_text[:50]}..." if case.last_tts_text else "TTS: None")
|
|
print(f"Mode: {'hot_window' if case.in_hot_window else 'wake_word'}")
|
|
print(f"{'='*60}")
|
|
print(f"Result: directed={result.directed}, query='{result.query}', stop={result.stop}")
|
|
print(f"Confidence: {result.confidence}")
|
|
print(f"Reasoning: {result.reasoning}")
|
|
print(f"{'='*60}")
|
|
|
|
assert result.directed == case.expected_directed, (
|
|
f"Expected directed={case.expected_directed}, got {result.directed}. "
|
|
f"Reasoning: {result.reasoning}"
|
|
)
|
|
assert result.stop == case.expected_stop, (
|
|
f"Expected stop={case.expected_stop}, got {result.stop}. "
|
|
f"Reasoning: {result.reasoning}"
|
|
)
|
|
for needle in _as_substring_list(case.expected_query_contains):
|
|
assert needle.lower() in (result.query or "").lower(), (
|
|
f"Expected query to contain '{needle}', "
|
|
f"got '{result.query}'. Reasoning: {result.reasoning}"
|
|
)
|
|
if result.query:
|
|
for needle in _as_substring_list(case.expected_query_not_contains):
|
|
assert needle.lower() not in result.query.lower(), (
|
|
f"Expected query to NOT contain '{needle}', "
|
|
f"got '{result.query}'. Reasoning: {result.reasoning}"
|
|
)
|
|
|
|
|
|
class TestProcessedSegmentFiltering:
|
|
"""Tests for processed segment filtering in intent judge."""
|
|
|
|
def test_processed_segment_not_reextracted(self):
|
|
_skip_if_not_intent_judge_phase()
|
|
if not is_intent_judge_available():
|
|
pytest.skip("Intent judge model (gemma4) not available")
|
|
|
|
from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
|
|
|
|
judge = IntentJudge(IntentJudgeConfig(
|
|
assistant_name="Jarvis",
|
|
model="gemma4:e2b",
|
|
timeout_sec=10.0,
|
|
))
|
|
|
|
segments = [
|
|
create_transcript_segment(
|
|
text="Jarvis what's the weather in London",
|
|
start_time=1000.0,
|
|
processed=True,
|
|
),
|
|
create_transcript_segment(
|
|
text="Jarvis tell me a random topic",
|
|
start_time=1010.0,
|
|
processed=False,
|
|
),
|
|
]
|
|
|
|
result = judge.judge(
|
|
segments=segments,
|
|
wake_timestamp=1010.0,
|
|
last_tts_text="",
|
|
last_tts_finish_time=0.0,
|
|
in_hot_window=False,
|
|
current_text="Jarvis tell me a random topic",
|
|
)
|
|
|
|
assert result is not None
|
|
assert result.directed is True
|
|
assert "random" in result.query.lower() or "topic" in result.query.lower(), (
|
|
f"Expected query about 'random topic', got '{result.query}'."
|
|
)
|
|
assert "weather" not in result.query.lower(), (
|
|
f"Query contains 'weather' from processed segment: '{result.query}'"
|
|
)
|
|
|
|
print(f"\n✅ Correctly extracted new query: '{result.query}'")
|