Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
962
evals/test_intent_judge.py
Normal file
962
evals/test_intent_judge.py
Normal file
@@ -0,0 +1,962 @@
|
||||
"""
|
||||
Evals for the Intent Judge LLM.
|
||||
|
||||
Deduplicated suite: 22 cases covering all behaviour axes from the original 59.
|
||||
See PR description / commit message for the dedup rationale.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, List, Union
|
||||
|
||||
from helpers import JUDGE_MODEL, JUDGE_BASE_URL, is_judge_llm_available
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test Data
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class IntentJudgeTestCase:
|
||||
"""Test case for intent judge evaluation."""
|
||||
name: str
|
||||
transcript: str
|
||||
last_tts_text: str
|
||||
in_hot_window: bool
|
||||
wake_timestamp: Optional[float]
|
||||
expected_directed: bool
|
||||
expected_query_contains: Optional[Union[str, List[str]]]
|
||||
expected_query_not_contains: Optional[Union[str, List[str]]] = None
|
||||
expected_stop: bool = False
|
||||
|
||||
|
||||
# Single-segment cases - one per distinct behaviour axis.
|
||||
INTENT_JUDGE_TEST_CASES = [
|
||||
# Wake word + simple question (canonical directed+extract)
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_simple_question",
|
||||
transcript="Jarvis what time is it",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="time",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
# Wake word at sentence end, adjacent to a named entity. Regression guard:
|
||||
# the judge previously left "Jarvis" in the query, causing the reply engine
|
||||
# to treat "Possessor Jarvis" as the film title instead of "Possessor".
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_trailing_after_named_entity",
|
||||
transcript="what do you know about the movie called Possessor Jarvis",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1001.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="possessor",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
# Wake word mid-sentence (not at start, not at end). Ensures the judge
|
||||
# removes every occurrence, not just the leading one.
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_mid_sentence",
|
||||
transcript="hey Jarvis what's the weather in London",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.3,
|
||||
expected_directed=True,
|
||||
expected_query_contains="weather",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
# Wake word + command/imperative addressed to the assistant (not a question)
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_command_timer",
|
||||
transcript="Jarvis set a timer for 5 minutes",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="timer",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
# Wake word + statement/command to remember something
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_statement_remember",
|
||||
transcript="Jarvis remind me to call mum at 5pm",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="mum",
|
||||
),
|
||||
# Wake word + casual share-of-information statement (no explicit command
|
||||
# or question). Regression guard: the judge previously rejected these as
|
||||
# "not directed" because the sentence was a statement about the user's
|
||||
# own action rather than a command or question, even though the wake
|
||||
# word was clearly addressed to the assistant.
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_share_statement_burger",
|
||||
transcript="Jarvis, I just ate a burger from McDonald's.",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="burger",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_share_statement_feeling",
|
||||
transcript="Jarvis I'm feeling a bit tired today",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="tired",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
# Wake word at the END of a declarative statement. Position of the wake
|
||||
# word must not affect directedness — this pattern must also be directed.
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_share_statement_trailing",
|
||||
transcript="My flight just got cancelled, Jarvis",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1001.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="flight",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
# Wake word at the END of a declarative statement that contains a
|
||||
# capitalised brand/product name immediately before "Jarvis". Regression:
|
||||
# gemma4:e2b misread "big Mac Jarvis" as the compound name "Mac Jarvis",
|
||||
# treating "Jarvis" as a surname rather than the wake word, and returned
|
||||
# directed=false despite its own reasoning stating it found the wake word.
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_trailing_after_capitalised_brand",
|
||||
transcript="I just ate a big Mac Jarvis",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1001.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="big Mac",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
# Self-contained imperative with an intentionally open subject ("something",
|
||||
# "anything", "a joke") — these are valid queries and must not be treated
|
||||
# as vague references or standalone "re-issue prior question" imperatives.
|
||||
# Regression: gemma4:e2b was returning directed=false with reasoning "no
|
||||
# extractable query" on "Jarvis say something please" because it conflated
|
||||
# the open subject with a topic-less question.
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_open_imperative_say_something",
|
||||
transcript="Jarvis say something please",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="say something",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_open_imperative_tell_me_a_joke",
|
||||
transcript="Jarvis tell me a joke",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="joke",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_open_imperative_tell_me_anything",
|
||||
transcript="Jarvis tell me anything",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="anything",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_open_imperative_give_me_advice",
|
||||
transcript="Jarvis give me advice please",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="advice",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
IntentJudgeTestCase(
|
||||
name="wake_word_open_imperative_surprise_me",
|
||||
transcript="Jarvis surprise me",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="surprise",
|
||||
expected_query_not_contains="jarvis",
|
||||
),
|
||||
# Same-segment context synthesis (distinct from simple wake+Q)
|
||||
IntentJudgeTestCase(
|
||||
name="context_synthesis_weather_opinion",
|
||||
transcript="I think the weather is great today in London. What do you think, Jarvis?",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.8,
|
||||
expected_directed=True,
|
||||
expected_query_contains="weather",
|
||||
),
|
||||
# Echo + user follow-up in hot window
|
||||
IntentJudgeTestCase(
|
||||
name="echo_plus_followup_extracted",
|
||||
transcript="London has 8 hours of daylight. That's quite cool. Tell me more.",
|
||||
last_tts_text="On this day, London receives around 7-8 hours of daylight.",
|
||||
in_hot_window=True,
|
||||
wake_timestamp=None,
|
||||
expected_directed=True,
|
||||
expected_query_contains="more",
|
||||
),
|
||||
# Stop command during TTS
|
||||
IntentJudgeTestCase(
|
||||
name="stop_command_during_tts",
|
||||
transcript="stop",
|
||||
last_tts_text="Let me tell you about the history of...",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=None,
|
||||
expected_directed=True,
|
||||
expected_query_contains=None,
|
||||
expected_stop=True,
|
||||
),
|
||||
# No wake word, not hot window -> not directed
|
||||
IntentJudgeTestCase(
|
||||
name="no_wake_word_casual_speech",
|
||||
transcript="I think the weather is nice today",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=None,
|
||||
expected_directed=False,
|
||||
expected_query_contains=None,
|
||||
),
|
||||
# Wake word only mentioned in narrative -> not directed
|
||||
IntentJudgeTestCase(
|
||||
name="mentioned_in_narrative_past_tense",
|
||||
transcript="I told my friend about Jarvis yesterday",
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.8,
|
||||
expected_directed=False,
|
||||
expected_query_contains=None,
|
||||
),
|
||||
# Hot window simple follow-up
|
||||
IntentJudgeTestCase(
|
||||
name="hot_window_simple_followup",
|
||||
transcript="What about next week?",
|
||||
last_tts_text="The weather this weekend will be rainy.",
|
||||
in_hot_window=True,
|
||||
wake_timestamp=None,
|
||||
expected_directed=True,
|
||||
expected_query_contains="next week",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class MultiSegmentTestCase:
|
||||
"""Test case with multiple transcript segments (realistic buffer state)."""
|
||||
name: str
|
||||
segments: list
|
||||
last_tts_text: str
|
||||
in_hot_window: bool
|
||||
wake_timestamp: Optional[float]
|
||||
expected_directed: bool
|
||||
expected_query_contains: Optional[Union[str, List[str]]]
|
||||
expected_query_not_contains: Optional[Union[str, List[str]]] = None
|
||||
expected_stop: bool = False
|
||||
aliases: Optional[List[str]] = None
|
||||
|
||||
|
||||
MULTI_SEGMENT_TEST_CASES = [
|
||||
# Real-logs scenario: echo + rejected similar + wake retry
|
||||
MultiSegmentTestCase(
|
||||
name="echo_plus_rejected_similar_plus_wake_retry",
|
||||
segments=[
|
||||
("and relatively windy, about 11 kilometers per hour", False),
|
||||
("Okay, well, what about any new movies tomorrow?", False),
|
||||
("Jarvis, what about new movies tomorrow?", False),
|
||||
],
|
||||
last_tts_text="Tomorrow's weather in Kensington looks a bit gloomy, with overcast conditions expected. It'll be quite cool, around 6°C, and relatively windy, about 11 km/h.",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1004.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="movies",
|
||||
expected_query_not_contains="weather",
|
||||
),
|
||||
# Hot window with echo in buffer + user follow-up
|
||||
MultiSegmentTestCase(
|
||||
name="buffer_echo_then_followup_hot_window",
|
||||
segments=[
|
||||
("The weather is sunny and warm", False),
|
||||
("What about the weekend?", False),
|
||||
],
|
||||
last_tts_text="The weather today is sunny and warm, around 20 degrees.",
|
||||
in_hot_window=True,
|
||||
wake_timestamp=None,
|
||||
expected_directed=True,
|
||||
expected_query_contains="weekend",
|
||||
expected_query_not_contains="sunny",
|
||||
),
|
||||
# Stop command with TTS echoes in buffer
|
||||
MultiSegmentTestCase(
|
||||
name="multiple_echoes_then_interrupt",
|
||||
segments=[
|
||||
("Let me tell you about", True),
|
||||
("the history of", True),
|
||||
("Jarvis stop", False),
|
||||
],
|
||||
last_tts_text="Let me tell you about the history of ancient Rome.",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1002.0,
|
||||
expected_directed=True,
|
||||
expected_query_contains=None,
|
||||
expected_stop=True,
|
||||
),
|
||||
# No wake word in multi-segment buffer
|
||||
MultiSegmentTestCase(
|
||||
name="no_wake_word_in_buffer",
|
||||
segments=[
|
||||
("How are you?", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=None,
|
||||
expected_directed=False,
|
||||
expected_query_contains=None,
|
||||
),
|
||||
# Context synthesis with prior ambient speech that must be filtered
|
||||
MultiSegmentTestCase(
|
||||
name="context_synthesis_with_prior_ambient",
|
||||
segments=[
|
||||
("Did you see the game last night?", False),
|
||||
("Yeah it was amazing", False),
|
||||
("The food here is excellent. Jarvis, what's the best dish to order?", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1004.0,
|
||||
expected_directed=True,
|
||||
expected_query_contains="dish",
|
||||
expected_query_not_contains="game",
|
||||
),
|
||||
# Multi-person conversation: context synthesis across speakers without explicit pronoun
|
||||
MultiSegmentTestCase(
|
||||
name="multi_person_weather_discussion",
|
||||
segments=[
|
||||
("I wonder what the weather will be like tomorrow", False),
|
||||
("Yeah we should check before planning the picnic", False),
|
||||
("Jarvis what do you think", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1004.0,
|
||||
expected_directed=True,
|
||||
expected_query_contains="weather",
|
||||
),
|
||||
# Multi-person + vague reference ("that" = iPhone from earlier segment)
|
||||
MultiSegmentTestCase(
|
||||
name="multi_person_vague_reference",
|
||||
segments=[
|
||||
("The new iPhone looks pretty cool", False),
|
||||
("I heard the camera is amazing", False),
|
||||
("Jarvis how much does that cost", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1004.0,
|
||||
expected_directed=True,
|
||||
expected_query_contains="iphone",
|
||||
),
|
||||
# User statement follow-up in hot window (not an echo of TTS question)
|
||||
MultiSegmentTestCase(
|
||||
name="user_followup_statement_after_question_nihilism",
|
||||
segments=[
|
||||
("Some people find that appealing", True),
|
||||
("While others see it as a bleak outlook", True),
|
||||
("What are your thoughts on nihilism", True),
|
||||
("I think it's way more ridiculous than absurdism. Absurdism is the way to go.", False),
|
||||
],
|
||||
last_tts_text="Nihilism is an interesting philosophical position. Some people find it appealing, while others see it as a bleak outlook. What are your thoughts on nihilism?",
|
||||
in_hot_window=True,
|
||||
wake_timestamp=None,
|
||||
expected_directed=True,
|
||||
expected_query_contains="absurdism",
|
||||
expected_query_not_contains="what are your thoughts",
|
||||
),
|
||||
# Cross-segment vague reference ("that" -> dinosaurs)
|
||||
MultiSegmentTestCase(
|
||||
name="cross_segment_dinosaur_opinion",
|
||||
segments=[
|
||||
("I think dinosaurs are cool", False),
|
||||
("What do you think about that Jarvis", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1002.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="dinosaur",
|
||||
),
|
||||
# Imperative resolution: "answer that" -> re-issue prior question
|
||||
MultiSegmentTestCase(
|
||||
name="cross_segment_answer_that_weather",
|
||||
segments=[
|
||||
("Sorry, how's the weather today?", False),
|
||||
("Jarvis, answer that", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1002.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="weather",
|
||||
expected_query_not_contains="answer that",
|
||||
),
|
||||
# Imperative resolution with unrelated noise between Q and imperative
|
||||
MultiSegmentTestCase(
|
||||
name="cross_segment_answer_that_with_noise",
|
||||
segments=[
|
||||
("How tall is Mount Everest", False),
|
||||
("Charlie sands to that", False),
|
||||
("Jarvis answer that", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1004.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="everest",
|
||||
expected_query_not_contains="answer that",
|
||||
),
|
||||
# Whisper tense variant of imperative ("answered that")
|
||||
MultiSegmentTestCase(
|
||||
name="cross_segment_answered_that_whisper_variant",
|
||||
segments=[
|
||||
("Sorry, how's the weather today?", False),
|
||||
("Jarvis answered that", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1002.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="weather",
|
||||
expected_query_not_contains="answered that",
|
||||
),
|
||||
# Multi-word imperative variant
|
||||
MultiSegmentTestCase(
|
||||
name="cross_segment_go_ahead_and_answer",
|
||||
segments=[
|
||||
("What's the capital of Portugal", False),
|
||||
("Jarvis go ahead and answer", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1002.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="portugal",
|
||||
expected_query_not_contains="go ahead and answer",
|
||||
),
|
||||
# Imperative superseded by new explicit question in same segment
|
||||
MultiSegmentTestCase(
|
||||
name="cross_segment_imperative_superseded_by_new_question",
|
||||
segments=[
|
||||
("How's the weather today?", False),
|
||||
("Jarvis, answer that — actually, what time is it?", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1002.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="time",
|
||||
expected_query_not_contains="weather",
|
||||
),
|
||||
# Cross-segment follow-up in hot window (topic extension)
|
||||
MultiSegmentTestCase(
|
||||
name="cross_segment_hot_window_followup",
|
||||
segments=[
|
||||
("The capital of France is Paris", True),
|
||||
("What about Germany", False),
|
||||
],
|
||||
last_tts_text="The capital of France is Paris, known as the City of Light.",
|
||||
in_hot_window=True,
|
||||
wake_timestamp=None,
|
||||
expected_directed=True,
|
||||
expected_query_contains="germany",
|
||||
),
|
||||
# Alias (Whisper mishearing) should be treated as the wake word. Without
|
||||
# alias normalisation the small model sees "Jervis" and decides the user
|
||||
# is addressing a different person.
|
||||
MultiSegmentTestCase(
|
||||
name="alias_treated_as_wake_word",
|
||||
segments=[
|
||||
("Jervis, what time is it in London?", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1000.8,
|
||||
expected_directed=True,
|
||||
expected_query_contains="time",
|
||||
aliases=["jervis", "jaivis", "jervis", "javis"],
|
||||
),
|
||||
# Alias mid-utterance after narrative context — the model must still
|
||||
# recognise the addressee as the assistant and resolve the vague reference.
|
||||
MultiSegmentTestCase(
|
||||
name="alias_after_narrative_context",
|
||||
segments=[
|
||||
("The new iPhone looks pretty cool", False),
|
||||
("I heard the camera is amazing", False),
|
||||
("Jaivis how much does that cost", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1004.0,
|
||||
expected_directed=True,
|
||||
expected_query_contains="iphone",
|
||||
aliases=["jervis", "jaivis", "jervis", "javis"],
|
||||
),
|
||||
# Buried target sentence amid interleaved unrelated chatter (multi-topic
|
||||
# disambiguation). Two separate topics coexist in the buffer — iPhone
|
||||
# pricing thread and an unrelated Yankees game discussion. The wake-word
|
||||
# segment contains a vague reference ("it") that must resolve to the
|
||||
# correct thread (iPhone), not the most recent unrelated topic.
|
||||
MultiSegmentTestCase(
|
||||
name="buried_target_amid_unrelated_chatter",
|
||||
segments=[
|
||||
("The new iPhone looks pretty cool", False),
|
||||
("Did you see the Yankees game last night", False),
|
||||
("I heard the camera is amazing on that phone", False),
|
||||
("Yeah that was a great play in the ninth inning", False),
|
||||
("Jarvis how much does it cost", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1008.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="iphone",
|
||||
expected_query_not_contains="yankees",
|
||||
),
|
||||
# Same buried-target disambiguation, but the wake-word question has no
|
||||
# explicit pronoun ("what's the price" instead of "how much does it cost").
|
||||
# The judge must still resolve the topic from prior segments — a query of
|
||||
# "what's the price" is not answerable alone.
|
||||
MultiSegmentTestCase(
|
||||
name="buried_target_topicless_question",
|
||||
segments=[
|
||||
("so anyway the meeting ran really long yesterday", False),
|
||||
("did you catch the ball game", False),
|
||||
("the new iPhone is out", False),
|
||||
("yeah they lost again though", False),
|
||||
("I want the pro model", False),
|
||||
("Jarvis what's the price", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1010.5,
|
||||
expected_directed=True,
|
||||
# Parent-noun rule: resolving to a sub-item ("pro model") must also
|
||||
# include the parent noun/brand ("iPhone") — "pro model" alone is
|
||||
# not self-contained.
|
||||
expected_query_contains=["iphone", "pro"],
|
||||
expected_query_not_contains="ball game",
|
||||
),
|
||||
# Vague reference "they" — the AirPods are the only plural antecedent
|
||||
# that can be cost-queried, so "how much do they cost" must resolve to
|
||||
# the AirPods thread and include the brand/noun in the query.
|
||||
MultiSegmentTestCase(
|
||||
name="buried_target_plural_vague_ref_they",
|
||||
segments=[
|
||||
("the AirPods sound great", False),
|
||||
("yeah the bass is really punchy", False),
|
||||
("Jarvis how much do they cost", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1006.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="airpods",
|
||||
),
|
||||
# Hot-window override: a topic-less follow-up ("tell me more") in hot
|
||||
# window must stay directed=true even though a topic-rich earlier buffer
|
||||
# would otherwise trigger the topic-resolution heuristic. The HOT WINDOW
|
||||
# rule must win over the "topic-less question" vague-reference rule.
|
||||
MultiSegmentTestCase(
|
||||
name="hot_window_override_topicless_followup",
|
||||
segments=[
|
||||
("the new iPhone is out", False),
|
||||
("I want the pro model", False),
|
||||
("tell me more", False),
|
||||
],
|
||||
last_tts_text="The iPhone 16 Pro has a titanium frame and a new camera system.",
|
||||
in_hot_window=True,
|
||||
wake_timestamp=None,
|
||||
expected_directed=True,
|
||||
expected_query_contains=None,
|
||||
),
|
||||
# Wake word mid-utterance after narrative buffer, addressing the assistant.
|
||||
# Real-world case: user was discussing Mata Hari in the background, then
|
||||
# turned to the assistant with "Jarvis, do you know what she's talking about,
|
||||
# about Mata Hari?". The small model mis-classified as "not directed" with
|
||||
# reasoning that contradicted the verdict. The wake word is mid-utterance
|
||||
# here but the trailing clause addresses the assistant directly ("do YOU
|
||||
# know"), so this must be DIRECTED.
|
||||
MultiSegmentTestCase(
|
||||
name="wake_word_after_narrative_addresses_assistant",
|
||||
segments=[
|
||||
("The dude was a lie upon the lie", False),
|
||||
("Mata Hari was never a traitor, she was an honest woman", False),
|
||||
("Jarvis, do you know what she's talking about, about Mata Hari?", False),
|
||||
],
|
||||
last_tts_text="",
|
||||
in_hot_window=False,
|
||||
wake_timestamp=1004.5,
|
||||
expected_directed=True,
|
||||
expected_query_contains="mata hari",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# Cases known to fail with the small model on the current prompt.
|
||||
# Track regressions / future prompt improvements here.
|
||||
KNOWN_FAILING_CASES: set = set()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
def _as_substring_list(value):
|
||||
"""Normalise an expected_query_contains / _not_contains value to a list."""
|
||||
if value is None:
|
||||
return []
|
||||
if isinstance(value, str):
|
||||
return [value]
|
||||
return list(value)
|
||||
|
||||
|
||||
def create_transcript_segment(
|
||||
text: str,
|
||||
start_time: float = 1000.0,
|
||||
is_during_tts: bool = False,
|
||||
processed: bool = False,
|
||||
):
|
||||
"""Create a TranscriptSegment for testing."""
|
||||
from jarvis.listening.transcript_buffer import TranscriptSegment
|
||||
return TranscriptSegment(
|
||||
text=text,
|
||||
start_time=start_time,
|
||||
end_time=start_time + 2.0,
|
||||
energy=0.01,
|
||||
is_during_tts=is_during_tts,
|
||||
processed=processed,
|
||||
)
|
||||
|
||||
|
||||
def run_intent_judge(case: IntentJudgeTestCase):
|
||||
"""Run the intent judge on a test case."""
|
||||
from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
|
||||
|
||||
judge = IntentJudge(IntentJudgeConfig(
|
||||
assistant_name="Jarvis",
|
||||
model="gemma4:e2b",
|
||||
timeout_sec=10.0,
|
||||
))
|
||||
|
||||
if not judge.available:
|
||||
return None
|
||||
|
||||
segments = [create_transcript_segment(case.transcript)]
|
||||
|
||||
return judge.judge(
|
||||
segments=segments,
|
||||
wake_timestamp=case.wake_timestamp,
|
||||
last_tts_text=case.last_tts_text,
|
||||
last_tts_finish_time=999.0 if case.last_tts_text else 0.0,
|
||||
in_hot_window=case.in_hot_window,
|
||||
current_text=case.transcript,
|
||||
)
|
||||
|
||||
|
||||
def run_intent_judge_multi_segment(case: "MultiSegmentTestCase"):
|
||||
"""Run the intent judge on a multi-segment test case."""
|
||||
from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
|
||||
|
||||
judge = IntentJudge(IntentJudgeConfig(
|
||||
assistant_name="Jarvis",
|
||||
aliases=list(case.aliases or []),
|
||||
model="gemma4:e2b",
|
||||
timeout_sec=10.0,
|
||||
))
|
||||
|
||||
if not judge.available:
|
||||
return None
|
||||
|
||||
segments = []
|
||||
base_time = 1000.0
|
||||
for i, (text, is_during_tts) in enumerate(case.segments):
|
||||
segments.append(create_transcript_segment(
|
||||
text=text,
|
||||
start_time=base_time + (i * 2.0),
|
||||
is_during_tts=is_during_tts,
|
||||
))
|
||||
|
||||
current_text = ""
|
||||
for text, is_during_tts in reversed(case.segments):
|
||||
if not is_during_tts:
|
||||
current_text = text
|
||||
break
|
||||
|
||||
return judge.judge(
|
||||
segments=segments,
|
||||
wake_timestamp=case.wake_timestamp,
|
||||
last_tts_text=case.last_tts_text,
|
||||
last_tts_finish_time=999.0 if case.last_tts_text else 0.0,
|
||||
in_hot_window=case.in_hot_window,
|
||||
current_text=current_text,
|
||||
)
|
||||
|
||||
|
||||
def is_intent_judge_available() -> bool:
|
||||
"""Check if the intent judge model is available."""
|
||||
import requests
|
||||
try:
|
||||
resp = requests.get("http://127.0.0.1:11434/api/tags", timeout=2)
|
||||
if resp.status_code != 200:
|
||||
return False
|
||||
data = resp.json()
|
||||
models = [m.get("name", "") for m in data.get("models", [])]
|
||||
return any("gemma4" in m for m in models)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _skip_if_not_intent_judge_phase():
|
||||
"""Intent judge tests are fixed to gemma4:e2b and would run twice under the
|
||||
multi-model eval matrix. Skip during the large-model phase to keep runtime
|
||||
down; they still run once during the small-model (gemma4) phase."""
|
||||
if "gemma4" not in JUDGE_MODEL:
|
||||
pytest.skip(f"Intent judge tests only run in the gemma4 phase (current: {JUDGE_MODEL})")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestIntentJudgeAccuracy:
|
||||
"""Evals for intent judge accuracy."""
|
||||
|
||||
@pytest.mark.parametrize("case", INTENT_JUDGE_TEST_CASES, ids=lambda c: c.name)
|
||||
def test_intent_judge_case(self, case: IntentJudgeTestCase):
|
||||
_skip_if_not_intent_judge_phase()
|
||||
if not is_intent_judge_available():
|
||||
pytest.skip("Intent judge model (gemma4) not available")
|
||||
|
||||
if case.name in KNOWN_FAILING_CASES:
|
||||
pytest.xfail(f"Known issue: {case.name} needs prompt improvement")
|
||||
|
||||
result = run_intent_judge(case)
|
||||
|
||||
if result is None:
|
||||
pytest.fail("Intent judge returned None")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Test Case: {case.name}")
|
||||
print(f"Transcript: {case.transcript}")
|
||||
print(f"TTS: {case.last_tts_text[:50]}..." if case.last_tts_text else "TTS: None")
|
||||
print(f"Mode: {'hot_window' if case.in_hot_window else 'wake_word'}")
|
||||
print(f"{'='*60}")
|
||||
print(f"Result: directed={result.directed}, query='{result.query}', stop={result.stop}")
|
||||
print(f"Confidence: {result.confidence}")
|
||||
print(f"Reasoning: {result.reasoning}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
assert result.directed == case.expected_directed, (
|
||||
f"Expected directed={case.expected_directed}, got {result.directed}. "
|
||||
f"Reasoning: {result.reasoning}"
|
||||
)
|
||||
assert result.stop == case.expected_stop, (
|
||||
f"Expected stop={case.expected_stop}, got {result.stop}. "
|
||||
f"Reasoning: {result.reasoning}"
|
||||
)
|
||||
for needle in _as_substring_list(case.expected_query_contains):
|
||||
assert needle.lower() in (result.query or "").lower(), (
|
||||
f"Expected query to contain '{needle}', "
|
||||
f"got '{result.query}'. Reasoning: {result.reasoning}"
|
||||
)
|
||||
if result.query:
|
||||
for needle in _as_substring_list(case.expected_query_not_contains):
|
||||
assert needle.lower() not in result.query.lower(), (
|
||||
f"Expected query to NOT contain '{needle}', "
|
||||
f"got '{result.query}'. Reasoning: {result.reasoning}"
|
||||
)
|
||||
|
||||
|
||||
class TestIntentJudgePromptQuality:
|
||||
"""Tests for intent judge prompt construction quality."""
|
||||
|
||||
def test_hot_window_mode_indicated_in_prompt(self):
|
||||
from jarvis.listening.intent_judge import IntentJudge
|
||||
|
||||
judge = IntentJudge()
|
||||
segments = [create_transcript_segment("hello")]
|
||||
|
||||
prompt = judge._build_user_prompt(
|
||||
segments=segments,
|
||||
wake_timestamp=None,
|
||||
last_tts_text="Test TTS",
|
||||
last_tts_finish_time=999.0,
|
||||
in_hot_window=True,
|
||||
)
|
||||
|
||||
assert "HOT WINDOW" in prompt
|
||||
|
||||
def test_tts_text_included_for_echo_detection(self):
|
||||
from jarvis.listening.intent_judge import IntentJudge
|
||||
|
||||
judge = IntentJudge()
|
||||
segments = [create_transcript_segment("The weather is nice")]
|
||||
tts_text = "The weather today is nice and sunny"
|
||||
|
||||
prompt = judge._build_user_prompt(
|
||||
segments=segments,
|
||||
wake_timestamp=None,
|
||||
last_tts_text=tts_text,
|
||||
last_tts_finish_time=999.0,
|
||||
in_hot_window=True,
|
||||
)
|
||||
|
||||
assert "nice and sunny" in prompt
|
||||
|
||||
def test_system_prompt_has_echo_guidance(self):
|
||||
from jarvis.listening.intent_judge import IntentJudge
|
||||
|
||||
judge = IntentJudge()
|
||||
prompt = judge._build_system_prompt()
|
||||
|
||||
assert "echo" in prompt.lower()
|
||||
assert "(during TTS)" in prompt
|
||||
|
||||
|
||||
class TestIntentJudgeFallback:
|
||||
"""Tests for intent judge fallback behaviour."""
|
||||
|
||||
def test_returns_none_when_ollama_unavailable(self):
|
||||
from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
|
||||
|
||||
judge = IntentJudge(IntentJudgeConfig(
|
||||
ollama_base_url="http://127.0.0.1:99999",
|
||||
timeout_sec=1.0,
|
||||
))
|
||||
|
||||
segments = [create_transcript_segment("test")]
|
||||
result = judge.judge(segments)
|
||||
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestIntentJudgeMultiSegment:
|
||||
"""Evals for intent judge with realistic multi-segment transcript buffers."""
|
||||
|
||||
@pytest.mark.parametrize("case", MULTI_SEGMENT_TEST_CASES, ids=lambda c: c.name)
|
||||
def test_multi_segment_case(self, case: MultiSegmentTestCase):
|
||||
_skip_if_not_intent_judge_phase()
|
||||
if not is_intent_judge_available():
|
||||
pytest.skip("Intent judge model (gemma4) not available")
|
||||
|
||||
if case.name in KNOWN_FAILING_CASES:
|
||||
pytest.xfail(f"Known issue: {case.name} needs prompt improvement")
|
||||
|
||||
result = run_intent_judge_multi_segment(case)
|
||||
|
||||
if result is None:
|
||||
pytest.fail("Intent judge returned None")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Test Case: {case.name}")
|
||||
print(f"Segments:")
|
||||
for text, is_tts in case.segments:
|
||||
marker = " (during TTS)" if is_tts else ""
|
||||
print(f" - \"{text}\"{marker}")
|
||||
print(f"TTS: {case.last_tts_text[:50]}..." if case.last_tts_text else "TTS: None")
|
||||
print(f"Mode: {'hot_window' if case.in_hot_window else 'wake_word'}")
|
||||
print(f"{'='*60}")
|
||||
print(f"Result: directed={result.directed}, query='{result.query}', stop={result.stop}")
|
||||
print(f"Confidence: {result.confidence}")
|
||||
print(f"Reasoning: {result.reasoning}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
assert result.directed == case.expected_directed, (
|
||||
f"Expected directed={case.expected_directed}, got {result.directed}. "
|
||||
f"Reasoning: {result.reasoning}"
|
||||
)
|
||||
assert result.stop == case.expected_stop, (
|
||||
f"Expected stop={case.expected_stop}, got {result.stop}. "
|
||||
f"Reasoning: {result.reasoning}"
|
||||
)
|
||||
for needle in _as_substring_list(case.expected_query_contains):
|
||||
assert needle.lower() in (result.query or "").lower(), (
|
||||
f"Expected query to contain '{needle}', "
|
||||
f"got '{result.query}'. Reasoning: {result.reasoning}"
|
||||
)
|
||||
if result.query:
|
||||
for needle in _as_substring_list(case.expected_query_not_contains):
|
||||
assert needle.lower() not in result.query.lower(), (
|
||||
f"Expected query to NOT contain '{needle}', "
|
||||
f"got '{result.query}'. Reasoning: {result.reasoning}"
|
||||
)
|
||||
|
||||
|
||||
class TestProcessedSegmentFiltering:
|
||||
"""Tests for processed segment filtering in intent judge."""
|
||||
|
||||
def test_processed_segment_not_reextracted(self):
|
||||
_skip_if_not_intent_judge_phase()
|
||||
if not is_intent_judge_available():
|
||||
pytest.skip("Intent judge model (gemma4) not available")
|
||||
|
||||
from jarvis.listening.intent_judge import IntentJudge, IntentJudgeConfig
|
||||
|
||||
judge = IntentJudge(IntentJudgeConfig(
|
||||
assistant_name="Jarvis",
|
||||
model="gemma4:e2b",
|
||||
timeout_sec=10.0,
|
||||
))
|
||||
|
||||
segments = [
|
||||
create_transcript_segment(
|
||||
text="Jarvis what's the weather in London",
|
||||
start_time=1000.0,
|
||||
processed=True,
|
||||
),
|
||||
create_transcript_segment(
|
||||
text="Jarvis tell me a random topic",
|
||||
start_time=1010.0,
|
||||
processed=False,
|
||||
),
|
||||
]
|
||||
|
||||
result = judge.judge(
|
||||
segments=segments,
|
||||
wake_timestamp=1010.0,
|
||||
last_tts_text="",
|
||||
last_tts_finish_time=0.0,
|
||||
in_hot_window=False,
|
||||
current_text="Jarvis tell me a random topic",
|
||||
)
|
||||
|
||||
assert result is not None
|
||||
assert result.directed is True
|
||||
assert "random" in result.query.lower() or "topic" in result.query.lower(), (
|
||||
f"Expected query about 'random topic', got '{result.query}'."
|
||||
)
|
||||
assert "weather" not in result.query.lower(), (
|
||||
f"Query contains 'weather' from processed segment: '{result.query}'"
|
||||
)
|
||||
|
||||
print(f"\n✅ Correctly extracted new query: '{result.query}'")
|
||||
Reference in New Issue
Block a user