Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
240 lines
12 KiB
Python
240 lines
12 KiB
Python
"""Tests for compound-query decomposition used by small models."""
|
||
|
||
import pytest
|
||
|
||
from jarvis.reply.compound_query import (
|
||
CJK_MIN_CLAUSE_CHARS,
|
||
DEFAULT_MIN_CLAUSE_CHARS,
|
||
MIN_CLAUSE_CHARS,
|
||
split_compound_query,
|
||
)
|
||
|
||
|
||
class TestSplitCompoundQuery:
|
||
"""Behaviour-level tests for the compound-query splitter."""
|
||
|
||
# ── English: positive cases ────────────────────────────────────────────
|
||
def test_multi_part_entity_query_splits(self):
|
||
parts = split_compound_query(
|
||
"Who directed Possessor and what other films has that director made?",
|
||
language="en",
|
||
)
|
||
assert len(parts) == 2
|
||
assert parts[0].startswith("Who directed Possessor")
|
||
assert parts[1].startswith("what other films")
|
||
|
||
def test_and_is_case_insensitive(self):
|
||
parts = split_compound_query(
|
||
"Show me the weather AND list my reminders for today",
|
||
language="en",
|
||
)
|
||
assert len(parts) == 2
|
||
|
||
def test_extra_whitespace_around_and(self):
|
||
parts = split_compound_query(
|
||
"Tell me about Britney Spears and what her best song is",
|
||
language="en",
|
||
)
|
||
assert len(parts) == 2
|
||
|
||
# ── English: negative cases (idioms / short clauses) ───────────────────
|
||
def test_rock_and_roll_does_not_split(self):
|
||
"""Short left clause guards against idiomatic 'X and Y' phrases."""
|
||
assert split_compound_query("Rock and roll history", language="en") == []
|
||
|
||
def test_pros_and_cons_does_not_split(self):
|
||
"""Short left clause ('pros' = 4 chars) keeps this as a single query."""
|
||
assert split_compound_query("pros and cons of remote work", language="en") == []
|
||
|
||
def test_short_left_side_does_not_split(self):
|
||
"""Boundary: left clause below MIN_CLAUSE_CHARS prevents split."""
|
||
short = "x" * (MIN_CLAUSE_CHARS - 1)
|
||
long = "x" * (MIN_CLAUSE_CHARS + 5)
|
||
assert split_compound_query(f"{short} and {long}", language="en") == []
|
||
|
||
def test_short_right_side_does_not_split(self):
|
||
short = "x" * (MIN_CLAUSE_CHARS - 1)
|
||
long = "x" * (MIN_CLAUSE_CHARS + 5)
|
||
assert split_compound_query(f"{long} and {short}", language="en") == []
|
||
|
||
def test_both_at_threshold_splits(self):
|
||
at_threshold = "x" * MIN_CLAUSE_CHARS
|
||
parts = split_compound_query(f"{at_threshold} and {at_threshold}", language="en")
|
||
assert len(parts) == 2
|
||
|
||
def test_multiple_ands_only_first_split(self):
|
||
"""First ' and ' wins — keeps the splitter deterministic."""
|
||
parts = split_compound_query(
|
||
"Tell me about dogs and cats and also birds please",
|
||
language="en",
|
||
)
|
||
assert len(parts) == 2
|
||
assert "cats" in parts[1]
|
||
assert "birds" in parts[1] # second ' and ' stays in right clause
|
||
|
||
def test_empty_and_none_are_safe(self):
|
||
assert split_compound_query("", language="en") == []
|
||
assert split_compound_query(None, language="en") == [] # type: ignore[arg-type]
|
||
|
||
def test_no_conjunction_returns_empty(self):
|
||
assert split_compound_query("What is the weather today?", language="en") == []
|
||
|
||
def test_bare_and_without_whitespace_does_not_split(self):
|
||
"""We require whitespace boundaries to avoid splitting 'command' etc."""
|
||
assert split_compound_query("commandline tools are useful", language="en") == []
|
||
|
||
# ── Whitespace-separated supported languages ───────────────────────────
|
||
@pytest.mark.parametrize("language,query", [
|
||
# Germanic / Romance
|
||
("es", "Quién dirigió Possessor y qué otras películas ha hecho"),
|
||
("fr", "Qui a réalisé Possessor et quels autres films a-t-il faits"),
|
||
("de", "Wer führte Regie bei Possessor und welche anderen Filme hat er"),
|
||
("pt", "Quem dirigiu Possessor e quais outros filmes fez o diretor"),
|
||
("it", "Chi ha diretto Possessor e quali altri film ha fatto"),
|
||
("nl", "Wie regisseerde Possessor en welke andere films maakte hij"),
|
||
("sv", "Vem regisserade Possessor och vilka andra filmer har han gjort"),
|
||
("no", "Hvem regisserte Possessor og hvilke andre filmer har han laget"),
|
||
("da", "Hvem instruerede Possessor og hvilke andre film har han lavet"),
|
||
("fi", "Kuka ohjasi Possessorin ja mitä muita elokuvia hän on tehnyt"),
|
||
# Slavic
|
||
("ru", "Кто снял фильм Поссессор и какие другие фильмы он снял"),
|
||
("uk", "Хто зняв фільм Поссессор і які інші фільми він зробив"),
|
||
("pl", "Kto wyreżyserował Possessor i jakie inne filmy zrobił"),
|
||
("cs", "Kdo režíroval Possessor a jaké další filmy natočil"),
|
||
("sk", "Kto režíroval Possessor a aké ďalšie filmy natočil"),
|
||
("bg", "Кой режисира Поссесор и какви други филми е направил"),
|
||
("hr", "Tko je režirao Possessor i koje druge filmove je snimio"),
|
||
("sl", "Kdo je režiral Possessor in katere druge filme je posnel"),
|
||
# Other European
|
||
("el", "Ποιος σκηνοθέτησε το Possessor και ποιες άλλες ταινίες έχει κάνει"),
|
||
("tr", "Possessor filmini kim yönetti ve başka hangi filmleri yaptı"),
|
||
("hu", "Ki rendezte a Possessort és milyen más filmeket csinált"),
|
||
("ro", "Cine a regizat Possessor și ce alte filme a făcut"),
|
||
# Asian whitespace-separated
|
||
("vi", "Ai đạo diễn Possessor và đạo diễn đó đã làm phim nào khác"),
|
||
("id", "Siapa sutradara Possessor dan film apa lagi yang sudah dibuat"),
|
||
("ms", "Siapa pengarah Possessor dan filem apa lagi yang telah dibuat"),
|
||
("hi", "पोसेसर का निर्देशन किसने किया और निर्देशक ने और कौन सी फिल्में बनाई"),
|
||
])
|
||
def test_supported_languages_split(self, language, query):
|
||
parts = split_compound_query(query, language=language)
|
||
assert len(parts) == 2, f"{language}: expected split, got {parts!r}"
|
||
|
||
def test_italian_ed_variant(self):
|
||
"""Italian uses 'ed' before vowels."""
|
||
parts = split_compound_query(
|
||
"Parlami della storia ed anche della geografia del paese",
|
||
language="it",
|
||
)
|
||
assert len(parts) == 2
|
||
|
||
# ── Non-English: unsupported / unknown languages ───────────────────────
|
||
def test_unsupported_language_does_not_split(self):
|
||
"""Unknown language codes fall back to no-decomposition rather than
|
||
mis-applying English rules — graceful degradation per spec."""
|
||
# Japanese, Korean, Chinese, Russian — not in our conjunction table.
|
||
# We do NOT want to split on ' and ' for these; the text below is
|
||
# contrived to contain English 'and' but a Japanese language code.
|
||
parts = split_compound_query(
|
||
"some long query and another long query", language="ja",
|
||
)
|
||
assert parts == []
|
||
|
||
def test_invalid_language_code_defaults_to_english(self):
|
||
"""Single-character or malformed codes normalise to None → English default."""
|
||
parts = split_compound_query(
|
||
"Tell me about cats and also about dogs please",
|
||
language="x",
|
||
)
|
||
assert len(parts) == 2
|
||
|
||
def test_none_language_defaults_to_english(self):
|
||
"""Non-voice entrypoints pass language=None; we default to English."""
|
||
parts = split_compound_query(
|
||
"Who is Britney Spears and what is her best song",
|
||
language=None,
|
||
)
|
||
assert len(parts) == 2
|
||
|
||
def test_uppercase_language_code_normalises(self):
|
||
parts = split_compound_query(
|
||
"Quién dirigió Possessor y qué otras películas ha hecho",
|
||
language="ES",
|
||
)
|
||
assert len(parts) == 2
|
||
|
||
def test_language_with_region_suffix_normalises(self):
|
||
"""en-US style codes should normalise to 'en'."""
|
||
parts = split_compound_query(
|
||
"Who is Britney Spears and what is her best song",
|
||
language="en-US",
|
||
)
|
||
assert len(parts) == 2
|
||
|
||
# ── Non-English: idioms should not false-positive ──────────────────────
|
||
def test_french_va_et_vient_short_left_side(self):
|
||
"""'va' is only 2 chars so it won't split — guard by length."""
|
||
assert split_compound_query("va et vient", language="fr") == []
|
||
|
||
# ── CJK (no whitespace around conjunctions) ────────────────────────────
|
||
def test_chinese_character_level_conjunction_splits(self):
|
||
"""Chinese 和 appears between words without whitespace."""
|
||
parts = split_compound_query("电影的历史和音乐的发展", language="zh")
|
||
assert len(parts) == 2
|
||
assert "电影" in parts[0]
|
||
assert "音乐" in parts[1]
|
||
|
||
def test_chinese_short_clauses_do_not_split(self):
|
||
"""'我和他' — 1-char clauses should not split (below CJK threshold)."""
|
||
assert split_compound_query("我和他", language="zh") == []
|
||
|
||
def test_chinese_threshold_is_lower_than_default(self):
|
||
"""CJK threshold must be smaller than Latin default — Han chars pack
|
||
more meaning per character."""
|
||
assert CJK_MIN_CLAUSE_CHARS < DEFAULT_MIN_CLAUSE_CHARS
|
||
|
||
def test_chinese_multi_char_conjunction_splits(self):
|
||
parts = split_compound_query(
|
||
"请告诉我关于狗的信息并且告诉我关于猫的信息", language="zh",
|
||
)
|
||
assert len(parts) == 2
|
||
|
||
def test_japanese_freestanding_conjunction_splits(self):
|
||
parts = split_compound_query(
|
||
"犬について教えてそして猫についても教えて", language="ja",
|
||
)
|
||
assert len(parts) == 2
|
||
|
||
def test_japanese_enclitic_particle_does_not_split(self):
|
||
"""と/や are noun-attached particles — we intentionally don't split
|
||
on them to avoid fragmenting noun phrases like '犬と猫'."""
|
||
# This phrase contains と between 犬 and 猫; our rules skip と
|
||
# on purpose, so this should NOT split.
|
||
assert split_compound_query("犬と猫が好きです", language="ja") == []
|
||
|
||
def test_korean_freestanding_conjunction_splits(self):
|
||
parts = split_compound_query(
|
||
"개에 대해 알려주세요 그리고 고양이에 대해서도 알려주세요",
|
||
language="ko",
|
||
)
|
||
assert len(parts) == 2
|
||
|
||
def test_korean_postpositional_particle_does_not_split(self):
|
||
"""와/과 are postpositional particles — intentionally not split on
|
||
(same reason as Japanese と/や)."""
|
||
assert split_compound_query("개와 고양이를 좋아해요", language="ko") == []
|
||
|
||
# ── Unsupported languages with enclitic conjunctions ───────────────────
|
||
@pytest.mark.parametrize("language", ["ar", "he", "th", "km", "lo"])
|
||
def test_enclitic_languages_return_empty(self, language):
|
||
"""Arabic / Hebrew use an enclitic conjunction prefix (و / ו) that
|
||
a regex can't safely split without a morphological tokenizer. Thai
|
||
/ Khmer / Lao lack inter-word whitespace and the conjunctions
|
||
overlap syllable boundaries. We intentionally do not support
|
||
these yet — the splitter must return [] rather than mis-split.
|
||
"""
|
||
parts = split_compound_query(
|
||
"some long query and another long query", language=language,
|
||
)
|
||
assert parts == []
|