Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/tests/test_compound_query.py
+++ b/tests/test_compound_query.py
@@ -0,0 +1,239 @@
+"""Tests for compound-query decomposition used by small models."""
+
+import pytest
+
+from jarvis.reply.compound_query import (
+    CJK_MIN_CLAUSE_CHARS,
+    DEFAULT_MIN_CLAUSE_CHARS,
+    MIN_CLAUSE_CHARS,
+    split_compound_query,
+)
+
+
+class TestSplitCompoundQuery:
+    """Behaviour-level tests for the compound-query splitter."""
+
+    # ── English: positive cases ────────────────────────────────────────────
+    def test_multi_part_entity_query_splits(self):
+        parts = split_compound_query(
+            "Who directed Possessor and what other films has that director made?",
+            language="en",
+        )
+        assert len(parts) == 2
+        assert parts[0].startswith("Who directed Possessor")
+        assert parts[1].startswith("what other films")
+
+    def test_and_is_case_insensitive(self):
+        parts = split_compound_query(
+            "Show me the weather AND list my reminders for today",
+            language="en",
+        )
+        assert len(parts) == 2
+
+    def test_extra_whitespace_around_and(self):
+        parts = split_compound_query(
+            "Tell me about Britney Spears  and  what her best song is",
+            language="en",
+        )
+        assert len(parts) == 2
+
+    # ── English: negative cases (idioms / short clauses) ───────────────────
+    def test_rock_and_roll_does_not_split(self):
+        """Short left clause guards against idiomatic 'X and Y' phrases."""
+        assert split_compound_query("Rock and roll history", language="en") == []
+
+    def test_pros_and_cons_does_not_split(self):
+        """Short left clause ('pros' = 4 chars) keeps this as a single query."""
+        assert split_compound_query("pros and cons of remote work", language="en") == []
+
+    def test_short_left_side_does_not_split(self):
+        """Boundary: left clause below MIN_CLAUSE_CHARS prevents split."""
+        short = "x" * (MIN_CLAUSE_CHARS - 1)
+        long = "x" * (MIN_CLAUSE_CHARS + 5)
+        assert split_compound_query(f"{short} and {long}", language="en") == []
+
+    def test_short_right_side_does_not_split(self):
+        short = "x" * (MIN_CLAUSE_CHARS - 1)
+        long = "x" * (MIN_CLAUSE_CHARS + 5)
+        assert split_compound_query(f"{long} and {short}", language="en") == []
+
+    def test_both_at_threshold_splits(self):
+        at_threshold = "x" * MIN_CLAUSE_CHARS
+        parts = split_compound_query(f"{at_threshold} and {at_threshold}", language="en")
+        assert len(parts) == 2
+
+    def test_multiple_ands_only_first_split(self):
+        """First ' and ' wins — keeps the splitter deterministic."""
+        parts = split_compound_query(
+            "Tell me about dogs and cats and also birds please",
+            language="en",
+        )
+        assert len(parts) == 2
+        assert "cats" in parts[1]
+        assert "birds" in parts[1]  # second ' and ' stays in right clause
+
+    def test_empty_and_none_are_safe(self):
+        assert split_compound_query("", language="en") == []
+        assert split_compound_query(None, language="en") == []  # type: ignore[arg-type]
+
+    def test_no_conjunction_returns_empty(self):
+        assert split_compound_query("What is the weather today?", language="en") == []
+
+    def test_bare_and_without_whitespace_does_not_split(self):
+        """We require whitespace boundaries to avoid splitting 'command' etc."""
+        assert split_compound_query("commandline tools are useful", language="en") == []
+
+    # ── Whitespace-separated supported languages ───────────────────────────
+    @pytest.mark.parametrize("language,query", [
+        # Germanic / Romance
+        ("es", "Quién dirigió Possessor y qué otras películas ha hecho"),
+        ("fr", "Qui a réalisé Possessor et quels autres films a-t-il faits"),
+        ("de", "Wer führte Regie bei Possessor und welche anderen Filme hat er"),
+        ("pt", "Quem dirigiu Possessor e quais outros filmes fez o diretor"),
+        ("it", "Chi ha diretto Possessor e quali altri film ha fatto"),
+        ("nl", "Wie regisseerde Possessor en welke andere films maakte hij"),
+        ("sv", "Vem regisserade Possessor och vilka andra filmer har han gjort"),
+        ("no", "Hvem regisserte Possessor og hvilke andre filmer har han laget"),
+        ("da", "Hvem instruerede Possessor og hvilke andre film har han lavet"),
+        ("fi", "Kuka ohjasi Possessorin ja mitä muita elokuvia hän on tehnyt"),
+        # Slavic
+        ("ru", "Кто снял фильм Поссессор и какие другие фильмы он снял"),
+        ("uk", "Хто зняв фільм Поссессор і які інші фільми він зробив"),
+        ("pl", "Kto wyreżyserował Possessor i jakie inne filmy zrobił"),
+        ("cs", "Kdo režíroval Possessor a jaké další filmy natočil"),
+        ("sk", "Kto režíroval Possessor a aké ďalšie filmy natočil"),
+        ("bg", "Кой режисира Поссесор и какви други филми е направил"),
+        ("hr", "Tko je režirao Possessor i koje druge filmove je snimio"),
+        ("sl", "Kdo je režiral Possessor in katere druge filme je posnel"),
+        # Other European
+        ("el", "Ποιος σκηνοθέτησε το Possessor και ποιες άλλες ταινίες έχει κάνει"),
+        ("tr", "Possessor filmini kim yönetti ve başka hangi filmleri yaptı"),
+        ("hu", "Ki rendezte a Possessort és milyen más filmeket csinált"),
+        ("ro", "Cine a regizat Possessor și ce alte filme a făcut"),
+        # Asian whitespace-separated
+        ("vi", "Ai đạo diễn Possessor và đạo diễn đó đã làm phim nào khác"),
+        ("id", "Siapa sutradara Possessor dan film apa lagi yang sudah dibuat"),
+        ("ms", "Siapa pengarah Possessor dan filem apa lagi yang telah dibuat"),
+        ("hi", "पोसेसर का निर्देशन किसने किया और निर्देशक ने और कौन सी फिल्में बनाई"),
+    ])
+    def test_supported_languages_split(self, language, query):
+        parts = split_compound_query(query, language=language)
+        assert len(parts) == 2, f"{language}: expected split, got {parts!r}"
+
+    def test_italian_ed_variant(self):
+        """Italian uses 'ed' before vowels."""
+        parts = split_compound_query(
+            "Parlami della storia ed anche della geografia del paese",
+            language="it",
+        )
+        assert len(parts) == 2
+
+    # ── Non-English: unsupported / unknown languages ───────────────────────
+    def test_unsupported_language_does_not_split(self):
+        """Unknown language codes fall back to no-decomposition rather than
+        mis-applying English rules — graceful degradation per spec."""
+        # Japanese, Korean, Chinese, Russian — not in our conjunction table.
+        # We do NOT want to split on ' and ' for these; the text below is
+        # contrived to contain English 'and' but a Japanese language code.
+        parts = split_compound_query(
+            "some long query and another long query", language="ja",
+        )
+        assert parts == []
+
+    def test_invalid_language_code_defaults_to_english(self):
+        """Single-character or malformed codes normalise to None → English default."""
+        parts = split_compound_query(
+            "Tell me about cats and also about dogs please",
+            language="x",
+        )
+        assert len(parts) == 2
+
+    def test_none_language_defaults_to_english(self):
+        """Non-voice entrypoints pass language=None; we default to English."""
+        parts = split_compound_query(
+            "Who is Britney Spears and what is her best song",
+            language=None,
+        )
+        assert len(parts) == 2
+
+    def test_uppercase_language_code_normalises(self):
+        parts = split_compound_query(
+            "Quién dirigió Possessor y qué otras películas ha hecho",
+            language="ES",
+        )
+        assert len(parts) == 2
+
+    def test_language_with_region_suffix_normalises(self):
+        """en-US style codes should normalise to 'en'."""
+        parts = split_compound_query(
+            "Who is Britney Spears and what is her best song",
+            language="en-US",
+        )
+        assert len(parts) == 2
+
+    # ── Non-English: idioms should not false-positive ──────────────────────
+    def test_french_va_et_vient_short_left_side(self):
+        """'va' is only 2 chars so it won't split — guard by length."""
+        assert split_compound_query("va et vient", language="fr") == []
+
+    # ── CJK (no whitespace around conjunctions) ────────────────────────────
+    def test_chinese_character_level_conjunction_splits(self):
+        """Chinese 和 appears between words without whitespace."""
+        parts = split_compound_query("电影的历史和音乐的发展", language="zh")
+        assert len(parts) == 2
+        assert "电影" in parts[0]
+        assert "音乐" in parts[1]
+
+    def test_chinese_short_clauses_do_not_split(self):
+        """'我和他' — 1-char clauses should not split (below CJK threshold)."""
+        assert split_compound_query("我和他", language="zh") == []
+
+    def test_chinese_threshold_is_lower_than_default(self):
+        """CJK threshold must be smaller than Latin default — Han chars pack
+        more meaning per character."""
+        assert CJK_MIN_CLAUSE_CHARS < DEFAULT_MIN_CLAUSE_CHARS
+
+    def test_chinese_multi_char_conjunction_splits(self):
+        parts = split_compound_query(
+            "请告诉我关于狗的信息并且告诉我关于猫的信息", language="zh",
+        )
+        assert len(parts) == 2
+
+    def test_japanese_freestanding_conjunction_splits(self):
+        parts = split_compound_query(
+            "犬について教えてそして猫についても教えて", language="ja",
+        )
+        assert len(parts) == 2
+
+    def test_japanese_enclitic_particle_does_not_split(self):
+        """と/や are noun-attached particles — we intentionally don't split
+        on them to avoid fragmenting noun phrases like '犬と猫'."""
+        # This phrase contains と between 犬 and 猫; our rules skip と
+        # on purpose, so this should NOT split.
+        assert split_compound_query("犬と猫が好きです", language="ja") == []
+
+    def test_korean_freestanding_conjunction_splits(self):
+        parts = split_compound_query(
+            "개에 대해 알려주세요 그리고 고양이에 대해서도 알려주세요",
+            language="ko",
+        )
+        assert len(parts) == 2
+
+    def test_korean_postpositional_particle_does_not_split(self):
+        """와/과 are postpositional particles — intentionally not split on
+        (same reason as Japanese と/や)."""
+        assert split_compound_query("개와 고양이를 좋아해요", language="ko") == []
+
+    # ── Unsupported languages with enclitic conjunctions ───────────────────
+    @pytest.mark.parametrize("language", ["ar", "he", "th", "km", "lo"])
+    def test_enclitic_languages_return_empty(self, language):
+        """Arabic / Hebrew use an enclitic conjunction prefix (و / ו) that
+        a regex can't safely split without a morphological tokenizer. Thai
+        / Khmer / Lao lack inter-word whitespace and the conjunctions
+        overlap syllable boundaries. We intentionally do not support
+        these yet — the splitter must return [] rather than mis-split.
+        """
+        parts = split_compound_query(
+            "some long query and another long query", language=language,
+        )
+        assert parts == []