Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/evals/test_diary_summariser_hygiene.py
+++ b/evals/test_diary_summariser_hygiene.py
@@ -0,0 +1,240 @@
+"""
+Diary Summariser Hygiene Evaluations (Live)
+
+Verifies the summariser prompt does not preserve assistant failure/deflection
+narration in diary entries. Without this hygiene, the assistant's own past
+failures get retrieved as "conversation history" on future related queries and
+prime the model to repeat the same deflection pattern.
+
+Motivating field incident:
+  A user asked "tell me about Possessor" and the small model deflected. The
+  diary then recorded: "the assistant offered to search the web." On the next
+  day, the same user asked again, and the model imitated the recorded
+  deflection instead of calling webSearch.
+
+Run: EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh test_diary_summariser
+"""
+
+import pytest
+
+from conftest import requires_judge_llm
+from helpers import JUDGE_BASE_URL, JUDGE_MODEL
+
+
+# Exact deflection phrases the summariser must not preserve verbatim.
+# Language-agnostic by nature (phrases are English because the field-observed
+# summariser output was English, but the *rule* in the prompt is language-agnostic).
+_DEFLECTION_PHRASES = (
+    "could not provide",
+    "lacked",
+    "offered to search",
+    "offer to search",
+    "offered to perform",
+    "unable to provide",
+    "was unable",
+    "did not have",
+    "does not have",
+    "had no specific",
+    "no specific information",
+    "no specific details",
+    "clarified that",
+    "indicated it",
+    "initially could not",
+    "failed to provide",
+    "no information",
+    "internal knowledge",
+)
+
+
+@pytest.mark.eval
+@requires_judge_llm
+class TestDiarySummariserHygieneLive:
+    """Live tests that the summariser omits assistant failure narration."""
+
+    def _summarise(self, chunks: list[str]) -> tuple[str, str]:
+        from jarvis.memory.conversation import generate_conversation_summary
+        summary, topics = generate_conversation_summary(
+            recent_chunks=chunks,
+            previous_summary=None,
+            ollama_base_url=JUDGE_BASE_URL,
+            ollama_chat_model=JUDGE_MODEL,
+            timeout_sec=60.0,
+        )
+        return summary or "", topics or ""
+
+    def test_omits_deflection_narration_for_unknown_entity(self):
+        """A conversation where the assistant deflected on an unknown entity,
+        then eventually found an answer, must summarise only the resolved fact —
+        not the deflection."""
+        chunks = [
+            "User: Tell me about the Possessor movie.",
+            "Assistant: I don't have specific information about Possessor. Would you like me to search the web for it?",
+            "User: Yeah go ahead.",
+            "Assistant: Possessor is a 2020 science-fiction horror film directed by Brandon Cronenberg, starring Andrea Riseborough.",
+        ]
+        summary, _ = self._summarise(chunks)
+        print(f"\n  Summary: {summary}")
+
+        lowered = summary.lower()
+        hits = [p for p in _DEFLECTION_PHRASES if p in lowered]
+        if hits:
+            pytest.xfail(
+                f"Small judge model {JUDGE_MODEL} still narrated deflections: {hits}. "
+                f"Summary: {summary}"
+            )
+
+        # Positive requirement: the resolved fact must appear.
+        assert "possessor" in lowered and (
+            "2020" in lowered or "cronenberg" in lowered or "film" in lowered or "movie" in lowered
+        ), f"Resolved fact missing from summary: {summary}"
+
+    def test_omits_deflection_when_topic_never_resolved(self):
+        """When the topic is raised but never resolved, the summary should
+        record the topic/user intent, not the assistant's deflection."""
+        chunks = [
+            "User: What do you know about the book Piranesi?",
+            "Assistant: I don't have specific information about that book.",
+            "User: No worries, let's talk about something else. What's the weather?",
+            "Assistant: It's 15 degrees and cloudy in London.",
+        ]
+        summary, _ = self._summarise(chunks)
+        print(f"\n  Summary: {summary}")
+
+        lowered = summary.lower()
+        # The topic (Piranesi) may appear, but phrases narrating the
+        # assistant's inability must not.
+        hits = [p for p in _DEFLECTION_PHRASES if p in lowered]
+        if hits:
+            pytest.xfail(
+                f"Small judge model {JUDGE_MODEL} still narrated deflections: {hits}. "
+                f"Summary: {summary}"
+            )
+
+    def test_unrelated_topics_are_not_welded_into_one_clause(self):
+        """Regression for the Possessor/Jarvis field incident.
+
+        Two distinct topics (the 2020 Cronenberg film Possessor, and the
+        MCU AI character named Jarvis) in the same conversation must not
+        be summarised as a single welded clause like "the movie Possessor
+        and the character Jarvis, identified as the MCU AI...". Downstream
+        enrichment will treat the appositive as describing both referents
+        and mislead the next reply.
+
+        The sentence that mentions Possessor must not also contain MCU-
+        specific tokens (Marvel / Stark / Vision / Avengers), and vice
+        versa.
+        """
+        chunks = [
+            "User: Have you seen the movie Possessor?",
+            "Assistant: I don't have specific information about that film. Would you like me to search the web?",
+            "User: No, unrelated — why are you called Jarvis?",
+            "Assistant: My name is a nod to the MCU character Jarvis, the AI created by Tony Stark and later embodied by Vision.",
+        ]
+        summary, _ = self._summarise(chunks)
+        print(f"\n  Summary: {summary}")
+
+        import re
+        sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', summary) if s.strip()]
+
+        # Tight phrase-level tokens — naked substrings like "vision" or "stark"
+        # collide with common English words and would false-positive.
+        mcu_tokens = (
+            "tony stark",
+            "marvel cinematic",
+            "mcu",
+            "embodied by vision",
+            "avengers",
+            "iron man",
+        )
+
+        welded = []
+        for s in sentences:
+            low = s.lower()
+            mentions_possessor = "possessor" in low
+            mentions_mcu_jarvis = any(t in low for t in mcu_tokens)
+            if mentions_possessor and mentions_mcu_jarvis:
+                welded.append(s)
+
+        if welded:
+            pytest.xfail(
+                f"Small judge model {JUDGE_MODEL} welded Possessor with MCU-Jarvis "
+                f"details in the same sentence: {welded}. Full summary: {summary}"
+            )
+
+        # Positive requirement: both topics must survive somewhere — the rule
+        # is about separation, not suppression.
+        lowered = summary.lower()
+        assert "possessor" in lowered, f"Possessor topic dropped: {summary}"
+        assert "jarvis" in lowered, f"Jarvis topic dropped: {summary}"
+
+    def test_preserves_legitimate_user_preferences(self):
+        """Regression guard: the hygiene rule must not strip legitimate content
+        (user preferences, decisions, facts)."""
+        chunks = [
+            "User: I prefer Celsius for temperatures.",
+            "Assistant: Got it, I'll use Celsius from now on.",
+            "User: Also, I live in Hackney.",
+            "Assistant: Noted.",
+        ]
+        summary, _ = self._summarise(chunks)
+        print(f"\n  Summary: {summary}")
+
+        lowered = summary.lower()
+        assert "celsius" in lowered, f"Preference dropped from summary: {summary}"
+        assert "hackney" in lowered, f"Location dropped from summary: {summary}"
+
+    def test_omits_deflection_narration_in_turkish(self):
+        """Rule 6 of the summariser prompt promises to apply in every
+        language, with explicit Turkish examples in the prompt body. This
+        eval validates the multilingual claim end-to-end on the live
+        judge model rather than relying on prompt-content assertions
+        alone (which only prove the prompt *says* it works in any
+        language, not that it actually does).
+
+        Turkish was chosen because the prompt has explicit Turkish
+        BAD/GOOD pairs and the user of this codebase speaks Turkish.
+        Spanish would equally validate but would duplicate the same
+        signal.
+        """
+        chunks = [
+            "User: Hackney'de iyi bir restoran biliyor musun?",
+            "Assistant: Hackney'deki güncel restoranlar hakkında özel bir bilgim yok. Web'de aramamı ister misin?",
+            "User: Boşver. Bugün hava nasıl?",
+            "Assistant: Londra'da hava 12 derece ve parçalı bulutlu.",
+        ]
+        summary, _ = self._summarise(chunks)
+        print(f"\n  Summary: {summary}")
+
+        lowered = summary.lower()
+        # Turkish deflection markers: assistant denying having information.
+        # The summariser must not preserve these in Turkish either.
+        turkish_deflections = (
+            "bilgisi yok",          # "has no information"
+            "bilgisi olmadığını",   # "that it has no information"
+            "bilmediğini",          # "that it does not know"
+            "yardımcı olamadı",     # "could not help"
+            "aramamı ister",        # "would you like me to search"
+            "aramayı önerdi",       # "suggested searching"
+        )
+        hits = [p for p in turkish_deflections if p in lowered]
+        if hits:
+            pytest.xfail(
+                f"Small judge model {JUDGE_MODEL} narrated Turkish deflections: {hits}. "
+                f"Summary: {summary}"
+            )
+
+        # Positive requirement: at least one of the surviving topics must
+        # be recorded. The user asked about a restaurant AND the weather.
+        # The rule is "drop deflections, keep topics" — the topics must
+        # persist in some recognisable form.
+        topic_present = any(t in lowered for t in (
+            "restoran",       # restaurant
+            "hackney",
+            "hava",           # weather
+            "londra",         # London
+            "12",             # the temperature
+        ))
+        assert topic_present, (
+            f"Turkish summary dropped every topic, not just deflections: {summary}"
+        )
+