Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/evals/test_memory_digest_preferences.py
+++ b/evals/test_memory_digest_preferences.py
@@ -0,0 +1,129 @@
+"""
+Memory Digest — Preference-Signal Surfacing (Live)
+
+Guards that the memory digest distiller (``enrichment.digest_memory_for_query``)
+surfaces past user engagement in the same domain as a taste/preference signal
+for recommendation-style queries ("what should I watch tonight", "suggest a
+restaurant", etc.), instead of returning NONE just because the snippets never
+contain an explicitly stated preference.
+
+Motivating field incident (2026-04-20):
+  User asked "what should I watch tonight, Jarvis?". The diary contained
+  fresh entries about the user engaging with the films Titanic and Possessor.
+  The digest returned NONE → the reply model formed a generic webSearch for
+  "what should I watch tonight" → the final reply recommended the generic
+  Rotten Tomatoes top-1 result ("Big Mistakes on Netflix"), ignoring the
+  user's actual taste and re-recommending nothing-from-their-history.
+
+The general principle (encoded in the digest prompt): past interactions in
+the query's domain are preference evidence even when no preference was
+stated in plain words. This is domain-agnostic — it should hold for food,
+books, music, news, films, anywhere.
+
+Run: EVAL_JUDGE_MODEL=gemma4:e2b pytest evals/test_memory_digest_preferences.py -v
+"""
+
+import pytest
+
+from conftest import requires_judge_llm
+from helpers import JUDGE_BASE_URL, JUDGE_MODEL
+
+
+@pytest.mark.eval
+@requires_judge_llm
+class TestMemoryDigestSurfacesPreferenceSignals:
+    """Live tests that the digest surfaces engagement-as-preference signals."""
+
+    def _digest(self, query: str, diary_entries: list[str]) -> str:
+        from jarvis.reply.enrichment import digest_memory_for_query
+        return digest_memory_for_query(
+            query=query,
+            diary_entries=diary_entries,
+            graph_parts=[],
+            ollama_base_url=JUDGE_BASE_URL,
+            ollama_chat_model=JUDGE_MODEL,
+            timeout_sec=60.0,
+        )
+
+    def test_watch_recommendation_surfaces_recently_discussed_films(self):
+        """Reproduces the 2026-04-20 incident directly at the digest layer."""
+        diary = [
+            "[2026-04-20] The user asked about the movie Titanic; the assistant "
+            "summarised its plot and noted it is a 1997 film directed by James Cameron.",
+            "[2026-04-19] The conversation focused on the film Possessor; the "
+            "assistant said it is a 2020 sci-fi horror by Brandon Cronenberg.",
+            "[2026-04-15] The user discussed their weekend plans and mentioned "
+            "they had been busy with work projects.",
+            "[2026-04-10] The user asked about the weather in London.",
+        ]
+        digest = self._digest("what should I watch tonight?", diary)
+        print(f"\n  Digest: {digest!r}")
+
+        # Digest must not be empty — past film engagement is a preference signal.
+        if not digest:
+            pytest.xfail(
+                f"Small judge model {JUDGE_MODEL} returned NONE for a "
+                f"recommendation query despite recent film engagement. "
+                f"This is the exact regression the prompt-level fix targets."
+            )
+
+        lowered = digest.lower()
+        # At least one of the recently-engaged titles must surface.
+        surfaced = [t for t in ("titanic", "possessor") if t in lowered]
+        assert surfaced, (
+            f"Digest did not surface any recently-engaged film as a preference "
+            f"signal. Got: {digest!r}"
+        )
+
+    def test_restaurant_recommendation_surfaces_past_cuisine_interest(self):
+        """Same principle, different domain — past food engagement surfaces
+        for a restaurant recommendation query."""
+        diary = [
+            "[2026-04-18] The user asked about ramen shops near their office "
+            "and the assistant listed three in Shoreditch.",
+            "[2026-04-12] The user discussed cooking a Thai green curry and "
+            "asked how to balance the fish sauce.",
+            "[2026-04-05] The user mentioned they had a dentist appointment.",
+        ]
+        digest = self._digest("suggest a restaurant for dinner tonight", diary)
+        print(f"\n  Digest: {digest!r}")
+
+        if not digest:
+            pytest.xfail(
+                f"Small judge model {JUDGE_MODEL} returned NONE for a "
+                f"restaurant recommendation despite recent cuisine engagement."
+            )
+
+        lowered = digest.lower()
+        # At least one of the engaged cuisines/items must surface.
+        surfaced = [t for t in ("ramen", "thai", "curry") if t in lowered]
+        assert surfaced, (
+            f"Digest did not surface any recently-engaged cuisine as a "
+            f"preference signal. Got: {digest!r}"
+        )
+
+    def test_unrelated_domain_still_returns_none(self):
+        """Regression guard: the relaxation must not make the digest surface
+        everything. Snippets from a wholly different domain should still NONE
+        out for a recommendation query."""
+        diary = [
+            "[2026-04-18] The user asked about the population of Iceland; the "
+            "assistant said it is roughly 380,000.",
+            "[2026-04-12] The user asked for help debugging a Python import "
+            "cycle in their work project.",
+        ]
+        digest = self._digest("what should I watch tonight?", diary)
+        print(f"\n  Digest: {digest!r}")
+
+        # Neither snippet is in the films/entertainment domain. The digest
+        # should either return empty or at least not falsely invent a film
+        # preference from population statistics or Python debugging.
+        if digest:
+            lowered = digest.lower()
+            fabricated = any(
+                t in lowered for t in ("film", "movie", "watch", "series", "show")
+            )
+            assert not fabricated, (
+                f"Digest fabricated a film preference from unrelated snippets. "
+                f"Got: {digest!r}"
+            )