Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/evals/test_memory_digest_identity.py
+++ b/evals/test_memory_digest_identity.py
@@ -0,0 +1,261 @@
+"""
+Memory Digest — Identity-Query Fact Surfacing (Live)
+
+Guards that the memory digest distiller (``enrichment.digest_memory_for_query``)
+surfaces user-stated facts about the user (location, interests, ongoing
+plans, biography) when the current query asks who the user is or what the
+assistant knows about them, rather than surfacing past Q&A topics the user
+merely asked about.
+
+Motivating field incident:
+  The user asked "what do you know about me?". The diary contained a
+  user-stated fact ("goes boxing near E3 2WS") alongside a past Q&A where
+  the user asked for the area of a rectangle. The digest surfaced the
+  rectangle question, which is not a fact about the user at all — leading
+  the reply model to miss the actual identity signal entirely.
+
+General principle (encoded in the digest prompt): for identity queries,
+user-stated facts dominate over past Q&A topics, and multiple such facts
+should be surfaced when present.
+
+Run: EVAL_JUDGE_MODEL=gemma4:e2b pytest evals/test_memory_digest_identity.py -v
+"""
+
+import pytest
+
+from conftest import requires_judge_llm
+from helpers import JUDGE_BASE_URL, JUDGE_MODEL
+
+
+@pytest.mark.eval
+@requires_judge_llm
+class TestMemoryDigestSurfacesIdentityFacts:
+    """Live tests that the digest prefers user-stated facts for identity queries."""
+
+    def _digest(self, query: str, diary_entries: list[str]) -> str:
+        from jarvis.reply.enrichment import digest_memory_for_query
+        return digest_memory_for_query(
+            query=query,
+            diary_entries=diary_entries,
+            graph_parts=[],
+            ollama_base_url=JUDGE_BASE_URL,
+            ollama_chat_model=JUDGE_MODEL,
+            timeout_sec=60.0,
+        )
+
+    def test_identity_query_surfaces_user_stated_fact_over_past_qa(self):
+        """Reproduces the field incident directly at the digest layer.
+
+        Padding filler ensures the raw block exceeds ``_DIGEST_MIN_CHARS``
+        (400) so the distil LLM actually runs — below that threshold the
+        raw text is passed through unchanged and this test would be a
+        no-op.
+        """
+        diary = [
+            "[2026-04-10] The user said they go boxing near E3 2WS.",
+            "[2026-04-12] The user asked for the area of a rectangle 7 by 9; "
+            "the assistant said 63.",
+            "[2026-04-11] The user asked what the capital of Peru is; the "
+            "assistant said Lima. They also asked about the population and "
+            "the assistant said it is roughly 10 million in the metro area.",
+            "[2026-04-09] The user asked the assistant to convert 200 USD to "
+            "GBP; the assistant said approximately 158 GBP at the current rate.",
+            "[2026-04-08] The user asked the assistant for the boiling point "
+            "of water at sea level; the assistant said 100 degrees Celsius.",
+        ]
+        digest = self._digest("what do you know about me?", diary)
+        print(f"\n  Digest: {digest!r}")
+
+        if not digest:
+            pytest.xfail(
+                f"Small judge model {JUDGE_MODEL} returned NONE for an "
+                f"identity query despite user-stated facts being present."
+            )
+
+        lowered = digest.lower()
+        surfaced_fact = "boxing" in lowered or "e3" in lowered
+        # Past Q&A topics that must stay out of an identity digest. The
+        # field-incident topic (rectangle area) is the primary guard;
+        # currency and boiling-point are included because they are
+        # numeric/factoid Q&As with no user-preference character — the
+        # exact failure class the identity rule targets.
+        surfaced_past_qa = any(
+            kw in lowered
+            for kw in (
+                "rectangle",
+                "7 by 9",
+                "area of",
+                "usd",
+                "gbp",
+                "boiling",
+            )
+        )
+        assert surfaced_fact, (
+            f"Digest did not surface the user-stated boxing/location fact "
+            f"for an identity query. Got: {digest!r}"
+        )
+        assert not surfaced_past_qa, (
+            f"Digest surfaced past Q&A topics as if they were facts "
+            f"about the user. Got: {digest!r}"
+        )
+
+    def test_identity_query_surfaces_multiple_user_facts_when_present(self):
+        """When several user-stated facts exist, the digest should combine
+        them rather than pick just one."""
+        diary = [
+            "[2026-04-10] The user said they live in East London.",
+            "[2026-04-11] The user said they are vegetarian.",
+            "[2026-04-12] The user said they are learning Japanese.",
+            "[2026-04-13] The user asked about the capital of Peru; the "
+            "assistant said Lima.",
+            "[2026-04-09] The user asked the assistant to convert 200 USD to "
+            "GBP; the assistant said approximately 158 GBP at the current rate.",
+            "[2026-04-08] The user asked the boiling point of water at sea "
+            "level; the assistant said 100 degrees Celsius.",
+        ]
+        digest = self._digest("tell me about myself", diary)
+        print(f"\n  Digest: {digest!r}")
+
+        if not digest:
+            pytest.xfail(
+                f"Small judge model {JUDGE_MODEL} returned NONE for an "
+                f"identity query despite multiple user-stated facts."
+            )
+
+        lowered = digest.lower()
+        facts_hit = sum(
+            kw in lowered
+            for kw in ("east london", "vegetarian", "japanese")
+        )
+        assert facts_hit >= 2, (
+            f"Digest surfaced fewer than 2 of the 3 user-stated facts for "
+            f"an identity query. Got: {digest!r}"
+        )
+        past_qa_leak = any(
+            kw in lowered for kw in ("usd", "gbp", "boiling")
+        )
+        assert not past_qa_leak, (
+            f"Digest leaked a past Q&A topic into an identity-query "
+            f"digest. Got: {digest!r}"
+        )
+
+    def test_identity_query_with_only_past_qa_returns_none_or_no_false_facts(self):
+        """Regression guard: if NO user-stated facts exist, the digest must
+        not fabricate a user fact from past Q&A topics."""
+        diary = [
+            "[2026-04-12] The user asked for the area of a rectangle 7 by 9; "
+            "the assistant said 63.",
+            "[2026-04-13] The user asked about the capital of Peru; the "
+            "assistant said Lima.",
+            "[2026-04-11] The user asked the assistant to convert 200 USD to "
+            "GBP; the assistant said approximately 158 GBP at the current rate.",
+            "[2026-04-10] The user asked the boiling point of water at sea "
+            "level; the assistant said 100 degrees Celsius.",
+            "[2026-04-09] The user asked for the capital of Australia; the "
+            "assistant said Canberra.",
+        ]
+        digest = self._digest("what do you know about me?", diary)
+        print(f"\n  Digest: {digest!r}")
+
+        lowered = digest.lower()
+        fabricated_user_fact = any(
+            phrase in lowered
+            for phrase in (
+                "user likes math",
+                "user is interested in math",
+                "user likes geography",
+                "user is interested in peru",
+            )
+        )
+        assert not fabricated_user_fact, (
+            f"Digest fabricated a user-preference claim from past Q&A "
+            f"topics. Got: {digest!r}"
+        )
+
+    def test_identity_query_does_not_trigger_recommendation_engagement_rule(self):
+        """Cross-rule guard: the recommendation-engagement rule says past
+        interactions count as preference signals for 'what should I watch'.
+        An IDENTITY query with the same film-engagement diary must not
+        mistakenly treat the films as facts about the user — the identity
+        rule still applies and past Q&A topics stay out unless the snippet
+        explicitly says the user is into that topic."""
+        diary = [
+            "[2026-04-20] The user asked about the movie Titanic; the "
+            "assistant summarised its plot and noted it is a 1997 film "
+            "directed by James Cameron.",
+            "[2026-04-19] The conversation focused on the film Possessor; "
+            "the assistant said it is a 2020 sci-fi horror by Brandon "
+            "Cronenberg.",
+            "[2026-04-10] The user said they live in East London and work "
+            "as a software engineer.",
+        ]
+        digest = self._digest("what do you know about me?", diary)
+        print(f"\n  Digest: {digest!r}")
+
+        if not digest:
+            pytest.xfail(
+                f"Small judge model {JUDGE_MODEL} returned NONE for an "
+                f"identity query despite user-stated facts present."
+            )
+
+        lowered = digest.lower()
+        user_fact_surfaced = any(
+            kw in lowered
+            for kw in ("east london", "software engineer", "engineer")
+        )
+        assert user_fact_surfaced, (
+            f"Digest did not surface the user-stated location/occupation "
+            f"fact for an identity query. Got: {digest!r}"
+        )
+        # The film Q&As must NOT be presented as user facts. The identity
+        # rule's "not a fact unless the snippet says the user is into it"
+        # clause must override the recommendation-engagement rule here.
+        film_presented_as_user_fact = any(
+            phrase in lowered
+            for phrase in (
+                "the user likes",
+                "the user enjoys",
+                "the user is a fan",
+                "the user is into",
+                "taste signal",
+                "already covered",
+            )
+        )
+        assert not film_presented_as_user_fact, (
+            f"Digest applied the recommendation-engagement rule to an "
+            f"identity query: films framed as user taste/preference. "
+            f"Got: {digest!r}"
+        )
+
+    def test_recommendation_query_still_surfaces_engagement_when_user_facts_present(self):
+        """Reverse cross-rule guard: a recommendation query alongside
+        user-stated facts must still surface engagement-as-preference.
+        The identity rule's 'prefer user-stated facts' must not suppress
+        the recommendation rule's engagement signals."""
+        diary = [
+            "[2026-04-20] The user asked about the movie Titanic; the "
+            "assistant summarised its plot and noted it is a 1997 film "
+            "directed by James Cameron.",
+            "[2026-04-19] The conversation focused on the film Possessor; "
+            "the assistant said it is a 2020 sci-fi horror by Brandon "
+            "Cronenberg.",
+            "[2026-04-10] The user said they live in East London.",
+        ]
+        digest = self._digest("what should I watch tonight?", diary)
+        print(f"\n  Digest: {digest!r}")
+
+        if not digest:
+            pytest.xfail(
+                f"Small judge model {JUDGE_MODEL} returned NONE for a "
+                f"recommendation query despite engagement signals present."
+            )
+
+        lowered = digest.lower()
+        engagement_surfaced = any(
+            kw in lowered for kw in ("titanic", "possessor")
+        )
+        assert engagement_surfaced, (
+            f"Digest suppressed engagement-as-preference signals on a "
+            f"recommendation query, likely because the identity rule "
+            f"dominated. Got: {digest!r}"
+        )