Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/evals/test_merge_consolidation.py
+++ b/evals/test_merge_consolidation.py
@@ -0,0 +1,645 @@
+"""
+Merge consolidation evaluations.
+
+`merge_node_data` advertises three behaviours beyond the supersession
+case covered in `test_recency_superseding.py`:
+
+  1. Near-duplicate dedupe — different wordings of the same fact
+     collapse to one canonical line.
+  2. Pattern consolidation — repeated activities fold into patterns
+     ("ate sushi Mon", "ate sushi Thu" → "regularly eats sushi").
+  3. Independence — an unrelated new fact must NOT silently drop an
+     existing unrelated line. (The most dangerous failure mode: a
+     hallucinated contradiction would erase real data.)
+
+Plus a check that the batched signature works end-to-end with a real
+picker model (the round-1 batching has unit tests but no eval).
+
+Run:
+    EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh merge_consolidation
+"""
+
+from dataclasses import dataclass
+from typing import List
+
+import pytest
+
+from conftest import requires_judge_llm
+from helpers import JUDGE_MODEL, JUDGE_BASE_URL
+
+from jarvis.memory.graph_ops import merge_node_data
+
+
+# =============================================================================
+# Test data
+# =============================================================================
+
+@dataclass
+class DedupeCase:
+    description: str
+    existing_data: str
+    new_facts: List[str]
+    # Substrings that must remain in the merged data.
+    must_contain: List[str]
+    # Substrings that should NOT appear (forbidden duplicates).
+    must_not_contain: List[str]
+    # Maximum line count after merge — caps near-dup explosion.
+    max_lines: int
+
+
+DEDUPE_CASES = [
+    pytest.param(
+        DedupeCase(
+            description="Same fact, different wording",
+            existing_data="The user lives in London.",
+            new_facts=["The user is based in London."],
+            must_contain=["london"],
+            must_not_contain=[],
+            max_lines=1,
+        ),
+        id="lives-in vs based-in London",
+    ),
+    pytest.param(
+        DedupeCase(
+            description="Job title rephrased",
+            existing_data="The user works as a software engineer.",
+            new_facts=["The user's job is software engineering."],
+            must_contain=["software"],
+            must_not_contain=[],
+            max_lines=1,
+        ),
+        id="job rephrased",
+    ),
+]
+
+
+@dataclass
+class PatternCase:
+    description: str
+    existing_data: str
+    new_facts: List[str]
+    # Keyword that should appear in the consolidated pattern line
+    # (e.g. "regularly", "often", "frequently", "every").
+    pattern_keywords: List[str]
+    # Subject the pattern is about (must remain).
+    subject_keyword: str
+    # Cap on lines — pattern consolidation should shrink, not grow.
+    max_lines: int
+
+
+@dataclass
+class PatternBoundaryCase:
+    description: str
+    existing_data: str
+    new_facts: List[str]
+    # Substrings that MUST still be present in the merged output —
+    # these are distinct one-off events that should not collapse
+    # into a fake pattern.
+    must_keep_distinct: List[str]
+
+
+PATTERN_BOUNDARY_CASES = [
+    pytest.param(
+        PatternBoundaryCase(
+            description="One-off events should not be patternised",
+            existing_data=(
+                "[2025-08-12] The user attended a wedding in Edinburgh.\n"
+                "[2025-11-03] The user gave a conference talk in Berlin."
+            ),
+            new_facts=["[2026-04-25] The user moved house to Manchester."],
+            # Three distinct, unrelated one-time events. Folding them
+            # into "regularly travels" or similar would invent a
+            # pattern that isn't there.
+            must_keep_distinct=["edinburgh", "berlin", "manchester"],
+        ),
+        id="distinct one-off events",
+        # Originally xfail(strict=False) — captured a regression where
+        # `gemma4:e2b` clustered date-prefixed entries with a new
+        # dated entry and silently dropped the older two. The case
+        # now passes 3/3 reps on the small model after the
+        # META-NARRATIVE rule landed. The causal link is not
+        # verified, but the eval is the right place to catch a
+        # regression so the marker is dropped and the case stands as
+        # a regular PASS.
+    ),
+]
+
+
+PATTERN_CASES = [
+    pytest.param(
+        PatternCase(
+            description="Repeated sushi meals",
+            existing_data=(
+                "[2026-04-07] The user ate sushi for lunch.\n"
+                "[2026-04-14] The user had sushi again.\n"
+                "[2026-04-21] The user ordered sushi for dinner."
+            ),
+            new_facts=["[2026-04-25] The user ate sushi today."],
+            pattern_keywords=["regularly", "often", "frequently", "weekly", "every", "tend"],
+            subject_keyword="sushi",
+            max_lines=3,
+        ),
+        id="sushi pattern",
+    ),
+]
+
+
+@dataclass
+class IndependenceCase:
+    description: str
+    existing_data: str
+    new_facts: List[str]
+    # Substrings that MUST survive — the new fact is unrelated and
+    # has no business dropping these.
+    must_keep: List[str]
+    # Substrings the new fact should add.
+    must_add: List[str]
+
+
+INDEPENDENCE_CASES = [
+    pytest.param(
+        IndependenceCase(
+            description="Vegetarian + unrelated meal mention",
+            # Note: "user is vegetarian" + "user ate a Big Mac" is a
+            # genuine contradiction the picker may legitimately
+            # surface or pick a side on. Use clearly-orthogonal facts
+            # instead so the eval is unambiguous.
+            existing_data=(
+                "The user has a peanut allergy.\n"
+                "The user prefers tea over coffee."
+            ),
+            new_facts=["The user enjoys hiking on weekends."],
+            must_keep=["peanut", "tea"],
+            must_add=["hiking"],
+        ),
+        id="independent facts coexist",
+    ),
+    pytest.param(
+        IndependenceCase(
+            description="Job + new hobby",
+            existing_data="The user works as a software engineer at Equals Money.",
+            new_facts=["The user is learning to play the guitar."],
+            must_keep=["software", "equals money"],
+            must_add=["guitar"],
+        ),
+        id="job survives unrelated hobby fact",
+    ),
+]
+
+
+@dataclass
+class MetaNarrativeCase:
+    description: str
+    existing_data: str
+    new_facts: List[str]
+    # Substrings that must NOT remain after the merge — these are
+    # extractor-artefact lines from earlier prompt versions
+    # (assistant-narrating, capability denials) and have no place
+    # in a knowledge node.
+    must_drop_substrings: List[str]
+    # Substrings that MUST remain — genuine knowledge or directives
+    # that should not get over-pruned by the meta-narrative rule.
+    must_keep_substrings: List[str]
+
+
+META_NARRATIVE_CASES = [
+    pytest.param(
+        MetaNarrativeCase(
+            description=(
+                "Capability-denial line in Directives is dropped, "
+                "real directive survives"
+            ),
+            # Mirrors the real bug report: a self-denial leaked into
+            # Directives via an older extractor prompt and persisted
+            # because no rewrite-on-write rule covered meta-narrative.
+            # Consolidate-all (empty new_facts) should now scrub it
+            # without touching the genuine British English directive.
+            existing_data=(
+                "Always reply in British English.\n"
+                "The assistant is unable to navigate to a web page."
+            ),
+            new_facts=[],
+            must_drop_substrings=[
+                "unable to navigate",
+                "the assistant is unable",
+            ],
+            must_keep_substrings=["british english"],
+        ),
+        id="capability denial dropped, directive kept",
+    ),
+    pytest.param(
+        MetaNarrativeCase(
+            description=(
+                "Assistant-narrating WORLD line is dropped during "
+                "self-consolidation"
+            ),
+            # The extractor's BANNED FACT FORMS list catches these at
+            # write-time now, but lines emitted before #291 landed
+            # still sit in nodes. Merge prompt must drop them too.
+            existing_data=(
+                "Possessor (2020) is directed by Brandon Cronenberg.\n"
+                "The assistant suggested grilled salmon for dinner."
+            ),
+            new_facts=[],
+            must_drop_substrings=[
+                "the assistant suggested",
+                "grilled salmon",
+            ],
+            must_keep_substrings=["possessor", "cronenberg"],
+        ),
+        id="assistant-suggested line dropped, lookup survives",
+    ),
+    pytest.param(
+        MetaNarrativeCase(
+            description=(
+                "Polluted node receiving a new fact: meta-narrative "
+                "drops AND the new fact lands"
+            ),
+            # Production path: a diary flush routes one new fact to a
+            # node that already holds an older capability-denial line.
+            # The merge must drop the denial AND incorporate the new
+            # fact — capturing the worst case where the META rule
+            # could steal attention from incorporation tracking.
+            existing_data=(
+                "Always reply in British English.\n"
+                "The assistant is unable to navigate to a web page."
+            ),
+            new_facts=["Keep replies under three sentences."],
+            must_drop_substrings=[
+                "unable to navigate",
+                "the assistant is unable",
+            ],
+            must_keep_substrings=[
+                "british english",
+                "three sentences",
+            ],
+        ),
+        id="polluted node + new fact: drop and incorporate",
+    ),
+    pytest.param(
+        MetaNarrativeCase(
+            description=(
+                "No meta-narrative present — merge must not invent "
+                "drops (over-pruning guard)"
+            ),
+            # Counter-test for over-zealous interpretation of the new
+            # rule. A clean Directives node with two genuine
+            # imperatives must come through self-consolidation
+            # untouched. If this fails the rule is too aggressive.
+            existing_data=(
+                "Always reply in British English.\n"
+                "Keep replies under three sentences."
+            ),
+            new_facts=[],
+            must_drop_substrings=[],
+            must_keep_substrings=["british english", "three sentences"],
+        ),
+        id="genuine directives untouched",
+    ),
+]
+
+
+@dataclass
+class BatchedCase:
+    description: str
+    existing_data: str
+    new_facts: List[str]
+    # Each entry: list of substring alternatives — at least one must
+    # appear in the merged data. Captures "the model phrased it
+    # however it wanted, but the fact survived".
+    expected_signals: List[List[str]]
+
+
+BATCHED_CASES = [
+    pytest.param(
+        BatchedCase(
+            description="Three independent new facts in one call",
+            existing_data="The user lives in London.",
+            new_facts=[
+                "The user has a dog named Biscuit.",
+                "The user prefers oat milk.",
+                "The user is allergic to peanuts.",
+            ],
+            expected_signals=[
+                ["london"],
+                ["biscuit", "dog"],
+                ["oat milk", "oat"],
+                ["peanut"],
+            ],
+        ),
+        id="batched 3 new facts",
+    ),
+]
+
+
+def _line_count(data: str) -> int:
+    return len([l for l in data.split("\n") if l.strip()])
+
+
+# =============================================================================
+# Tests
+# =============================================================================
+
+@pytest.mark.eval
+class TestNearDuplicateDedupe:
+    """Different wordings of the same fact must collapse to one line."""
+
+    @requires_judge_llm
+    @pytest.mark.parametrize("case", DEDUPE_CASES)
+    def test_near_duplicates_collapse(self, case, graph_store):
+        case = case.values[0] if hasattr(case, 'values') else case
+
+        node = graph_store.create_node(
+            name="T",
+            description=case.description,
+            data=case.existing_data,
+            parent_id="root",
+        )
+
+        result = merge_node_data(
+            store=graph_store,
+            node_id=node.id,
+            new_facts=case.new_facts,
+            ollama_base_url=JUDGE_BASE_URL,
+            ollama_chat_model=JUDGE_MODEL,
+            timeout_sec=30.0,
+        )
+
+        merged = graph_store.get_node(node.id).data
+        merged_lower = merged.lower()
+        line_count = _line_count(merged)
+
+        print(f"\n  📝 dedupe '{case.description}':\n     {merged[:300]}")
+        print(f"     success={result.success} lines={line_count}")
+
+        for kw in case.must_contain:
+            assert kw.lower() in merged_lower, (
+                f"[{case.description}] expected '{kw}' to survive merge.\n{merged}"
+            )
+        for kw in case.must_not_contain:
+            assert kw.lower() not in merged_lower, (
+                f"[{case.description}] forbidden '{kw}' leaked into merge.\n{merged}"
+            )
+        assert line_count <= case.max_lines, (
+            f"[{case.description}] merge produced {line_count} lines, expected ≤ {case.max_lines} "
+            f"(near-duplicates should collapse).\n{merged}"
+        )
+
+
+@pytest.mark.eval
+class TestPatternConsolidation:
+    """Repeated activities should fold into patterns rather than
+    accumulate as a stack of dated entries."""
+
+    @requires_judge_llm
+    @pytest.mark.parametrize("case", PATTERN_CASES)
+    def test_repeated_activities_consolidate(self, case, graph_store):
+        case = case.values[0] if hasattr(case, 'values') else case
+
+        node = graph_store.create_node(
+            name="T",
+            description=case.description,
+            data=case.existing_data,
+            parent_id="root",
+        )
+
+        result = merge_node_data(
+            store=graph_store,
+            node_id=node.id,
+            new_facts=case.new_facts,
+            ollama_base_url=JUDGE_BASE_URL,
+            ollama_chat_model=JUDGE_MODEL,
+            timeout_sec=30.0,
+        )
+
+        merged = graph_store.get_node(node.id).data
+        merged_lower = merged.lower()
+        line_count = _line_count(merged)
+
+        print(f"\n  📝 pattern '{case.description}':\n     {merged[:300]}")
+        print(f"     success={result.success} lines={line_count}")
+
+        assert case.subject_keyword.lower() in merged_lower, (
+            f"[{case.description}] subject '{case.subject_keyword}' lost from merge.\n{merged}"
+        )
+        has_pattern = any(kw in merged_lower for kw in case.pattern_keywords)
+        assert has_pattern, (
+            f"[{case.description}] expected pattern wording (any of {case.pattern_keywords}) "
+            f"after consolidating repeated activities.\n{merged}"
+        )
+        assert line_count <= case.max_lines, (
+            f"[{case.description}] {line_count} lines remain — repeated activities should "
+            f"have consolidated to ≤ {case.max_lines}.\n{merged}"
+        )
+
+
+@pytest.mark.eval
+class TestPatternBoundary:
+    """Counter-example to `TestPatternConsolidation`: distinct one-off
+    events MUST NOT be folded into a fabricated pattern. Pattern
+    consolidation should fire on repetition, not on coincidence."""
+
+    @requires_judge_llm
+    @pytest.mark.parametrize("case", PATTERN_BOUNDARY_CASES)
+    def test_distinct_one_offs_stay_distinct(self, case, graph_store):
+        case = case.values[0] if hasattr(case, 'values') else case
+
+        node = graph_store.create_node(
+            name="T",
+            description=case.description,
+            data=case.existing_data,
+            parent_id="root",
+        )
+
+        result = merge_node_data(
+            store=graph_store,
+            node_id=node.id,
+            new_facts=case.new_facts,
+            ollama_base_url=JUDGE_BASE_URL,
+            ollama_chat_model=JUDGE_MODEL,
+            timeout_sec=30.0,
+        )
+
+        merged = graph_store.get_node(node.id).data
+        merged_lower = merged.lower()
+
+        print(f"\n  📝 pattern-boundary '{case.description}':\n     {merged[:300]}")
+        print(f"     success={result.success}")
+
+        for kw in case.must_keep_distinct:
+            assert kw.lower() in merged_lower, (
+                f"[{case.description}] distinct event '{kw}' was folded away — "
+                f"the picker invented a pattern from one-offs.\n{merged}"
+            )
+
+
+@pytest.mark.eval
+class TestIndependenceOfUnrelatedFacts:
+    """An unrelated new fact must NOT drop an existing unrelated line.
+    Silent erasure of real data is the most dangerous failure mode of
+    the rewrite-on-write merge — the hallucination guard catches
+    runaway growth, but only this eval catches runaway shrinkage."""
+
+    @requires_judge_llm
+    @pytest.mark.parametrize("case", INDEPENDENCE_CASES)
+    def test_independent_facts_coexist(self, case, graph_store):
+        case = case.values[0] if hasattr(case, 'values') else case
+
+        node = graph_store.create_node(
+            name="T",
+            description=case.description,
+            data=case.existing_data,
+            parent_id="root",
+        )
+
+        result = merge_node_data(
+            store=graph_store,
+            node_id=node.id,
+            new_facts=case.new_facts,
+            ollama_base_url=JUDGE_BASE_URL,
+            ollama_chat_model=JUDGE_MODEL,
+            timeout_sec=30.0,
+        )
+
+        merged = graph_store.get_node(node.id).data
+        merged_lower = merged.lower()
+
+        print(f"\n  📝 independence '{case.description}':\n     {merged[:300]}")
+        print(f"     success={result.success}")
+
+        for kw in case.must_keep:
+            assert kw.lower() in merged_lower, (
+                f"[{case.description}] existing fact containing '{kw}' was silently "
+                f"dropped by an unrelated new fact — independence violated.\n{merged}"
+            )
+        for kw in case.must_add:
+            assert kw.lower() in merged_lower, (
+                f"[{case.description}] new fact containing '{kw}' did not land.\n{merged}"
+            )
+
+
+@pytest.mark.eval
+class TestMetaNarrativePruning:
+    """Lines that narrate the assistant's own behaviour, capabilities,
+    or denials are extractor artefacts from earlier prompt versions,
+    not user knowledge. The merge step must drop them during normal
+    rewrite-on-write AND during the consolidate-all sweep. Counterpart
+    to the extractor's BANNED FACT FORMS list — that catches them at
+    write-time, this catches the historical leftovers."""
+
+    @requires_judge_llm
+    @pytest.mark.parametrize("case", META_NARRATIVE_CASES)
+    def test_meta_narrative_dropped_real_facts_kept(self, case, graph_store):
+        case = case.values[0] if hasattr(case, 'values') else case
+
+        node = graph_store.create_node(
+            name="T",
+            description=case.description,
+            data=case.existing_data,
+            parent_id="root",
+        )
+
+        result = merge_node_data(
+            store=graph_store,
+            node_id=node.id,
+            new_facts=case.new_facts,
+            ollama_base_url=JUDGE_BASE_URL,
+            ollama_chat_model=JUDGE_MODEL,
+            timeout_sec=30.0,
+        )
+
+        merged = graph_store.get_node(node.id).data
+        merged_lower = merged.lower()
+
+        print(f"\n  📝 meta-narrative '{case.description}':\n     {merged[:300]}")
+        print(f"     success={result.success}")
+
+        for kw in case.must_drop_substrings:
+            assert kw.lower() not in merged_lower, (
+                f"[{case.description}] meta-narrative line containing "
+                f"'{kw}' survived the merge — the rule did not fire.\n{merged}"
+            )
+        for kw in case.must_keep_substrings:
+            assert kw.lower() in merged_lower, (
+                f"[{case.description}] genuine fact containing '{kw}' was "
+                f"over-pruned — the rule is too aggressive.\n{merged}"
+            )
+
+        # When new_facts is non-empty the merge must report at least
+        # one incorporation. A regression where the META rule steals
+        # attention from incorporation tracking would surface here as
+        # `incorporated_indices == []` despite the fact landing in
+        # the merged data — exactly the failure mode `_match_key`'s
+        # tolerant punctuation strip was added to prevent.
+        if case.new_facts:
+            assert len(result.incorporated_indices) >= 1, (
+                f"[{case.description}] new fact landed in merged data "
+                f"but incorporated_indices is empty — orchestrator "
+                f"would under-report the flush.\n"
+                f"merged={merged}\nresult={result}"
+            )
+
+
+@pytest.mark.eval
+class TestBatchedMerge:
+    """Multiple new facts in one merge call must all land. Pins the
+    round-1 batched signature against a real picker model."""
+
+    @requires_judge_llm
+    @pytest.mark.parametrize("case", BATCHED_CASES)
+    def test_all_batched_facts_land(self, case, graph_store):
+        case = case.values[0] if hasattr(case, 'values') else case
+
+        node = graph_store.create_node(
+            name="T",
+            description=case.description,
+            data=case.existing_data,
+            parent_id="root",
+        )
+
+        result = merge_node_data(
+            store=graph_store,
+            node_id=node.id,
+            new_facts=case.new_facts,
+            ollama_base_url=JUDGE_BASE_URL,
+            ollama_chat_model=JUDGE_MODEL,
+            timeout_sec=30.0,
+        )
+
+        merged = graph_store.get_node(node.id).data
+        merged_lower = merged.lower()
+        line_count = _line_count(merged)
+
+        print(f"\n  📝 batched '{case.description}':\n     {merged[:400]}")
+        print(f"     success={result.success} lines={line_count} "
+              f"incorporated={result.incorporated_indices}")
+
+        for alternatives in case.expected_signals:
+            assert any(alt.lower() in merged_lower for alt in alternatives), (
+                f"[{case.description}] none of {alternatives} survived the batched merge.\n"
+                f"{merged}"
+            )
+
+        # Lower bound on lines: at minimum the merged data should
+        # contain a line per surviving fact. Upper bound is enforced
+        # by the in-product hallucination guard, not this eval — a
+        # cap here is brittle since legitimate consolidation could
+        # cross it on a paraphrase the model picks differently.
+        assert line_count >= len(case.expected_signals) - 1, (
+            f"[{case.description}] {line_count} lines suspiciously low for "
+            f"{len(case.expected_signals)} signals — facts may have been silently merged.\n"
+            f"{merged}"
+        )
+
+        # Pin the round-1 batched reporting fix: every input fact
+        # whose substance survived should be tracked in
+        # `incorporated_indices`. An empty list when facts clearly
+        # landed means the orchestrator under-reports flushes — the
+        # exact regression `_match_key`'s tolerant punctuation strip
+        # was added to prevent. Allow strict equality OR coverage of
+        # all input indices, since the picker may legitimately
+        # consolidate two new facts into one line.
+        assert len(result.incorporated_indices) >= 1, (
+            f"[{case.description}] incorporated_indices is empty despite facts landing — "
+            f"reporting drift back. {result.incorporated_indices}"
+        )