Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
124
evals/test_planner_personalisation.py
Normal file
124
evals/test_planner_personalisation.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""
|
||||
Planner — Personalisation Detection (Live)
|
||||
|
||||
Guards that the task-list planner emits a ``searchMemory`` directive as
|
||||
the first step for queries that implicitly depend on the user's own
|
||||
interests, tastes, or history — even when the user did not use the word
|
||||
"preference" or "history" in the query.
|
||||
|
||||
Motivating field incident (2026-04-24):
|
||||
User asked "Tell me some news that might interest me, Jarvis." The
|
||||
planner emitted ``webSearch query='current news'`` with no
|
||||
``searchMemory`` step, so the engine skipped memory enrichment and the
|
||||
reply was a generic BBC front-page summary with no personalisation.
|
||||
|
||||
The planner's rule 2 already lists "preferences" as a trigger, but
|
||||
gemma4:e2b doesn't pattern-match phrases like "interest me", "suggest
|
||||
something for me", "what should I…" onto that category without concrete
|
||||
examples. This eval asserts the prompt teaches the connection — adding
|
||||
examples that name the exact linguistic shape of a personalisation
|
||||
request.
|
||||
|
||||
Run: EVAL_JUDGE_MODEL=gemma4:e2b pytest evals/test_planner_personalisation.py -v
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from conftest import requires_judge_llm
|
||||
from helpers import JUDGE_BASE_URL, JUDGE_MODEL
|
||||
|
||||
|
||||
def _cfg():
|
||||
from types import SimpleNamespace
|
||||
return SimpleNamespace(
|
||||
ollama_base_url=JUDGE_BASE_URL,
|
||||
ollama_chat_model=JUDGE_MODEL,
|
||||
planner_model="",
|
||||
tool_router_model="",
|
||||
intent_judge_model="",
|
||||
planner_enabled=True,
|
||||
planner_timeout_sec=20.0,
|
||||
)
|
||||
|
||||
|
||||
_TOOL_CATALOG = [
|
||||
("webSearch", "Search the web for current facts and events."),
|
||||
("getWeather", "Current weather and forecast for a location."),
|
||||
("stop", "End the turn and reply to the user."),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.eval
|
||||
@requires_judge_llm
|
||||
class TestPlannerEmitsSearchMemoryForPersonalisedQueries:
|
||||
"""Field-regression guard for the 'interest me' pattern."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"query",
|
||||
[
|
||||
"tell me some news that might interest me",
|
||||
"suggest something I'd enjoy watching tonight",
|
||||
"what should I cook for dinner",
|
||||
"recommend a book I'd like",
|
||||
],
|
||||
ids=lambda q: q[:40],
|
||||
)
|
||||
def test_personalised_query_plans_memory_lookup_first(self, query):
|
||||
from jarvis.reply.planner import (
|
||||
plan_query, plan_requires_memory, is_search_memory_step,
|
||||
)
|
||||
|
||||
plan = plan_query(
|
||||
cfg=_cfg(),
|
||||
query=query,
|
||||
dialogue_context="",
|
||||
tools=_TOOL_CATALOG,
|
||||
)
|
||||
print(f"\n Query: {query!r}")
|
||||
print(f" Plan: {plan}")
|
||||
|
||||
assert plan, (
|
||||
f"Planner returned an empty plan for {query!r} — expected a "
|
||||
f"multi-step plan starting with a searchMemory directive."
|
||||
)
|
||||
assert plan_requires_memory(plan), (
|
||||
f"Planner did not request memory for personalised query "
|
||||
f"{query!r}. Plan: {plan}. The user's own interests are "
|
||||
f"exactly what rule 2 of the planner prompt lists as a "
|
||||
f"trigger for searchMemory."
|
||||
)
|
||||
assert is_search_memory_step(plan[0]), (
|
||||
f"searchMemory must be the FIRST step so memory enrichment "
|
||||
f"runs before any tool call. Plan: {plan}"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"query",
|
||||
[
|
||||
"what is the capital of France",
|
||||
"who is Britney Spears",
|
||||
"what's 2 plus 2",
|
||||
],
|
||||
ids=lambda q: q[:40],
|
||||
)
|
||||
def test_general_knowledge_query_does_not_request_memory(self, query):
|
||||
"""Negative case: pure general-knowledge queries must NOT trigger
|
||||
a searchMemory directive. Every extra searchMemory is a wasted
|
||||
memory-enrichment LLM call downstream."""
|
||||
from jarvis.reply.planner import plan_query, plan_requires_memory
|
||||
|
||||
plan = plan_query(
|
||||
cfg=_cfg(),
|
||||
query=query,
|
||||
dialogue_context="",
|
||||
tools=_TOOL_CATALOG,
|
||||
)
|
||||
print(f"\n Query: {query!r}")
|
||||
print(f" Plan: {plan}")
|
||||
|
||||
assert plan, f"Planner returned empty plan for {query!r}"
|
||||
assert not plan_requires_memory(plan), (
|
||||
f"Planner wrongly requested searchMemory for a general-"
|
||||
f"knowledge query {query!r}. That wastes a memory-enrichment "
|
||||
f"LLM call on every such turn. Plan: {plan}"
|
||||
)
|
||||
Reference in New Issue
Block a user