Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
227 lines
9.1 KiB
Python
227 lines
9.1 KiB
Python
"""
|
|
Knowledge Graph Branch Routing Evaluations
|
|
|
|
Validates the extractor's per-fact branch classification (USER / DIRECTIVES
|
|
/ WORLD). The warm profile injected into every reply is the User +
|
|
Directives branches concatenated — misclassification here either leaks
|
|
directives out of the warm blob (the assistant forgets a standing rule)
|
|
or dumps world trivia into the blob (every reply carries irrelevant
|
|
background). Both are nasty, silent regressions, so the classification
|
|
accuracy needs its own eval.
|
|
|
|
Cases are deliberately adversarial around the swap-test boundary:
|
|
- User statements about themselves that a naive classifier might read
|
|
as a directive ("I prefer short answers" → USER, not DIRECTIVES —
|
|
it's a preference about the user, not an instruction).
|
|
- Imperatives to the assistant that a naive classifier might read as
|
|
user preferences ("always reply briefly" → DIRECTIVES, not USER).
|
|
- World facts where the user is also the subject of the request but
|
|
the fact itself is external attribution.
|
|
|
|
Run:
|
|
EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh graph_branch_routing
|
|
EVAL_JUDGE_MODEL=gpt-oss:20b ./scripts/run_evals.sh graph_branch_routing
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import List, Optional, Tuple, Union
|
|
|
|
import pytest
|
|
|
|
from conftest import requires_judge_llm
|
|
from helpers import MockConfig
|
|
|
|
from jarvis.memory.graph import BRANCH_DIRECTIVES, BRANCH_USER, BRANCH_WORLD
|
|
from jarvis.memory.graph_ops import extract_graph_memories
|
|
|
|
|
|
# =============================================================================
|
|
# Test Data
|
|
# =============================================================================
|
|
|
|
|
|
@dataclass
|
|
class RoutingCase:
|
|
"""A summary and the branches we expect each keyword-identified fact
|
|
to be routed into."""
|
|
|
|
summary: str
|
|
date_utc: Optional[str] = None
|
|
# Each expectation is ``(keyword_or_alternatives, expected_branch_id)``.
|
|
# If the first item is a tuple, any one of its strings satisfies the
|
|
# match — use this when the model may paraphrase. Matching is
|
|
# case-insensitive substring on fact text.
|
|
expectations: List[Tuple[Union[str, Tuple[str, ...]], str]] = field(
|
|
default_factory=list,
|
|
)
|
|
|
|
|
|
ROUTING_CASES = [
|
|
# ── Clear USER facts ────────────────────────────────────────────────
|
|
pytest.param(
|
|
RoutingCase(
|
|
summary=(
|
|
"The user mentioned they live in Brighton and have two "
|
|
"cats, Miso and Kuma. They've been vegetarian for five "
|
|
"years and work as a backend engineer."
|
|
),
|
|
date_utc="2026-04-20",
|
|
expectations=[
|
|
("Brighton", BRANCH_USER),
|
|
("Miso", BRANCH_USER),
|
|
("vegetarian", BRANCH_USER),
|
|
("engineer", BRANCH_USER),
|
|
],
|
|
),
|
|
id="USER: identity, location, pets, diet, job",
|
|
),
|
|
# ── Clear DIRECTIVES ─────────────────────────────────────────────────
|
|
pytest.param(
|
|
RoutingCase(
|
|
summary=(
|
|
"The user told me to always answer in British English, "
|
|
"to keep replies under three sentences, and to never "
|
|
"apologise or say sorry. They also asked me to address "
|
|
"them as Boss going forward."
|
|
),
|
|
date_utc="2026-04-20",
|
|
expectations=[
|
|
("British English", BRANCH_DIRECTIVES),
|
|
("three sentences", BRANCH_DIRECTIVES),
|
|
("apologise", BRANCH_DIRECTIVES),
|
|
("Boss", BRANCH_DIRECTIVES),
|
|
],
|
|
),
|
|
id="DIRECTIVES: tone, length, forbidden phrases, address form",
|
|
),
|
|
# ── Clear WORLD facts ────────────────────────────────────────────────
|
|
pytest.param(
|
|
RoutingCase(
|
|
summary=(
|
|
"The user asked about Trenches Boxing Club. I found that "
|
|
"it's on Mare Street in Hackney, offers evening classes "
|
|
"on weekdays from 6-8pm at 15 pounds per session. I also "
|
|
"confirmed that Possessor is a 2020 sci-fi horror film "
|
|
"directed by Brandon Cronenberg."
|
|
),
|
|
date_utc="2026-04-20",
|
|
expectations=[
|
|
("Trenches", BRANCH_WORLD),
|
|
("Mare Street", BRANCH_WORLD),
|
|
("Possessor", BRANCH_WORLD),
|
|
("Cronenberg", BRANCH_WORLD),
|
|
],
|
|
),
|
|
id="WORLD: local business details, film attribution",
|
|
),
|
|
# ── Adversarial: preference vs directive ────────────────────────────
|
|
pytest.param(
|
|
RoutingCase(
|
|
summary=(
|
|
"The user said they prefer Thai food over Italian when "
|
|
"eating out. They also told me to keep all food "
|
|
"recommendations under five options, because longer "
|
|
"lists overwhelm them."
|
|
),
|
|
date_utc="2026-04-20",
|
|
expectations=[
|
|
# Preference about the user's own tastes → USER
|
|
("Thai", BRANCH_USER),
|
|
# Instruction about assistant behaviour → DIRECTIVES
|
|
("five options", BRANCH_DIRECTIVES),
|
|
],
|
|
),
|
|
id="Adversarial: food preference (USER) vs list-length rule (DIRECTIVES)",
|
|
),
|
|
# ── Adversarial: mixed summary ──────────────────────────────────────
|
|
pytest.param(
|
|
RoutingCase(
|
|
summary=(
|
|
"The user has been vegetarian for three years and lives "
|
|
"in central London. They told me to stop suggesting fish "
|
|
"dishes when they ask about food — they consider "
|
|
"pescatarian suggestions unhelpful. I confirmed that "
|
|
"Mildreds in Covent Garden is a fully vegetarian "
|
|
"restaurant with a Michelin Bib Gourmand rating."
|
|
),
|
|
date_utc="2026-04-20",
|
|
expectations=[
|
|
("Mildreds", BRANCH_WORLD),
|
|
("vegetarian for three years", BRANCH_USER),
|
|
# Model phrases the directive either as "pescatarian
|
|
# suggestions unhelpful" or "fish dishes" — accept
|
|
# either; the classification is what matters.
|
|
(("pescatarian", "fish"), BRANCH_DIRECTIVES),
|
|
],
|
|
),
|
|
id="Adversarial: all three branches in one summary",
|
|
),
|
|
]
|
|
|
|
|
|
# =============================================================================
|
|
# Helpers
|
|
# =============================================================================
|
|
|
|
|
|
def _run_extraction(case: RoutingCase, config: MockConfig) -> list[tuple[str, str]]:
|
|
return extract_graph_memories(
|
|
summary=case.summary,
|
|
ollama_base_url=config.ollama_base_url,
|
|
ollama_chat_model=config.ollama_chat_model,
|
|
timeout_sec=config.llm_chat_timeout_sec,
|
|
thinking=False,
|
|
date_utc=case.date_utc,
|
|
)
|
|
|
|
|
|
def _find_branch_for_keyword(
|
|
facts: list[tuple[str, str]],
|
|
keyword: Union[str, Tuple[str, ...]],
|
|
) -> Optional[str]:
|
|
"""Return the branch_id of the first fact whose text contains keyword
|
|
(case-insensitive), or None if no fact matches. If keyword is a tuple,
|
|
any of its strings satisfies the match."""
|
|
alternatives = (keyword,) if isinstance(keyword, str) else keyword
|
|
lowered = [k.lower() for k in alternatives]
|
|
for branch_id, fact in facts:
|
|
fact_lower = fact.lower()
|
|
if any(k in fact_lower for k in lowered):
|
|
return branch_id
|
|
return None
|
|
|
|
|
|
# =============================================================================
|
|
# Tests
|
|
# =============================================================================
|
|
|
|
|
|
class TestGraphBranchRouting:
|
|
"""Branch classification accuracy for the knowledge extractor."""
|
|
|
|
@requires_judge_llm
|
|
@pytest.mark.parametrize("case", ROUTING_CASES)
|
|
def test_routes_facts_to_expected_branches(
|
|
self, mock_config, case: RoutingCase,
|
|
):
|
|
facts = _run_extraction(case, mock_config)
|
|
|
|
# Print for report visibility
|
|
print(f"Extracted {len(facts)} facts:")
|
|
for branch_id, fact in facts:
|
|
print(f" [{branch_id}] {fact}")
|
|
|
|
# Every expectation must be satisfied
|
|
for keyword, expected_branch in case.expectations:
|
|
actual_branch = _find_branch_for_keyword(facts, keyword)
|
|
assert actual_branch is not None, (
|
|
f"Expected a fact containing {keyword!r} (for branch "
|
|
f"{expected_branch!r}), but no extracted fact matched. "
|
|
f"Facts: {facts}"
|
|
)
|
|
assert actual_branch == expected_branch, (
|
|
f"Keyword {keyword!r}: expected branch "
|
|
f"{expected_branch!r}, got {actual_branch!r}. Facts: "
|
|
f"{facts}"
|
|
)
|