Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled

Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
This commit is contained in:
javis-bot
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions

View File

@@ -0,0 +1,458 @@
"""
Knowledge Extraction Evaluations
Tests the quality of knowledge extraction from conversation summaries.
Ensures the extraction prompt correctly handles:
1. Assistant self-references (should NOT be extracted)
2. Stale temporal snapshots (should NOT be extracted)
3. Common knowledge (should NOT be extracted)
4. Novel knowledge (SHOULD be extracted)
5. Proper reframing (requests → knowledge, not interaction descriptions)
Run:
EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh knowledge
EVAL_JUDGE_MODEL=gpt-oss:20b ./scripts/run_evals.sh knowledge
"""
import json
import re
from dataclasses import dataclass, field
from typing import List, Optional
import pytest
from conftest import requires_judge_llm
from helpers import (
MockConfig,
JUDGE_MODEL,
JUDGE_BASE_URL,
call_judge_llm,
JudgeVerdict,
)
from jarvis.memory.graph_ops import extract_graph_memories
# =============================================================================
# Test Data
# =============================================================================
@dataclass
class ExtractionTestCase:
"""A conversation summary with expected extraction outcomes."""
summary: str
date_utc: Optional[str] = None
# Facts that SHOULD appear (checked by keyword matching)
should_extract_keywords: List[str] = field(default_factory=list)
# Patterns that should NOT appear in any extracted fact
should_not_extract_patterns: List[str] = field(default_factory=list)
# Minimum number of facts expected
min_facts: int = 0
# Maximum number of facts expected (0 = no upper limit)
max_facts: int = 0
# ── Cases where extraction should produce good novel knowledge ──────────
GOOD_EXTRACTION_CASES = [
pytest.param(
ExtractionTestCase(
summary=(
"The user asked about boxing gyms in Hackney. I found that "
"Trenches Boxing Club offers evening classes on weekdays from "
"6-8pm, priced at 15 pounds per session. The user mentioned "
"they've been living in Hackney for 2 years."
),
date_utc="2026-04-10",
should_extract_keywords=["Trenches", "Hackney", "boxing"],
min_facts=2,
),
id="Novel knowledge: local business details and user location",
),
pytest.param(
ExtractionTestCase(
summary=(
"The user follows an 1800 kcal daily meal plan with a target "
"of 150g protein. They mentioned preferring air-fried chicken "
"breast with a soy-oyster-teriyaki glaze — a recipe they've "
"been perfecting over the past month."
),
date_utc="2026-04-08",
should_extract_keywords=["1800", "protein"],
min_facts=2,
),
id="Novel knowledge: user diet plan and preferred recipe",
),
pytest.param(
ExtractionTestCase(
summary=(
"The user is planning to move from London to Tbilisi, Georgia "
"in June 2026. They've already secured a flat in Vera district "
"for 800 USD per month. They work remotely as a software "
"engineer for a UK-based startup called Equals Money."
),
date_utc="2026-04-12",
should_extract_keywords=["Tbilisi", "Equals Money"],
min_facts=3,
),
id="Novel knowledge: relocation plans and employment",
),
pytest.param(
ExtractionTestCase(
summary=(
"Kullanıcı Kadıköy'deki Çiya Sofrası restoranını sordu. "
"Öğle yemeği menüsü 250 TL civarında, özellikle kuzu tandır "
"ve enginar yemeği çok beğeniliyormuş. Kullanıcı İstanbul'da "
"Kadıköy semtinde yaşıyor ve haftada 3 kez dışarıda yemek yiyor."
),
date_utc="2026-04-11",
should_extract_keywords=["Çiya", "Kadıköy"],
min_facts=2,
),
id="Novel knowledge: non-English summary (Turkish)",
),
]
# ── Cases where specific patterns should NOT appear ─────────────────────
BAD_PATTERN_CASES = [
pytest.param(
ExtractionTestCase(
summary=(
"The user asked about healthy meal options. I recommended "
"adding more vegetables and lean protein to their diet. I "
"suggested trying grilled salmon with quinoa and steamed "
"broccoli. The user thanked me for the suggestions."
),
date_utc="2026-04-10",
should_not_extract_patterns=[
r"(?i)assistant",
r"(?i)recommend",
r"(?i)suggest",
r"(?i)I told",
r"(?i)I advised",
],
max_facts=1, # Possibly 0 — there's no novel knowledge here
),
id="Reject: assistant self-references (recommendations are not knowledge)",
),
pytest.param(
ExtractionTestCase(
summary=(
"The user asked for the current weather. The temperature in "
"London is 20 degrees Celsius with partly cloudy skies. Wind "
"is coming from the southwest at 15 km/h. It's currently "
"3:45 PM on a Sunday afternoon."
),
date_utc="2026-04-06",
should_not_extract_patterns=[
r"(?i)current(ly)? (weather|temperature|time|date)",
r"(?i)20.*(degree|celsius|°)",
r"(?i)3:45",
r"(?i)wind.*southwest",
r"(?i)partly cloudy",
],
max_facts=1, # Maybe "user is in London" but nothing else
),
id="Reject: stale temporal snapshots (weather, time of day)",
),
]
# ── Cases testing proper reframing ──────────────────────────────────────
REFRAMING_CASES = [
pytest.param(
ExtractionTestCase(
summary=(
"The user asked about vegetarian restaurants near Covent "
"Garden. I found Mildreds, which serves plant-based dishes "
"and has 4.5 stars on Google. The user mentioned they've been "
"vegetarian for 3 years. They also asked about Dishoom but "
"decided against it since it's not fully vegetarian."
),
date_utc="2026-04-10",
should_extract_keywords=["Mildreds", "vegetarian"],
should_not_extract_patterns=[
r"(?i)user asked about",
r"(?i)user enquired",
r"(?i)user wanted to know",
],
min_facts=2,
),
id="Reframing: requests become knowledge, not interaction descriptions",
),
pytest.param(
ExtractionTestCase(
summary=(
"The user mentioned they started a new job at Equals Money "
"on March 1st 2026 as a senior backend engineer. They're "
"working with Python and FastAPI. Their team lead is someone "
"called Hakan."
),
date_utc="2026-04-05",
should_extract_keywords=["Equals Money", "March"],
should_not_extract_patterns=[
r"(?i)user mentioned",
r"(?i)user said",
r"(?i)user told",
],
min_facts=2,
),
id="Reframing: life events framed as facts with temporal context",
),
]
# =============================================================================
# Helpers
# =============================================================================
def _run_extraction(case: ExtractionTestCase, config: MockConfig) -> list[str]:
"""Run extract_graph_memories with the given case and config.
Returns a flat list of fact strings. The extractor now returns
``(branch_id, fact)`` tuples; these evals predate branch tagging
and only care about the fact text. The new branch-routing evals
live in ``test_graph_branch_routing.py``.
"""
tagged = extract_graph_memories(
summary=case.summary,
ollama_base_url=config.ollama_base_url,
ollama_chat_model=config.ollama_chat_model,
timeout_sec=config.llm_chat_timeout_sec,
thinking=False,
date_utc=case.date_utc,
)
return [fact for _branch, fact in tagged]
def _fact_matches_keyword(facts: list[str], keyword: str) -> bool:
"""Check if any extracted fact contains the keyword (case-insensitive)."""
keyword_lower = keyword.lower()
return any(keyword_lower in fact.lower() for fact in facts)
def _any_fact_matches_pattern(facts: list[str], pattern: str) -> bool:
"""Check if any extracted fact matches a regex pattern."""
compiled = re.compile(pattern)
return any(compiled.search(fact) for fact in facts)
def _judge_extraction_quality(
summary: str,
facts: list[str],
date_utc: Optional[str] = None,
) -> JudgeVerdict:
"""Use LLM-as-judge to evaluate overall extraction quality."""
system_prompt = (
"You are evaluating knowledge extraction quality. Given a conversation "
"summary and the facts extracted from it, score the extraction.\n\n"
"Score on these criteria (0-10 each):\n"
"1. NOVELTY: Are the extracted facts genuinely novel (not common "
"knowledge the model already knows)?\n"
"2. SELF_CONTAINED: Is each fact a self-contained statement useful "
"without the original conversation?\n"
"3. NO_ASSISTANT_VOICE: Are facts written as knowledge, NOT as "
"descriptions of what the assistant said/recommended?\n"
"4. NO_STALE_DATA: Are transient details (weather, time of day) "
"correctly excluded?\n"
"5. COMPLETENESS: Were important novel facts captured?\n\n"
"Output your evaluation in this EXACT format:\n"
"NOVELTY: [0-10]\n"
"SELF_CONTAINED: [0-10]\n"
"NO_ASSISTANT_VOICE: [0-10]\n"
"NO_STALE_DATA: [0-10]\n"
"COMPLETENESS: [0-10]\n"
"OVERALL: [PASS/FAIL]\n"
"REASONING: [One paragraph explaining your verdict]"
)
facts_text = "\n".join(f"- {f}" for f in facts) if facts else "(no facts extracted)"
date_info = f"\nDate context: {date_utc}" if date_utc else ""
user_prompt = (
f"Conversation summary:{date_info}\n{summary}\n\n"
f"Extracted facts:\n{facts_text}"
)
response = call_judge_llm(system_prompt, user_prompt, timeout_sec=120.0)
if not response:
return JudgeVerdict(
is_passed=False,
score=0.0,
reasoning="Judge LLM unavailable",
)
# Parse structured response
from helpers import _parse_judge_response
return _parse_judge_response(response)
# =============================================================================
# Test Classes
# =============================================================================
class TestKnowledgeExtractionQuality:
"""Tests that good novel knowledge is correctly extracted."""
@requires_judge_llm
@pytest.mark.parametrize("case", GOOD_EXTRACTION_CASES)
def test_extracts_novel_knowledge(self, mock_config, case: ExtractionTestCase):
"""Verify that novel knowledge is extracted with expected keywords."""
facts = _run_extraction(case, mock_config)
# Should extract at least min_facts
assert len(facts) >= case.min_facts, (
f"Expected at least {case.min_facts} facts, got {len(facts)}: {facts}"
)
# Check that expected keywords appear in at least one fact
for keyword in case.should_extract_keywords:
assert _fact_matches_keyword(facts, keyword), (
f"Expected keyword '{keyword}' in extracted facts: {facts}"
)
# Print for report visibility
print(f"Extracted {len(facts)} facts:")
for f in facts:
print(f" - {f}")
class TestKnowledgeExtractionRejection:
"""Tests that noise, stale data, and common knowledge are rejected."""
@requires_judge_llm
@pytest.mark.parametrize("case", BAD_PATTERN_CASES)
def test_rejects_bad_patterns(self, mock_config, case: ExtractionTestCase):
"""Verify that known bad patterns are not present in extracted facts."""
facts = _run_extraction(case, mock_config)
# Check max_facts constraint
if case.max_facts > 0:
assert len(facts) <= case.max_facts, (
f"Expected at most {case.max_facts} facts, got {len(facts)}: {facts}"
)
# Check that bad patterns don't appear
for pattern in case.should_not_extract_patterns:
assert not _any_fact_matches_pattern(facts, pattern), (
f"Bad pattern '{pattern}' found in extracted facts: {facts}"
)
# Print for report visibility
print(f"Extracted {len(facts)} facts (expected <= {case.max_facts}):")
for f in facts:
print(f" - {f}")
class TestKnowledgeExtractionReframing:
"""Tests that interaction descriptions are reframed as knowledge."""
@requires_judge_llm
@pytest.mark.parametrize("case", REFRAMING_CASES)
def test_reframes_as_knowledge(self, mock_config, case: ExtractionTestCase):
"""Verify facts are written as knowledge, not interaction descriptions."""
facts = _run_extraction(case, mock_config)
# Should extract enough facts
assert len(facts) >= case.min_facts, (
f"Expected at least {case.min_facts} facts, got {len(facts)}: {facts}"
)
# Should contain expected keywords
for keyword in case.should_extract_keywords:
assert _fact_matches_keyword(facts, keyword), (
f"Expected keyword '{keyword}' in extracted facts: {facts}"
)
# Should NOT contain interaction-description patterns
for pattern in case.should_not_extract_patterns:
assert not _any_fact_matches_pattern(facts, pattern), (
f"Interaction-description pattern '{pattern}' found in: {facts}"
)
# Print for report visibility
print(f"Extracted {len(facts)} facts:")
for f in facts:
print(f" - {f}")
class TestKnowledgeExtractionJudge:
"""LLM-as-judge evaluations of overall extraction quality."""
@requires_judge_llm
@pytest.mark.parametrize("case", GOOD_EXTRACTION_CASES)
def test_judge_extraction_quality(self, mock_config, case: ExtractionTestCase):
"""Judge evaluates overall extraction quality on good summaries."""
facts = _run_extraction(case, mock_config)
verdict = _judge_extraction_quality(
summary=case.summary,
facts=facts,
date_utc=case.date_utc,
)
# Print for report
print(f"Score: {verdict.score:.2f}")
print(f"Reasoning: {verdict.reasoning}")
for criterion, score in verdict.criteria_scores.items():
print(f" {criterion}: {score:.1f}")
# Accept if the judge passes OR the score is above 0.7 —
# the judge can be overly strict on completeness for minor details
assert verdict.is_passed or verdict.score >= 0.7, (
f"Judge failed extraction quality (score={verdict.score:.2f}): "
f"{verdict.reasoning}\nFacts: {facts}"
)
@requires_judge_llm
def test_judge_empty_conversation_returns_empty(self, mock_config):
"""Empty or trivial conversations should produce no facts."""
case = ExtractionTestCase(
summary="The user said hello and I greeted them back. Nothing else was discussed.",
date_utc="2026-04-12",
)
facts = _run_extraction(case, mock_config)
assert len(facts) == 0, (
f"Expected 0 facts from trivial conversation, got {len(facts)}: {facts}"
)
print("Correctly extracted 0 facts from trivial conversation")
@requires_judge_llm
def test_judge_mixed_summary_filters_noise(self, mock_config):
"""A summary with both novel knowledge and noise should only extract the novel parts."""
case = ExtractionTestCase(
summary=(
"The user asked about the weather — it's 22 degrees and sunny "
"in Hackney right now. I recommended they go for a walk in "
"Victoria Park. The user mentioned they just adopted a cat "
"named Miso from Battersea Dogs & Cats Home last week. They "
"also asked what time it is."
),
date_utc="2026-04-10",
)
facts = _run_extraction(case, mock_config)
# Should capture the cat adoption (novel, specific)
assert _fact_matches_keyword(facts, "Miso") or _fact_matches_keyword(facts, "cat"), (
f"Should have extracted cat adoption fact: {facts}"
)
# Should NOT capture weather snapshot
assert not _any_fact_matches_pattern(facts, r"(?i)22.*(degree|celsius|°)"), (
f"Should not have extracted weather snapshot: {facts}"
)
# Should NOT capture assistant recommendation
assert not _any_fact_matches_pattern(facts, r"(?i)(recommend|suggest).*walk"), (
f"Should not have extracted assistant recommendation: {facts}"
)
print(f"Extracted {len(facts)} facts from mixed summary:")
for f in facts:
print(f" - {f}")