Files
javis_bot/evals/test_knowledge_extraction.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

459 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Knowledge Extraction Evaluations
Tests the quality of knowledge extraction from conversation summaries.
Ensures the extraction prompt correctly handles:
1. Assistant self-references (should NOT be extracted)
2. Stale temporal snapshots (should NOT be extracted)
3. Common knowledge (should NOT be extracted)
4. Novel knowledge (SHOULD be extracted)
5. Proper reframing (requests → knowledge, not interaction descriptions)
Run:
EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh knowledge
EVAL_JUDGE_MODEL=gpt-oss:20b ./scripts/run_evals.sh knowledge
"""
import json
import re
from dataclasses import dataclass, field
from typing import List, Optional
import pytest
from conftest import requires_judge_llm
from helpers import (
MockConfig,
JUDGE_MODEL,
JUDGE_BASE_URL,
call_judge_llm,
JudgeVerdict,
)
from jarvis.memory.graph_ops import extract_graph_memories
# =============================================================================
# Test Data
# =============================================================================
@dataclass
class ExtractionTestCase:
"""A conversation summary with expected extraction outcomes."""
summary: str
date_utc: Optional[str] = None
# Facts that SHOULD appear (checked by keyword matching)
should_extract_keywords: List[str] = field(default_factory=list)
# Patterns that should NOT appear in any extracted fact
should_not_extract_patterns: List[str] = field(default_factory=list)
# Minimum number of facts expected
min_facts: int = 0
# Maximum number of facts expected (0 = no upper limit)
max_facts: int = 0
# ── Cases where extraction should produce good novel knowledge ──────────
GOOD_EXTRACTION_CASES = [
pytest.param(
ExtractionTestCase(
summary=(
"The user asked about boxing gyms in Hackney. I found that "
"Trenches Boxing Club offers evening classes on weekdays from "
"6-8pm, priced at 15 pounds per session. The user mentioned "
"they've been living in Hackney for 2 years."
),
date_utc="2026-04-10",
should_extract_keywords=["Trenches", "Hackney", "boxing"],
min_facts=2,
),
id="Novel knowledge: local business details and user location",
),
pytest.param(
ExtractionTestCase(
summary=(
"The user follows an 1800 kcal daily meal plan with a target "
"of 150g protein. They mentioned preferring air-fried chicken "
"breast with a soy-oyster-teriyaki glaze — a recipe they've "
"been perfecting over the past month."
),
date_utc="2026-04-08",
should_extract_keywords=["1800", "protein"],
min_facts=2,
),
id="Novel knowledge: user diet plan and preferred recipe",
),
pytest.param(
ExtractionTestCase(
summary=(
"The user is planning to move from London to Tbilisi, Georgia "
"in June 2026. They've already secured a flat in Vera district "
"for 800 USD per month. They work remotely as a software "
"engineer for a UK-based startup called Equals Money."
),
date_utc="2026-04-12",
should_extract_keywords=["Tbilisi", "Equals Money"],
min_facts=3,
),
id="Novel knowledge: relocation plans and employment",
),
pytest.param(
ExtractionTestCase(
summary=(
"Kullanıcı Kadıköy'deki Çiya Sofrası restoranını sordu. "
"Öğle yemeği menüsü 250 TL civarında, özellikle kuzu tandır "
"ve enginar yemeği çok beğeniliyormuş. Kullanıcı İstanbul'da "
"Kadıköy semtinde yaşıyor ve haftada 3 kez dışarıda yemek yiyor."
),
date_utc="2026-04-11",
should_extract_keywords=["Çiya", "Kadıköy"],
min_facts=2,
),
id="Novel knowledge: non-English summary (Turkish)",
),
]
# ── Cases where specific patterns should NOT appear ─────────────────────
BAD_PATTERN_CASES = [
pytest.param(
ExtractionTestCase(
summary=(
"The user asked about healthy meal options. I recommended "
"adding more vegetables and lean protein to their diet. I "
"suggested trying grilled salmon with quinoa and steamed "
"broccoli. The user thanked me for the suggestions."
),
date_utc="2026-04-10",
should_not_extract_patterns=[
r"(?i)assistant",
r"(?i)recommend",
r"(?i)suggest",
r"(?i)I told",
r"(?i)I advised",
],
max_facts=1, # Possibly 0 — there's no novel knowledge here
),
id="Reject: assistant self-references (recommendations are not knowledge)",
),
pytest.param(
ExtractionTestCase(
summary=(
"The user asked for the current weather. The temperature in "
"London is 20 degrees Celsius with partly cloudy skies. Wind "
"is coming from the southwest at 15 km/h. It's currently "
"3:45 PM on a Sunday afternoon."
),
date_utc="2026-04-06",
should_not_extract_patterns=[
r"(?i)current(ly)? (weather|temperature|time|date)",
r"(?i)20.*(degree|celsius|°)",
r"(?i)3:45",
r"(?i)wind.*southwest",
r"(?i)partly cloudy",
],
max_facts=1, # Maybe "user is in London" but nothing else
),
id="Reject: stale temporal snapshots (weather, time of day)",
),
]
# ── Cases testing proper reframing ──────────────────────────────────────
REFRAMING_CASES = [
pytest.param(
ExtractionTestCase(
summary=(
"The user asked about vegetarian restaurants near Covent "
"Garden. I found Mildreds, which serves plant-based dishes "
"and has 4.5 stars on Google. The user mentioned they've been "
"vegetarian for 3 years. They also asked about Dishoom but "
"decided against it since it's not fully vegetarian."
),
date_utc="2026-04-10",
should_extract_keywords=["Mildreds", "vegetarian"],
should_not_extract_patterns=[
r"(?i)user asked about",
r"(?i)user enquired",
r"(?i)user wanted to know",
],
min_facts=2,
),
id="Reframing: requests become knowledge, not interaction descriptions",
),
pytest.param(
ExtractionTestCase(
summary=(
"The user mentioned they started a new job at Equals Money "
"on March 1st 2026 as a senior backend engineer. They're "
"working with Python and FastAPI. Their team lead is someone "
"called Hakan."
),
date_utc="2026-04-05",
should_extract_keywords=["Equals Money", "March"],
should_not_extract_patterns=[
r"(?i)user mentioned",
r"(?i)user said",
r"(?i)user told",
],
min_facts=2,
),
id="Reframing: life events framed as facts with temporal context",
),
]
# =============================================================================
# Helpers
# =============================================================================
def _run_extraction(case: ExtractionTestCase, config: MockConfig) -> list[str]:
"""Run extract_graph_memories with the given case and config.
Returns a flat list of fact strings. The extractor now returns
``(branch_id, fact)`` tuples; these evals predate branch tagging
and only care about the fact text. The new branch-routing evals
live in ``test_graph_branch_routing.py``.
"""
tagged = extract_graph_memories(
summary=case.summary,
ollama_base_url=config.ollama_base_url,
ollama_chat_model=config.ollama_chat_model,
timeout_sec=config.llm_chat_timeout_sec,
thinking=False,
date_utc=case.date_utc,
)
return [fact for _branch, fact in tagged]
def _fact_matches_keyword(facts: list[str], keyword: str) -> bool:
"""Check if any extracted fact contains the keyword (case-insensitive)."""
keyword_lower = keyword.lower()
return any(keyword_lower in fact.lower() for fact in facts)
def _any_fact_matches_pattern(facts: list[str], pattern: str) -> bool:
"""Check if any extracted fact matches a regex pattern."""
compiled = re.compile(pattern)
return any(compiled.search(fact) for fact in facts)
def _judge_extraction_quality(
summary: str,
facts: list[str],
date_utc: Optional[str] = None,
) -> JudgeVerdict:
"""Use LLM-as-judge to evaluate overall extraction quality."""
system_prompt = (
"You are evaluating knowledge extraction quality. Given a conversation "
"summary and the facts extracted from it, score the extraction.\n\n"
"Score on these criteria (0-10 each):\n"
"1. NOVELTY: Are the extracted facts genuinely novel (not common "
"knowledge the model already knows)?\n"
"2. SELF_CONTAINED: Is each fact a self-contained statement useful "
"without the original conversation?\n"
"3. NO_ASSISTANT_VOICE: Are facts written as knowledge, NOT as "
"descriptions of what the assistant said/recommended?\n"
"4. NO_STALE_DATA: Are transient details (weather, time of day) "
"correctly excluded?\n"
"5. COMPLETENESS: Were important novel facts captured?\n\n"
"Output your evaluation in this EXACT format:\n"
"NOVELTY: [0-10]\n"
"SELF_CONTAINED: [0-10]\n"
"NO_ASSISTANT_VOICE: [0-10]\n"
"NO_STALE_DATA: [0-10]\n"
"COMPLETENESS: [0-10]\n"
"OVERALL: [PASS/FAIL]\n"
"REASONING: [One paragraph explaining your verdict]"
)
facts_text = "\n".join(f"- {f}" for f in facts) if facts else "(no facts extracted)"
date_info = f"\nDate context: {date_utc}" if date_utc else ""
user_prompt = (
f"Conversation summary:{date_info}\n{summary}\n\n"
f"Extracted facts:\n{facts_text}"
)
response = call_judge_llm(system_prompt, user_prompt, timeout_sec=120.0)
if not response:
return JudgeVerdict(
is_passed=False,
score=0.0,
reasoning="Judge LLM unavailable",
)
# Parse structured response
from helpers import _parse_judge_response
return _parse_judge_response(response)
# =============================================================================
# Test Classes
# =============================================================================
class TestKnowledgeExtractionQuality:
"""Tests that good novel knowledge is correctly extracted."""
@requires_judge_llm
@pytest.mark.parametrize("case", GOOD_EXTRACTION_CASES)
def test_extracts_novel_knowledge(self, mock_config, case: ExtractionTestCase):
"""Verify that novel knowledge is extracted with expected keywords."""
facts = _run_extraction(case, mock_config)
# Should extract at least min_facts
assert len(facts) >= case.min_facts, (
f"Expected at least {case.min_facts} facts, got {len(facts)}: {facts}"
)
# Check that expected keywords appear in at least one fact
for keyword in case.should_extract_keywords:
assert _fact_matches_keyword(facts, keyword), (
f"Expected keyword '{keyword}' in extracted facts: {facts}"
)
# Print for report visibility
print(f"Extracted {len(facts)} facts:")
for f in facts:
print(f" - {f}")
class TestKnowledgeExtractionRejection:
"""Tests that noise, stale data, and common knowledge are rejected."""
@requires_judge_llm
@pytest.mark.parametrize("case", BAD_PATTERN_CASES)
def test_rejects_bad_patterns(self, mock_config, case: ExtractionTestCase):
"""Verify that known bad patterns are not present in extracted facts."""
facts = _run_extraction(case, mock_config)
# Check max_facts constraint
if case.max_facts > 0:
assert len(facts) <= case.max_facts, (
f"Expected at most {case.max_facts} facts, got {len(facts)}: {facts}"
)
# Check that bad patterns don't appear
for pattern in case.should_not_extract_patterns:
assert not _any_fact_matches_pattern(facts, pattern), (
f"Bad pattern '{pattern}' found in extracted facts: {facts}"
)
# Print for report visibility
print(f"Extracted {len(facts)} facts (expected <= {case.max_facts}):")
for f in facts:
print(f" - {f}")
class TestKnowledgeExtractionReframing:
"""Tests that interaction descriptions are reframed as knowledge."""
@requires_judge_llm
@pytest.mark.parametrize("case", REFRAMING_CASES)
def test_reframes_as_knowledge(self, mock_config, case: ExtractionTestCase):
"""Verify facts are written as knowledge, not interaction descriptions."""
facts = _run_extraction(case, mock_config)
# Should extract enough facts
assert len(facts) >= case.min_facts, (
f"Expected at least {case.min_facts} facts, got {len(facts)}: {facts}"
)
# Should contain expected keywords
for keyword in case.should_extract_keywords:
assert _fact_matches_keyword(facts, keyword), (
f"Expected keyword '{keyword}' in extracted facts: {facts}"
)
# Should NOT contain interaction-description patterns
for pattern in case.should_not_extract_patterns:
assert not _any_fact_matches_pattern(facts, pattern), (
f"Interaction-description pattern '{pattern}' found in: {facts}"
)
# Print for report visibility
print(f"Extracted {len(facts)} facts:")
for f in facts:
print(f" - {f}")
class TestKnowledgeExtractionJudge:
"""LLM-as-judge evaluations of overall extraction quality."""
@requires_judge_llm
@pytest.mark.parametrize("case", GOOD_EXTRACTION_CASES)
def test_judge_extraction_quality(self, mock_config, case: ExtractionTestCase):
"""Judge evaluates overall extraction quality on good summaries."""
facts = _run_extraction(case, mock_config)
verdict = _judge_extraction_quality(
summary=case.summary,
facts=facts,
date_utc=case.date_utc,
)
# Print for report
print(f"Score: {verdict.score:.2f}")
print(f"Reasoning: {verdict.reasoning}")
for criterion, score in verdict.criteria_scores.items():
print(f" {criterion}: {score:.1f}")
# Accept if the judge passes OR the score is above 0.7 —
# the judge can be overly strict on completeness for minor details
assert verdict.is_passed or verdict.score >= 0.7, (
f"Judge failed extraction quality (score={verdict.score:.2f}): "
f"{verdict.reasoning}\nFacts: {facts}"
)
@requires_judge_llm
def test_judge_empty_conversation_returns_empty(self, mock_config):
"""Empty or trivial conversations should produce no facts."""
case = ExtractionTestCase(
summary="The user said hello and I greeted them back. Nothing else was discussed.",
date_utc="2026-04-12",
)
facts = _run_extraction(case, mock_config)
assert len(facts) == 0, (
f"Expected 0 facts from trivial conversation, got {len(facts)}: {facts}"
)
print("Correctly extracted 0 facts from trivial conversation")
@requires_judge_llm
def test_judge_mixed_summary_filters_noise(self, mock_config):
"""A summary with both novel knowledge and noise should only extract the novel parts."""
case = ExtractionTestCase(
summary=(
"The user asked about the weather — it's 22 degrees and sunny "
"in Hackney right now. I recommended they go for a walk in "
"Victoria Park. The user mentioned they just adopted a cat "
"named Miso from Battersea Dogs & Cats Home last week. They "
"also asked what time it is."
),
date_utc="2026-04-10",
)
facts = _run_extraction(case, mock_config)
# Should capture the cat adoption (novel, specific)
assert _fact_matches_keyword(facts, "Miso") or _fact_matches_keyword(facts, "cat"), (
f"Should have extracted cat adoption fact: {facts}"
)
# Should NOT capture weather snapshot
assert not _any_fact_matches_pattern(facts, r"(?i)22.*(degree|celsius|°)"), (
f"Should not have extracted weather snapshot: {facts}"
)
# Should NOT capture assistant recommendation
assert not _any_fact_matches_pattern(facts, r"(?i)(recommend|suggest).*walk"), (
f"Should not have extracted assistant recommendation: {facts}"
)
print(f"Extracted {len(facts)} facts from mixed summary:")
for f in facts:
print(f" - {f}")