Files
javis_bot/tests/test_evaluator.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

534 lines
21 KiB
Python

"""Unit tests for the agentic-loop turn evaluator."""
from unittest.mock import patch
import pytest
from jarvis.reply.evaluator import evaluate_turn, EvaluatorResult, _parse_result
class TestParseResult:
def test_parses_terminal_true(self):
res = _parse_result('{"terminal": true, "nudge": "", "reason": "done"}')
assert res.terminal is True
assert res.nudge == ""
def test_parses_continue_with_nudge(self):
res = _parse_result(
'{"terminal": false, "nudge": "Call openApp with target=YouTube", '
'"reason": "agent offered instead of acting"}'
)
assert res.terminal is False
assert res.nudge == "Call openApp with target=YouTube"
assert "offered" in res.reason
def test_fails_open_to_terminal_on_garbage(self):
res = _parse_result("not JSON at all")
assert res.terminal is True
assert res.reason == "evaluator_failed_open"
def test_strips_markdown_fences(self):
res = _parse_result(
'```json\n{"terminal": true, "nudge": "", "reason": "ok"}\n```'
)
assert res.terminal is True
def test_extracts_embedded_json(self):
res = _parse_result(
'Here: {"terminal": false, "nudge": "use X", "reason": "r"} done'
)
assert res.terminal is False
assert res.nudge == "use X"
def test_missing_terminal_field_fails_open_to_terminal(self):
res = _parse_result('{"nudge": "x", "reason": "y"}')
assert res.terminal is True
assert res.reason == "evaluator_failed_open"
def test_non_bool_terminal_fails_open_to_terminal(self):
res = _parse_result('{"terminal": "yes", "nudge": "", "reason": ""}')
assert res.terminal is True
def test_parses_tool_call_field(self):
"""Evaluator can return a structured `tool_call` with name + args
alongside the free-form nudge. This lets the engine execute the
tool directly instead of relying on the chat model to obey a
textual nudge — critical for small models that ignore nudges."""
res = _parse_result(
'{"terminal": false, "nudge": "call webSearch", '
'"reason": "prose", "tool_call": {"name": "webSearch", '
'"arguments": {"search_query": "overview of China"}}}'
)
assert res.terminal is False
assert res.tool_call is not None
assert res.tool_call["name"] == "webSearch"
assert res.tool_call["arguments"] == {"search_query": "overview of China"}
def test_tool_call_absent_is_none(self):
res = _parse_result(
'{"terminal": false, "nudge": "do the thing", "reason": "prose"}'
)
assert res.tool_call is None
def test_tool_call_missing_name_is_rejected(self):
"""Malformed tool_call (no string name) must be dropped, not crash."""
res = _parse_result(
'{"terminal": false, "nudge": "x", "reason": "y", '
'"tool_call": {"arguments": {}}}'
)
assert res.tool_call is None
def test_tool_call_non_dict_arguments_normalised_to_empty(self):
res = _parse_result(
'{"terminal": false, "nudge": "x", "reason": "y", '
'"tool_call": {"name": "stop", "arguments": "junk"}}'
)
assert res.tool_call is not None
assert res.tool_call["name"] == "stop"
assert res.tool_call["arguments"] == {}
class TestEvaluateTurn:
def _cfg(self, **overrides):
class _C:
ollama_base_url = "http://x"
ollama_chat_model = "m"
llm_digest_timeout_sec = 5.0
llm_thinking_enabled = False
c = _C()
for k, v in overrides.items():
setattr(c, k, v)
return c
def test_terminal_path(self):
with patch(
"jarvis.reply.evaluator.call_llm_direct",
return_value='{"terminal": true, "nudge": "", "reason": "done"}',
):
res = evaluate_turn(
"what's 2+2?", "4.", [("calc", "do maths")], 1, self._cfg()
)
assert res.terminal is True
assert res.nudge == ""
def test_continue_with_nudge(self):
with patch(
"jarvis.reply.evaluator.call_llm_direct",
return_value=(
'{"terminal": false, "nudge": "Invoke openApp with '
'target=YouTube", "reason": "offered instead of acted"}'
),
):
res = evaluate_turn(
"open youtube",
"I can navigate you to YouTube homepage.",
[("openApp", "Open an application"), ("stop", "stop sentinel")],
1,
self._cfg(),
)
assert res.terminal is False
assert "openApp" in res.nudge
def test_parse_failure_fails_open_to_terminal(self):
with patch(
"jarvis.reply.evaluator.call_llm_direct",
return_value="not a valid response",
):
res = evaluate_turn("q", "r", [], 1, self._cfg())
assert res.terminal is True
assert res.reason == "evaluator_failed_open"
def test_timeout_or_exception_fails_open_to_terminal(self):
with patch(
"jarvis.reply.evaluator.call_llm_direct",
side_effect=TimeoutError("slow"),
):
res = evaluate_turn("q", "r", [], 1, self._cfg())
assert res.terminal is True
assert res.reason == "evaluator_failed_open"
def test_missing_config_fails_open_to_terminal(self):
cfg = self._cfg(ollama_base_url="", ollama_chat_model="")
res = evaluate_turn("q", "r", [], 1, cfg)
assert res.terminal is True
assert res.reason == "evaluator_failed_open"
def test_connection_error_fails_open_to_terminal(self):
with patch(
"jarvis.reply.evaluator.call_llm_direct",
side_effect=ConnectionError("ollama down"),
):
res = evaluate_turn("q", "r", [], 1, self._cfg())
assert res.terminal is True
def test_redacts_email_in_prompt(self):
"""Assistant response echoing an email is scrubbed before the LLM call."""
captured = {}
def _capture(**kwargs):
captured.update(kwargs)
return '{"terminal": true, "nudge": "", "reason": ""}'
with patch(
"jarvis.reply.evaluator.call_llm_direct",
side_effect=_capture,
):
evaluate_turn(
"who is alice?",
"Her email is alice@example.com and she lives in London.",
[],
1,
self._cfg(),
)
sent = captured.get("user_content", "")
assert "alice@example.com" not in sent
assert "[REDACTED_EMAIL]" in sent
def test_available_tools_appear_in_prompt(self):
captured = {}
def _capture(**kwargs):
captured.update(kwargs)
return '{"terminal": true, "nudge": "", "reason": ""}'
with patch(
"jarvis.reply.evaluator.call_llm_direct",
side_effect=_capture,
):
evaluate_turn(
"open youtube",
"I can help you find YouTube.",
[
("openApp", "Open an application by name"),
("webSearch", "Search the web"),
],
1,
self._cfg(),
)
sent = captured.get("user_content", "")
assert "openApp" in sent
assert "Open an application by name" in sent
assert "webSearch" in sent
def test_tool_schema_appears_in_prompt(self):
"""Regression: without parameter names the evaluator tends to emit
hallucinated argument keys (``query`` instead of ``search_query``),
causing direct-exec to fail schema validation in a loop."""
captured = {}
def _capture(**kwargs):
captured.update(kwargs)
return '{"terminal": true, "nudge": "", "reason": ""}'
schema = {
"type": "object",
"properties": {
"search_query": {"type": "string"},
},
"required": ["search_query"],
}
with patch(
"jarvis.reply.evaluator.call_llm_direct",
side_effect=_capture,
):
evaluate_turn(
"tube strikes today",
"I cannot check real-time info.",
[("webSearch", "Search the web", schema)],
1,
self._cfg(),
)
sent = captured.get("user_content", "")
assert "webSearch(search_query: string required)" in sent, (
f"Expected parameter signature in prompt; got: {sent[:400]!r}"
)
def test_tool_schema_omitted_falls_back_to_name_only(self):
"""Two-tuple form must still work for back-compat."""
captured = {}
def _capture(**kwargs):
captured.update(kwargs)
return '{"terminal": true, "nudge": "", "reason": ""}'
with patch(
"jarvis.reply.evaluator.call_llm_direct",
side_effect=_capture,
):
evaluate_turn(
"q",
"r",
[("webSearch", "Search the web")],
1,
self._cfg(),
)
sent = captured.get("user_content", "")
assert "webSearch" in sent
# No hallucinated param signature when schema absent.
assert "webSearch(" not in sent
def test_invoked_tools_appear_in_prompt(self):
"""Regression: without this context the evaluator cannot tell that
a tool has already run, and keeps re-requesting it when the chat
model replies in prose after a successful direct-exec."""
captured = {}
def _capture(**kwargs):
captured.update(kwargs)
return '{"terminal": true, "nudge": "", "reason": ""}'
with patch(
"jarvis.reply.evaluator.call_llm_direct",
side_effect=_capture,
):
evaluate_turn(
user_query="open youtube",
assistant_response_summary="I'll help with that.",
available_tools=[
(
"chrome-devtools__navigate_page",
"Navigate to a URL in Chrome",
),
],
turns_used=2,
cfg=self._cfg(),
invoked_tools=[
(
"chrome-devtools__navigate_page",
'{"url": "youtube.com"}',
'{"status": "ok", "url": "https://youtube.com"}',
),
],
)
sent = captured.get("user_content", "")
assert "TOOLS ALREADY INVOKED THIS REPLY" in sent, (
f"Evaluator prompt must include an invoked-tools block. "
f"Got: {sent[:400]!r}"
)
assert "chrome-devtools__navigate_page" in sent
assert "youtube.com" in sent, (
"Args of invoked tools must appear in the prompt so the "
"evaluator can match them against the user's request and "
"avoid re-requesting the same call."
)
def test_invoked_tools_default_is_empty(self):
"""When the caller omits invoked_tools (engine paths predating the
parameter, tests), the prompt still renders with a clear
'(none yet this reply)' marker instead of crashing."""
captured = {}
def _capture(**kwargs):
captured.update(kwargs)
return '{"terminal": true, "nudge": "", "reason": ""}'
with patch(
"jarvis.reply.evaluator.call_llm_direct",
side_effect=_capture,
):
evaluate_turn("q", "r", [], 1, self._cfg())
sent = captured.get("user_content", "")
assert "TOOLS ALREADY INVOKED THIS REPLY" in sent
assert "none yet" in sent
def test_evaluator_model_override_used(self):
captured = {}
def _capture(**kwargs):
captured.update(kwargs)
return '{"terminal": true, "nudge": "", "reason": ""}'
cfg = self._cfg(
evaluator_model="dedicated-evaluator",
intent_judge_model="judge-model",
ollama_chat_model="chat-model",
)
with patch(
"jarvis.reply.evaluator.call_llm_direct",
side_effect=_capture,
):
evaluate_turn("q", "r", [], 1, cfg)
assert captured.get("chat_model") == "dedicated-evaluator"
def test_evaluator_model_falls_back_to_intent_judge(self):
captured = {}
def _capture(**kwargs):
captured.update(kwargs)
return '{"terminal": true, "nudge": "", "reason": ""}'
cfg = self._cfg(
evaluator_model="",
intent_judge_model="judge-model",
ollama_chat_model="chat-model",
)
with patch(
"jarvis.reply.evaluator.call_llm_direct",
side_effect=_capture,
):
evaluate_turn("q", "r", [], 1, cfg)
assert captured.get("chat_model") == "judge-model"
class TestEvaluatorGarbledTurnGuidance:
"""The evaluator prompt must tell the judge model to reject garbled
agent turns (raw tool protocol markers, special tokens, truncated
JSON) with a continue so a retry can produce a real reply.
Without this clause, the judge sees ``tool_code\\nprint(...)<unused88>``
as "prose", returns terminal, and the engine ships the garbage
straight to the user. The deterministic malformed guard in the engine
handles the known shapes; this clause is defence-in-depth for novel
leaks the guard has not learned yet.
"""
def test_prompt_mentions_garbled_marker_recognition(self):
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
prompt_lower = _EVALUATOR_SYSTEM_PROMPT.lower()
assert "garbled" in prompt_lower or "malformed" in prompt_lower, (
"Evaluator prompt must explicitly instruct the judge to "
"recognise garbled / malformed agent turns and return continue "
"so the engine can recover instead of shipping the junk."
)
# The explicit shapes we want the judge on the lookout for.
for marker in ("tool_code", "tool_output", "<unused"):
assert marker in _EVALUATOR_SYSTEM_PROMPT, (
f"Evaluator prompt should name {marker!r} as an example of "
f"a garbled agent turn — naming shapes helps small judge "
f"models spot them."
)
def test_prompt_instructs_salvaging_failed_tool_calls(self):
"""When the garbled turn encodes a failed tool-call attempt
(e.g. ``tool_code\\nprint(google_search.search(query="..."))`` or
bare ``tool_calls: [{"name": "webSearch", ...}]`` JSON), the
evaluator should extract the intended tool + arguments and name
them in the nudge so the next turn goes through the normal
tool-call path. Saves a turn vs. a generic "produce prose"
nudge, and keeps allow-list/schema/redaction guards intact
because the retry is a real tool call, not a direct execution
of parsed text.
"""
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
prompt_lower = _EVALUATOR_SYSTEM_PROMPT.lower()
assert "salvage" in prompt_lower or "extract" in prompt_lower, (
"Evaluator prompt should instruct the judge to extract / "
"salvage the intended tool call from a garbled turn when "
"possible, rather than only nudging 'produce prose'."
)
# The nudge should name the intended tool + args, not just say
# "try again". Pin a keyword that signals this shape.
assert (
"name the tool" in prompt_lower
or "name the intended tool" in prompt_lower
), (
"Evaluator prompt should tell the judge to name the "
"intended tool (and arguments) in the nudge when the "
"garbled turn encodes a failed tool-call attempt."
)
class TestEvaluatorTerminalBias:
"""For simple single-part queries whose grounded answer is already in
the turn, the evaluator must return terminal on the FIRST grounded
reply. Without explicit guidance, a small judge model defaults to
'continue' on every ambiguous turn and the agentic loop burns through
``agentic_max_turns``, which fires the digest summariser and leaks
the 'I could not fully finish your request' caveat onto an otherwise
correct answer.
Field evidence: "how's the weather today" → getWeather called →
grounded reply produced → evaluator keeps saying continue → 8 turns
burned → digest caveat prepended. Correctness-wise the answer is
there; UX-wise the assistant sounds confused.
The prompt must carry BOTH signals:
1. A single-part query with a grounded answer is terminal — even
if the judge can't prove a tool ran, facts that address the ask
are sufficient.
2. Multi-part queries still need every part addressed before
going terminal, so chained-research flows (two webSearch calls,
parallel comparisons) do not regress.
"""
def test_prompt_biases_terminal_on_single_part_grounded_reply(self):
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
prompt_lower = _EVALUATOR_SYSTEM_PROMPT.lower()
assert "single-part" in prompt_lower or "single part" in prompt_lower, (
"Evaluator prompt should distinguish single-part queries "
"(one ask) from multi-part queries — small judge models "
"need the category named explicitly to apply the right bias."
)
# The reply-shaped anchor: when the turn contains facts that
# answer the ask, terminal.
assert (
"concrete facts" in prompt_lower
or "concrete data" in prompt_lower
or "facts that address" in prompt_lower
), (
"Evaluator prompt should tell the judge that a reply "
"containing concrete facts that address the user's ask is "
"terminal, even when the judge can't prove a tool ran."
)
def test_prompt_instructs_structured_tool_call_field(self):
"""When the judge has named a specific tool + arguments in the
nudge, the prompt must also tell it to emit them as a structured
`tool_call: {"name": "...", "arguments": {...}}` JSON field. The
engine uses that structured form to execute the tool directly,
bypassing small models that ignore free-form nudges."""
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
assert "tool_call" in _EVALUATOR_SYSTEM_PROMPT, (
"Evaluator prompt must tell the judge to emit a structured "
"`tool_call` object alongside the free-form nudge so the "
"engine can execute the call directly."
)
def test_prompt_biases_terminal_when_required_tool_already_invoked(self):
"""Field regression: after a direct-exec of
chrome-devtools__navigate_page, the chat model replied in prose,
and the evaluator kept returning continue-with-the-same-tool_call
because it couldn't see the tool had already run. The prompt must
explicitly tell the judge to consult TOOLS ALREADY INVOKED and
return terminal when the action has been performed."""
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
prompt_lower = _EVALUATOR_SYSTEM_PROMPT.lower()
assert "already invoked" in prompt_lower or "already ran" in prompt_lower, (
"Prompt must tell the judge to consult the invoked-tools "
"history so it can distinguish 'not yet tried' from "
"'already ran successfully'."
)
assert "terminal" in prompt_lower and (
"already ran" in prompt_lower or "already been invoked" in prompt_lower
), (
"Prompt must bias terminal when a tool covering the user's "
"action has already been invoked successfully."
)
def test_prompt_still_continues_on_unaddressed_multi_part(self):
"""The terminal bias for single-part queries must not cannibalise
multi-part flows. Prompt must explicitly tell the judge that
when the query has multiple parts and at least one is
unaddressed, return continue."""
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
prompt_lower = _EVALUATOR_SYSTEM_PROMPT.lower()
assert "multi-part" in prompt_lower or "multi part" in prompt_lower, (
"Evaluator prompt should name the multi-part case so the "
"terminal bias does not swallow chained-research flows."
)
assert (
"unaddressed" in prompt_lower
or "not addressed" in prompt_lower
or "not yet addressed" in prompt_lower
or "still unanswered" in prompt_lower
), (
"Evaluator prompt should tell the judge to return continue "
"when a multi-part query has at least one unaddressed part."
)