Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
533
tests/test_evaluator.py
Normal file
533
tests/test_evaluator.py
Normal file
@@ -0,0 +1,533 @@
|
||||
"""Unit tests for the agentic-loop turn evaluator."""
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from jarvis.reply.evaluator import evaluate_turn, EvaluatorResult, _parse_result
|
||||
|
||||
|
||||
class TestParseResult:
|
||||
def test_parses_terminal_true(self):
|
||||
res = _parse_result('{"terminal": true, "nudge": "", "reason": "done"}')
|
||||
assert res.terminal is True
|
||||
assert res.nudge == ""
|
||||
|
||||
def test_parses_continue_with_nudge(self):
|
||||
res = _parse_result(
|
||||
'{"terminal": false, "nudge": "Call openApp with target=YouTube", '
|
||||
'"reason": "agent offered instead of acting"}'
|
||||
)
|
||||
assert res.terminal is False
|
||||
assert res.nudge == "Call openApp with target=YouTube"
|
||||
assert "offered" in res.reason
|
||||
|
||||
def test_fails_open_to_terminal_on_garbage(self):
|
||||
res = _parse_result("not JSON at all")
|
||||
assert res.terminal is True
|
||||
assert res.reason == "evaluator_failed_open"
|
||||
|
||||
def test_strips_markdown_fences(self):
|
||||
res = _parse_result(
|
||||
'```json\n{"terminal": true, "nudge": "", "reason": "ok"}\n```'
|
||||
)
|
||||
assert res.terminal is True
|
||||
|
||||
def test_extracts_embedded_json(self):
|
||||
res = _parse_result(
|
||||
'Here: {"terminal": false, "nudge": "use X", "reason": "r"} done'
|
||||
)
|
||||
assert res.terminal is False
|
||||
assert res.nudge == "use X"
|
||||
|
||||
def test_missing_terminal_field_fails_open_to_terminal(self):
|
||||
res = _parse_result('{"nudge": "x", "reason": "y"}')
|
||||
assert res.terminal is True
|
||||
assert res.reason == "evaluator_failed_open"
|
||||
|
||||
def test_non_bool_terminal_fails_open_to_terminal(self):
|
||||
res = _parse_result('{"terminal": "yes", "nudge": "", "reason": ""}')
|
||||
assert res.terminal is True
|
||||
|
||||
def test_parses_tool_call_field(self):
|
||||
"""Evaluator can return a structured `tool_call` with name + args
|
||||
alongside the free-form nudge. This lets the engine execute the
|
||||
tool directly instead of relying on the chat model to obey a
|
||||
textual nudge — critical for small models that ignore nudges."""
|
||||
res = _parse_result(
|
||||
'{"terminal": false, "nudge": "call webSearch", '
|
||||
'"reason": "prose", "tool_call": {"name": "webSearch", '
|
||||
'"arguments": {"search_query": "overview of China"}}}'
|
||||
)
|
||||
assert res.terminal is False
|
||||
assert res.tool_call is not None
|
||||
assert res.tool_call["name"] == "webSearch"
|
||||
assert res.tool_call["arguments"] == {"search_query": "overview of China"}
|
||||
|
||||
def test_tool_call_absent_is_none(self):
|
||||
res = _parse_result(
|
||||
'{"terminal": false, "nudge": "do the thing", "reason": "prose"}'
|
||||
)
|
||||
assert res.tool_call is None
|
||||
|
||||
def test_tool_call_missing_name_is_rejected(self):
|
||||
"""Malformed tool_call (no string name) must be dropped, not crash."""
|
||||
res = _parse_result(
|
||||
'{"terminal": false, "nudge": "x", "reason": "y", '
|
||||
'"tool_call": {"arguments": {}}}'
|
||||
)
|
||||
assert res.tool_call is None
|
||||
|
||||
def test_tool_call_non_dict_arguments_normalised_to_empty(self):
|
||||
res = _parse_result(
|
||||
'{"terminal": false, "nudge": "x", "reason": "y", '
|
||||
'"tool_call": {"name": "stop", "arguments": "junk"}}'
|
||||
)
|
||||
assert res.tool_call is not None
|
||||
assert res.tool_call["name"] == "stop"
|
||||
assert res.tool_call["arguments"] == {}
|
||||
|
||||
|
||||
class TestEvaluateTurn:
|
||||
def _cfg(self, **overrides):
|
||||
class _C:
|
||||
ollama_base_url = "http://x"
|
||||
ollama_chat_model = "m"
|
||||
llm_digest_timeout_sec = 5.0
|
||||
llm_thinking_enabled = False
|
||||
c = _C()
|
||||
for k, v in overrides.items():
|
||||
setattr(c, k, v)
|
||||
return c
|
||||
|
||||
def test_terminal_path(self):
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
return_value='{"terminal": true, "nudge": "", "reason": "done"}',
|
||||
):
|
||||
res = evaluate_turn(
|
||||
"what's 2+2?", "4.", [("calc", "do maths")], 1, self._cfg()
|
||||
)
|
||||
assert res.terminal is True
|
||||
assert res.nudge == ""
|
||||
|
||||
def test_continue_with_nudge(self):
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
return_value=(
|
||||
'{"terminal": false, "nudge": "Invoke openApp with '
|
||||
'target=YouTube", "reason": "offered instead of acted"}'
|
||||
),
|
||||
):
|
||||
res = evaluate_turn(
|
||||
"open youtube",
|
||||
"I can navigate you to YouTube homepage.",
|
||||
[("openApp", "Open an application"), ("stop", "stop sentinel")],
|
||||
1,
|
||||
self._cfg(),
|
||||
)
|
||||
assert res.terminal is False
|
||||
assert "openApp" in res.nudge
|
||||
|
||||
def test_parse_failure_fails_open_to_terminal(self):
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
return_value="not a valid response",
|
||||
):
|
||||
res = evaluate_turn("q", "r", [], 1, self._cfg())
|
||||
assert res.terminal is True
|
||||
assert res.reason == "evaluator_failed_open"
|
||||
|
||||
def test_timeout_or_exception_fails_open_to_terminal(self):
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
side_effect=TimeoutError("slow"),
|
||||
):
|
||||
res = evaluate_turn("q", "r", [], 1, self._cfg())
|
||||
assert res.terminal is True
|
||||
assert res.reason == "evaluator_failed_open"
|
||||
|
||||
def test_missing_config_fails_open_to_terminal(self):
|
||||
cfg = self._cfg(ollama_base_url="", ollama_chat_model="")
|
||||
res = evaluate_turn("q", "r", [], 1, cfg)
|
||||
assert res.terminal is True
|
||||
assert res.reason == "evaluator_failed_open"
|
||||
|
||||
def test_connection_error_fails_open_to_terminal(self):
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
side_effect=ConnectionError("ollama down"),
|
||||
):
|
||||
res = evaluate_turn("q", "r", [], 1, self._cfg())
|
||||
assert res.terminal is True
|
||||
|
||||
def test_redacts_email_in_prompt(self):
|
||||
"""Assistant response echoing an email is scrubbed before the LLM call."""
|
||||
captured = {}
|
||||
|
||||
def _capture(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return '{"terminal": true, "nudge": "", "reason": ""}'
|
||||
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
side_effect=_capture,
|
||||
):
|
||||
evaluate_turn(
|
||||
"who is alice?",
|
||||
"Her email is alice@example.com and she lives in London.",
|
||||
[],
|
||||
1,
|
||||
self._cfg(),
|
||||
)
|
||||
sent = captured.get("user_content", "")
|
||||
assert "alice@example.com" not in sent
|
||||
assert "[REDACTED_EMAIL]" in sent
|
||||
|
||||
def test_available_tools_appear_in_prompt(self):
|
||||
captured = {}
|
||||
|
||||
def _capture(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return '{"terminal": true, "nudge": "", "reason": ""}'
|
||||
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
side_effect=_capture,
|
||||
):
|
||||
evaluate_turn(
|
||||
"open youtube",
|
||||
"I can help you find YouTube.",
|
||||
[
|
||||
("openApp", "Open an application by name"),
|
||||
("webSearch", "Search the web"),
|
||||
],
|
||||
1,
|
||||
self._cfg(),
|
||||
)
|
||||
sent = captured.get("user_content", "")
|
||||
assert "openApp" in sent
|
||||
assert "Open an application by name" in sent
|
||||
assert "webSearch" in sent
|
||||
|
||||
def test_tool_schema_appears_in_prompt(self):
|
||||
"""Regression: without parameter names the evaluator tends to emit
|
||||
hallucinated argument keys (``query`` instead of ``search_query``),
|
||||
causing direct-exec to fail schema validation in a loop."""
|
||||
captured = {}
|
||||
|
||||
def _capture(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return '{"terminal": true, "nudge": "", "reason": ""}'
|
||||
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"search_query": {"type": "string"},
|
||||
},
|
||||
"required": ["search_query"],
|
||||
}
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
side_effect=_capture,
|
||||
):
|
||||
evaluate_turn(
|
||||
"tube strikes today",
|
||||
"I cannot check real-time info.",
|
||||
[("webSearch", "Search the web", schema)],
|
||||
1,
|
||||
self._cfg(),
|
||||
)
|
||||
sent = captured.get("user_content", "")
|
||||
assert "webSearch(search_query: string required)" in sent, (
|
||||
f"Expected parameter signature in prompt; got: {sent[:400]!r}"
|
||||
)
|
||||
|
||||
def test_tool_schema_omitted_falls_back_to_name_only(self):
|
||||
"""Two-tuple form must still work for back-compat."""
|
||||
captured = {}
|
||||
|
||||
def _capture(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return '{"terminal": true, "nudge": "", "reason": ""}'
|
||||
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
side_effect=_capture,
|
||||
):
|
||||
evaluate_turn(
|
||||
"q",
|
||||
"r",
|
||||
[("webSearch", "Search the web")],
|
||||
1,
|
||||
self._cfg(),
|
||||
)
|
||||
sent = captured.get("user_content", "")
|
||||
assert "webSearch" in sent
|
||||
# No hallucinated param signature when schema absent.
|
||||
assert "webSearch(" not in sent
|
||||
|
||||
def test_invoked_tools_appear_in_prompt(self):
|
||||
"""Regression: without this context the evaluator cannot tell that
|
||||
a tool has already run, and keeps re-requesting it when the chat
|
||||
model replies in prose after a successful direct-exec."""
|
||||
captured = {}
|
||||
|
||||
def _capture(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return '{"terminal": true, "nudge": "", "reason": ""}'
|
||||
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
side_effect=_capture,
|
||||
):
|
||||
evaluate_turn(
|
||||
user_query="open youtube",
|
||||
assistant_response_summary="I'll help with that.",
|
||||
available_tools=[
|
||||
(
|
||||
"chrome-devtools__navigate_page",
|
||||
"Navigate to a URL in Chrome",
|
||||
),
|
||||
],
|
||||
turns_used=2,
|
||||
cfg=self._cfg(),
|
||||
invoked_tools=[
|
||||
(
|
||||
"chrome-devtools__navigate_page",
|
||||
'{"url": "youtube.com"}',
|
||||
'{"status": "ok", "url": "https://youtube.com"}',
|
||||
),
|
||||
],
|
||||
)
|
||||
sent = captured.get("user_content", "")
|
||||
assert "TOOLS ALREADY INVOKED THIS REPLY" in sent, (
|
||||
f"Evaluator prompt must include an invoked-tools block. "
|
||||
f"Got: {sent[:400]!r}"
|
||||
)
|
||||
assert "chrome-devtools__navigate_page" in sent
|
||||
assert "youtube.com" in sent, (
|
||||
"Args of invoked tools must appear in the prompt so the "
|
||||
"evaluator can match them against the user's request and "
|
||||
"avoid re-requesting the same call."
|
||||
)
|
||||
|
||||
def test_invoked_tools_default_is_empty(self):
|
||||
"""When the caller omits invoked_tools (engine paths predating the
|
||||
parameter, tests), the prompt still renders with a clear
|
||||
'(none yet this reply)' marker instead of crashing."""
|
||||
captured = {}
|
||||
|
||||
def _capture(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return '{"terminal": true, "nudge": "", "reason": ""}'
|
||||
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
side_effect=_capture,
|
||||
):
|
||||
evaluate_turn("q", "r", [], 1, self._cfg())
|
||||
sent = captured.get("user_content", "")
|
||||
assert "TOOLS ALREADY INVOKED THIS REPLY" in sent
|
||||
assert "none yet" in sent
|
||||
|
||||
def test_evaluator_model_override_used(self):
|
||||
captured = {}
|
||||
|
||||
def _capture(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return '{"terminal": true, "nudge": "", "reason": ""}'
|
||||
|
||||
cfg = self._cfg(
|
||||
evaluator_model="dedicated-evaluator",
|
||||
intent_judge_model="judge-model",
|
||||
ollama_chat_model="chat-model",
|
||||
)
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
side_effect=_capture,
|
||||
):
|
||||
evaluate_turn("q", "r", [], 1, cfg)
|
||||
assert captured.get("chat_model") == "dedicated-evaluator"
|
||||
|
||||
def test_evaluator_model_falls_back_to_intent_judge(self):
|
||||
captured = {}
|
||||
|
||||
def _capture(**kwargs):
|
||||
captured.update(kwargs)
|
||||
return '{"terminal": true, "nudge": "", "reason": ""}'
|
||||
|
||||
cfg = self._cfg(
|
||||
evaluator_model="",
|
||||
intent_judge_model="judge-model",
|
||||
ollama_chat_model="chat-model",
|
||||
)
|
||||
with patch(
|
||||
"jarvis.reply.evaluator.call_llm_direct",
|
||||
side_effect=_capture,
|
||||
):
|
||||
evaluate_turn("q", "r", [], 1, cfg)
|
||||
assert captured.get("chat_model") == "judge-model"
|
||||
|
||||
|
||||
class TestEvaluatorGarbledTurnGuidance:
|
||||
"""The evaluator prompt must tell the judge model to reject garbled
|
||||
agent turns (raw tool protocol markers, special tokens, truncated
|
||||
JSON) with a continue so a retry can produce a real reply.
|
||||
|
||||
Without this clause, the judge sees ``tool_code\\nprint(...)<unused88>``
|
||||
as "prose", returns terminal, and the engine ships the garbage
|
||||
straight to the user. The deterministic malformed guard in the engine
|
||||
handles the known shapes; this clause is defence-in-depth for novel
|
||||
leaks the guard has not learned yet.
|
||||
"""
|
||||
|
||||
def test_prompt_mentions_garbled_marker_recognition(self):
|
||||
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
|
||||
|
||||
prompt_lower = _EVALUATOR_SYSTEM_PROMPT.lower()
|
||||
assert "garbled" in prompt_lower or "malformed" in prompt_lower, (
|
||||
"Evaluator prompt must explicitly instruct the judge to "
|
||||
"recognise garbled / malformed agent turns and return continue "
|
||||
"so the engine can recover instead of shipping the junk."
|
||||
)
|
||||
# The explicit shapes we want the judge on the lookout for.
|
||||
for marker in ("tool_code", "tool_output", "<unused"):
|
||||
assert marker in _EVALUATOR_SYSTEM_PROMPT, (
|
||||
f"Evaluator prompt should name {marker!r} as an example of "
|
||||
f"a garbled agent turn — naming shapes helps small judge "
|
||||
f"models spot them."
|
||||
)
|
||||
|
||||
def test_prompt_instructs_salvaging_failed_tool_calls(self):
|
||||
"""When the garbled turn encodes a failed tool-call attempt
|
||||
(e.g. ``tool_code\\nprint(google_search.search(query="..."))`` or
|
||||
bare ``tool_calls: [{"name": "webSearch", ...}]`` JSON), the
|
||||
evaluator should extract the intended tool + arguments and name
|
||||
them in the nudge so the next turn goes through the normal
|
||||
tool-call path. Saves a turn vs. a generic "produce prose"
|
||||
nudge, and keeps allow-list/schema/redaction guards intact
|
||||
because the retry is a real tool call, not a direct execution
|
||||
of parsed text.
|
||||
"""
|
||||
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
|
||||
|
||||
prompt_lower = _EVALUATOR_SYSTEM_PROMPT.lower()
|
||||
assert "salvage" in prompt_lower or "extract" in prompt_lower, (
|
||||
"Evaluator prompt should instruct the judge to extract / "
|
||||
"salvage the intended tool call from a garbled turn when "
|
||||
"possible, rather than only nudging 'produce prose'."
|
||||
)
|
||||
# The nudge should name the intended tool + args, not just say
|
||||
# "try again". Pin a keyword that signals this shape.
|
||||
assert (
|
||||
"name the tool" in prompt_lower
|
||||
or "name the intended tool" in prompt_lower
|
||||
), (
|
||||
"Evaluator prompt should tell the judge to name the "
|
||||
"intended tool (and arguments) in the nudge when the "
|
||||
"garbled turn encodes a failed tool-call attempt."
|
||||
)
|
||||
|
||||
|
||||
class TestEvaluatorTerminalBias:
|
||||
"""For simple single-part queries whose grounded answer is already in
|
||||
the turn, the evaluator must return terminal on the FIRST grounded
|
||||
reply. Without explicit guidance, a small judge model defaults to
|
||||
'continue' on every ambiguous turn and the agentic loop burns through
|
||||
``agentic_max_turns``, which fires the digest summariser and leaks
|
||||
the 'I could not fully finish your request' caveat onto an otherwise
|
||||
correct answer.
|
||||
|
||||
Field evidence: "how's the weather today" → getWeather called →
|
||||
grounded reply produced → evaluator keeps saying continue → 8 turns
|
||||
burned → digest caveat prepended. Correctness-wise the answer is
|
||||
there; UX-wise the assistant sounds confused.
|
||||
|
||||
The prompt must carry BOTH signals:
|
||||
1. A single-part query with a grounded answer is terminal — even
|
||||
if the judge can't prove a tool ran, facts that address the ask
|
||||
are sufficient.
|
||||
2. Multi-part queries still need every part addressed before
|
||||
going terminal, so chained-research flows (two webSearch calls,
|
||||
parallel comparisons) do not regress.
|
||||
"""
|
||||
|
||||
def test_prompt_biases_terminal_on_single_part_grounded_reply(self):
|
||||
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
|
||||
|
||||
prompt_lower = _EVALUATOR_SYSTEM_PROMPT.lower()
|
||||
assert "single-part" in prompt_lower or "single part" in prompt_lower, (
|
||||
"Evaluator prompt should distinguish single-part queries "
|
||||
"(one ask) from multi-part queries — small judge models "
|
||||
"need the category named explicitly to apply the right bias."
|
||||
)
|
||||
# The reply-shaped anchor: when the turn contains facts that
|
||||
# answer the ask, terminal.
|
||||
assert (
|
||||
"concrete facts" in prompt_lower
|
||||
or "concrete data" in prompt_lower
|
||||
or "facts that address" in prompt_lower
|
||||
), (
|
||||
"Evaluator prompt should tell the judge that a reply "
|
||||
"containing concrete facts that address the user's ask is "
|
||||
"terminal, even when the judge can't prove a tool ran."
|
||||
)
|
||||
|
||||
def test_prompt_instructs_structured_tool_call_field(self):
|
||||
"""When the judge has named a specific tool + arguments in the
|
||||
nudge, the prompt must also tell it to emit them as a structured
|
||||
`tool_call: {"name": "...", "arguments": {...}}` JSON field. The
|
||||
engine uses that structured form to execute the tool directly,
|
||||
bypassing small models that ignore free-form nudges."""
|
||||
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
|
||||
|
||||
assert "tool_call" in _EVALUATOR_SYSTEM_PROMPT, (
|
||||
"Evaluator prompt must tell the judge to emit a structured "
|
||||
"`tool_call` object alongside the free-form nudge so the "
|
||||
"engine can execute the call directly."
|
||||
)
|
||||
|
||||
def test_prompt_biases_terminal_when_required_tool_already_invoked(self):
|
||||
"""Field regression: after a direct-exec of
|
||||
chrome-devtools__navigate_page, the chat model replied in prose,
|
||||
and the evaluator kept returning continue-with-the-same-tool_call
|
||||
because it couldn't see the tool had already run. The prompt must
|
||||
explicitly tell the judge to consult TOOLS ALREADY INVOKED and
|
||||
return terminal when the action has been performed."""
|
||||
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
|
||||
|
||||
prompt_lower = _EVALUATOR_SYSTEM_PROMPT.lower()
|
||||
assert "already invoked" in prompt_lower or "already ran" in prompt_lower, (
|
||||
"Prompt must tell the judge to consult the invoked-tools "
|
||||
"history so it can distinguish 'not yet tried' from "
|
||||
"'already ran successfully'."
|
||||
)
|
||||
assert "terminal" in prompt_lower and (
|
||||
"already ran" in prompt_lower or "already been invoked" in prompt_lower
|
||||
), (
|
||||
"Prompt must bias terminal when a tool covering the user's "
|
||||
"action has already been invoked successfully."
|
||||
)
|
||||
|
||||
def test_prompt_still_continues_on_unaddressed_multi_part(self):
|
||||
"""The terminal bias for single-part queries must not cannibalise
|
||||
multi-part flows. Prompt must explicitly tell the judge that
|
||||
when the query has multiple parts and at least one is
|
||||
unaddressed, return continue."""
|
||||
from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT
|
||||
|
||||
prompt_lower = _EVALUATOR_SYSTEM_PROMPT.lower()
|
||||
assert "multi-part" in prompt_lower or "multi part" in prompt_lower, (
|
||||
"Evaluator prompt should name the multi-part case so the "
|
||||
"terminal bias does not swallow chained-research flows."
|
||||
)
|
||||
assert (
|
||||
"unaddressed" in prompt_lower
|
||||
or "not addressed" in prompt_lower
|
||||
or "not yet addressed" in prompt_lower
|
||||
or "still unanswered" in prompt_lower
|
||||
), (
|
||||
"Evaluator prompt should tell the judge to return continue "
|
||||
"when a multi-part query has at least one unaddressed part."
|
||||
)
|
||||
Reference in New Issue
Block a user