"""Unit tests for the agentic-loop turn evaluator.""" from unittest.mock import patch import pytest from jarvis.reply.evaluator import evaluate_turn, EvaluatorResult, _parse_result class TestParseResult: def test_parses_terminal_true(self): res = _parse_result('{"terminal": true, "nudge": "", "reason": "done"}') assert res.terminal is True assert res.nudge == "" def test_parses_continue_with_nudge(self): res = _parse_result( '{"terminal": false, "nudge": "Call openApp with target=YouTube", ' '"reason": "agent offered instead of acting"}' ) assert res.terminal is False assert res.nudge == "Call openApp with target=YouTube" assert "offered" in res.reason def test_fails_open_to_terminal_on_garbage(self): res = _parse_result("not JSON at all") assert res.terminal is True assert res.reason == "evaluator_failed_open" def test_strips_markdown_fences(self): res = _parse_result( '```json\n{"terminal": true, "nudge": "", "reason": "ok"}\n```' ) assert res.terminal is True def test_extracts_embedded_json(self): res = _parse_result( 'Here: {"terminal": false, "nudge": "use X", "reason": "r"} done' ) assert res.terminal is False assert res.nudge == "use X" def test_missing_terminal_field_fails_open_to_terminal(self): res = _parse_result('{"nudge": "x", "reason": "y"}') assert res.terminal is True assert res.reason == "evaluator_failed_open" def test_non_bool_terminal_fails_open_to_terminal(self): res = _parse_result('{"terminal": "yes", "nudge": "", "reason": ""}') assert res.terminal is True def test_parses_tool_call_field(self): """Evaluator can return a structured `tool_call` with name + args alongside the free-form nudge. This lets the engine execute the tool directly instead of relying on the chat model to obey a textual nudge — critical for small models that ignore nudges.""" res = _parse_result( '{"terminal": false, "nudge": "call webSearch", ' '"reason": "prose", "tool_call": {"name": "webSearch", ' '"arguments": {"search_query": "overview of China"}}}' ) assert res.terminal is False assert res.tool_call is not None assert res.tool_call["name"] == "webSearch" assert res.tool_call["arguments"] == {"search_query": "overview of China"} def test_tool_call_absent_is_none(self): res = _parse_result( '{"terminal": false, "nudge": "do the thing", "reason": "prose"}' ) assert res.tool_call is None def test_tool_call_missing_name_is_rejected(self): """Malformed tool_call (no string name) must be dropped, not crash.""" res = _parse_result( '{"terminal": false, "nudge": "x", "reason": "y", ' '"tool_call": {"arguments": {}}}' ) assert res.tool_call is None def test_tool_call_non_dict_arguments_normalised_to_empty(self): res = _parse_result( '{"terminal": false, "nudge": "x", "reason": "y", ' '"tool_call": {"name": "stop", "arguments": "junk"}}' ) assert res.tool_call is not None assert res.tool_call["name"] == "stop" assert res.tool_call["arguments"] == {} class TestEvaluateTurn: def _cfg(self, **overrides): class _C: ollama_base_url = "http://x" ollama_chat_model = "m" llm_digest_timeout_sec = 5.0 llm_thinking_enabled = False c = _C() for k, v in overrides.items(): setattr(c, k, v) return c def test_terminal_path(self): with patch( "jarvis.reply.evaluator.call_llm_direct", return_value='{"terminal": true, "nudge": "", "reason": "done"}', ): res = evaluate_turn( "what's 2+2?", "4.", [("calc", "do maths")], 1, self._cfg() ) assert res.terminal is True assert res.nudge == "" def test_continue_with_nudge(self): with patch( "jarvis.reply.evaluator.call_llm_direct", return_value=( '{"terminal": false, "nudge": "Invoke openApp with ' 'target=YouTube", "reason": "offered instead of acted"}' ), ): res = evaluate_turn( "open youtube", "I can navigate you to YouTube homepage.", [("openApp", "Open an application"), ("stop", "stop sentinel")], 1, self._cfg(), ) assert res.terminal is False assert "openApp" in res.nudge def test_parse_failure_fails_open_to_terminal(self): with patch( "jarvis.reply.evaluator.call_llm_direct", return_value="not a valid response", ): res = evaluate_turn("q", "r", [], 1, self._cfg()) assert res.terminal is True assert res.reason == "evaluator_failed_open" def test_timeout_or_exception_fails_open_to_terminal(self): with patch( "jarvis.reply.evaluator.call_llm_direct", side_effect=TimeoutError("slow"), ): res = evaluate_turn("q", "r", [], 1, self._cfg()) assert res.terminal is True assert res.reason == "evaluator_failed_open" def test_missing_config_fails_open_to_terminal(self): cfg = self._cfg(ollama_base_url="", ollama_chat_model="") res = evaluate_turn("q", "r", [], 1, cfg) assert res.terminal is True assert res.reason == "evaluator_failed_open" def test_connection_error_fails_open_to_terminal(self): with patch( "jarvis.reply.evaluator.call_llm_direct", side_effect=ConnectionError("ollama down"), ): res = evaluate_turn("q", "r", [], 1, self._cfg()) assert res.terminal is True def test_redacts_email_in_prompt(self): """Assistant response echoing an email is scrubbed before the LLM call.""" captured = {} def _capture(**kwargs): captured.update(kwargs) return '{"terminal": true, "nudge": "", "reason": ""}' with patch( "jarvis.reply.evaluator.call_llm_direct", side_effect=_capture, ): evaluate_turn( "who is alice?", "Her email is alice@example.com and she lives in London.", [], 1, self._cfg(), ) sent = captured.get("user_content", "") assert "alice@example.com" not in sent assert "[REDACTED_EMAIL]" in sent def test_available_tools_appear_in_prompt(self): captured = {} def _capture(**kwargs): captured.update(kwargs) return '{"terminal": true, "nudge": "", "reason": ""}' with patch( "jarvis.reply.evaluator.call_llm_direct", side_effect=_capture, ): evaluate_turn( "open youtube", "I can help you find YouTube.", [ ("openApp", "Open an application by name"), ("webSearch", "Search the web"), ], 1, self._cfg(), ) sent = captured.get("user_content", "") assert "openApp" in sent assert "Open an application by name" in sent assert "webSearch" in sent def test_tool_schema_appears_in_prompt(self): """Regression: without parameter names the evaluator tends to emit hallucinated argument keys (``query`` instead of ``search_query``), causing direct-exec to fail schema validation in a loop.""" captured = {} def _capture(**kwargs): captured.update(kwargs) return '{"terminal": true, "nudge": "", "reason": ""}' schema = { "type": "object", "properties": { "search_query": {"type": "string"}, }, "required": ["search_query"], } with patch( "jarvis.reply.evaluator.call_llm_direct", side_effect=_capture, ): evaluate_turn( "tube strikes today", "I cannot check real-time info.", [("webSearch", "Search the web", schema)], 1, self._cfg(), ) sent = captured.get("user_content", "") assert "webSearch(search_query: string required)" in sent, ( f"Expected parameter signature in prompt; got: {sent[:400]!r}" ) def test_tool_schema_omitted_falls_back_to_name_only(self): """Two-tuple form must still work for back-compat.""" captured = {} def _capture(**kwargs): captured.update(kwargs) return '{"terminal": true, "nudge": "", "reason": ""}' with patch( "jarvis.reply.evaluator.call_llm_direct", side_effect=_capture, ): evaluate_turn( "q", "r", [("webSearch", "Search the web")], 1, self._cfg(), ) sent = captured.get("user_content", "") assert "webSearch" in sent # No hallucinated param signature when schema absent. assert "webSearch(" not in sent def test_invoked_tools_appear_in_prompt(self): """Regression: without this context the evaluator cannot tell that a tool has already run, and keeps re-requesting it when the chat model replies in prose after a successful direct-exec.""" captured = {} def _capture(**kwargs): captured.update(kwargs) return '{"terminal": true, "nudge": "", "reason": ""}' with patch( "jarvis.reply.evaluator.call_llm_direct", side_effect=_capture, ): evaluate_turn( user_query="open youtube", assistant_response_summary="I'll help with that.", available_tools=[ ( "chrome-devtools__navigate_page", "Navigate to a URL in Chrome", ), ], turns_used=2, cfg=self._cfg(), invoked_tools=[ ( "chrome-devtools__navigate_page", '{"url": "youtube.com"}', '{"status": "ok", "url": "https://youtube.com"}', ), ], ) sent = captured.get("user_content", "") assert "TOOLS ALREADY INVOKED THIS REPLY" in sent, ( f"Evaluator prompt must include an invoked-tools block. " f"Got: {sent[:400]!r}" ) assert "chrome-devtools__navigate_page" in sent assert "youtube.com" in sent, ( "Args of invoked tools must appear in the prompt so the " "evaluator can match them against the user's request and " "avoid re-requesting the same call." ) def test_invoked_tools_default_is_empty(self): """When the caller omits invoked_tools (engine paths predating the parameter, tests), the prompt still renders with a clear '(none yet this reply)' marker instead of crashing.""" captured = {} def _capture(**kwargs): captured.update(kwargs) return '{"terminal": true, "nudge": "", "reason": ""}' with patch( "jarvis.reply.evaluator.call_llm_direct", side_effect=_capture, ): evaluate_turn("q", "r", [], 1, self._cfg()) sent = captured.get("user_content", "") assert "TOOLS ALREADY INVOKED THIS REPLY" in sent assert "none yet" in sent def test_evaluator_model_override_used(self): captured = {} def _capture(**kwargs): captured.update(kwargs) return '{"terminal": true, "nudge": "", "reason": ""}' cfg = self._cfg( evaluator_model="dedicated-evaluator", intent_judge_model="judge-model", ollama_chat_model="chat-model", ) with patch( "jarvis.reply.evaluator.call_llm_direct", side_effect=_capture, ): evaluate_turn("q", "r", [], 1, cfg) assert captured.get("chat_model") == "dedicated-evaluator" def test_evaluator_model_falls_back_to_intent_judge(self): captured = {} def _capture(**kwargs): captured.update(kwargs) return '{"terminal": true, "nudge": "", "reason": ""}' cfg = self._cfg( evaluator_model="", intent_judge_model="judge-model", ollama_chat_model="chat-model", ) with patch( "jarvis.reply.evaluator.call_llm_direct", side_effect=_capture, ): evaluate_turn("q", "r", [], 1, cfg) assert captured.get("chat_model") == "judge-model" class TestEvaluatorGarbledTurnGuidance: """The evaluator prompt must tell the judge model to reject garbled agent turns (raw tool protocol markers, special tokens, truncated JSON) with a continue so a retry can produce a real reply. Without this clause, the judge sees ``tool_code\\nprint(...)`` as "prose", returns terminal, and the engine ships the garbage straight to the user. The deterministic malformed guard in the engine handles the known shapes; this clause is defence-in-depth for novel leaks the guard has not learned yet. """ def test_prompt_mentions_garbled_marker_recognition(self): from jarvis.reply.evaluator import _EVALUATOR_SYSTEM_PROMPT prompt_lower = _EVALUATOR_SYSTEM_PROMPT.lower() assert "garbled" in prompt_lower or "malformed" in prompt_lower, ( "Evaluator prompt must explicitly instruct the judge to " "recognise garbled / malformed agent turns and return continue " "so the engine can recover instead of shipping the junk." ) # The explicit shapes we want the judge on the lookout for. for marker in ("tool_code", "tool_output", "