Files
javis_bot/tests/test_text_tool_call_parser.py
javis-bot c4abf63f38
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
2026-06-09 14:51:05 +09:00

231 lines
9.3 KiB
Python

"""Unit tests for the lenient text-based tool-call parser.
Small models emit tool calls in several shapes that the native Ollama
tool_calls API doesn't recognise. The engine's ``_extract_text_tool_call``
must parse these so the model's compliance succeeds regardless of shape.
The gemma-native ``tool_code`` branch was removed in the evaluator-driven
loop refactor — the model is now responsible for producing a valid tool
call, and the evaluator / toolSearchTool path replaces the safety net.
"""
import pytest
def _extract(content: str, tool_name: str = "webSearch"):
import jarvis.reply.engine as engine_mod
assert hasattr(engine_mod, "_extract_text_tool_call"), (
"Expose _extract_text_tool_call at module level for test coverage."
)
return engine_mod._extract_text_tool_call(content, {tool_name})
class TestCanonicalToolCallsArrayLiteral:
"""Form 1: `tool_calls: [...]` JSON array in content."""
def test_extracts_name_and_string_args(self):
content = (
'tool_calls: [{"id": "call_1", "type": "function", '
'"function": {"name": "webSearch", "arguments": "Possessor movie"}}]'
)
name, args, _ = _extract(content)
assert name == "webSearch"
assert args and isinstance(args, dict)
def test_extracts_name_and_dict_args(self):
content = (
'tool_calls: [{"id": "call_1", "type": "function", '
'"function": {"name": "webSearch", '
'"arguments": {"search_query": "Piranesi book"}}}]'
)
name, args, _ = _extract(content)
assert name == "webSearch"
assert args.get("search_query") == "Piranesi book"
class TestMalformedCanonicalToolCallsLenientFallback:
"""Form 1b: small models emit almost-valid JSON that drops closing braces.
Without the lenient fallback the raw line leaks as the reply.
"""
def test_parses_despite_missing_closing_braces(self):
content = (
'tool_calls: [{"id": "call_1", "type": "function", '
'"function": {"name": "getWeather", '
'"arguments": "{\\"location\\": \\"Tbilisi, Georgia\\"}}"]'
)
name, args, _ = _extract(content, tool_name="getWeather")
assert name == "getWeather"
assert args.get("location") == "Tbilisi, Georgia"
def test_lenient_fallback_rejects_unknown_tool_names(self):
content = (
'tool_calls: [{"id": "call_1", "type": "function", '
'"function": {"name": "fileSystem_write", '
'"arguments": "{\\"path\\": \\"/tmp/x\\"}}"]'
)
name, _args, _ = _extract(content, tool_name="webSearch")
assert name is None
class TestSimplifiedColonForm:
"""Form 2: `toolName: key: value`."""
def test_parses_tool_name_and_arg(self):
content = "webSearch: search_query: Possessor movie"
name, args, _ = _extract(content)
assert name == "webSearch"
assert args.get("search_query") == "Possessor movie"
def test_rejects_unknown_tool_name(self):
content = "Note: something: arbitrary prose"
name, _args, _ = _extract(content)
assert name is None
class TestFunctionCallForm:
"""Form 3: `toolName(...)`."""
def test_parses_json_object_inside_parens(self):
content = 'webSearch({"search_query": "Possessor"})'
name, args, _ = _extract(content)
assert name == "webSearch"
assert args.get("search_query") == "Possessor"
def test_parses_bare_string_inside_parens(self):
content = 'webSearch("Possessor")'
name, args, _ = _extract(content)
assert name == "webSearch"
assert any(v == "Possessor" for v in args.values())
class TestNoFalsePositiveOnProse:
def test_plain_conversational_reply_is_not_parsed_as_tool_call(self):
content = "I can help you find information about movies."
name, _args, _ = _extract(content)
assert name is None
def _is_malformed(content: str) -> bool:
import jarvis.reply.engine as engine_mod
assert hasattr(engine_mod, "_is_malformed_model_output"), (
"Expose _is_malformed_model_output at module level for test coverage."
)
return engine_mod._is_malformed_model_output(content)
class TestMalformedModelOutputGuard:
"""``_is_malformed_model_output`` gates content before it can reach the
user. Covers the field-captured leak shapes we have observed from
small models (gemma4:e2b/e4b) after tool results."""
@pytest.mark.parametrize(
"content,label",
[
("tool_calls: []", "bare tool_calls literal"),
("tool_calls: [{}]", "tool_calls with stub entry"),
("tool_code\n print(google_search.search(query='x'))\n ", "gemma tool_code block"),
("tool_output\n[{'snippet': 'x'}]", "gemma tool_output block"),
("Okay, here is your answer <unused88>", "unused sentinel inline"),
("Reply ends with <unused10>.", "different unused sentinel"),
("{\"forecast\": 14, \"high\": 15", "truncated JSON (no closing brace)"),
('{"openapi": "3.0.0", "paths": {}}', "OpenAPI spec dump"),
('{"location": "Hackney", "forecast": "cloudy"}', "weather JSON dump"),
],
)
def test_detects_malformed_shape(self, content, label):
assert _is_malformed(content), f"Should flag: {label!r} -> {content!r}"
@pytest.mark.parametrize(
"content",
[
"Sure, the capital of France is Paris.",
"I found three results: Blinding Lights, Anti-Hero, and Levitating.",
"I couldn't read the page contents this time. Want me to retry?",
# Starts with { but closes properly AND has a conversational field.
'{"response": "Here you go."}',
],
)
def test_allows_normal_prose(self, content):
assert not _is_malformed(content), f"Should not flag prose: {content!r}"
class TestTextToolCallGuidancePrompt:
"""The text-based tool-call guidance injected for gemma-class models must
explicitly name and forbid the shapes we know gemma leaks when confused.
Gemma is not a natively tool-calling model — we bolt tool calling on via
a prompt that teaches the `tool_calls: [...]` literal shape. Gemma's
pre-training includes a different protocol (Google's code-interpreter
`tool_code` / `tool_output` fenced blocks and `<unusedNN>` sentinel
tokens), and when confused the model falls back to emitting those
instead. The engine's deterministic guard catches them downstream, but
the prompt itself should name them as forbidden so the model is steered
away from emitting them in the first place — cheaper than catching and
retrying.
This test pins the prompt against drift: if someone reshuffles the
guidance and drops the forbidden-shape clause, this test fails.
"""
def _guidance(self, allowed_names=("webSearch", "stop", "toolSearchTool")):
import jarvis.reply.engine as engine_mod
assert hasattr(engine_mod, "_text_tool_call_guidance"), (
"Expose _text_tool_call_guidance(allowed_names) at module "
"level so the tool-call prompt block is unit-testable."
)
return engine_mod._text_tool_call_guidance(list(allowed_names))
def test_guidance_teaches_tool_calls_array_shape(self):
text = self._guidance()
assert "tool_calls:" in text, (
"Guidance must teach the `tool_calls: [...]` literal shape "
"the parser expects."
)
def test_guidance_lists_allowed_tool_names(self):
text = self._guidance(["webSearch", "stop", "openApp"])
for name in ("webSearch", "stop", "openApp"):
assert name in text, f"{name} should appear in the allow-list"
@pytest.mark.parametrize(
"forbidden,label",
[
("tool_code", "gemma code-interpreter block"),
("tool_output", "gemma tool-output block"),
("<unused", "gemma unused-sentinel token"),
],
)
def test_guidance_names_and_forbids_gemma_native_shapes(
self, forbidden, label
):
text = self._guidance()
assert forbidden in text, (
f"Guidance should explicitly name {forbidden!r} ({label}) as "
f"a forbidden shape so the model is steered away from "
f"emitting it. Naming specific tokens beats vague 'do not "
f"emit raw protocol' instructions for small models."
)
def test_guidance_marks_gemma_shapes_as_forbidden_not_examples(self):
"""The forbidden shapes should appear in a forbidding context
('do not', 'never', 'will fail', 'forbidden'), not as positive
examples the model should copy.
"""
text = self._guidance()
# Find the paragraph mentioning tool_code and check it sits near
# a negation.
idx = text.find("tool_code")
assert idx >= 0
window = text[max(0, idx - 200) : idx + 200].lower()
assert any(
neg in window
for neg in ("do not", "don't", "never", "will fail", "forbidden", "not accepted")
), (
"The `tool_code` mention must be in a forbidding context, "
"not a positive example. Showing gemma's native protocol as "
"an example would reinforce the exact behaviour we want to "
"stop."
)