Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
231 lines
9.3 KiB
Python
231 lines
9.3 KiB
Python
"""Unit tests for the lenient text-based tool-call parser.
|
|
|
|
Small models emit tool calls in several shapes that the native Ollama
|
|
tool_calls API doesn't recognise. The engine's ``_extract_text_tool_call``
|
|
must parse these so the model's compliance succeeds regardless of shape.
|
|
|
|
The gemma-native ``tool_code`` branch was removed in the evaluator-driven
|
|
loop refactor — the model is now responsible for producing a valid tool
|
|
call, and the evaluator / toolSearchTool path replaces the safety net.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
|
|
def _extract(content: str, tool_name: str = "webSearch"):
|
|
import jarvis.reply.engine as engine_mod
|
|
assert hasattr(engine_mod, "_extract_text_tool_call"), (
|
|
"Expose _extract_text_tool_call at module level for test coverage."
|
|
)
|
|
return engine_mod._extract_text_tool_call(content, {tool_name})
|
|
|
|
|
|
class TestCanonicalToolCallsArrayLiteral:
|
|
"""Form 1: `tool_calls: [...]` JSON array in content."""
|
|
|
|
def test_extracts_name_and_string_args(self):
|
|
content = (
|
|
'tool_calls: [{"id": "call_1", "type": "function", '
|
|
'"function": {"name": "webSearch", "arguments": "Possessor movie"}}]'
|
|
)
|
|
name, args, _ = _extract(content)
|
|
assert name == "webSearch"
|
|
assert args and isinstance(args, dict)
|
|
|
|
def test_extracts_name_and_dict_args(self):
|
|
content = (
|
|
'tool_calls: [{"id": "call_1", "type": "function", '
|
|
'"function": {"name": "webSearch", '
|
|
'"arguments": {"search_query": "Piranesi book"}}}]'
|
|
)
|
|
name, args, _ = _extract(content)
|
|
assert name == "webSearch"
|
|
assert args.get("search_query") == "Piranesi book"
|
|
|
|
|
|
class TestMalformedCanonicalToolCallsLenientFallback:
|
|
"""Form 1b: small models emit almost-valid JSON that drops closing braces.
|
|
|
|
Without the lenient fallback the raw line leaks as the reply.
|
|
"""
|
|
|
|
def test_parses_despite_missing_closing_braces(self):
|
|
content = (
|
|
'tool_calls: [{"id": "call_1", "type": "function", '
|
|
'"function": {"name": "getWeather", '
|
|
'"arguments": "{\\"location\\": \\"Tbilisi, Georgia\\"}}"]'
|
|
)
|
|
name, args, _ = _extract(content, tool_name="getWeather")
|
|
assert name == "getWeather"
|
|
assert args.get("location") == "Tbilisi, Georgia"
|
|
|
|
def test_lenient_fallback_rejects_unknown_tool_names(self):
|
|
content = (
|
|
'tool_calls: [{"id": "call_1", "type": "function", '
|
|
'"function": {"name": "fileSystem_write", '
|
|
'"arguments": "{\\"path\\": \\"/tmp/x\\"}}"]'
|
|
)
|
|
name, _args, _ = _extract(content, tool_name="webSearch")
|
|
assert name is None
|
|
|
|
|
|
class TestSimplifiedColonForm:
|
|
"""Form 2: `toolName: key: value`."""
|
|
|
|
def test_parses_tool_name_and_arg(self):
|
|
content = "webSearch: search_query: Possessor movie"
|
|
name, args, _ = _extract(content)
|
|
assert name == "webSearch"
|
|
assert args.get("search_query") == "Possessor movie"
|
|
|
|
def test_rejects_unknown_tool_name(self):
|
|
content = "Note: something: arbitrary prose"
|
|
name, _args, _ = _extract(content)
|
|
assert name is None
|
|
|
|
|
|
class TestFunctionCallForm:
|
|
"""Form 3: `toolName(...)`."""
|
|
|
|
def test_parses_json_object_inside_parens(self):
|
|
content = 'webSearch({"search_query": "Possessor"})'
|
|
name, args, _ = _extract(content)
|
|
assert name == "webSearch"
|
|
assert args.get("search_query") == "Possessor"
|
|
|
|
def test_parses_bare_string_inside_parens(self):
|
|
content = 'webSearch("Possessor")'
|
|
name, args, _ = _extract(content)
|
|
assert name == "webSearch"
|
|
assert any(v == "Possessor" for v in args.values())
|
|
|
|
|
|
class TestNoFalsePositiveOnProse:
|
|
def test_plain_conversational_reply_is_not_parsed_as_tool_call(self):
|
|
content = "I can help you find information about movies."
|
|
name, _args, _ = _extract(content)
|
|
assert name is None
|
|
|
|
|
|
def _is_malformed(content: str) -> bool:
|
|
import jarvis.reply.engine as engine_mod
|
|
assert hasattr(engine_mod, "_is_malformed_model_output"), (
|
|
"Expose _is_malformed_model_output at module level for test coverage."
|
|
)
|
|
return engine_mod._is_malformed_model_output(content)
|
|
|
|
|
|
class TestMalformedModelOutputGuard:
|
|
"""``_is_malformed_model_output`` gates content before it can reach the
|
|
user. Covers the field-captured leak shapes we have observed from
|
|
small models (gemma4:e2b/e4b) after tool results."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"content,label",
|
|
[
|
|
("tool_calls: []", "bare tool_calls literal"),
|
|
("tool_calls: [{}]", "tool_calls with stub entry"),
|
|
("tool_code\n print(google_search.search(query='x'))\n ", "gemma tool_code block"),
|
|
("tool_output\n[{'snippet': 'x'}]", "gemma tool_output block"),
|
|
("Okay, here is your answer <unused88>", "unused sentinel inline"),
|
|
("Reply ends with <unused10>.", "different unused sentinel"),
|
|
("{\"forecast\": 14, \"high\": 15", "truncated JSON (no closing brace)"),
|
|
('{"openapi": "3.0.0", "paths": {}}', "OpenAPI spec dump"),
|
|
('{"location": "Hackney", "forecast": "cloudy"}', "weather JSON dump"),
|
|
],
|
|
)
|
|
def test_detects_malformed_shape(self, content, label):
|
|
assert _is_malformed(content), f"Should flag: {label!r} -> {content!r}"
|
|
|
|
@pytest.mark.parametrize(
|
|
"content",
|
|
[
|
|
"Sure, the capital of France is Paris.",
|
|
"I found three results: Blinding Lights, Anti-Hero, and Levitating.",
|
|
"I couldn't read the page contents this time. Want me to retry?",
|
|
# Starts with { but closes properly AND has a conversational field.
|
|
'{"response": "Here you go."}',
|
|
],
|
|
)
|
|
def test_allows_normal_prose(self, content):
|
|
assert not _is_malformed(content), f"Should not flag prose: {content!r}"
|
|
|
|
|
|
class TestTextToolCallGuidancePrompt:
|
|
"""The text-based tool-call guidance injected for gemma-class models must
|
|
explicitly name and forbid the shapes we know gemma leaks when confused.
|
|
|
|
Gemma is not a natively tool-calling model — we bolt tool calling on via
|
|
a prompt that teaches the `tool_calls: [...]` literal shape. Gemma's
|
|
pre-training includes a different protocol (Google's code-interpreter
|
|
`tool_code` / `tool_output` fenced blocks and `<unusedNN>` sentinel
|
|
tokens), and when confused the model falls back to emitting those
|
|
instead. The engine's deterministic guard catches them downstream, but
|
|
the prompt itself should name them as forbidden so the model is steered
|
|
away from emitting them in the first place — cheaper than catching and
|
|
retrying.
|
|
|
|
This test pins the prompt against drift: if someone reshuffles the
|
|
guidance and drops the forbidden-shape clause, this test fails.
|
|
"""
|
|
|
|
def _guidance(self, allowed_names=("webSearch", "stop", "toolSearchTool")):
|
|
import jarvis.reply.engine as engine_mod
|
|
assert hasattr(engine_mod, "_text_tool_call_guidance"), (
|
|
"Expose _text_tool_call_guidance(allowed_names) at module "
|
|
"level so the tool-call prompt block is unit-testable."
|
|
)
|
|
return engine_mod._text_tool_call_guidance(list(allowed_names))
|
|
|
|
def test_guidance_teaches_tool_calls_array_shape(self):
|
|
text = self._guidance()
|
|
assert "tool_calls:" in text, (
|
|
"Guidance must teach the `tool_calls: [...]` literal shape "
|
|
"the parser expects."
|
|
)
|
|
|
|
def test_guidance_lists_allowed_tool_names(self):
|
|
text = self._guidance(["webSearch", "stop", "openApp"])
|
|
for name in ("webSearch", "stop", "openApp"):
|
|
assert name in text, f"{name} should appear in the allow-list"
|
|
|
|
@pytest.mark.parametrize(
|
|
"forbidden,label",
|
|
[
|
|
("tool_code", "gemma code-interpreter block"),
|
|
("tool_output", "gemma tool-output block"),
|
|
("<unused", "gemma unused-sentinel token"),
|
|
],
|
|
)
|
|
def test_guidance_names_and_forbids_gemma_native_shapes(
|
|
self, forbidden, label
|
|
):
|
|
text = self._guidance()
|
|
assert forbidden in text, (
|
|
f"Guidance should explicitly name {forbidden!r} ({label}) as "
|
|
f"a forbidden shape so the model is steered away from "
|
|
f"emitting it. Naming specific tokens beats vague 'do not "
|
|
f"emit raw protocol' instructions for small models."
|
|
)
|
|
|
|
def test_guidance_marks_gemma_shapes_as_forbidden_not_examples(self):
|
|
"""The forbidden shapes should appear in a forbidding context
|
|
('do not', 'never', 'will fail', 'forbidden'), not as positive
|
|
examples the model should copy.
|
|
"""
|
|
text = self._guidance()
|
|
# Find the paragraph mentioning tool_code and check it sits near
|
|
# a negation.
|
|
idx = text.find("tool_code")
|
|
assert idx >= 0
|
|
window = text[max(0, idx - 200) : idx + 200].lower()
|
|
assert any(
|
|
neg in window
|
|
for neg in ("do not", "don't", "never", "will fail", "forbidden", "not accepted")
|
|
), (
|
|
"The `tool_code` mention must be in a forbidding context, "
|
|
"not a positive example. Showing gemma's native protocol as "
|
|
"an example would reinforce the exact behaviour we want to "
|
|
"stop."
|
|
)
|