Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled

Transform isair/jarvis into a Discord-controlled voice assistant running on
the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact.

- bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral),
  voice channel join + voice receive/playback, pluggable VNC screen broadcast
  (selfbot live / noVNC / screenshot)
- bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS
  behind a thin localhost HTTP API
- .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite,
  docs/language-comparison.md and docs/vnc-xfce-setup.md

Language decision: hybrid (Python brain + Node/bun Discord layer) because
Discord blocks bot video; native screen broadcast only works via a Node
selfbot library.
This commit is contained in:
javis-bot
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions

View File

@@ -0,0 +1,170 @@
"""
End-to-end eval — two-turn flow where the user supplies a missing tool
argument on the second turn.
Field trace (2026-05-03, gemma4:e2b):
Turn 1: "how's the weather tomorrow Jarvis?"
→ location not configured → getWeather reports "no location set"
→ assistant asks the user for a location.
Turn 2: "I'm in London"
→ small router picks webSearch (not getWeather), planner does
`webSearch query='weather in london tomorrow'`, DDG bot-challenges,
Wikipedia fallback matches "Edge of Tomorrow" (the 2014 Tom Cruise
film) on the keyword "tomorrow", and the assistant parrots the film
summary as the weather answer.
The fix lives at the engine level: when the previous assistant turn
invoked a tool and the current user query is a short follow-up
(≤ ~80 chars), the previous tool name is unioned back into the allow-list
so the chat model can continue the original tool chain with the new info.
This eval drives the full reply engine over both turns and asserts that
``getWeather`` is invoked twice — once with empty args (turn 1) and once
with ``location='London'`` (turn 2) — and that the final reply mentions
the London forecast, not "Edge of Tomorrow".
Run: EVAL_JUDGE_MODEL=gemma4:e2b ./scripts/run_evals.sh followup_supplies_missing_tool_arg
"""
from unittest.mock import patch
import pytest
from conftest import requires_judge_llm
from helpers import (
ToolCallCapture,
assert_not_fallback_reply,
JUDGE_MODEL,
)
_LONDON_FORECAST = (
"Weather for London, UK:\n"
"Today: 15°C, partly cloudy. High 17°C, low 10°C.\n"
"Tomorrow: 14°C, light rain, high 16°C, low 9°C."
)
def _make_get_weather_runner(capture: ToolCallCapture):
"""Mock for ``run_tool_with_retries`` that responds to getWeather based
on the location argument.
Empty args → ``success=False`` ("could not auto-detect location") to
match the real getWeather behaviour and stamp ``tool_failed=True`` on
the recorded tool turn (turn 1 shape).
``location='London'`` (or any non-empty location) → ``success=True``
plus the canned forecast.
Everything else falls through to ``success=True`` "OK".
"""
from jarvis.tools.types import ToolExecutionResult
def _runner(db, cfg, tool_name, tool_args, **kwargs):
capture.record(tool_name, tool_args or {})
if tool_name == "getWeather":
location = ((tool_args or {}).get("location") or "").strip()
if not location:
return ToolExecutionResult(
success=False,
reply_text=(
"I couldn't auto-detect your location. Please "
"tell me which city to check the weather for."
),
)
return ToolExecutionResult(
success=True,
reply_text=_LONDON_FORECAST,
)
# If the model misroutes to webSearch we want to make damn sure we
# don't accidentally satisfy the assertion via a confabulated
# success — return something the model cannot honestly turn into
# a London forecast.
if tool_name == "webSearch":
return ToolExecutionResult(
success=True,
reply_text=(
"UNTRUSTED WEB EXTRACT:\n"
"Edge of Tomorrow is a 2014 American science fiction "
"action film directed by Doug Liman, starring Tom Cruise."
),
)
return ToolExecutionResult(success=True, reply_text="OK")
return _runner
@pytest.mark.eval
@requires_judge_llm
class TestFollowupSuppliesMissingToolArg:
"""End-to-end regression for the engine-level tool carry-over guard."""
def test_short_followup_continues_previous_tool_chain(
self, mock_config, eval_db, eval_dialogue_memory,
):
from jarvis.reply.engine import run_reply_engine
mock_config.ollama_base_url = "http://localhost:11434"
mock_config.ollama_chat_model = JUDGE_MODEL
# Geoip disabled — the only way the model gets a location is
# from the user supplying one on turn 2.
mock_config.location_enabled = False
capture = ToolCallCapture()
with patch(
"jarvis.reply.engine.run_tool_with_retries",
side_effect=_make_get_weather_runner(capture),
):
turn1 = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text="how's the weather tomorrow Jarvis?",
dialogue_memory=eval_dialogue_memory,
)
turn2 = run_reply_engine(
db=eval_db, cfg=mock_config, tts=None,
text="I'm in London",
dialogue_memory=eval_dialogue_memory,
)
print(f"\n Followup Carry-over ({JUDGE_MODEL}):")
print(f" Turn 1 reply: {(turn1 or '')[:200]}")
print(f" Turn 2 reply: {(turn2 or '')[:200]}")
print(f" Tools called: {capture.tool_names()}")
for c in capture.calls:
print(f" - {c['name']}({c['args']})")
assert_not_fallback_reply(turn1, context="turn-1")
assert_not_fallback_reply(turn2, context="turn-2")
weather_calls = [c for c in capture.calls if c["name"] == "getWeather"]
assert len(weather_calls) >= 2, (
"Expected getWeather to be invoked at least twice (once with "
"empty args on turn 1, once with location='London' on turn 2). "
f"Tools observed: {capture.tool_names()}. Calls: {capture.calls}"
)
# Turn-2 call must carry the location the user supplied.
london_calls = [
c for c in weather_calls
if "london" in (c["args"].get("location") or "").lower()
]
assert london_calls, (
"getWeather was never re-invoked with location='London' on "
"turn 2 — the carry-over guard did not preserve the previous "
f"tool's place in the allow-list. All getWeather calls: "
f"{[c['args'] for c in weather_calls]}"
)
# webSearch must NOT have been the path — that's the field-trace
# failure mode (Edge of Tomorrow). If it fired anyway, the user
# answer must still be about London weather, not the film.
turn2_lower = (turn2 or "").lower()
assert "edge of tomorrow" not in turn2_lower, (
"Reply parroted the Wikipedia fallback for 'Edge of Tomorrow'. "
f"Reply: {(turn2 or '')[:400]}"
)
assert "london" in turn2_lower, (
"Turn-2 reply does not mention London weather. "
f"Reply: {(turn2 or '')[:400]}"
)