Add Discord-native hybrid front-end for Jarvis (bot + bridge)
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Some checks failed
Release / semantic-release (push) Successful in 59s
tests / Unit tests (Linux, Python 3.11) (push) Successful in 13m45s
Release / build-linux (push) Failing after 7m47s
Release / build-windows (push) Has been cancelled
Release / build-macos (arm64, macos-latest) (push) Has been cancelled
Release / build-macos (x64, macos-15-intel) (push) Has been cancelled
Release / release-main (push) Has been cancelled
Release / release-develop (push) Has been cancelled
Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
This commit is contained in:
974
tests/test_dictation.py
Normal file
974
tests/test_dictation.py
Normal file
@@ -0,0 +1,974 @@
|
||||
"""
|
||||
Tests for the dictation engine (hold-to-dictate feature).
|
||||
"""
|
||||
|
||||
import threading
|
||||
import time
|
||||
from unittest.mock import patch, MagicMock, PropertyMock
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_engine(**overrides):
|
||||
"""Create a DictationEngine with sensible test defaults."""
|
||||
from src.jarvis.dictation.dictation_engine import DictationEngine
|
||||
|
||||
defaults = dict(
|
||||
whisper_model_ref=lambda: MagicMock(),
|
||||
whisper_backend_ref=lambda: "faster-whisper",
|
||||
mlx_repo_ref=lambda: None,
|
||||
hotkey="ctrl+shift+d",
|
||||
sample_rate=16000,
|
||||
on_dictation_start=None,
|
||||
on_dictation_end=None,
|
||||
transcribe_lock=threading.Lock(),
|
||||
)
|
||||
defaults.update(overrides)
|
||||
return DictationEngine(**defaults)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Beep generation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBeepGeneration:
|
||||
"""Tests for beep WAV generation."""
|
||||
|
||||
def test_start_beep_is_valid_wav(self):
|
||||
from src.jarvis.dictation.dictation_engine import _get_start_beep
|
||||
wav = _get_start_beep()
|
||||
assert wav[:4] == b"RIFF"
|
||||
assert wav[8:12] == b"WAVE"
|
||||
|
||||
def test_stop_beep_is_valid_wav(self):
|
||||
from src.jarvis.dictation.dictation_engine import _get_stop_beep
|
||||
wav = _get_stop_beep()
|
||||
assert wav[:4] == b"RIFF"
|
||||
assert wav[8:12] == b"WAVE"
|
||||
|
||||
def test_start_and_stop_beeps_differ(self):
|
||||
from src.jarvis.dictation.dictation_engine import _get_start_beep, _get_stop_beep
|
||||
assert _get_start_beep() != _get_stop_beep()
|
||||
|
||||
def test_generate_beep_wav_custom_params(self):
|
||||
from src.jarvis.dictation.dictation_engine import _generate_beep_wav
|
||||
wav = _generate_beep_wav(freq=1000, duration=0.05)
|
||||
assert wav[:4] == b"RIFF"
|
||||
assert len(wav) > 44 # At least a header
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hotkey parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestHotkeyParsing:
|
||||
"""Tests for hotkey string → pynput key object parsing."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _skip_if_no_pynput(self):
|
||||
try:
|
||||
import pynput # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("pynput not installed")
|
||||
|
||||
def test_parse_ctrl_shift_d(self):
|
||||
from src.jarvis.dictation.dictation_engine import parse_hotkey
|
||||
mods, trigger = parse_hotkey("ctrl+shift+d")
|
||||
assert len(mods) == 2
|
||||
assert trigger is not None
|
||||
|
||||
def test_parse_modifier_only_combo(self):
|
||||
"""A modifier-only hotkey like 'ctrl+cmd' should be valid."""
|
||||
from src.jarvis.dictation.dictation_engine import parse_hotkey
|
||||
mods, trigger = parse_hotkey("ctrl+cmd")
|
||||
assert len(mods) == 2
|
||||
assert trigger is None
|
||||
|
||||
def test_parse_ctrl_alt(self):
|
||||
"""macOS/Linux default: ctrl+alt should parse as two modifiers."""
|
||||
from src.jarvis.dictation.dictation_engine import parse_hotkey
|
||||
mods, trigger = parse_hotkey("ctrl+alt")
|
||||
assert len(mods) == 2
|
||||
assert trigger is None
|
||||
|
||||
def test_parse_ctrl_win(self):
|
||||
"""'win' modifier alias should map to the same key as 'cmd'."""
|
||||
from src.jarvis.dictation.dictation_engine import parse_hotkey
|
||||
mods_win, trigger_win = parse_hotkey("ctrl+win")
|
||||
mods_cmd, trigger_cmd = parse_hotkey("ctrl+cmd")
|
||||
assert mods_win == mods_cmd
|
||||
assert trigger_win is None
|
||||
assert trigger_cmd is None
|
||||
|
||||
def test_parse_empty_string_raises(self):
|
||||
from src.jarvis.dictation.dictation_engine import parse_hotkey
|
||||
with pytest.raises(ValueError):
|
||||
parse_hotkey("")
|
||||
|
||||
def test_parse_unknown_key_raises(self):
|
||||
from src.jarvis.dictation.dictation_engine import parse_hotkey
|
||||
with pytest.raises(ValueError):
|
||||
parse_hotkey("ctrl+nonexistentkey")
|
||||
|
||||
def test_parse_alt_modifier(self):
|
||||
from src.jarvis.dictation.dictation_engine import parse_hotkey
|
||||
mods, trigger = parse_hotkey("alt+x")
|
||||
assert len(mods) == 1
|
||||
assert trigger is not None
|
||||
|
||||
def test_parse_single_letter(self):
|
||||
"""A single letter without modifiers should work as trigger."""
|
||||
from src.jarvis.dictation.dictation_engine import parse_hotkey
|
||||
# Technically no modifiers, just a trigger
|
||||
mods, trigger = parse_hotkey("f")
|
||||
assert len(mods) == 0
|
||||
assert trigger is not None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Engine lifecycle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEngineLifecycle:
|
||||
"""Tests for DictationEngine start/stop behaviour."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _skip_if_no_deps(self):
|
||||
try:
|
||||
import pynput # noqa: F401
|
||||
import sounddevice # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("pynput or sounddevice not installed")
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
@patch("src.jarvis.dictation.dictation_engine.sys")
|
||||
@patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
|
||||
def test_start_creates_listener(self, mock_kb, mock_sys, mock_platform):
|
||||
# Force a platform where pynput is allowed (avoid macOS 26+ guard)
|
||||
mock_sys.platform = "linux"
|
||||
mock_listener_instance = MagicMock()
|
||||
mock_kb.Listener.return_value = mock_listener_instance
|
||||
mock_kb.Key = MagicMock()
|
||||
mock_kb.KeyCode = MagicMock()
|
||||
mock_kb.Key.ctrl_l = MagicMock()
|
||||
mock_kb.Key.shift = MagicMock()
|
||||
|
||||
engine = _make_engine()
|
||||
engine.start()
|
||||
|
||||
assert engine._started is True
|
||||
mock_listener_instance.start.assert_called_once()
|
||||
|
||||
engine.stop()
|
||||
assert engine._started is False
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.pynput_keyboard", None)
|
||||
def test_start_without_pynput_is_noop(self):
|
||||
"""Engine should gracefully skip when pynput is missing."""
|
||||
from src.jarvis.dictation.dictation_engine import DictationEngine
|
||||
# We can't use _make_engine because parse_hotkey needs pynput.
|
||||
# Directly test the start() guard.
|
||||
engine = DictationEngine.__new__(DictationEngine)
|
||||
engine._started = False
|
||||
engine._listener = None
|
||||
engine._recording = False
|
||||
engine.start()
|
||||
assert engine._started is False
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.sd", None)
|
||||
@patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
|
||||
def test_start_without_sounddevice_is_noop(self, mock_kb):
|
||||
"""Engine should gracefully skip when sounddevice is missing."""
|
||||
mock_kb.Key = MagicMock()
|
||||
mock_kb.KeyCode = MagicMock()
|
||||
mock_kb.Key.ctrl_l = MagicMock()
|
||||
mock_kb.Key.shift = MagicMock()
|
||||
|
||||
engine = _make_engine()
|
||||
engine.start()
|
||||
assert engine._started is False
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
@patch("src.jarvis.dictation.dictation_engine.sys")
|
||||
@patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
|
||||
def test_start_skips_on_macos_26(self, mock_kb, mock_sys, mock_platform):
|
||||
"""pynput crashes on macOS 26+ (TSM thread assertion). Engine must skip."""
|
||||
mock_sys.platform = "darwin"
|
||||
mock_platform.mac_ver.return_value = ("26.2", ("", "", ""), "")
|
||||
mock_kb.Key = MagicMock()
|
||||
mock_kb.KeyCode = MagicMock()
|
||||
mock_kb.Key.ctrl_l = MagicMock()
|
||||
mock_kb.Key.shift = MagicMock()
|
||||
|
||||
engine = _make_engine()
|
||||
engine.start()
|
||||
assert engine._started is False
|
||||
mock_kb.Listener.assert_not_called()
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
@patch("src.jarvis.dictation.dictation_engine.sys")
|
||||
@patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
|
||||
def test_start_allowed_on_macos_15(self, mock_kb, mock_sys, mock_platform):
|
||||
"""pynput should still work on macOS 15 (Sequoia) and earlier."""
|
||||
mock_sys.platform = "darwin"
|
||||
mock_platform.mac_ver.return_value = ("15.4", ("", "", ""), "")
|
||||
mock_listener = MagicMock()
|
||||
mock_kb.Listener.return_value = mock_listener
|
||||
mock_kb.Key = MagicMock()
|
||||
mock_kb.KeyCode = MagicMock()
|
||||
mock_kb.Key.ctrl_l = MagicMock()
|
||||
mock_kb.Key.shift = MagicMock()
|
||||
|
||||
engine = _make_engine()
|
||||
engine.start()
|
||||
assert engine._started is True
|
||||
mock_listener.start.assert_called_once()
|
||||
engine.stop()
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
@patch("src.jarvis.dictation.dictation_engine.sys")
|
||||
@patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
|
||||
def test_start_allowed_on_windows(self, mock_kb, mock_sys, mock_platform):
|
||||
"""Windows should not be affected by the macOS guard."""
|
||||
mock_sys.platform = "win32"
|
||||
mock_listener = MagicMock()
|
||||
mock_kb.Listener.return_value = mock_listener
|
||||
mock_kb.Key = MagicMock()
|
||||
mock_kb.KeyCode = MagicMock()
|
||||
mock_kb.Key.ctrl_l = MagicMock()
|
||||
mock_kb.Key.shift = MagicMock()
|
||||
|
||||
engine = _make_engine()
|
||||
engine.start()
|
||||
assert engine._started is True
|
||||
mock_listener.start.assert_called_once()
|
||||
engine.stop()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Recording state machine
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRecordingStateMachine:
|
||||
"""Tests for the recording start/stop logic."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _skip_if_no_deps(self):
|
||||
try:
|
||||
import pynput # noqa: F401
|
||||
import sounddevice # noqa: F401
|
||||
import numpy # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("required dependencies not installed")
|
||||
|
||||
def test_start_recording_checks_whisper_model(self):
|
||||
"""Should not start recording if Whisper model is None (non-mlx)."""
|
||||
engine = _make_engine(whisper_model_ref=lambda: None)
|
||||
engine._start_recording()
|
||||
assert engine._recording is False
|
||||
|
||||
def test_start_recording_allows_mlx_without_model(self):
|
||||
"""MLX backend uses repo reference, not model object."""
|
||||
engine = _make_engine(
|
||||
whisper_model_ref=lambda: None,
|
||||
whisper_backend_ref=lambda: "mlx",
|
||||
mlx_repo_ref=lambda: "mlx-community/whisper-small-mlx",
|
||||
)
|
||||
with patch("src.jarvis.dictation.dictation_engine.sd") as mock_sd, \
|
||||
patch("src.jarvis.dictation.dictation_engine._play_beep"):
|
||||
mock_stream = MagicMock()
|
||||
mock_sd.InputStream.return_value = mock_stream
|
||||
engine._start_recording()
|
||||
assert engine._recording is True
|
||||
# Cleanup
|
||||
engine._stop_recording(discard=True)
|
||||
|
||||
def test_stop_recording_discard_clears_frames(self):
|
||||
engine = _make_engine()
|
||||
engine._recording = True
|
||||
engine._audio_frames = [MagicMock()]
|
||||
engine._stream = MagicMock()
|
||||
engine._stop_recording(discard=True)
|
||||
assert engine._audio_frames == []
|
||||
assert engine._recording is False
|
||||
|
||||
def test_stop_recording_returns_fast_on_slow_stream_close(self):
|
||||
"""The non-discard path must not block the caller on stream.close().
|
||||
|
||||
Rationale: ``_stop_recording`` is invoked from the pynput low-level
|
||||
keyboard hook callback. Windows silently removes low-level keyboard
|
||||
hooks that take more than ~5 s to return, which leaves pynput in an
|
||||
inconsistent state that can crash the process when the paste thread
|
||||
subsequently calls Controller.press/tap/release (issue #184).
|
||||
|
||||
The listener callback must return in a handful of milliseconds even
|
||||
if closing the audio device is slow.
|
||||
"""
|
||||
import numpy as np
|
||||
slow_stream = MagicMock()
|
||||
|
||||
def slow_close(*_args, **_kwargs):
|
||||
time.sleep(1.0)
|
||||
|
||||
slow_stream.stop.side_effect = slow_close
|
||||
slow_stream.close.side_effect = slow_close
|
||||
|
||||
engine = _make_engine()
|
||||
engine._recording = True
|
||||
engine._stream = slow_stream
|
||||
# Short (< 0.3 s) audio so transcribe_and_paste exits quickly.
|
||||
engine._audio_frames = [np.zeros(1600, dtype=np.float32)]
|
||||
|
||||
with patch("src.jarvis.dictation.dictation_engine._play_beep"):
|
||||
t0 = time.time()
|
||||
engine._stop_recording()
|
||||
elapsed = time.time() - t0
|
||||
|
||||
# The caller (simulating the pynput hook) must return quickly.
|
||||
# 200 ms is generous headroom vs. the ~5 s Windows LowLevelHooksTimeout
|
||||
# — the method should actually return in microseconds, since it just
|
||||
# flips a bool and spawns a daemon thread.
|
||||
assert elapsed < 0.2, (
|
||||
f"_stop_recording blocked for {elapsed:.2f}s in the listener "
|
||||
"thread — stream.close() must be off the hot path"
|
||||
)
|
||||
|
||||
# The stream must still be closed eventually, off-thread.
|
||||
deadline = time.time() + 5.0
|
||||
while time.time() < deadline and not slow_stream.close.called:
|
||||
time.sleep(0.05)
|
||||
assert slow_stream.close.called, "stream.close() never ran"
|
||||
|
||||
def test_stop_recording_idempotent_under_concurrent_calls(self):
|
||||
"""Rapid double-release of the hotkey must not double-close the stream.
|
||||
|
||||
On Windows ``ctrl+cmd`` the user releases two keys in quick succession;
|
||||
both releases can fire the listener callback before either has finished.
|
||||
Only one teardown should reach the stream.
|
||||
"""
|
||||
import numpy as np
|
||||
engine = _make_engine()
|
||||
engine._recording = True
|
||||
stream_mock = MagicMock()
|
||||
engine._stream = stream_mock
|
||||
engine._audio_frames = [np.zeros(1600, dtype=np.float32)]
|
||||
|
||||
with patch("src.jarvis.dictation.dictation_engine._play_beep"):
|
||||
# Two near-simultaneous calls from the listener.
|
||||
t1 = threading.Thread(target=engine._stop_recording)
|
||||
t2 = threading.Thread(target=engine._stop_recording)
|
||||
t1.start()
|
||||
t2.start()
|
||||
t1.join()
|
||||
t2.join()
|
||||
|
||||
# Wait for the spawned teardown thread to run close().
|
||||
deadline = time.time() + 5.0
|
||||
while time.time() < deadline and not stream_mock.close.called:
|
||||
time.sleep(0.05)
|
||||
# Only one of the two calls should have reached the stream.
|
||||
assert stream_mock.close.call_count == 1
|
||||
|
||||
def test_max_duration_callback_still_stops_recording(self):
|
||||
"""Hitting the 60s cap must still close the stream and fire the end
|
||||
callback, even though the new teardown path runs off-thread.
|
||||
|
||||
``_audio_callback`` spawns a daemon thread that calls
|
||||
``_stop_recording()``; that then dispatches ``_finalise_and_transcribe``
|
||||
which closes the stream and eventually invokes ``_on_dictation_end``
|
||||
(via ``_transcribe_and_paste``'s finally).
|
||||
"""
|
||||
import numpy as np
|
||||
end_called = threading.Event()
|
||||
engine = _make_engine(
|
||||
on_dictation_end=lambda: end_called.set(),
|
||||
whisper_model_ref=lambda: None, # short-circuits transcribe
|
||||
whisper_backend_ref=lambda: "faster-whisper",
|
||||
)
|
||||
stream_mock = MagicMock()
|
||||
engine._recording = True
|
||||
engine._stream = stream_mock
|
||||
# Pre-fill up to the limit so one more frame triggers the cap.
|
||||
engine._max_frames = 100
|
||||
engine._audio_frames = [np.zeros(100, dtype=np.float32)]
|
||||
|
||||
with patch("src.jarvis.dictation.dictation_engine._play_beep"):
|
||||
indata = np.random.randn(1600, 1).astype(np.float32)
|
||||
engine._audio_callback(indata, 1600, None, None)
|
||||
# _stop_recording runs in a daemon thread; wait for close().
|
||||
assert end_called.wait(timeout=5.0), "on_dictation_end never fired"
|
||||
|
||||
assert stream_mock.close.called, "stream.close() never ran"
|
||||
assert engine._recording is False
|
||||
|
||||
def test_finalise_fires_on_dictation_end_when_beep_raises(self):
|
||||
"""A failure in ``_play_beep`` must not strand the listener paused.
|
||||
|
||||
``_on_dictation_end`` is normally fired from
|
||||
``_transcribe_and_paste``'s finally, but that step is never reached
|
||||
if ``_close_stream`` or ``_play_beep`` raises. ``_finalise_and_transcribe``
|
||||
must therefore guarantee the callback fires on any error.
|
||||
"""
|
||||
import numpy as np
|
||||
end_called = threading.Event()
|
||||
engine = _make_engine(on_dictation_end=lambda: end_called.set())
|
||||
|
||||
with patch(
|
||||
"src.jarvis.dictation.dictation_engine._play_beep",
|
||||
side_effect=RuntimeError("beep broken"),
|
||||
):
|
||||
engine._finalise_and_transcribe(
|
||||
stream=None,
|
||||
audio_frames=[np.zeros(1600, dtype=np.float32)],
|
||||
start_time=time.time(),
|
||||
)
|
||||
|
||||
assert end_called.is_set(), (
|
||||
"_on_dictation_end must fire even when _play_beep raises"
|
||||
)
|
||||
|
||||
def test_on_dictation_callbacks_called(self):
|
||||
"""Start/end callbacks should be invoked."""
|
||||
start_called = threading.Event()
|
||||
end_called = threading.Event()
|
||||
|
||||
engine = _make_engine(
|
||||
on_dictation_start=lambda: start_called.set(),
|
||||
on_dictation_end=lambda: end_called.set(),
|
||||
)
|
||||
|
||||
with patch("src.jarvis.dictation.dictation_engine.sd") as mock_sd, \
|
||||
patch("src.jarvis.dictation.dictation_engine._play_beep"):
|
||||
mock_stream = MagicMock()
|
||||
mock_sd.InputStream.return_value = mock_stream
|
||||
engine._start_recording()
|
||||
assert start_called.is_set()
|
||||
|
||||
engine._stop_recording(discard=True)
|
||||
assert end_called.is_set()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Transcription
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTranscription:
|
||||
"""Tests for the transcription logic."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _skip_if_no_deps(self):
|
||||
try:
|
||||
import numpy # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("numpy not installed")
|
||||
|
||||
def test_transcribe_faster_whisper(self):
|
||||
import numpy as np
|
||||
mock_model = MagicMock()
|
||||
mock_seg = MagicMock()
|
||||
mock_seg.text = " hello world "
|
||||
mock_model.transcribe.return_value = ([mock_seg], MagicMock())
|
||||
|
||||
engine = _make_engine(
|
||||
whisper_model_ref=lambda: mock_model,
|
||||
whisper_backend_ref=lambda: "faster-whisper",
|
||||
)
|
||||
|
||||
audio = np.zeros(16000, dtype=np.float32)
|
||||
result = engine._transcribe(audio)
|
||||
assert result == "hello world"
|
||||
|
||||
def test_transcribe_empty_returns_empty(self):
|
||||
import numpy as np
|
||||
mock_model = MagicMock()
|
||||
mock_model.transcribe.return_value = ([], MagicMock())
|
||||
|
||||
engine = _make_engine(
|
||||
whisper_model_ref=lambda: mock_model,
|
||||
whisper_backend_ref=lambda: "faster-whisper",
|
||||
)
|
||||
|
||||
audio = np.zeros(16000, dtype=np.float32)
|
||||
result = engine._transcribe(audio)
|
||||
assert result == ""
|
||||
|
||||
def test_transcribe_no_model_returns_empty(self):
|
||||
import numpy as np
|
||||
engine = _make_engine(
|
||||
whisper_model_ref=lambda: None,
|
||||
whisper_backend_ref=lambda: "faster-whisper",
|
||||
)
|
||||
|
||||
audio = np.zeros(16000, dtype=np.float32)
|
||||
result = engine._transcribe(audio)
|
||||
assert result == ""
|
||||
|
||||
def test_transcribe_mlx(self):
|
||||
import sys
|
||||
import numpy as np
|
||||
mock_mlx = MagicMock()
|
||||
mock_mlx.transcribe.return_value = {"text": "hello from mlx"}
|
||||
|
||||
# Patch sys.modules so `import mlx_whisper` inside the method resolves
|
||||
with patch.dict(sys.modules, {"mlx_whisper": mock_mlx}):
|
||||
engine = _make_engine(
|
||||
whisper_model_ref=lambda: None,
|
||||
whisper_backend_ref=lambda: "mlx",
|
||||
mlx_repo_ref=lambda: "mlx-community/whisper-small-mlx",
|
||||
)
|
||||
|
||||
audio = np.zeros(16000, dtype=np.float32)
|
||||
result = engine._transcribe(audio)
|
||||
assert result == "hello from mlx"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Clipboard helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestClipboard:
|
||||
"""Tests for clipboard/paste helper functions."""
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
@patch("src.jarvis.dictation.dictation_engine._clipboard_windows")
|
||||
@patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
|
||||
def test_clipboard_paste_windows(self, mock_kb, mock_clip_win, mock_platform):
|
||||
from src.jarvis.dictation.dictation_engine import _clipboard_paste
|
||||
mock_platform.system.return_value = "Windows"
|
||||
mock_ctrl = MagicMock()
|
||||
mock_kb.Controller.return_value = mock_ctrl
|
||||
mock_kb.Key.ctrl = MagicMock()
|
||||
|
||||
_clipboard_paste("hello")
|
||||
mock_clip_win.assert_called_once_with("hello")
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine._paste_cgevent", return_value=True)
|
||||
@patch("src.jarvis.dictation.dictation_engine._check_macos_accessibility", return_value=True)
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
@patch("src.jarvis.dictation.dictation_engine._clipboard_macos")
|
||||
@patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
|
||||
def test_clipboard_paste_macos(
|
||||
self, mock_kb, mock_clip_mac, mock_platform, mock_ax, mock_cge
|
||||
):
|
||||
from src.jarvis.dictation.dictation_engine import _clipboard_paste
|
||||
mock_platform.system.return_value = "Darwin"
|
||||
mock_ctrl = MagicMock()
|
||||
mock_kb.Controller.return_value = mock_ctrl
|
||||
mock_kb.Key.cmd = MagicMock()
|
||||
|
||||
_clipboard_paste("hello mac")
|
||||
mock_clip_mac.assert_called_once_with("hello mac")
|
||||
# Guard: the real CGEvent paste and Accessibility check must never
|
||||
# fire during tests — they would emit a real Cmd+V into whatever
|
||||
# window has focus and pop open System Settings.
|
||||
mock_cge.assert_called_once()
|
||||
|
||||
def test_clipboard_paste_empty_string_is_noop(self):
|
||||
from src.jarvis.dictation.dictation_engine import _clipboard_paste
|
||||
# Should return immediately without error
|
||||
_clipboard_paste("")
|
||||
_clipboard_paste(None)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Audio callback
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestAudioCallback:
|
||||
"""Tests for the audio callback frame accumulation."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _skip_if_no_numpy(self):
|
||||
try:
|
||||
import numpy # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("numpy not installed")
|
||||
|
||||
def test_callback_accumulates_frames(self):
|
||||
import numpy as np
|
||||
engine = _make_engine()
|
||||
engine._recording = True
|
||||
engine._audio_frames = []
|
||||
engine._max_frames = 1_000_000
|
||||
|
||||
indata = np.random.randn(1600, 1).astype(np.float32)
|
||||
engine._audio_callback(indata, 1600, None, None)
|
||||
assert len(engine._audio_frames) == 1
|
||||
assert len(engine._audio_frames[0]) == 1600
|
||||
|
||||
def test_callback_ignores_when_not_recording(self):
|
||||
import numpy as np
|
||||
engine = _make_engine()
|
||||
engine._recording = False
|
||||
engine._audio_frames = []
|
||||
|
||||
indata = np.random.randn(1600, 1).astype(np.float32)
|
||||
engine._audio_callback(indata, 1600, None, None)
|
||||
assert len(engine._audio_frames) == 0
|
||||
|
||||
def test_callback_respects_max_duration(self):
|
||||
import numpy as np
|
||||
engine = _make_engine()
|
||||
engine._recording = True
|
||||
# Pre-fill near the max
|
||||
engine._max_frames = 100
|
||||
engine._audio_frames = [np.zeros(100, dtype=np.float32)]
|
||||
|
||||
indata = np.random.randn(1600, 1).astype(np.float32)
|
||||
with patch.object(engine, "_stop_recording"):
|
||||
engine._audio_callback(indata, 1600, None, None)
|
||||
# Should not accumulate more frames
|
||||
assert len(engine._audio_frames) == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Transcribe-and-paste pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTranscribeAndPaste:
|
||||
"""Tests for the full transcribe → paste pipeline."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _skip_if_no_numpy(self):
|
||||
try:
|
||||
import numpy # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("numpy not installed")
|
||||
|
||||
def test_short_audio_skipped(self):
|
||||
"""Audio shorter than 0.3s should be skipped."""
|
||||
import numpy as np
|
||||
engine = _make_engine()
|
||||
end_called = threading.Event()
|
||||
engine._on_dictation_end = lambda: end_called.set()
|
||||
|
||||
# 0.1s of audio at 16kHz = 1600 samples (< 4800 needed for 0.3s)
|
||||
short_frames = [np.zeros(1600, dtype=np.float32)]
|
||||
engine._transcribe_and_paste(short_frames)
|
||||
assert end_called.is_set()
|
||||
|
||||
def test_empty_frames_handled(self):
|
||||
engine = _make_engine()
|
||||
end_called = threading.Event()
|
||||
engine._on_dictation_end = lambda: end_called.set()
|
||||
|
||||
engine._transcribe_and_paste([])
|
||||
assert end_called.is_set()
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine._clipboard_paste")
|
||||
def test_successful_transcription_pastes(self, mock_paste):
|
||||
import numpy as np
|
||||
mock_model = MagicMock()
|
||||
mock_seg = MagicMock()
|
||||
mock_seg.text = "hello world"
|
||||
mock_model.transcribe.return_value = ([mock_seg], MagicMock())
|
||||
|
||||
engine = _make_engine(
|
||||
whisper_model_ref=lambda: mock_model,
|
||||
whisper_backend_ref=lambda: "faster-whisper",
|
||||
)
|
||||
|
||||
frames = [np.zeros(8000, dtype=np.float32)] # 0.5s
|
||||
engine._transcribe_and_paste(frames)
|
||||
mock_paste.assert_called_once_with("hello world")
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine._clipboard_paste")
|
||||
def test_empty_transcription_does_not_paste(self, mock_paste):
|
||||
import numpy as np
|
||||
mock_model = MagicMock()
|
||||
mock_model.transcribe.return_value = ([], MagicMock())
|
||||
|
||||
engine = _make_engine(
|
||||
whisper_model_ref=lambda: mock_model,
|
||||
whisper_backend_ref=lambda: "faster-whisper",
|
||||
)
|
||||
|
||||
frames = [np.zeros(8000, dtype=np.float32)]
|
||||
engine._transcribe_and_paste(frames)
|
||||
mock_paste.assert_not_called()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config integration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestConfigIntegration:
|
||||
"""Tests that dictation config fields are present in Settings."""
|
||||
|
||||
def test_settings_has_dictation_fields(self):
|
||||
from src.jarvis.config import Settings
|
||||
import inspect
|
||||
sig = inspect.signature(Settings)
|
||||
assert "dictation_enabled" in sig.parameters
|
||||
assert "dictation_hotkey" in sig.parameters
|
||||
|
||||
def test_default_config_has_dictation(self):
|
||||
import sys
|
||||
from src.jarvis.config import get_default_config
|
||||
defaults = get_default_config()
|
||||
assert defaults["dictation_enabled"] is True
|
||||
# Platform-aware default (aligned with WisprFlow)
|
||||
if sys.platform == "win32":
|
||||
assert defaults["dictation_hotkey"] == "ctrl+cmd"
|
||||
else:
|
||||
assert defaults["dictation_hotkey"] == "ctrl+alt"
|
||||
|
||||
def test_load_settings_includes_dictation(self):
|
||||
"""load_settings should produce Settings with dictation fields."""
|
||||
from src.jarvis.config import load_settings
|
||||
settings = load_settings()
|
||||
assert hasattr(settings, "dictation_enabled")
|
||||
assert hasattr(settings, "dictation_hotkey")
|
||||
assert isinstance(settings.dictation_enabled, bool)
|
||||
assert isinstance(settings.dictation_hotkey, str)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Face widget DICTATING state
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFaceWidgetDictatingState:
|
||||
"""Tests that the DICTATING state exists and is handled."""
|
||||
|
||||
def test_jarvis_state_has_dictating(self):
|
||||
from src.desktop_app.face_widget import JarvisState
|
||||
assert hasattr(JarvisState, "DICTATING")
|
||||
assert JarvisState.DICTATING.value == "dictating"
|
||||
|
||||
def test_dictating_state_round_trips(self):
|
||||
"""State manager should accept DICTATING state."""
|
||||
from src.desktop_app.face_widget import JarvisState
|
||||
state = JarvisState("dictating")
|
||||
assert state == JarvisState.DICTATING
|
||||
|
||||
def test_jarvis_state_has_dictation_processing(self):
|
||||
from src.desktop_app.face_widget import JarvisState
|
||||
assert hasattr(JarvisState, "DICTATION_PROCESSING")
|
||||
assert JarvisState.DICTATION_PROCESSING.value == "dictation_processing"
|
||||
|
||||
def test_dictation_processing_state_round_trips(self):
|
||||
from src.desktop_app.face_widget import JarvisState
|
||||
state = JarvisState("dictation_processing")
|
||||
assert state == JarvisState.DICTATION_PROCESSING
|
||||
|
||||
|
||||
class TestDictationProcessingCallback:
|
||||
"""Verifies the processing callback fires between recording stop and
|
||||
transcription, so the face can switch to a distinct 'processing' state
|
||||
once the user's voice input has been accepted."""
|
||||
|
||||
def test_processing_callback_fires_before_end_callback(self):
|
||||
"""End-to-end ordering: the processing callback must fire before the
|
||||
end callback during the full finalise → transcribe → paste chain."""
|
||||
from src.jarvis.dictation import dictation_engine as de
|
||||
|
||||
events = []
|
||||
|
||||
engine = _make_engine(
|
||||
on_dictation_processing_start=lambda: events.append("processing"),
|
||||
on_dictation_end=lambda: events.append("end"),
|
||||
)
|
||||
|
||||
# Stub stream teardown and beep audio only. The real
|
||||
# _transcribe_and_paste runs; with empty frames it short-circuits
|
||||
# and still fires _on_dictation_end via its finally block, which is
|
||||
# the wiring we want to verify.
|
||||
with patch.object(de, "_close_stream"), patch.object(de, "_play_beep"):
|
||||
engine._finalise_and_transcribe(
|
||||
stream=MagicMock(), audio_frames=[], start_time=time.time()
|
||||
)
|
||||
|
||||
assert events == ["processing", "end"]
|
||||
|
||||
def test_processing_callback_optional(self):
|
||||
"""Engine must work when no processing callback is supplied."""
|
||||
from src.jarvis.dictation import dictation_engine as de
|
||||
|
||||
engine = _make_engine(on_dictation_processing_start=None)
|
||||
|
||||
with patch.object(de, "_close_stream"), \
|
||||
patch.object(de, "_play_beep"), \
|
||||
patch.object(engine, "_transcribe_and_paste"):
|
||||
# Should not raise
|
||||
engine._finalise_and_transcribe(stream=MagicMock(), audio_frames=[], start_time=time.time())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Thread safety
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestThreadSafety:
|
||||
"""Tests for thread-safe transcription locking."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _skip_if_no_numpy(self):
|
||||
try:
|
||||
import numpy # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("numpy not installed")
|
||||
|
||||
def test_transcribe_acquires_lock(self):
|
||||
"""Transcription should acquire the shared lock."""
|
||||
import numpy as np
|
||||
lock = threading.Lock()
|
||||
mock_model = MagicMock()
|
||||
mock_model.transcribe.return_value = ([], MagicMock())
|
||||
|
||||
engine = _make_engine(
|
||||
whisper_model_ref=lambda: mock_model,
|
||||
whisper_backend_ref=lambda: "faster-whisper",
|
||||
transcribe_lock=lock,
|
||||
)
|
||||
|
||||
# Acquire the lock externally — transcribe should block
|
||||
lock.acquire()
|
||||
result_holder = [None]
|
||||
done = threading.Event()
|
||||
|
||||
def do_transcribe():
|
||||
result_holder[0] = engine._transcribe(np.zeros(16000, dtype=np.float32))
|
||||
done.set()
|
||||
|
||||
t = threading.Thread(target=do_transcribe)
|
||||
t.start()
|
||||
|
||||
# Give thread a moment — it should be blocked
|
||||
time.sleep(0.1)
|
||||
assert not done.is_set()
|
||||
|
||||
# Release the lock — thread should complete
|
||||
lock.release()
|
||||
done.wait(timeout=2.0)
|
||||
assert done.is_set()
|
||||
assert result_holder[0] == ""
|
||||
t.join(timeout=1.0)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Listener pause flag
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestListenerPauseFlag:
|
||||
"""Tests for the dictation pause flag on VoiceListener."""
|
||||
|
||||
@pytest.fixture()
|
||||
def listener(self):
|
||||
"""Create a VoiceListener with mock dependencies."""
|
||||
from src.jarvis.listening.listener import VoiceListener
|
||||
cfg = MagicMock()
|
||||
cfg.sample_rate = 16000
|
||||
cfg.vad_enabled = False
|
||||
cfg.wake_aliases = []
|
||||
cfg.stop_commands = ["stop"]
|
||||
return VoiceListener(MagicMock(), cfg, MagicMock(), MagicMock())
|
||||
|
||||
def test_voice_listener_has_dictation_active_flag(self, listener):
|
||||
"""VoiceListener should initialise _dictation_active = False."""
|
||||
assert hasattr(listener, "_dictation_active")
|
||||
assert listener._dictation_active is False
|
||||
|
||||
def test_voice_listener_has_transcribe_lock(self, listener):
|
||||
"""VoiceListener should expose a transcribe_lock."""
|
||||
assert hasattr(listener, "transcribe_lock")
|
||||
assert isinstance(listener.transcribe_lock, type(threading.Lock()))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# format_hotkey_display
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFormatHotkeyDisplay:
|
||||
"""Tests for platform-aware hotkey display formatting."""
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
def test_windows_cmd_shows_win(self, mock_platform):
|
||||
from src.jarvis.dictation.dictation_engine import format_hotkey_display
|
||||
mock_platform.system.return_value = "Windows"
|
||||
assert format_hotkey_display("ctrl+cmd") == "Ctrl + Win"
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
def test_windows_super_shows_win(self, mock_platform):
|
||||
from src.jarvis.dictation.dictation_engine import format_hotkey_display
|
||||
mock_platform.system.return_value = "Windows"
|
||||
assert format_hotkey_display("ctrl+super") == "Ctrl + Win"
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
def test_windows_win_shows_win(self, mock_platform):
|
||||
from src.jarvis.dictation.dictation_engine import format_hotkey_display
|
||||
mock_platform.system.return_value = "Windows"
|
||||
assert format_hotkey_display("ctrl+win") == "Ctrl + Win"
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
def test_macos_cmd_shows_cmd(self, mock_platform):
|
||||
from src.jarvis.dictation.dictation_engine import format_hotkey_display
|
||||
mock_platform.system.return_value = "Darwin"
|
||||
assert format_hotkey_display("ctrl+cmd") == "Ctrl + Cmd"
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
def test_macos_alt_shows_option(self, mock_platform):
|
||||
from src.jarvis.dictation.dictation_engine import format_hotkey_display
|
||||
mock_platform.system.return_value = "Darwin"
|
||||
assert format_hotkey_display("ctrl+alt") == "Ctrl + Option"
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
def test_ctrl_shift_d(self, mock_platform):
|
||||
from src.jarvis.dictation.dictation_engine import format_hotkey_display
|
||||
mock_platform.system.return_value = "Windows"
|
||||
assert format_hotkey_display("ctrl+shift+d") == "Ctrl + Shift + D"
|
||||
|
||||
@patch("src.jarvis.dictation.dictation_engine.platform")
|
||||
def test_linux_alt_stays_alt(self, mock_platform):
|
||||
from src.jarvis.dictation.dictation_engine import format_hotkey_display
|
||||
mock_platform.system.return_value = "Linux"
|
||||
assert format_hotkey_display("ctrl+alt") == "Ctrl + Alt"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _clipboard_windows ctypes correctness
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestClipboardWindowsCtypes:
|
||||
"""Verify _clipboard_windows sets proper ctypes return types."""
|
||||
|
||||
@pytest.mark.skipif(
|
||||
__import__("sys").platform != "win32",
|
||||
reason="Windows-only clipboard API",
|
||||
)
|
||||
def test_clipboard_windows_roundtrip(self):
|
||||
"""Write to clipboard and read back to verify ctypes bindings."""
|
||||
import ctypes
|
||||
from ctypes import wintypes
|
||||
from src.jarvis.dictation.dictation_engine import _clipboard_windows
|
||||
|
||||
test_text = "dictation test 🎙️"
|
||||
_clipboard_windows(test_text)
|
||||
|
||||
# Read back from clipboard
|
||||
user32 = ctypes.windll.user32
|
||||
kernel32 = ctypes.windll.kernel32
|
||||
user32.OpenClipboard.argtypes = [wintypes.HWND]
|
||||
user32.OpenClipboard.restype = wintypes.BOOL
|
||||
user32.GetClipboardData.argtypes = [wintypes.UINT]
|
||||
user32.GetClipboardData.restype = wintypes.HANDLE
|
||||
user32.CloseClipboard.restype = wintypes.BOOL
|
||||
kernel32.GlobalLock.argtypes = [wintypes.HANDLE]
|
||||
kernel32.GlobalLock.restype = ctypes.c_void_p
|
||||
kernel32.GlobalUnlock.argtypes = [wintypes.HANDLE]
|
||||
kernel32.GlobalUnlock.restype = wintypes.BOOL
|
||||
|
||||
CF_UNICODETEXT = 13
|
||||
assert user32.OpenClipboard(None)
|
||||
try:
|
||||
h = user32.GetClipboardData(CF_UNICODETEXT)
|
||||
assert h, "GetClipboardData returned NULL"
|
||||
ptr = kernel32.GlobalLock(h)
|
||||
assert ptr, "GlobalLock returned NULL"
|
||||
result = ctypes.wstring_at(ptr)
|
||||
kernel32.GlobalUnlock(h)
|
||||
assert result == test_text
|
||||
finally:
|
||||
user32.CloseClipboard()
|
||||
Reference in New Issue
Block a user