Add Discord-native hybrid front-end for Jarvis (bot + bridge)

Transform isair/jarvis into a Discord-controlled voice assistant running on the Ubuntu VNC desktop, keeping the mature ~39k-line Python brain intact. - bot/ (Node + bun, discord.js): /자비스 slash commands (ephemeral), voice channel join + voice receive/playback, pluggable VNC screen broadcast (selfbot live / noVNC / screenshot) - bridge/ (Python, Flask): wraps jarvis STT + run_reply_engine + Piper TTS behind a thin localhost HTTP API - .env.example, scripts/ (start_bridge/start_bot/dev), README rewrite, docs/language-comparison.md and docs/vnc-xfce-setup.md Language decision: hybrid (Python brain + Node/bun Discord layer) because Discord blocks bot video; native screen broadcast only works via a Node selfbot library.
2026-06-09 14:51:05 +09:00
parent a5bf8d1826
commit c4abf63f38
308 changed files with 94135 additions and 1 deletions
--- a/tests/test_dictation.py
+++ b/tests/test_dictation.py
@@ -0,0 +1,974 @@
+"""
+Tests for the dictation engine (hold-to-dictate feature).
+"""
+
+import threading
+import time
+from unittest.mock import patch, MagicMock, PropertyMock
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_engine(**overrides):
+    """Create a DictationEngine with sensible test defaults."""
+    from src.jarvis.dictation.dictation_engine import DictationEngine
+
+    defaults = dict(
+        whisper_model_ref=lambda: MagicMock(),
+        whisper_backend_ref=lambda: "faster-whisper",
+        mlx_repo_ref=lambda: None,
+        hotkey="ctrl+shift+d",
+        sample_rate=16000,
+        on_dictation_start=None,
+        on_dictation_end=None,
+        transcribe_lock=threading.Lock(),
+    )
+    defaults.update(overrides)
+    return DictationEngine(**defaults)
+
+
+# ---------------------------------------------------------------------------
+# Beep generation
+# ---------------------------------------------------------------------------
+
+class TestBeepGeneration:
+    """Tests for beep WAV generation."""
+
+    def test_start_beep_is_valid_wav(self):
+        from src.jarvis.dictation.dictation_engine import _get_start_beep
+        wav = _get_start_beep()
+        assert wav[:4] == b"RIFF"
+        assert wav[8:12] == b"WAVE"
+
+    def test_stop_beep_is_valid_wav(self):
+        from src.jarvis.dictation.dictation_engine import _get_stop_beep
+        wav = _get_stop_beep()
+        assert wav[:4] == b"RIFF"
+        assert wav[8:12] == b"WAVE"
+
+    def test_start_and_stop_beeps_differ(self):
+        from src.jarvis.dictation.dictation_engine import _get_start_beep, _get_stop_beep
+        assert _get_start_beep() != _get_stop_beep()
+
+    def test_generate_beep_wav_custom_params(self):
+        from src.jarvis.dictation.dictation_engine import _generate_beep_wav
+        wav = _generate_beep_wav(freq=1000, duration=0.05)
+        assert wav[:4] == b"RIFF"
+        assert len(wav) > 44  # At least a header
+
+
+# ---------------------------------------------------------------------------
+# Hotkey parsing
+# ---------------------------------------------------------------------------
+
+class TestHotkeyParsing:
+    """Tests for hotkey string → pynput key object parsing."""
+
+    @pytest.fixture(autouse=True)
+    def _skip_if_no_pynput(self):
+        try:
+            import pynput  # noqa: F401
+        except ImportError:
+            pytest.skip("pynput not installed")
+
+    def test_parse_ctrl_shift_d(self):
+        from src.jarvis.dictation.dictation_engine import parse_hotkey
+        mods, trigger = parse_hotkey("ctrl+shift+d")
+        assert len(mods) == 2
+        assert trigger is not None
+
+    def test_parse_modifier_only_combo(self):
+        """A modifier-only hotkey like 'ctrl+cmd' should be valid."""
+        from src.jarvis.dictation.dictation_engine import parse_hotkey
+        mods, trigger = parse_hotkey("ctrl+cmd")
+        assert len(mods) == 2
+        assert trigger is None
+
+    def test_parse_ctrl_alt(self):
+        """macOS/Linux default: ctrl+alt should parse as two modifiers."""
+        from src.jarvis.dictation.dictation_engine import parse_hotkey
+        mods, trigger = parse_hotkey("ctrl+alt")
+        assert len(mods) == 2
+        assert trigger is None
+
+    def test_parse_ctrl_win(self):
+        """'win' modifier alias should map to the same key as 'cmd'."""
+        from src.jarvis.dictation.dictation_engine import parse_hotkey
+        mods_win, trigger_win = parse_hotkey("ctrl+win")
+        mods_cmd, trigger_cmd = parse_hotkey("ctrl+cmd")
+        assert mods_win == mods_cmd
+        assert trigger_win is None
+        assert trigger_cmd is None
+
+    def test_parse_empty_string_raises(self):
+        from src.jarvis.dictation.dictation_engine import parse_hotkey
+        with pytest.raises(ValueError):
+            parse_hotkey("")
+
+    def test_parse_unknown_key_raises(self):
+        from src.jarvis.dictation.dictation_engine import parse_hotkey
+        with pytest.raises(ValueError):
+            parse_hotkey("ctrl+nonexistentkey")
+
+    def test_parse_alt_modifier(self):
+        from src.jarvis.dictation.dictation_engine import parse_hotkey
+        mods, trigger = parse_hotkey("alt+x")
+        assert len(mods) == 1
+        assert trigger is not None
+
+    def test_parse_single_letter(self):
+        """A single letter without modifiers should work as trigger."""
+        from src.jarvis.dictation.dictation_engine import parse_hotkey
+        # Technically no modifiers, just a trigger
+        mods, trigger = parse_hotkey("f")
+        assert len(mods) == 0
+        assert trigger is not None
+
+
+# ---------------------------------------------------------------------------
+# Engine lifecycle
+# ---------------------------------------------------------------------------
+
+class TestEngineLifecycle:
+    """Tests for DictationEngine start/stop behaviour."""
+
+    @pytest.fixture(autouse=True)
+    def _skip_if_no_deps(self):
+        try:
+            import pynput  # noqa: F401
+            import sounddevice  # noqa: F401
+        except ImportError:
+            pytest.skip("pynput or sounddevice not installed")
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    @patch("src.jarvis.dictation.dictation_engine.sys")
+    @patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
+    def test_start_creates_listener(self, mock_kb, mock_sys, mock_platform):
+        # Force a platform where pynput is allowed (avoid macOS 26+ guard)
+        mock_sys.platform = "linux"
+        mock_listener_instance = MagicMock()
+        mock_kb.Listener.return_value = mock_listener_instance
+        mock_kb.Key = MagicMock()
+        mock_kb.KeyCode = MagicMock()
+        mock_kb.Key.ctrl_l = MagicMock()
+        mock_kb.Key.shift = MagicMock()
+
+        engine = _make_engine()
+        engine.start()
+
+        assert engine._started is True
+        mock_listener_instance.start.assert_called_once()
+
+        engine.stop()
+        assert engine._started is False
+
+    @patch("src.jarvis.dictation.dictation_engine.pynput_keyboard", None)
+    def test_start_without_pynput_is_noop(self):
+        """Engine should gracefully skip when pynput is missing."""
+        from src.jarvis.dictation.dictation_engine import DictationEngine
+        # We can't use _make_engine because parse_hotkey needs pynput.
+        # Directly test the start() guard.
+        engine = DictationEngine.__new__(DictationEngine)
+        engine._started = False
+        engine._listener = None
+        engine._recording = False
+        engine.start()
+        assert engine._started is False
+
+    @patch("src.jarvis.dictation.dictation_engine.sd", None)
+    @patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
+    def test_start_without_sounddevice_is_noop(self, mock_kb):
+        """Engine should gracefully skip when sounddevice is missing."""
+        mock_kb.Key = MagicMock()
+        mock_kb.KeyCode = MagicMock()
+        mock_kb.Key.ctrl_l = MagicMock()
+        mock_kb.Key.shift = MagicMock()
+
+        engine = _make_engine()
+        engine.start()
+        assert engine._started is False
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    @patch("src.jarvis.dictation.dictation_engine.sys")
+    @patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
+    def test_start_skips_on_macos_26(self, mock_kb, mock_sys, mock_platform):
+        """pynput crashes on macOS 26+ (TSM thread assertion). Engine must skip."""
+        mock_sys.platform = "darwin"
+        mock_platform.mac_ver.return_value = ("26.2", ("", "", ""), "")
+        mock_kb.Key = MagicMock()
+        mock_kb.KeyCode = MagicMock()
+        mock_kb.Key.ctrl_l = MagicMock()
+        mock_kb.Key.shift = MagicMock()
+
+        engine = _make_engine()
+        engine.start()
+        assert engine._started is False
+        mock_kb.Listener.assert_not_called()
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    @patch("src.jarvis.dictation.dictation_engine.sys")
+    @patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
+    def test_start_allowed_on_macos_15(self, mock_kb, mock_sys, mock_platform):
+        """pynput should still work on macOS 15 (Sequoia) and earlier."""
+        mock_sys.platform = "darwin"
+        mock_platform.mac_ver.return_value = ("15.4", ("", "", ""), "")
+        mock_listener = MagicMock()
+        mock_kb.Listener.return_value = mock_listener
+        mock_kb.Key = MagicMock()
+        mock_kb.KeyCode = MagicMock()
+        mock_kb.Key.ctrl_l = MagicMock()
+        mock_kb.Key.shift = MagicMock()
+
+        engine = _make_engine()
+        engine.start()
+        assert engine._started is True
+        mock_listener.start.assert_called_once()
+        engine.stop()
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    @patch("src.jarvis.dictation.dictation_engine.sys")
+    @patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
+    def test_start_allowed_on_windows(self, mock_kb, mock_sys, mock_platform):
+        """Windows should not be affected by the macOS guard."""
+        mock_sys.platform = "win32"
+        mock_listener = MagicMock()
+        mock_kb.Listener.return_value = mock_listener
+        mock_kb.Key = MagicMock()
+        mock_kb.KeyCode = MagicMock()
+        mock_kb.Key.ctrl_l = MagicMock()
+        mock_kb.Key.shift = MagicMock()
+
+        engine = _make_engine()
+        engine.start()
+        assert engine._started is True
+        mock_listener.start.assert_called_once()
+        engine.stop()
+
+
+# ---------------------------------------------------------------------------
+# Recording state machine
+# ---------------------------------------------------------------------------
+
+class TestRecordingStateMachine:
+    """Tests for the recording start/stop logic."""
+
+    @pytest.fixture(autouse=True)
+    def _skip_if_no_deps(self):
+        try:
+            import pynput  # noqa: F401
+            import sounddevice  # noqa: F401
+            import numpy  # noqa: F401
+        except ImportError:
+            pytest.skip("required dependencies not installed")
+
+    def test_start_recording_checks_whisper_model(self):
+        """Should not start recording if Whisper model is None (non-mlx)."""
+        engine = _make_engine(whisper_model_ref=lambda: None)
+        engine._start_recording()
+        assert engine._recording is False
+
+    def test_start_recording_allows_mlx_without_model(self):
+        """MLX backend uses repo reference, not model object."""
+        engine = _make_engine(
+            whisper_model_ref=lambda: None,
+            whisper_backend_ref=lambda: "mlx",
+            mlx_repo_ref=lambda: "mlx-community/whisper-small-mlx",
+        )
+        with patch("src.jarvis.dictation.dictation_engine.sd") as mock_sd, \
+             patch("src.jarvis.dictation.dictation_engine._play_beep"):
+            mock_stream = MagicMock()
+            mock_sd.InputStream.return_value = mock_stream
+            engine._start_recording()
+            assert engine._recording is True
+            # Cleanup
+            engine._stop_recording(discard=True)
+
+    def test_stop_recording_discard_clears_frames(self):
+        engine = _make_engine()
+        engine._recording = True
+        engine._audio_frames = [MagicMock()]
+        engine._stream = MagicMock()
+        engine._stop_recording(discard=True)
+        assert engine._audio_frames == []
+        assert engine._recording is False
+
+    def test_stop_recording_returns_fast_on_slow_stream_close(self):
+        """The non-discard path must not block the caller on stream.close().
+
+        Rationale: ``_stop_recording`` is invoked from the pynput low-level
+        keyboard hook callback.  Windows silently removes low-level keyboard
+        hooks that take more than ~5 s to return, which leaves pynput in an
+        inconsistent state that can crash the process when the paste thread
+        subsequently calls Controller.press/tap/release (issue #184).
+
+        The listener callback must return in a handful of milliseconds even
+        if closing the audio device is slow.
+        """
+        import numpy as np
+        slow_stream = MagicMock()
+
+        def slow_close(*_args, **_kwargs):
+            time.sleep(1.0)
+
+        slow_stream.stop.side_effect = slow_close
+        slow_stream.close.side_effect = slow_close
+
+        engine = _make_engine()
+        engine._recording = True
+        engine._stream = slow_stream
+        # Short (< 0.3 s) audio so transcribe_and_paste exits quickly.
+        engine._audio_frames = [np.zeros(1600, dtype=np.float32)]
+
+        with patch("src.jarvis.dictation.dictation_engine._play_beep"):
+            t0 = time.time()
+            engine._stop_recording()
+            elapsed = time.time() - t0
+
+        # The caller (simulating the pynput hook) must return quickly.
+        # 200 ms is generous headroom vs. the ~5 s Windows LowLevelHooksTimeout
+        # — the method should actually return in microseconds, since it just
+        # flips a bool and spawns a daemon thread.
+        assert elapsed < 0.2, (
+            f"_stop_recording blocked for {elapsed:.2f}s in the listener "
+            "thread — stream.close() must be off the hot path"
+        )
+
+        # The stream must still be closed eventually, off-thread.
+        deadline = time.time() + 5.0
+        while time.time() < deadline and not slow_stream.close.called:
+            time.sleep(0.05)
+        assert slow_stream.close.called, "stream.close() never ran"
+
+    def test_stop_recording_idempotent_under_concurrent_calls(self):
+        """Rapid double-release of the hotkey must not double-close the stream.
+
+        On Windows ``ctrl+cmd`` the user releases two keys in quick succession;
+        both releases can fire the listener callback before either has finished.
+        Only one teardown should reach the stream.
+        """
+        import numpy as np
+        engine = _make_engine()
+        engine._recording = True
+        stream_mock = MagicMock()
+        engine._stream = stream_mock
+        engine._audio_frames = [np.zeros(1600, dtype=np.float32)]
+
+        with patch("src.jarvis.dictation.dictation_engine._play_beep"):
+            # Two near-simultaneous calls from the listener.
+            t1 = threading.Thread(target=engine._stop_recording)
+            t2 = threading.Thread(target=engine._stop_recording)
+            t1.start()
+            t2.start()
+            t1.join()
+            t2.join()
+
+        # Wait for the spawned teardown thread to run close().
+        deadline = time.time() + 5.0
+        while time.time() < deadline and not stream_mock.close.called:
+            time.sleep(0.05)
+        # Only one of the two calls should have reached the stream.
+        assert stream_mock.close.call_count == 1
+
+    def test_max_duration_callback_still_stops_recording(self):
+        """Hitting the 60s cap must still close the stream and fire the end
+        callback, even though the new teardown path runs off-thread.
+
+        ``_audio_callback`` spawns a daemon thread that calls
+        ``_stop_recording()``; that then dispatches ``_finalise_and_transcribe``
+        which closes the stream and eventually invokes ``_on_dictation_end``
+        (via ``_transcribe_and_paste``'s finally).
+        """
+        import numpy as np
+        end_called = threading.Event()
+        engine = _make_engine(
+            on_dictation_end=lambda: end_called.set(),
+            whisper_model_ref=lambda: None,  # short-circuits transcribe
+            whisper_backend_ref=lambda: "faster-whisper",
+        )
+        stream_mock = MagicMock()
+        engine._recording = True
+        engine._stream = stream_mock
+        # Pre-fill up to the limit so one more frame triggers the cap.
+        engine._max_frames = 100
+        engine._audio_frames = [np.zeros(100, dtype=np.float32)]
+
+        with patch("src.jarvis.dictation.dictation_engine._play_beep"):
+            indata = np.random.randn(1600, 1).astype(np.float32)
+            engine._audio_callback(indata, 1600, None, None)
+            # _stop_recording runs in a daemon thread; wait for close().
+            assert end_called.wait(timeout=5.0), "on_dictation_end never fired"
+
+        assert stream_mock.close.called, "stream.close() never ran"
+        assert engine._recording is False
+
+    def test_finalise_fires_on_dictation_end_when_beep_raises(self):
+        """A failure in ``_play_beep`` must not strand the listener paused.
+
+        ``_on_dictation_end`` is normally fired from
+        ``_transcribe_and_paste``'s finally, but that step is never reached
+        if ``_close_stream`` or ``_play_beep`` raises.  ``_finalise_and_transcribe``
+        must therefore guarantee the callback fires on any error.
+        """
+        import numpy as np
+        end_called = threading.Event()
+        engine = _make_engine(on_dictation_end=lambda: end_called.set())
+
+        with patch(
+            "src.jarvis.dictation.dictation_engine._play_beep",
+            side_effect=RuntimeError("beep broken"),
+        ):
+            engine._finalise_and_transcribe(
+                stream=None,
+                audio_frames=[np.zeros(1600, dtype=np.float32)],
+                start_time=time.time(),
+            )
+
+        assert end_called.is_set(), (
+            "_on_dictation_end must fire even when _play_beep raises"
+        )
+
+    def test_on_dictation_callbacks_called(self):
+        """Start/end callbacks should be invoked."""
+        start_called = threading.Event()
+        end_called = threading.Event()
+
+        engine = _make_engine(
+            on_dictation_start=lambda: start_called.set(),
+            on_dictation_end=lambda: end_called.set(),
+        )
+
+        with patch("src.jarvis.dictation.dictation_engine.sd") as mock_sd, \
+             patch("src.jarvis.dictation.dictation_engine._play_beep"):
+            mock_stream = MagicMock()
+            mock_sd.InputStream.return_value = mock_stream
+            engine._start_recording()
+            assert start_called.is_set()
+
+            engine._stop_recording(discard=True)
+            assert end_called.is_set()
+
+
+# ---------------------------------------------------------------------------
+# Transcription
+# ---------------------------------------------------------------------------
+
+class TestTranscription:
+    """Tests for the transcription logic."""
+
+    @pytest.fixture(autouse=True)
+    def _skip_if_no_deps(self):
+        try:
+            import numpy  # noqa: F401
+        except ImportError:
+            pytest.skip("numpy not installed")
+
+    def test_transcribe_faster_whisper(self):
+        import numpy as np
+        mock_model = MagicMock()
+        mock_seg = MagicMock()
+        mock_seg.text = " hello world "
+        mock_model.transcribe.return_value = ([mock_seg], MagicMock())
+
+        engine = _make_engine(
+            whisper_model_ref=lambda: mock_model,
+            whisper_backend_ref=lambda: "faster-whisper",
+        )
+
+        audio = np.zeros(16000, dtype=np.float32)
+        result = engine._transcribe(audio)
+        assert result == "hello world"
+
+    def test_transcribe_empty_returns_empty(self):
+        import numpy as np
+        mock_model = MagicMock()
+        mock_model.transcribe.return_value = ([], MagicMock())
+
+        engine = _make_engine(
+            whisper_model_ref=lambda: mock_model,
+            whisper_backend_ref=lambda: "faster-whisper",
+        )
+
+        audio = np.zeros(16000, dtype=np.float32)
+        result = engine._transcribe(audio)
+        assert result == ""
+
+    def test_transcribe_no_model_returns_empty(self):
+        import numpy as np
+        engine = _make_engine(
+            whisper_model_ref=lambda: None,
+            whisper_backend_ref=lambda: "faster-whisper",
+        )
+
+        audio = np.zeros(16000, dtype=np.float32)
+        result = engine._transcribe(audio)
+        assert result == ""
+
+    def test_transcribe_mlx(self):
+        import sys
+        import numpy as np
+        mock_mlx = MagicMock()
+        mock_mlx.transcribe.return_value = {"text": "hello from mlx"}
+
+        # Patch sys.modules so `import mlx_whisper` inside the method resolves
+        with patch.dict(sys.modules, {"mlx_whisper": mock_mlx}):
+            engine = _make_engine(
+                whisper_model_ref=lambda: None,
+                whisper_backend_ref=lambda: "mlx",
+                mlx_repo_ref=lambda: "mlx-community/whisper-small-mlx",
+            )
+
+            audio = np.zeros(16000, dtype=np.float32)
+            result = engine._transcribe(audio)
+            assert result == "hello from mlx"
+
+
+# ---------------------------------------------------------------------------
+# Clipboard helpers
+# ---------------------------------------------------------------------------
+
+class TestClipboard:
+    """Tests for clipboard/paste helper functions."""
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    @patch("src.jarvis.dictation.dictation_engine._clipboard_windows")
+    @patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
+    def test_clipboard_paste_windows(self, mock_kb, mock_clip_win, mock_platform):
+        from src.jarvis.dictation.dictation_engine import _clipboard_paste
+        mock_platform.system.return_value = "Windows"
+        mock_ctrl = MagicMock()
+        mock_kb.Controller.return_value = mock_ctrl
+        mock_kb.Key.ctrl = MagicMock()
+
+        _clipboard_paste("hello")
+        mock_clip_win.assert_called_once_with("hello")
+
+    @patch("src.jarvis.dictation.dictation_engine._paste_cgevent", return_value=True)
+    @patch("src.jarvis.dictation.dictation_engine._check_macos_accessibility", return_value=True)
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    @patch("src.jarvis.dictation.dictation_engine._clipboard_macos")
+    @patch("src.jarvis.dictation.dictation_engine.pynput_keyboard")
+    def test_clipboard_paste_macos(
+        self, mock_kb, mock_clip_mac, mock_platform, mock_ax, mock_cge
+    ):
+        from src.jarvis.dictation.dictation_engine import _clipboard_paste
+        mock_platform.system.return_value = "Darwin"
+        mock_ctrl = MagicMock()
+        mock_kb.Controller.return_value = mock_ctrl
+        mock_kb.Key.cmd = MagicMock()
+
+        _clipboard_paste("hello mac")
+        mock_clip_mac.assert_called_once_with("hello mac")
+        # Guard: the real CGEvent paste and Accessibility check must never
+        # fire during tests — they would emit a real Cmd+V into whatever
+        # window has focus and pop open System Settings.
+        mock_cge.assert_called_once()
+
+    def test_clipboard_paste_empty_string_is_noop(self):
+        from src.jarvis.dictation.dictation_engine import _clipboard_paste
+        # Should return immediately without error
+        _clipboard_paste("")
+        _clipboard_paste(None)
+
+
+# ---------------------------------------------------------------------------
+# Audio callback
+# ---------------------------------------------------------------------------
+
+class TestAudioCallback:
+    """Tests for the audio callback frame accumulation."""
+
+    @pytest.fixture(autouse=True)
+    def _skip_if_no_numpy(self):
+        try:
+            import numpy  # noqa: F401
+        except ImportError:
+            pytest.skip("numpy not installed")
+
+    def test_callback_accumulates_frames(self):
+        import numpy as np
+        engine = _make_engine()
+        engine._recording = True
+        engine._audio_frames = []
+        engine._max_frames = 1_000_000
+
+        indata = np.random.randn(1600, 1).astype(np.float32)
+        engine._audio_callback(indata, 1600, None, None)
+        assert len(engine._audio_frames) == 1
+        assert len(engine._audio_frames[0]) == 1600
+
+    def test_callback_ignores_when_not_recording(self):
+        import numpy as np
+        engine = _make_engine()
+        engine._recording = False
+        engine._audio_frames = []
+
+        indata = np.random.randn(1600, 1).astype(np.float32)
+        engine._audio_callback(indata, 1600, None, None)
+        assert len(engine._audio_frames) == 0
+
+    def test_callback_respects_max_duration(self):
+        import numpy as np
+        engine = _make_engine()
+        engine._recording = True
+        # Pre-fill near the max
+        engine._max_frames = 100
+        engine._audio_frames = [np.zeros(100, dtype=np.float32)]
+
+        indata = np.random.randn(1600, 1).astype(np.float32)
+        with patch.object(engine, "_stop_recording"):
+            engine._audio_callback(indata, 1600, None, None)
+            # Should not accumulate more frames
+            assert len(engine._audio_frames) == 1
+
+
+# ---------------------------------------------------------------------------
+# Transcribe-and-paste pipeline
+# ---------------------------------------------------------------------------
+
+class TestTranscribeAndPaste:
+    """Tests for the full transcribe → paste pipeline."""
+
+    @pytest.fixture(autouse=True)
+    def _skip_if_no_numpy(self):
+        try:
+            import numpy  # noqa: F401
+        except ImportError:
+            pytest.skip("numpy not installed")
+
+    def test_short_audio_skipped(self):
+        """Audio shorter than 0.3s should be skipped."""
+        import numpy as np
+        engine = _make_engine()
+        end_called = threading.Event()
+        engine._on_dictation_end = lambda: end_called.set()
+
+        # 0.1s of audio at 16kHz = 1600 samples (< 4800 needed for 0.3s)
+        short_frames = [np.zeros(1600, dtype=np.float32)]
+        engine._transcribe_and_paste(short_frames)
+        assert end_called.is_set()
+
+    def test_empty_frames_handled(self):
+        engine = _make_engine()
+        end_called = threading.Event()
+        engine._on_dictation_end = lambda: end_called.set()
+
+        engine._transcribe_and_paste([])
+        assert end_called.is_set()
+
+    @patch("src.jarvis.dictation.dictation_engine._clipboard_paste")
+    def test_successful_transcription_pastes(self, mock_paste):
+        import numpy as np
+        mock_model = MagicMock()
+        mock_seg = MagicMock()
+        mock_seg.text = "hello world"
+        mock_model.transcribe.return_value = ([mock_seg], MagicMock())
+
+        engine = _make_engine(
+            whisper_model_ref=lambda: mock_model,
+            whisper_backend_ref=lambda: "faster-whisper",
+        )
+
+        frames = [np.zeros(8000, dtype=np.float32)]  # 0.5s
+        engine._transcribe_and_paste(frames)
+        mock_paste.assert_called_once_with("hello world")
+
+    @patch("src.jarvis.dictation.dictation_engine._clipboard_paste")
+    def test_empty_transcription_does_not_paste(self, mock_paste):
+        import numpy as np
+        mock_model = MagicMock()
+        mock_model.transcribe.return_value = ([], MagicMock())
+
+        engine = _make_engine(
+            whisper_model_ref=lambda: mock_model,
+            whisper_backend_ref=lambda: "faster-whisper",
+        )
+
+        frames = [np.zeros(8000, dtype=np.float32)]
+        engine._transcribe_and_paste(frames)
+        mock_paste.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# Config integration
+# ---------------------------------------------------------------------------
+
+class TestConfigIntegration:
+    """Tests that dictation config fields are present in Settings."""
+
+    def test_settings_has_dictation_fields(self):
+        from src.jarvis.config import Settings
+        import inspect
+        sig = inspect.signature(Settings)
+        assert "dictation_enabled" in sig.parameters
+        assert "dictation_hotkey" in sig.parameters
+
+    def test_default_config_has_dictation(self):
+        import sys
+        from src.jarvis.config import get_default_config
+        defaults = get_default_config()
+        assert defaults["dictation_enabled"] is True
+        # Platform-aware default (aligned with WisprFlow)
+        if sys.platform == "win32":
+            assert defaults["dictation_hotkey"] == "ctrl+cmd"
+        else:
+            assert defaults["dictation_hotkey"] == "ctrl+alt"
+
+    def test_load_settings_includes_dictation(self):
+        """load_settings should produce Settings with dictation fields."""
+        from src.jarvis.config import load_settings
+        settings = load_settings()
+        assert hasattr(settings, "dictation_enabled")
+        assert hasattr(settings, "dictation_hotkey")
+        assert isinstance(settings.dictation_enabled, bool)
+        assert isinstance(settings.dictation_hotkey, str)
+
+
+# ---------------------------------------------------------------------------
+# Face widget DICTATING state
+# ---------------------------------------------------------------------------
+
+class TestFaceWidgetDictatingState:
+    """Tests that the DICTATING state exists and is handled."""
+
+    def test_jarvis_state_has_dictating(self):
+        from src.desktop_app.face_widget import JarvisState
+        assert hasattr(JarvisState, "DICTATING")
+        assert JarvisState.DICTATING.value == "dictating"
+
+    def test_dictating_state_round_trips(self):
+        """State manager should accept DICTATING state."""
+        from src.desktop_app.face_widget import JarvisState
+        state = JarvisState("dictating")
+        assert state == JarvisState.DICTATING
+
+    def test_jarvis_state_has_dictation_processing(self):
+        from src.desktop_app.face_widget import JarvisState
+        assert hasattr(JarvisState, "DICTATION_PROCESSING")
+        assert JarvisState.DICTATION_PROCESSING.value == "dictation_processing"
+
+    def test_dictation_processing_state_round_trips(self):
+        from src.desktop_app.face_widget import JarvisState
+        state = JarvisState("dictation_processing")
+        assert state == JarvisState.DICTATION_PROCESSING
+
+
+class TestDictationProcessingCallback:
+    """Verifies the processing callback fires between recording stop and
+    transcription, so the face can switch to a distinct 'processing' state
+    once the user's voice input has been accepted."""
+
+    def test_processing_callback_fires_before_end_callback(self):
+        """End-to-end ordering: the processing callback must fire before the
+        end callback during the full finalise → transcribe → paste chain."""
+        from src.jarvis.dictation import dictation_engine as de
+
+        events = []
+
+        engine = _make_engine(
+            on_dictation_processing_start=lambda: events.append("processing"),
+            on_dictation_end=lambda: events.append("end"),
+        )
+
+        # Stub stream teardown and beep audio only. The real
+        # _transcribe_and_paste runs; with empty frames it short-circuits
+        # and still fires _on_dictation_end via its finally block, which is
+        # the wiring we want to verify.
+        with patch.object(de, "_close_stream"), patch.object(de, "_play_beep"):
+            engine._finalise_and_transcribe(
+                stream=MagicMock(), audio_frames=[], start_time=time.time()
+            )
+
+        assert events == ["processing", "end"]
+
+    def test_processing_callback_optional(self):
+        """Engine must work when no processing callback is supplied."""
+        from src.jarvis.dictation import dictation_engine as de
+
+        engine = _make_engine(on_dictation_processing_start=None)
+
+        with patch.object(de, "_close_stream"), \
+             patch.object(de, "_play_beep"), \
+             patch.object(engine, "_transcribe_and_paste"):
+            # Should not raise
+            engine._finalise_and_transcribe(stream=MagicMock(), audio_frames=[], start_time=time.time())
+
+
+# ---------------------------------------------------------------------------
+# Thread safety
+# ---------------------------------------------------------------------------
+
+class TestThreadSafety:
+    """Tests for thread-safe transcription locking."""
+
+    @pytest.fixture(autouse=True)
+    def _skip_if_no_numpy(self):
+        try:
+            import numpy  # noqa: F401
+        except ImportError:
+            pytest.skip("numpy not installed")
+
+    def test_transcribe_acquires_lock(self):
+        """Transcription should acquire the shared lock."""
+        import numpy as np
+        lock = threading.Lock()
+        mock_model = MagicMock()
+        mock_model.transcribe.return_value = ([], MagicMock())
+
+        engine = _make_engine(
+            whisper_model_ref=lambda: mock_model,
+            whisper_backend_ref=lambda: "faster-whisper",
+            transcribe_lock=lock,
+        )
+
+        # Acquire the lock externally — transcribe should block
+        lock.acquire()
+        result_holder = [None]
+        done = threading.Event()
+
+        def do_transcribe():
+            result_holder[0] = engine._transcribe(np.zeros(16000, dtype=np.float32))
+            done.set()
+
+        t = threading.Thread(target=do_transcribe)
+        t.start()
+
+        # Give thread a moment — it should be blocked
+        time.sleep(0.1)
+        assert not done.is_set()
+
+        # Release the lock — thread should complete
+        lock.release()
+        done.wait(timeout=2.0)
+        assert done.is_set()
+        assert result_holder[0] == ""
+        t.join(timeout=1.0)
+
+
+# ---------------------------------------------------------------------------
+# Listener pause flag
+# ---------------------------------------------------------------------------
+
+class TestListenerPauseFlag:
+    """Tests for the dictation pause flag on VoiceListener."""
+
+    @pytest.fixture()
+    def listener(self):
+        """Create a VoiceListener with mock dependencies."""
+        from src.jarvis.listening.listener import VoiceListener
+        cfg = MagicMock()
+        cfg.sample_rate = 16000
+        cfg.vad_enabled = False
+        cfg.wake_aliases = []
+        cfg.stop_commands = ["stop"]
+        return VoiceListener(MagicMock(), cfg, MagicMock(), MagicMock())
+
+    def test_voice_listener_has_dictation_active_flag(self, listener):
+        """VoiceListener should initialise _dictation_active = False."""
+        assert hasattr(listener, "_dictation_active")
+        assert listener._dictation_active is False
+
+    def test_voice_listener_has_transcribe_lock(self, listener):
+        """VoiceListener should expose a transcribe_lock."""
+        assert hasattr(listener, "transcribe_lock")
+        assert isinstance(listener.transcribe_lock, type(threading.Lock()))
+
+
+# ---------------------------------------------------------------------------
+# format_hotkey_display
+# ---------------------------------------------------------------------------
+
+class TestFormatHotkeyDisplay:
+    """Tests for platform-aware hotkey display formatting."""
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    def test_windows_cmd_shows_win(self, mock_platform):
+        from src.jarvis.dictation.dictation_engine import format_hotkey_display
+        mock_platform.system.return_value = "Windows"
+        assert format_hotkey_display("ctrl+cmd") == "Ctrl + Win"
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    def test_windows_super_shows_win(self, mock_platform):
+        from src.jarvis.dictation.dictation_engine import format_hotkey_display
+        mock_platform.system.return_value = "Windows"
+        assert format_hotkey_display("ctrl+super") == "Ctrl + Win"
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    def test_windows_win_shows_win(self, mock_platform):
+        from src.jarvis.dictation.dictation_engine import format_hotkey_display
+        mock_platform.system.return_value = "Windows"
+        assert format_hotkey_display("ctrl+win") == "Ctrl + Win"
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    def test_macos_cmd_shows_cmd(self, mock_platform):
+        from src.jarvis.dictation.dictation_engine import format_hotkey_display
+        mock_platform.system.return_value = "Darwin"
+        assert format_hotkey_display("ctrl+cmd") == "Ctrl + Cmd"
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    def test_macos_alt_shows_option(self, mock_platform):
+        from src.jarvis.dictation.dictation_engine import format_hotkey_display
+        mock_platform.system.return_value = "Darwin"
+        assert format_hotkey_display("ctrl+alt") == "Ctrl + Option"
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    def test_ctrl_shift_d(self, mock_platform):
+        from src.jarvis.dictation.dictation_engine import format_hotkey_display
+        mock_platform.system.return_value = "Windows"
+        assert format_hotkey_display("ctrl+shift+d") == "Ctrl + Shift + D"
+
+    @patch("src.jarvis.dictation.dictation_engine.platform")
+    def test_linux_alt_stays_alt(self, mock_platform):
+        from src.jarvis.dictation.dictation_engine import format_hotkey_display
+        mock_platform.system.return_value = "Linux"
+        assert format_hotkey_display("ctrl+alt") == "Ctrl + Alt"
+
+
+# ---------------------------------------------------------------------------
+# _clipboard_windows ctypes correctness
+# ---------------------------------------------------------------------------
+
+class TestClipboardWindowsCtypes:
+    """Verify _clipboard_windows sets proper ctypes return types."""
+
+    @pytest.mark.skipif(
+        __import__("sys").platform != "win32",
+        reason="Windows-only clipboard API",
+    )
+    def test_clipboard_windows_roundtrip(self):
+        """Write to clipboard and read back to verify ctypes bindings."""
+        import ctypes
+        from ctypes import wintypes
+        from src.jarvis.dictation.dictation_engine import _clipboard_windows
+
+        test_text = "dictation test 🎙️"
+        _clipboard_windows(test_text)
+
+        # Read back from clipboard
+        user32 = ctypes.windll.user32
+        kernel32 = ctypes.windll.kernel32
+        user32.OpenClipboard.argtypes = [wintypes.HWND]
+        user32.OpenClipboard.restype = wintypes.BOOL
+        user32.GetClipboardData.argtypes = [wintypes.UINT]
+        user32.GetClipboardData.restype = wintypes.HANDLE
+        user32.CloseClipboard.restype = wintypes.BOOL
+        kernel32.GlobalLock.argtypes = [wintypes.HANDLE]
+        kernel32.GlobalLock.restype = ctypes.c_void_p
+        kernel32.GlobalUnlock.argtypes = [wintypes.HANDLE]
+        kernel32.GlobalUnlock.restype = wintypes.BOOL
+
+        CF_UNICODETEXT = 13
+        assert user32.OpenClipboard(None)
+        try:
+            h = user32.GetClipboardData(CF_UNICODETEXT)
+            assert h, "GetClipboardData returned NULL"
+            ptr = kernel32.GlobalLock(h)
+            assert ptr, "GlobalLock returned NULL"
+            result = ctypes.wstring_at(ptr)
+            kernel32.GlobalUnlock(h)
+            assert result == test_text
+        finally:
+            user32.CloseClipboard()