diff --git a/src/audio/realtime-segmenter.ts b/src/audio/realtime-segmenter.ts index 028d045..f753748 100644 --- a/src/audio/realtime-segmenter.ts +++ b/src/audio/realtime-segmenter.ts @@ -1,5 +1,9 @@ interface RealtimeSegmenterOptions { onSegment: (pcm16: Buffer) => void; + onLevel?: (peak: number) => void; + onSpeechStart?: (peak: number) => void; + onSpeechDiscarded?: (samples: number) => void; + onSpeechReady?: (samples: number) => void; } export class RealtimeSegmenter { @@ -44,6 +48,8 @@ export class RealtimeSegmenter { } } + this.options.onLevel?.(peak); + if (!this.speechActive) { appendWithCap(this.preRoll, frame, this.preRollSamples); if (peak >= this.speechStartThreshold) { @@ -60,6 +66,7 @@ export class RealtimeSegmenter { this.silenceFrames = 0; this.speech.splice(0, this.speech.length, ...this.preRoll); this.preRoll.splice(0, this.preRoll.length); + this.options.onSpeechStart?.(peak); } this.speech.push(...frame); @@ -81,9 +88,11 @@ export class RealtimeSegmenter { this.speechCandidateFrames = 0; if (speechPcm.length < this.minSpeechSamples * 2) { + this.options.onSpeechDiscarded?.(speechPcm.length / 2); return; } + this.options.onSpeechReady?.(speechPcm.length / 2); this.options.onSegment(speechPcm); } } diff --git a/src/index.ts b/src/index.ts index f021d9b..d3246ea 100644 --- a/src/index.ts +++ b/src/index.ts @@ -14,6 +14,13 @@ async function runLoopback(): Promise { const stt = new FasterWhisperSttService(config, logger); let capture = null as ReturnType | null; let shuttingDown: Promise | null = null; + let receivedChunks = 0; + let receivedBytes = 0; + let maxPeak = 0; + let lastChunkAt = 0; + let lastLevelLogAt = 0; + let sawSpeechStart = false; + let emittedSegmentCount = 0; const shutdown = async (exitCode: number, reason: string, error?: unknown): Promise => { if (shuttingDown) { @@ -60,6 +67,7 @@ async function runLoopback(): Promise { }); await stt.warmup(); + logger.info("STT warmup finished"); const transcriptionQueue: Buffer[] = []; let transcribing = false; @@ -93,14 +101,50 @@ async function runLoopback(): Promise { }; const segmenter = new RealtimeSegmenter({ + onLevel: (peak) => { + if (peak > maxPeak) { + maxPeak = peak; + } + + const now = Date.now(); + if (now - lastLevelLogAt >= 3000) { + lastLevelLogAt = now; + logger.info("Audio input heartbeat", { + chunks: receivedChunks, + bytes: receivedBytes, + peak: maxPeak, + speech_started: sawSpeechStart, + emitted_segments: emittedSegmentCount, + }); + maxPeak = 0; + } + }, + onSpeechStart: (peak) => { + sawSpeechStart = true; + logger.info("Speech start detected", { peak }); + }, + onSpeechDiscarded: (samples) => { + logger.info("Discarded short speech segment", { samples }); + }, + onSpeechReady: (samples) => { + emittedSegmentCount += 1; + logger.info("Speech segment ready", { index: emittedSegmentCount, samples }); + }, onSegment: (pcm16) => { transcriptionQueue.push(pcm16); + logger.info("Queued segment for STT", { + queue: transcriptionQueue.length, + bytes: pcm16.length, + }); void runNext(); }, }); capture = spawnLoopbackCapture(config, logger); capture.stdout.on("data", (chunk: Buffer) => { + receivedChunks += 1; + receivedBytes += chunk.length; + lastChunkAt = Date.now(); segmenter.pushChunk(chunk); }); capture.stderr.on("data", (chunk: Buffer) => { @@ -123,6 +167,18 @@ async function runLoopback(): Promise { console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`); console.log(`model: ${config.WHISPER_MODEL}`); console.log(`language: ${config.WHISPER_LANGUAGE}`); + + setInterval(() => { + const now = Date.now(); + if (lastChunkAt === 0 && !shuttingDown) { + logger.warn("아직 캡처 PCM 데이터가 들어오지 않았습니다. AUDIO_SOURCE 가 잘못됐거나 loopback 입력이 아닌 장치일 수 있습니다."); + return; + } + + if (lastChunkAt > 0 && now - lastChunkAt >= 5000 && !shuttingDown) { + logger.warn("최근 5초 동안 새 PCM chunk 가 들어오지 않았습니다."); + } + }, 5000).unref(); } async function main(): Promise {