From 962ff7037b6ac9670a5c7057309606bf0e427e4c Mon Sep 17 00:00:00 2001 From: claude-bot Date: Sat, 2 May 2026 21:24:41 +0900 Subject: [PATCH] Tune realtime STT defaults --- .env.example | 10 +++++++++- README.md | 12 ++++++++++-- src/audio/realtime-segmenter.ts | 34 +++++++++++++++++++++++++-------- src/config.ts | 9 ++++++++- src/index.ts | 31 +++++++++++++++++++++++++----- 5 files changed, 79 insertions(+), 17 deletions(-) diff --git a/.env.example b/.env.example index 35db3cf..8483955 100644 --- a/.env.example +++ b/.env.example @@ -10,7 +10,15 @@ WHISPER_MODEL=large-v3-turbo WHISPER_LANGUAGE=ko WHISPER_DEVICE=auto WHISPER_COMPUTE_TYPE=auto -WHISPER_BEAM_SIZE=1 +WHISPER_BEAM_SIZE=2 + +SEGMENT_START_THRESHOLD=900 +SEGMENT_CONTINUE_THRESHOLD=450 +SEGMENT_START_FRAMES=2 +SEGMENT_END_FRAMES=24 +SEGMENT_PREROLL_SAMPLES=3200 +SEGMENT_MIN_SPEECH_SAMPLES=7200 +SEGMENT_MAX_SPEECH_SAMPLES=160000 DEBUG_TRANSCRIPTS=true LOG_LEVEL=info diff --git a/README.md b/README.md index 487d39a..e078575 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,13 @@ bun run start:loopback - `WHISPER_COMPUTE_TYPE` - `auto`, `float16`, `int8_float16`, `int8`, `float32` - `WHISPER_BEAM_SIZE` - - 기본값 `1` + - 기본값 `2` +- `SEGMENT_END_FRAMES` + - 기본값 `24` + - 끝을 조금 더 늦게 잘라서 문장이 잘게 끊기는 현상을 줄입니다 +- `SEGMENT_MAX_SPEECH_SAMPLES` + - 기본값 `160000` + - 너무 긴 구간은 강제로 끊어서 지연이 과하게 커지는 걸 막습니다 ## 메모 @@ -77,7 +83,9 @@ WHISPER_MODEL=large-v3-turbo WHISPER_LANGUAGE=ko WHISPER_DEVICE=auto WHISPER_COMPUTE_TYPE=auto -WHISPER_BEAM_SIZE=1 +WHISPER_BEAM_SIZE=2 +SEGMENT_END_FRAMES=24 +SEGMENT_MAX_SPEECH_SAMPLES=160000 DEBUG_TRANSCRIPTS=true LOG_LEVEL=info ``` diff --git a/src/audio/realtime-segmenter.ts b/src/audio/realtime-segmenter.ts index f753748..3310200 100644 --- a/src/audio/realtime-segmenter.ts +++ b/src/audio/realtime-segmenter.ts @@ -4,6 +4,13 @@ interface RealtimeSegmenterOptions { onSpeechStart?: (peak: number) => void; onSpeechDiscarded?: (samples: number) => void; onSpeechReady?: (samples: number) => void; + preRollSamples?: number; + speechStartThreshold?: number; + speechContinueThreshold?: number; + speechStartFrames?: number; + speechEndFrames?: number; + minSpeechSamples?: number; + maxSpeechSamples?: number; } export class RealtimeSegmenter { @@ -12,18 +19,27 @@ export class RealtimeSegmenter { private readonly speech: number[] = []; private readonly frameSamples = 320; - private readonly preRollSamples = 3200; - private readonly speechStartThreshold = 900; - private readonly speechContinueThreshold = 450; - private readonly speechStartFrames = 2; - private readonly speechEndFrames = 18; - private readonly minSpeechSamples = 6400; + private readonly preRollSamples: number; + private readonly speechStartThreshold: number; + private readonly speechContinueThreshold: number; + private readonly speechStartFrames: number; + private readonly speechEndFrames: number; + private readonly minSpeechSamples: number; + private readonly maxSpeechSamples: number; private speechActive = false; private speechCandidateFrames = 0; private silenceFrames = 0; - constructor(private readonly options: RealtimeSegmenterOptions) {} + constructor(private readonly options: RealtimeSegmenterOptions) { + this.preRollSamples = options.preRollSamples ?? 3200; + this.speechStartThreshold = options.speechStartThreshold ?? 900; + this.speechContinueThreshold = options.speechContinueThreshold ?? 450; + this.speechStartFrames = options.speechStartFrames ?? 2; + this.speechEndFrames = options.speechEndFrames ?? 24; + this.minSpeechSamples = options.minSpeechSamples ?? 7200; + this.maxSpeechSamples = options.maxSpeechSamples ?? 160000; + } pushChunk(chunk: Buffer): void { for (let offset = 0; offset + 1 < chunk.length; offset += 2) { @@ -78,7 +94,9 @@ export class RealtimeSegmenter { } if (this.silenceFrames < this.speechEndFrames) { - return; + if (this.speech.length < this.maxSpeechSamples) { + return; + } } const speechPcm = int16ArrayToBuffer(Int16Array.from(this.speech)); diff --git a/src/config.ts b/src/config.ts index 9ed5b55..2cde868 100644 --- a/src/config.ts +++ b/src/config.ts @@ -19,7 +19,14 @@ const envSchema = z.object({ WHISPER_LANGUAGE: z.string().min(1).default("ko"), WHISPER_DEVICE: z.enum(["auto", "cuda", "cpu"]).default("auto"), WHISPER_COMPUTE_TYPE: z.string().min(1).default("auto"), - WHISPER_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1), + WHISPER_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(2), + SEGMENT_START_THRESHOLD: z.coerce.number().int().min(100).max(10000).default(900), + SEGMENT_CONTINUE_THRESHOLD: z.coerce.number().int().min(50).max(10000).default(450), + SEGMENT_START_FRAMES: z.coerce.number().int().min(1).max(10).default(2), + SEGMENT_END_FRAMES: z.coerce.number().int().min(4).max(60).default(24), + SEGMENT_PREROLL_SAMPLES: z.coerce.number().int().min(320).max(16000).default(3200), + SEGMENT_MIN_SPEECH_SAMPLES: z.coerce.number().int().min(1600).max(64000).default(7200), + SEGMENT_MAX_SPEECH_SAMPLES: z.coerce.number().int().min(16000).max(320000).default(160000), DEBUG_TRANSCRIPTS: z .string() .optional() diff --git a/src/index.ts b/src/index.ts index d3246ea..bf56e12 100644 --- a/src/index.ts +++ b/src/index.ts @@ -69,8 +69,9 @@ async function runLoopback(): Promise { await stt.warmup(); logger.info("STT warmup finished"); - const transcriptionQueue: Buffer[] = []; + const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = []; let transcribing = false; + let nextSegmentIndex = 1; const runNext = async (): Promise => { if (transcribing) { @@ -83,11 +84,17 @@ async function runLoopback(): Promise { transcribing = true; try { - const text = await stt.transcribePcm16(next); + const startedAt = Date.now(); + const text = await stt.transcribePcm16(next.pcm16); + logger.info("STT latency", { + index: next.index, + wait_ms: startedAt - next.queuedAt, + transcribe_ms: Date.now() - startedAt, + }); if (!text) { logger.info("빈 전사 결과"); } else { - logger.info("Transcript", text); + logger.info("Transcript", { index: next.index, text }); if (config.DEBUG_TRANSCRIPTS) { console.log(`\n[text] ${text}\n`); } @@ -101,6 +108,13 @@ async function runLoopback(): Promise { }; const segmenter = new RealtimeSegmenter({ + preRollSamples: config.SEGMENT_PREROLL_SAMPLES, + speechStartThreshold: config.SEGMENT_START_THRESHOLD, + speechContinueThreshold: config.SEGMENT_CONTINUE_THRESHOLD, + speechStartFrames: config.SEGMENT_START_FRAMES, + speechEndFrames: config.SEGMENT_END_FRAMES, + minSpeechSamples: config.SEGMENT_MIN_SPEECH_SAMPLES, + maxSpeechSamples: config.SEGMENT_MAX_SPEECH_SAMPLES, onLevel: (peak) => { if (peak > maxPeak) { maxPeak = peak; @@ -128,11 +142,17 @@ async function runLoopback(): Promise { }, onSpeechReady: (samples) => { emittedSegmentCount += 1; - logger.info("Speech segment ready", { index: emittedSegmentCount, samples }); + logger.info("Speech segment ready", { index: emittedSegmentCount, samples, ms: Math.round((samples / 16000) * 1000) }); }, onSegment: (pcm16) => { - transcriptionQueue.push(pcm16); + const index = nextSegmentIndex++; + transcriptionQueue.push({ + pcm16, + queuedAt: Date.now(), + index, + }); logger.info("Queued segment for STT", { + index, queue: transcriptionQueue.length, bytes: pcm16.length, }); @@ -167,6 +187,7 @@ async function runLoopback(): Promise { console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`); console.log(`model: ${config.WHISPER_MODEL}`); console.log(`language: ${config.WHISPER_LANGUAGE}`); + console.log(`beam: ${config.WHISPER_BEAM_SIZE}`); setInterval(() => { const now = Date.now();