From 962ff7037b6ac9670a5c7057309606bf0e427e4c Mon Sep 17 00:00:00 2001
From: claude-bot <claude-bot@tkrmagid.kr>
Date: Sat, 2 May 2026 21:24:41 +0900
Subject: [PATCH] Tune realtime STT defaults

---
 .env.example                    | 10 +++++++++-
 README.md                       | 12 ++++++++++--
 src/audio/realtime-segmenter.ts | 34 +++++++++++++++++++++++++--------
 src/config.ts                   |  9 ++++++++-
 src/index.ts                    | 31 +++++++++++++++++++++++++-----
 5 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/.env.example b/.env.example
index 35db3cf..8483955 100644
--- a/.env.example
+++ b/.env.example
@@ -10,7 +10,15 @@ WHISPER_MODEL=large-v3-turbo
 WHISPER_LANGUAGE=ko
 WHISPER_DEVICE=auto
 WHISPER_COMPUTE_TYPE=auto
-WHISPER_BEAM_SIZE=1
+WHISPER_BEAM_SIZE=2
+
+SEGMENT_START_THRESHOLD=900
+SEGMENT_CONTINUE_THRESHOLD=450
+SEGMENT_START_FRAMES=2
+SEGMENT_END_FRAMES=24
+SEGMENT_PREROLL_SAMPLES=3200
+SEGMENT_MIN_SPEECH_SAMPLES=7200
+SEGMENT_MAX_SPEECH_SAMPLES=160000
 
 DEBUG_TRANSCRIPTS=true
 LOG_LEVEL=info
diff --git a/README.md b/README.md
index 487d39a..e078575 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,13 @@ bun run start:loopback
 - `WHISPER_COMPUTE_TYPE`
   - `auto`, `float16`, `int8_float16`, `int8`, `float32`
 - `WHISPER_BEAM_SIZE`
-  - 기본값 `1`
+  - 기본값 `2`
+- `SEGMENT_END_FRAMES`
+  - 기본값 `24`
+  - 끝을 조금 더 늦게 잘라서 문장이 잘게 끊기는 현상을 줄입니다
+- `SEGMENT_MAX_SPEECH_SAMPLES`
+  - 기본값 `160000`
+  - 너무 긴 구간은 강제로 끊어서 지연이 과하게 커지는 걸 막습니다
 
 ## 메모
 
@@ -77,7 +83,9 @@ WHISPER_MODEL=large-v3-turbo
 WHISPER_LANGUAGE=ko
 WHISPER_DEVICE=auto
 WHISPER_COMPUTE_TYPE=auto
-WHISPER_BEAM_SIZE=1
+WHISPER_BEAM_SIZE=2
+SEGMENT_END_FRAMES=24
+SEGMENT_MAX_SPEECH_SAMPLES=160000
 DEBUG_TRANSCRIPTS=true
 LOG_LEVEL=info
 ```
diff --git a/src/audio/realtime-segmenter.ts b/src/audio/realtime-segmenter.ts
index f753748..3310200 100644
--- a/src/audio/realtime-segmenter.ts
+++ b/src/audio/realtime-segmenter.ts
@@ -4,6 +4,13 @@ interface RealtimeSegmenterOptions {
   onSpeechStart?: (peak: number) => void;
   onSpeechDiscarded?: (samples: number) => void;
   onSpeechReady?: (samples: number) => void;
+  preRollSamples?: number;
+  speechStartThreshold?: number;
+  speechContinueThreshold?: number;
+  speechStartFrames?: number;
+  speechEndFrames?: number;
+  minSpeechSamples?: number;
+  maxSpeechSamples?: number;
 }
 
 export class RealtimeSegmenter {
@@ -12,18 +19,27 @@ export class RealtimeSegmenter {
   private readonly speech: number[] = [];
 
   private readonly frameSamples = 320;
-  private readonly preRollSamples = 3200;
-  private readonly speechStartThreshold = 900;
-  private readonly speechContinueThreshold = 450;
-  private readonly speechStartFrames = 2;
-  private readonly speechEndFrames = 18;
-  private readonly minSpeechSamples = 6400;
+  private readonly preRollSamples: number;
+  private readonly speechStartThreshold: number;
+  private readonly speechContinueThreshold: number;
+  private readonly speechStartFrames: number;
+  private readonly speechEndFrames: number;
+  private readonly minSpeechSamples: number;
+  private readonly maxSpeechSamples: number;
 
   private speechActive = false;
   private speechCandidateFrames = 0;
   private silenceFrames = 0;
 
-  constructor(private readonly options: RealtimeSegmenterOptions) {}
+  constructor(private readonly options: RealtimeSegmenterOptions) {
+    this.preRollSamples = options.preRollSamples ?? 3200;
+    this.speechStartThreshold = options.speechStartThreshold ?? 900;
+    this.speechContinueThreshold = options.speechContinueThreshold ?? 450;
+    this.speechStartFrames = options.speechStartFrames ?? 2;
+    this.speechEndFrames = options.speechEndFrames ?? 24;
+    this.minSpeechSamples = options.minSpeechSamples ?? 7200;
+    this.maxSpeechSamples = options.maxSpeechSamples ?? 160000;
+  }
 
   pushChunk(chunk: Buffer): void {
     for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
@@ -78,7 +94,9 @@ export class RealtimeSegmenter {
     }
 
     if (this.silenceFrames < this.speechEndFrames) {
-      return;
+      if (this.speech.length < this.maxSpeechSamples) {
+        return;
+      }
     }
 
     const speechPcm = int16ArrayToBuffer(Int16Array.from(this.speech));
diff --git a/src/config.ts b/src/config.ts
index 9ed5b55..2cde868 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -19,7 +19,14 @@ const envSchema = z.object({
   WHISPER_LANGUAGE: z.string().min(1).default("ko"),
   WHISPER_DEVICE: z.enum(["auto", "cuda", "cpu"]).default("auto"),
   WHISPER_COMPUTE_TYPE: z.string().min(1).default("auto"),
-  WHISPER_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1),
+  WHISPER_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(2),
+  SEGMENT_START_THRESHOLD: z.coerce.number().int().min(100).max(10000).default(900),
+  SEGMENT_CONTINUE_THRESHOLD: z.coerce.number().int().min(50).max(10000).default(450),
+  SEGMENT_START_FRAMES: z.coerce.number().int().min(1).max(10).default(2),
+  SEGMENT_END_FRAMES: z.coerce.number().int().min(4).max(60).default(24),
+  SEGMENT_PREROLL_SAMPLES: z.coerce.number().int().min(320).max(16000).default(3200),
+  SEGMENT_MIN_SPEECH_SAMPLES: z.coerce.number().int().min(1600).max(64000).default(7200),
+  SEGMENT_MAX_SPEECH_SAMPLES: z.coerce.number().int().min(16000).max(320000).default(160000),
   DEBUG_TRANSCRIPTS: z
     .string()
     .optional()
diff --git a/src/index.ts b/src/index.ts
index d3246ea..bf56e12 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -69,8 +69,9 @@ async function runLoopback(): Promise<void> {
   await stt.warmup();
   logger.info("STT warmup finished");
 
-  const transcriptionQueue: Buffer[] = [];
+  const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = [];
   let transcribing = false;
+  let nextSegmentIndex = 1;
 
   const runNext = async (): Promise<void> => {
     if (transcribing) {
@@ -83,11 +84,17 @@ async function runLoopback(): Promise<void> {
 
     transcribing = true;
     try {
-      const text = await stt.transcribePcm16(next);
+      const startedAt = Date.now();
+      const text = await stt.transcribePcm16(next.pcm16);
+      logger.info("STT latency", {
+        index: next.index,
+        wait_ms: startedAt - next.queuedAt,
+        transcribe_ms: Date.now() - startedAt,
+      });
       if (!text) {
         logger.info("빈 전사 결과");
       } else {
-        logger.info("Transcript", text);
+        logger.info("Transcript", { index: next.index, text });
         if (config.DEBUG_TRANSCRIPTS) {
           console.log(`\n[text] ${text}\n`);
         }
@@ -101,6 +108,13 @@ async function runLoopback(): Promise<void> {
   };
 
   const segmenter = new RealtimeSegmenter({
+    preRollSamples: config.SEGMENT_PREROLL_SAMPLES,
+    speechStartThreshold: config.SEGMENT_START_THRESHOLD,
+    speechContinueThreshold: config.SEGMENT_CONTINUE_THRESHOLD,
+    speechStartFrames: config.SEGMENT_START_FRAMES,
+    speechEndFrames: config.SEGMENT_END_FRAMES,
+    minSpeechSamples: config.SEGMENT_MIN_SPEECH_SAMPLES,
+    maxSpeechSamples: config.SEGMENT_MAX_SPEECH_SAMPLES,
     onLevel: (peak) => {
       if (peak > maxPeak) {
         maxPeak = peak;
@@ -128,11 +142,17 @@ async function runLoopback(): Promise<void> {
     },
     onSpeechReady: (samples) => {
       emittedSegmentCount += 1;
-      logger.info("Speech segment ready", { index: emittedSegmentCount, samples });
+      logger.info("Speech segment ready", { index: emittedSegmentCount, samples, ms: Math.round((samples / 16000) * 1000) });
     },
     onSegment: (pcm16) => {
-      transcriptionQueue.push(pcm16);
+      const index = nextSegmentIndex++;
+      transcriptionQueue.push({
+        pcm16,
+        queuedAt: Date.now(),
+        index,
+      });
       logger.info("Queued segment for STT", {
+        index,
         queue: transcriptionQueue.length,
         bytes: pcm16.length,
       });
@@ -167,6 +187,7 @@ async function runLoopback(): Promise<void> {
   console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`);
   console.log(`model: ${config.WHISPER_MODEL}`);
   console.log(`language: ${config.WHISPER_LANGUAGE}`);
+  console.log(`beam: ${config.WHISPER_BEAM_SIZE}`);
 
   setInterval(() => {
     const now = Date.now();