Tune realtime STT defaults

2026-05-02 21:24:41 +09:00
parent 11cfd7cc04
commit 962ff7037b
5 changed files with 79 additions and 17 deletions
--- a/src/index.ts
+++ b/src/index.ts
@@ -69,8 +69,9 @@ async function runLoopback(): Promise<void> {
  await stt.warmup();
  logger.info("STT warmup finished");

-  const transcriptionQueue: Buffer[] = [];
+  const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = [];
  let transcribing = false;
+  let nextSegmentIndex = 1;

  const runNext = async (): Promise<void> => {
    if (transcribing) {
@@ -83,11 +84,17 @@ async function runLoopback(): Promise<void> {

    transcribing = true;
    try {
-      const text = await stt.transcribePcm16(next);
+      const startedAt = Date.now();
+      const text = await stt.transcribePcm16(next.pcm16);
+      logger.info("STT latency", {
+        index: next.index,
+        wait_ms: startedAt - next.queuedAt,
+        transcribe_ms: Date.now() - startedAt,
+      });
      if (!text) {
        logger.info("빈 전사 결과");
      } else {
-        logger.info("Transcript", text);
+        logger.info("Transcript", { index: next.index, text });
        if (config.DEBUG_TRANSCRIPTS) {
          console.log(`\n[text] ${text}\n`);
        }
@@ -101,6 +108,13 @@ async function runLoopback(): Promise<void> {
  };

  const segmenter = new RealtimeSegmenter({
+    preRollSamples: config.SEGMENT_PREROLL_SAMPLES,
+    speechStartThreshold: config.SEGMENT_START_THRESHOLD,
+    speechContinueThreshold: config.SEGMENT_CONTINUE_THRESHOLD,
+    speechStartFrames: config.SEGMENT_START_FRAMES,
+    speechEndFrames: config.SEGMENT_END_FRAMES,
+    minSpeechSamples: config.SEGMENT_MIN_SPEECH_SAMPLES,
+    maxSpeechSamples: config.SEGMENT_MAX_SPEECH_SAMPLES,
    onLevel: (peak) => {
      if (peak > maxPeak) {
        maxPeak = peak;
@@ -128,11 +142,17 @@ async function runLoopback(): Promise<void> {
    },
    onSpeechReady: (samples) => {
      emittedSegmentCount += 1;
-      logger.info("Speech segment ready", { index: emittedSegmentCount, samples });
+      logger.info("Speech segment ready", { index: emittedSegmentCount, samples, ms: Math.round((samples / 16000) * 1000) });
    },
    onSegment: (pcm16) => {
-      transcriptionQueue.push(pcm16);
+      const index = nextSegmentIndex++;
+      transcriptionQueue.push({
+        pcm16,
+        queuedAt: Date.now(),
+        index,
+      });
      logger.info("Queued segment for STT", {
+        index,
        queue: transcriptionQueue.length,
        bytes: pcm16.length,
      });
@@ -167,6 +187,7 @@ async function runLoopback(): Promise<void> {
  console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`);
  console.log(`model: ${config.WHISPER_MODEL}`);
  console.log(`language: ${config.WHISPER_LANGUAGE}`);
+  console.log(`beam: ${config.WHISPER_BEAM_SIZE}`);

  setInterval(() => {
    const now = Date.now();