realtime_voice_bot/src/index.ts

import process from "node:process";
import { createInterface } from "node:readline";

import { loadConfig } from "./config.js";
import { Logger } from "./logger.js";
import { printAudioDevices, spawnLoopbackCapture } from "./audio/capture.js";
import { RealtimeSegmenter } from "./audio/realtime-segmenter.js";
import { FasterWhisperSttService } from "./services/faster-whisper-stt.js";
import { MeloTtsService } from "./services/melo-tts.js";
import { OllamaLlmService } from "./services/ollama-llm.js";

const mode = process.argv[2] ?? "test-stt";

async function runSttTest(options: { enableLlm: boolean; enableTts: boolean }): Promise<void> {
  const config = loadConfig();
  const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
  const stt = new FasterWhisperSttService(config, logger);
  const llm = options.enableLlm ? new OllamaLlmService(config, logger) : null;
  let tts = options.enableTts && config.TTS_ENABLED ? new MeloTtsService(config, logger) : null;
  let capture = null as ReturnType<typeof spawnLoopbackCapture> | null;
  let shuttingDown: Promise<void> | null = null;
  let suppressCapture = false;
  let receivedChunks = 0;
  let receivedBytes = 0;
  let maxPeak = 0;
  let lastChunkAt = 0;
  let lastLevelLogAt = 0;
  let sawSpeechStart = false;
  let emittedSegmentCount = 0;

  const shutdown = async (exitCode: number, reason: string, error?: unknown): Promise<void> => {
    if (shuttingDown) {
      return await shuttingDown;
    }

    shuttingDown = (async () => {
      if (error) {
        logger.error(`Shutting down: ${reason}`, error);
      } else {
        logger.info("Shutting down", reason);
      }

      if (capture && !capture.killed && capture.exitCode === null) {
        capture.kill("SIGTERM");
      }

      await stt.destroy().catch((destroyError) => {
        logger.warn("STT destroy failed", destroyError);
      });
      if (tts) {
        await tts.destroy().catch((destroyError) => {
          logger.warn("TTS destroy failed", destroyError);
        });
      }
    })();

    await shuttingDown;
    process.exit(exitCode);
  };

  process.once("SIGINT", () => {
    void shutdown(0, "SIGINT");
  });
  process.once("SIGTERM", () => {
    void shutdown(0, "SIGTERM");
  });
  process.once("uncaughtException", (error) => {
    void shutdown(1, "uncaughtException", error);
  });
  process.once("unhandledRejection", (reason) => {
    void shutdown(1, "unhandledRejection", reason);
  });
  process.once("exit", () => {
    if (capture && !capture.killed && capture.exitCode === null) {
      capture.kill("SIGKILL");
    }
    void stt.destroy();
    if (tts) {
      void tts.destroy();
    }
  });

  console.log("STT 준비중...");
  await stt.warmup();
  logger.info("STT warmup finished");
  console.log("STT 준비 완료");
  if (llm) {
    console.log("LLM 준비중...");
    await llm.warmup();
    logger.info("LLM warmup finished");
    console.log("LLM 준비 완료");
  }
  if (tts) {
    console.log("TTS 준비중...");
    try {
      await tts.warmup();
      logger.info("TTS warmup finished", {
        image: config.TTS_IMAGE,
        language: config.TTS_LANGUAGE,
        speaker: config.TTS_SPEAKER,
      });
      console.log("TTS 준비 완료");
    } catch (error) {
      logger.warn("TTS warmup failed", error);
      console.log("TTS 비활성화: bun run setup:tts 를 먼저 실행하세요.");
      tts = null;
    }
  }

  const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = [];
  let transcribing = false;
  let nextSegmentIndex = 1;

  const runNext = async (): Promise<void> => {
    if (transcribing) {
      return;
    }
    const next = transcriptionQueue.shift();
    if (!next) {
      return;
    }

    transcribing = true;
    try {
      const startedAt = Date.now();
      const text = await stt.transcribePcm16(next.pcm16);
      logger.info("STT latency", {
        index: next.index,
        wait_ms: startedAt - next.queuedAt,
        transcribe_ms: Date.now() - startedAt,
      });
      if (!text) {
        logger.info("빈 전사 결과");
      } else {
        logger.info("Transcript", { index: next.index, text });
        if (config.DEBUG) {
          if (config.DEBUG_TRANSCRIPTS) {
            console.log(`\n[text] ${text}\n`);
          }
        } else {
          console.log(`사용자> ${text}`);
        }

        if (llm) {
          const assessmentStartedAt = Date.now();
          const assessment = await llm.assessReplyNeed(text);
          logger.info("Reply assessment", {
            index: next.index,
            should_reply: assessment.shouldReply,
            likely_needs_lookup: assessment.likelyNeedsLookup,
            reason: assessment.reason,
            assessment_ms: Date.now() - assessmentStartedAt,
          });

          if (!assessment.shouldReply) {
            if (config.DEBUG) {
              console.log(`[skip] ${assessment.reason}\n`);
            }
            return;
          }

          const llmStartedAt = Date.now();
          const reply = await llm.generateReply(text, {
            onProgress: (message) => {
              if (config.DEBUG) {
                console.log(`[assistant] ${message}`);
                return;
              }
              console.log(`답변> ${message}`);
            },
          });
          logger.info("LLM latency", {
            index: next.index,
            llm_ms: Date.now() - llmStartedAt,
          });
          logger.info("LLM reply", { index: next.index, text: reply });

          if (config.DEBUG) {
            if (config.DEBUG_TRANSCRIPTS) {
              console.log(`[assistant] ${reply}\n`);
            }
          } else {
            console.log(`답변> ${reply}`);
          }

          if (tts) {
            suppressCapture = true;
            segmenter.reset();
            try {
              await tts.speak(reply);
            } catch (error) {
              logger.warn("TTS playback failed", error);
            } finally {
              suppressCapture = false;
              sawSpeechStart = false;
              maxPeak = 0;
            }
          }
        }
      }
    } catch (error) {
      logger.error("STT/LLM failed", error);
    } finally {
      transcribing = false;
      void runNext();
    }
  };

  const segmenter = new RealtimeSegmenter({
    preRollSamples: config.SEGMENT_PREROLL_SAMPLES,
    speechStartThreshold: config.SEGMENT_START_THRESHOLD,
    speechContinueThreshold: config.SEGMENT_CONTINUE_THRESHOLD,
    speechStartFrames: config.SEGMENT_START_FRAMES,
    speechEndFrames: config.SEGMENT_END_FRAMES,
    minSpeechSamples: config.SEGMENT_MIN_SPEECH_SAMPLES,
    maxSpeechSamples: config.SEGMENT_MAX_SPEECH_SAMPLES,
    onLevel: (peak) => {
      if (peak > maxPeak) {
        maxPeak = peak;
      }

      const now = Date.now();
      if (now - lastLevelLogAt >= 3000) {
        lastLevelLogAt = now;
        logger.info("Audio input heartbeat", {
          chunks: receivedChunks,
          bytes: receivedBytes,
          peak: maxPeak,
          speech_started: sawSpeechStart,
          emitted_segments: emittedSegmentCount,
        });
        maxPeak = 0;
      }
    },
    onSpeechStart: (peak) => {
      sawSpeechStart = true;
      logger.info("Speech start detected", { peak });
    },
    onSpeechDiscarded: (samples) => {
      logger.info("Discarded short speech segment", { samples });
    },
    onSpeechReady: (samples) => {
      emittedSegmentCount += 1;
      logger.info("Speech segment ready", {
        index: emittedSegmentCount,
        samples,
        ms: Math.round((samples / 16000) * 1000),
      });
    },
    onSegment: (pcm16) => {
      const index = nextSegmentIndex++;
      transcriptionQueue.push({
        pcm16,
        queuedAt: Date.now(),
        index,
      });
      logger.info("Queued segment for STT", {
        index,
        queue: transcriptionQueue.length,
        bytes: pcm16.length,
      });
      void runNext();
    },
  });

  capture = spawnLoopbackCapture(config, logger);
  capture.stdout.on("data", (chunk: Buffer) => {
    receivedChunks += 1;
    receivedBytes += chunk.length;
    lastChunkAt = Date.now();
    if (suppressCapture) {
      return;
    }
    segmenter.pushChunk(chunk);
  });
  capture.stderr.on("data", (chunk: Buffer) => {
    const text = chunk.toString().trim();
    if (text) {
      logger.debug("[capture]", text);
    }
  });
  capture.on("error", (error) => {
    void shutdown(1, "capture-error", error);
  });
  capture.on("exit", (code, signal) => {
    logger.warn("capture exited", { code, signal });
    if (!shuttingDown) {
      void shutdown(1, "capture-exit");
    }
  });

  if (config.DEBUG) {
    if (options.enableLlm && options.enableTts) {
      console.log("실시간 출력장치 STT+LLM+TTS 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
    } else if (options.enableLlm) {
      console.log("실시간 출력장치 STT+LLM 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
    } else {
      console.log("실시간 출력장치 STT 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
    }
    console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`);
    console.log(`model: ${config.WHISPER_MODEL}`);
    console.log(`language: ${config.WHISPER_LANGUAGE}`);
    console.log(`beam: ${config.WHISPER_BEAM_SIZE}`);
    if (options.enableLlm) {
      console.log(`llm: ${config.OLLAMA_MODEL}`);
    }
    if (options.enableTts) {
      console.log(`tts: ${config.TTS_IMAGE}`);
    }
  }

  setInterval(() => {
    const now = Date.now();
    if (lastChunkAt === 0 && !shuttingDown) {
      logger.warn("아직 캡처 PCM 데이터가 들어오지 않았습니다. AUDIO_SOURCE 가 잘못됐거나 loopback 입력이 아닌 장치일 수 있습니다.");
      return;
    }

    if (lastChunkAt > 0 && now - lastChunkAt >= 5000 && !shuttingDown) {
      logger.warn("최근 5초 동안 새 PCM chunk 가 들어오지 않았습니다.");
    }
  }, 5000).unref();
}

async function runLlmCli(): Promise<void> {
  const config = loadConfig();
  const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
  const llm = new OllamaLlmService(config, logger);

  await llm.warmup();

  console.log(`LLM CLI 테스트를 시작합니다. model=${config.OLLAMA_MODEL}`);
  console.log("/exit 로 종료, /reset 으로 대화 초기화");

  const rl = createInterface({
    input: process.stdin,
    output: process.stdout,
    prompt: "you> ",
  });

  rl.prompt();

  rl.on("line", async (line) => {
    const text = line.trim();

    if (!text) {
      rl.prompt();
      return;
    }

    if (text === "/exit") {
      rl.close();
      return;
    }

    if (text === "/reset") {
      llm.resetConversation();
      console.log("assistant> 대화 문맥을 초기화했습니다.");
      rl.prompt();
      return;
    }

    try {
      const startedAt = Date.now();
      const reply = await llm.generateReply(text, {
        onProgress: (message) => {
          console.log(`assistant> ${message}`);
        },
      });
      logger.info("LLM latency", {
        llm_ms: Date.now() - startedAt,
      });
      console.log(`assistant> ${reply}`);
    } catch (error) {
      console.error(error instanceof Error ? error.message : String(error));
    }

    rl.prompt();
  });

  rl.on("close", () => {
    process.exit(0);
  });
}

async function runTtsTest(): Promise<void> {
  const text = process.argv.slice(3).join(" ").trim() || "안녕하세요. 로컬 티티에스 테스트입니다.";
  const config = loadConfig();
  const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
  const tts = new MeloTtsService(config, logger);

  console.log("TTS 준비중...");
  await tts.warmup();
  console.log("TTS 준비 완료");
  console.log(`재생 문장: ${text}`);
  await tts.speak(text);
}

async function main(): Promise<void> {
  switch (mode) {
    case "devices":
      await printAudioDevices();
      return;
    case "test-stt":
      await runSttTest({ enableLlm: false, enableTts: false });
      return;
    case "test-sttllm":
      await runSttTest({ enableLlm: true, enableTts: false });
      return;
    case "test-all":
      await runSttTest({ enableLlm: true, enableTts: true });
      return;
    case "test-llm":
      await runLlmCli();
      return;
    case "test-tts":
      await runTtsTest();
      return;
    default:
      throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-all, test-llm, test-tts, devices`);
  }
}

void main().catch((error) => {
  console.error(error instanceof Error ? error.message : String(error));
  process.exit(1);
});