Integrate LLM into STT flow with reply gating

2026-05-03 01:00:44 +09:00
parent b28f163217
commit c53dcc853d
3 changed files with 171 additions and 9 deletions
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@
 - 메모리 버퍼 기반 간단한 저지연 발화 분리
 - 미리 로드한 `faster-whisper` 워커에 PCM 직접 전달
 - 디스크에 WAV 저장 없이 바로 전사
 - STT 결과에 대해 답변 가치 판단 후 필요할 때만 LLM 답변
 - 로컬 `Ollama` LLM 에이전트 CLI 테스트
 ## 빠른 시작
@@ -91,7 +92,7 @@ bun run test:llm
 4. `bun run devices`
 5. `.env`에서 `AUDIO_SOURCE=`에 루프백 장치 이름 입력
 6. `bun run test:stt`
-7. 유튜브, 디스코드 통화, 동영상 같은 소리를 재생해서 전사 확인
+7. 유튜브, 디스코드 통화, 동영상 같은 소리를 재생해서 전사와 LLM 답변 확인
 ## Windows LLM 테스트 순서
@@ -111,6 +112,8 @@ bun run test:llm
 동작 원칙:
 - 일반 대화는 로컬 LLM만 답변
 - 최신 정보, 뉴스, 사실 확인, 검색 요청일 때만 웹 도구 사용
 - STT 경로에서는 먼저 "대답할 가치가 있는 텍스트인지" 판정한 뒤 필요할 때만 답변
 - 웹 검색이 실제로 시작되면 결과 전에 `검색해볼게요.` 같은 진행 메시지를 먼저 출력
 ## Windows용 .env 예시
--- a/src/index.ts
+++ b/src/index.ts
@@ -14,6 +14,7 @@ async function runSttTest(): Promise<void> {
  const config = loadConfig();
  const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
  const stt = new FasterWhisperSttService(config, logger);
  const llm = new OllamaLlmService(config, logger);
  let capture = null as ReturnType<typeof spawnLoopbackCapture> | null;
  let shuttingDown: Promise<void> | null = null;
  let receivedChunks = 0;
@@ -70,6 +71,8 @@ async function runSttTest(): Promise<void> {
  await stt.warmup();
  logger.info("STT warmup finished");
  await llm.warmup();
  logger.info("LLM warmup finished");
  const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = [];
  let transcribing = false;
@@ -102,11 +105,52 @@ async function runSttTest(): Promise<void> {
            console.log(`\n[text] ${text}\n`);
          }
        } else {
-          console.log(text);
+          console.log(`사용자> ${text}`);
        }
        const assessmentStartedAt = Date.now();
        const assessment = await llm.assessReplyNeed(text);
        logger.info("Reply assessment", {
          index: next.index,
          should_reply: assessment.shouldReply,
          likely_needs_lookup: assessment.likelyNeedsLookup,
          reason: assessment.reason,
          assessment_ms: Date.now() - assessmentStartedAt,
        });
        if (!assessment.shouldReply) {
          if (config.DEBUG) {
            console.log(`[skip] ${assessment.reason}\n`);
          }
          return;
        }
        const llmStartedAt = Date.now();
        const reply = await llm.generateReply(text, {
          onProgress: (message) => {
            if (config.DEBUG) {
              console.log(`[assistant] ${message}`);
              return;
            }
            console.log(`답변> ${message}`);
          },
        });
        logger.info("LLM latency", {
          index: next.index,
          llm_ms: Date.now() - llmStartedAt,
        });
        logger.info("LLM reply", { index: next.index, text: reply });
        if (config.DEBUG) {
          if (config.DEBUG_TRANSCRIPTS) {
            console.log(`[assistant] ${reply}\n`);
          }
        } else {
          console.log(`답변> ${reply}`);
        }
      }
    } catch (error) {
-      logger.error("STT failed", error);
+      logger.error("STT/LLM failed", error);
    } finally {
      transcribing = false;
      void runNext();
@@ -254,7 +298,11 @@ async function runLlmCli(): Promise<void> {
    try {
      const startedAt = Date.now();
-      const reply = await llm.generateReply(text);
+      const reply = await llm.generateReply(text, {
        onProgress: (message) => {
          console.log(`assistant> ${message}`);
        },
      });
      logger.info("LLM latency", {
        llm_ms: Date.now() - startedAt,
      });
--- a/src/services/ollama-llm.ts
+++ b/src/services/ollama-llm.ts
@@ -42,6 +42,16 @@ interface OllamaToolResultMessage {
  content: string;
 }
 interface GenerateReplyOptions {
  onProgress?: (message: string) => void;
 }
 export interface ReplyAssessment {
  shouldReply: boolean;
  likelyNeedsLookup: boolean;
  reason: string;
 }
 const SYSTEM_PROMPT =
  "너는 한국어로 짧고 자연스럽게 답하는 로컬 음성 비서다. 사용자의 말에 바로 답하고, 군더더기 없는 1~3문장으로 답해라. 정확한 시간, 설정 확인, 계산이 필요하면 도구를 우선 사용해라. 최신 정보, 오늘/최근 정보, 뉴스, 검색 요청, 사실 확인, 외부 웹페이지 내용이 필요한 경우에만 web_search 와 fetch_url 을 사용해라. 내부 지식만으로 충분한 일반 대화에는 웹 도구를 쓰지 마라. 너는 도구 호출 루프 안에 있으며 필요하면 여러 번 도구를 호출할 수 있다.";
@@ -158,14 +168,40 @@ export class OllamaLlmService {
    this.logger.info("LLM warmup finished", { model: this.config.OLLAMA_MODEL, reply: reply.content });
  }
-  async generateReply(userText: string): Promise<string> {
+  async assessReplyNeed(userText: string): Promise<ReplyAssessment> {
    const heuristic = this.assessReplyNeedHeuristically(userText);
    if (heuristic) {
      return heuristic;
    }
    const prompt =
      '다음 텍스트에 로컬 비서가 실제로 대답해야 하는지 판정해라. 의미 없는 감탄사, 중얼거림, 문맥 없는 짧은 파편, 노래 가사 조각, 잡음성 문장은 false. 질문, 요청, 확인, 명령, 대화 시도는 true. 최신 정보나 사실 확인이 필요하면 likely_needs_lookup 를 true 로 해라. JSON만 출력: {"should_reply":true,"likely_needs_lookup":false,"reason":"짧게"}';
    const reply = await this.chat([
      { role: "system", content: prompt },
      { role: "user", content: userText },
    ], { enableTools: false });
    const parsed = this.parseAssessment(reply.content);
    if (parsed) {
      return parsed;
    }
    return {
      shouldReply: true,
      likelyNeedsLookup: this.mightNeedLookup(userText),
      reason: "fallback",
    };
  }
  async generateReply(userText: string, options?: GenerateReplyOptions): Promise<string> {
    const messages: Array<OllamaChatMessage | OllamaToolResultMessage> = [
      { role: "system", content: SYSTEM_PROMPT },
      ...this.history,
      { role: "user", content: userText },
    ];
-    const reply = await this.runAgentLoop(messages);
+    const reply = await this.runAgentLoop(messages, options);
    this.history.push({ role: "user", content: userText });
    this.history.push({ role: "assistant", content: reply });
@@ -186,9 +222,14 @@ export class OllamaLlmService {
    this.history = this.history.slice(-maxMessages);
  }
-  private async runAgentLoop(messages: Array<OllamaChatMessage | OllamaToolResultMessage>): Promise<string> {
+  private async runAgentLoop(
    messages: Array<OllamaChatMessage | OllamaToolResultMessage>,
    options?: GenerateReplyOptions,
  ): Promise<string> {
    let progressEmitted = false;
    for (let step = 0; step < 6; step += 1) {
-      const response = await this.chat(messages);
+      const response = await this.chat(messages, { enableTools: true });
      const toolCalls = response.toolCalls ?? [];
      messages.push({
@@ -202,6 +243,13 @@ export class OllamaLlmService {
      }
      for (const call of toolCalls) {
        if (!progressEmitted) {
          const progressMessage = this.getProgressMessage(call.function.name);
          if (progressMessage) {
            options?.onProgress?.(progressMessage);
            progressEmitted = true;
          }
        }
        const result = await this.executeTool(call);
        this.logger.info("LLM tool call", {
          name: call.function.name,
@@ -221,6 +269,7 @@ export class OllamaLlmService {
  private async chat(
    messages: Array<OllamaChatMessage | OllamaToolResultMessage>,
    options?: { enableTools: boolean },
  ): Promise<{ content: string; toolCalls: OllamaToolCall[] }> {
    const response = await fetch(`${this.config.OLLAMA_BASE_URL}/api/chat`, {
      method: "POST",
@@ -230,7 +279,7 @@ export class OllamaLlmService {
      body: JSON.stringify({
        model: this.config.OLLAMA_MODEL,
        messages,
-        tools: TOOL_DEFINITIONS,
+        tools: options?.enableTools ? TOOL_DEFINITIONS : undefined,
        stream: false,
        think: false,
        keep_alive: this.config.OLLAMA_KEEP_ALIVE,
@@ -365,4 +414,66 @@ export class OllamaLlmService {
    }
    return fallback;
  }
  private getProgressMessage(toolName: string): string | null {
    switch (toolName) {
      case "web_search":
      case "fetch_url":
        return "검색해볼게요.";
      default:
        return null;
    }
  }
  private parseAssessment(content: string): ReplyAssessment | null {
    const match = content.match(/\{[\s\S]*\}/);
    if (!match) {
      return null;
    }
    try {
      const parsed = JSON.parse(match[0]) as Record<string, unknown>;
      return {
        shouldReply: parsed.should_reply === true || parsed.shouldReply === true,
        likelyNeedsLookup: parsed.likely_needs_lookup === true || parsed.likelyNeedsLookup === true,
        reason: typeof parsed.reason === "string" ? parsed.reason : "parsed",
      };
    } catch {
      return null;
    }
  }
  private assessReplyNeedHeuristically(userText: string): ReplyAssessment | null {
    const normalized = userText.trim();
    if (!normalized) {
      return {
        shouldReply: false,
        likelyNeedsLookup: false,
        reason: "empty",
      };
    }
    if (/^(아+|어+|음+|으+|흠+|엉+|어어+|음음+|하+|호+|와+|오+|응+|네+|예+|끝\.?)$/u.test(normalized)) {
      return {
        shouldReply: false,
        likelyNeedsLookup: false,
        reason: "filler",
      };
    }
    if (normalized.length <= 2 && !/[?？]/.test(normalized)) {
      return {
        shouldReply: false,
        likelyNeedsLookup: false,
        reason: "too_short",
      };
    }
    return null;
  }
  private mightNeedLookup(text: string): boolean {
    return /(최신|오늘|최근|뉴스|검색|찾아|알아봐|확인|업데이트|가격|날씨|현재|실시간)/u.test(text);
  }
 }