From 645a5109a26b72876a141dc2e01ec24ed12dc991 Mon Sep 17 00:00:00 2001
From: claude-bot <claude-bot@tkrmagid.kr>
Date: Thu, 30 Apr 2026 18:08:38 +0900
Subject: [PATCH] Improve local startup checks and Korean STT defaults

---
 .env.example               |  4 +-
 README.md                  | 10 +++-
 src/config.ts              |  4 +-
 src/local-main.ts          |  7 +++
 src/services/llm.ts        |  1 +
 src/services/ollama-llm.ts | 97 ++++++++++++++++++++++++++++----------
 6 files changed, 92 insertions(+), 31 deletions(-)
diff --git a/.env.example b/.env.example
index 06ad168..44979bf 100644
--- a/.env.example
+++ b/.env.example
@@ -11,11 +11,11 @@ LOCAL_AI_VENV_PATH=.local-ai/.venv
 LOCAL_AI_CACHE_DIR=.local-ai/cache
 # Windows면 `python` 또는 `py -3`
 LOCAL_AI_PYTHON=
-LOCAL_STT_MODEL=tiny
+LOCAL_STT_MODEL=small
 # CUDA dll 오류가 나면 `cpu`
 LOCAL_STT_DEVICE=auto
 LOCAL_STT_COMPUTE_TYPE=auto
-LOCAL_STT_BEAM_SIZE=1
+LOCAL_STT_BEAM_SIZE=3
 LOCAL_TTS_MODEL_PATH=.local-ai/models/kokoro-v1.0.onnx
 LOCAL_TTS_VOICES_PATH=.local-ai/models/voices-v1.0.bin
 LOCAL_TTS_LANGUAGE=ko
diff --git a/README.md b/README.md
index 14447a8..687885c 100644
--- a/README.md
+++ b/README.md
@@ -122,15 +122,23 @@ Windows에서 GPU STT를 쓰려면 `LOCAL_STT_DEVICE=auto` 그대로 두고 `bun
 
 ## 속도 우선 기본값
 
-- STT 기본 모델은 `tiny`
+- STT 기본 권장 모델은 `small`
 - LLM 기본 모델은 `qwen3:0.6b`
 - TTS 기본 보이스는 `af_heart`
 - TTS 기본 속도는 `1.12`
 
+더 빠르게 돌리고 싶으면:
+
+```env
+LOCAL_STT_MODEL=tiny
+LOCAL_STT_BEAM_SIZE=1
+```
+
 정확도가 아쉬우면:
 
 ```env
 LOCAL_STT_MODEL=small
+LOCAL_STT_BEAM_SIZE=3
 OLLAMA_MODEL=qwen3:1.7b
 ```
 
diff --git a/src/config.ts b/src/config.ts
index 6259d4c..4e20dfb 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -22,10 +22,10 @@ const envSchema = z.object({
   LOCAL_AI_VENV_PATH: z.string().min(1).default(".local-ai/.venv"),
   LOCAL_AI_CACHE_DIR: z.string().min(1).default(".local-ai/cache"),
   LOCAL_AI_PYTHON: emptyToUndefined,
-  LOCAL_STT_MODEL: z.string().min(1).default("tiny"),
+  LOCAL_STT_MODEL: z.string().min(1).default("small"),
   LOCAL_STT_DEVICE: z.string().min(1).default("auto"),
   LOCAL_STT_COMPUTE_TYPE: z.string().min(1).default("auto"),
-  LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1),
+  LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(3),
   LOCAL_TTS_MODEL_PATH: z.string().min(1).default(".local-ai/models/kokoro-v1.0.onnx"),
   LOCAL_TTS_VOICES_PATH: z.string().min(1).default(".local-ai/models/voices-v1.0.bin"),
   LOCAL_TTS_LANGUAGE: z.string().min(1).default("ko"),
diff --git a/src/local-main.ts b/src/local-main.ts
index c35a9c6..7199c72 100644
--- a/src/local-main.ts
+++ b/src/local-main.ts
@@ -77,6 +77,13 @@ export async function runLocalAssistant(config: AssistantRuntimeConfig, logger:
 
   await stt.warmup();
   await tts.warmup();
+  await llm.warmup?.();
+
+  if (config.BOT_DEFAULT_LANGUAGE.startsWith("ko") && config.LOCAL_STT_MODEL === "tiny") {
+    logger.warn(
+      "LOCAL_STT_MODEL=tiny 는 한국어 인식률이 낮을 수 있습니다. GPU 환경이면 small 이상을 권장합니다.",
+    );
+  }
 
   const session = new LocalVoiceSession({
     config,
diff --git a/src/services/llm.ts b/src/services/llm.ts
index 5c01e73..0d2b8af 100644
--- a/src/services/llm.ts
+++ b/src/services/llm.ts
@@ -1,5 +1,6 @@
 import type { ConversationMemory, UserUtterance } from "./conversation.js";
 
 export interface LlmService {
+  warmup?(): Promise<void>;
   generateReply(memory: ConversationMemory, utterance: UserUtterance): Promise<string>;
 }
diff --git a/src/services/ollama-llm.ts b/src/services/ollama-llm.ts
index 58992d6..a93ad2b 100644
--- a/src/services/ollama-llm.ts
+++ b/src/services/ollama-llm.ts
@@ -21,6 +21,13 @@ interface OllamaChatResponse {
   error?: string;
 }
 
+interface OllamaTagsResponse {
+  models?: Array<{
+    name?: string;
+    model?: string;
+  }>;
+}
+
 function normalizeReply(text: string): string {
   const strippedThink = text.replace(/<think>[\s\S]*?<\/think>/gi, " ");
   const compact = strippedThink.replace(/\s+/g, " ").trim();
@@ -39,35 +46,73 @@ function normalizeReply(text: string): string {
 export class OllamaLlmService implements LlmService {
   constructor(private readonly config: AssistantRuntimeConfig) {}
 
+  async warmup(): Promise<void> {
+    const url = new URL("/api/tags", this.config.OLLAMA_BASE_URL);
+    let response: Response;
+
+    try {
+      response = await fetch(url);
+    } catch {
+      throw new Error(
+        `Ollama 서버에 연결할 수 없습니다. ${this.config.OLLAMA_BASE_URL} 확인 후 Ollama 앱 또는 \`ollama serve\` 를 실행하고 \`ollama pull ${this.config.OLLAMA_MODEL}\` 까지 끝내 주세요.`,
+      );
+    }
+
+    const body = (await response.json().catch(() => ({}))) as OllamaTagsResponse & { error?: string };
+    if (!response.ok) {
+      throw new Error(body.error ?? `Ollama 상태 확인 실패: HTTP ${response.status}`);
+    }
+
+    const models = body.models ?? [];
+    const exists = models.some((model) => {
+      const name = model.name?.trim();
+      const alias = model.model?.trim();
+      return name === this.config.OLLAMA_MODEL || alias === this.config.OLLAMA_MODEL;
+    });
+
+    if (!exists) {
+      throw new Error(
+        `Ollama 모델 ${this.config.OLLAMA_MODEL} 이 없습니다. \`ollama pull ${this.config.OLLAMA_MODEL}\` 를 먼저 실행해 주세요.`,
+      );
+    }
+  }
+
   async generateReply(memory: ConversationMemory, utterance: UserUtterance): Promise<string> {
     const url = new URL("/api/chat", this.config.OLLAMA_BASE_URL);
-    const response = await fetch(url, {
-      method: "POST",
-      headers: {
-        "Content-Type": "application/json",
-      },
-      body: JSON.stringify({
-        model: this.config.OLLAMA_MODEL,
-        messages: [
-          {
-            role: "system",
-            content: ASSISTANT_INSTRUCTIONS,
-          },
-          {
-            role: "user",
-            content: memory.buildPrompt(utterance),
-          },
-        ],
-        think: false,
-        stream: false,
-        keep_alive: this.config.OLLAMA_KEEP_ALIVE,
-        options: {
-          num_ctx: this.config.OLLAMA_NUM_CTX,
-          temperature: 0.4,
-          num_predict: 120,
+    let response: Response;
+    try {
+      response = await fetch(url, {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
         },
-      }),
-    });
+        body: JSON.stringify({
+          model: this.config.OLLAMA_MODEL,
+          messages: [
+            {
+              role: "system",
+              content: ASSISTANT_INSTRUCTIONS,
+            },
+            {
+              role: "user",
+              content: memory.buildPrompt(utterance),
+            },
+          ],
+          think: false,
+          stream: false,
+          keep_alive: this.config.OLLAMA_KEEP_ALIVE,
+          options: {
+            num_ctx: this.config.OLLAMA_NUM_CTX,
+            temperature: 0.4,
+            num_predict: 120,
+          },
+        }),
+      });
+    } catch {
+      throw new Error(
+        `Ollama 서버에 연결할 수 없습니다. ${this.config.OLLAMA_BASE_URL} 확인 후 Ollama 앱 또는 \`ollama serve\` 를 실행해 주세요.`,
+      );
+    }
 
     const body = (await response.json().catch(() => ({}))) as OllamaChatResponse;