import process from "node:process"; import { createInterface } from "node:readline"; import { loadConfig } from "./config.js"; import { Logger } from "./logger.js"; import { printAudioDevices, spawnLoopbackCapture } from "./audio/capture.js"; import { RealtimeSegmenter } from "./audio/realtime-segmenter.js"; import { FasterWhisperSttService } from "./services/faster-whisper-stt.js"; import { MeloTtsService } from "./services/melo-tts.js"; import { OllamaLlmService } from "./services/ollama-llm.js"; const mode = process.argv[2] ?? "test-stt"; async function runSttTest(options: { enableLlm: boolean; enableTts: boolean }): Promise { const config = loadConfig(); const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error"); const stt = new FasterWhisperSttService(config, logger); const llm = options.enableLlm ? new OllamaLlmService(config, logger) : null; let tts = options.enableTts && config.TTS_ENABLED ? new MeloTtsService(config, logger) : null; let capture = null as ReturnType | null; let shuttingDown: Promise | null = null; let suppressCapture = false; let receivedChunks = 0; let receivedBytes = 0; let maxPeak = 0; let lastChunkAt = 0; let lastLevelLogAt = 0; let sawSpeechStart = false; let emittedSegmentCount = 0; const shutdown = async (exitCode: number, reason: string, error?: unknown): Promise => { if (shuttingDown) { return await shuttingDown; } shuttingDown = (async () => { if (error) { logger.error(`Shutting down: ${reason}`, error); } else { logger.info("Shutting down", reason); } if (capture && !capture.killed && capture.exitCode === null) { capture.kill("SIGTERM"); } await stt.destroy().catch((destroyError) => { logger.warn("STT destroy failed", destroyError); }); if (tts) { await tts.destroy().catch((destroyError) => { logger.warn("TTS destroy failed", destroyError); }); } })(); await shuttingDown; process.exit(exitCode); }; process.once("SIGINT", () => { void shutdown(0, "SIGINT"); }); process.once("SIGTERM", () => { void shutdown(0, "SIGTERM"); }); process.once("uncaughtException", (error) => { void shutdown(1, "uncaughtException", error); }); process.once("unhandledRejection", (reason) => { void shutdown(1, "unhandledRejection", reason); }); process.once("exit", () => { if (capture && !capture.killed && capture.exitCode === null) { capture.kill("SIGKILL"); } void stt.destroy(); if (tts) { void tts.destroy(); } }); console.log("STT 준비중..."); await stt.warmup(); logger.info("STT warmup finished"); console.log("STT 준비 완료"); if (llm) { console.log("LLM 준비중..."); await llm.warmup(); logger.info("LLM warmup finished"); console.log("LLM 준비 완료"); } if (tts) { console.log("TTS 준비중..."); try { await tts.warmup(); logger.info("TTS warmup finished", { image: config.TTS_IMAGE, language: config.TTS_LANGUAGE, speaker: config.TTS_SPEAKER, }); console.log("TTS 준비 완료"); } catch (error) { logger.warn("TTS warmup failed", error); console.log("TTS 비활성화: bun run setup:tts 를 먼저 실행하세요."); tts = null; } } const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = []; let transcribing = false; let nextSegmentIndex = 1; const runNext = async (): Promise => { if (transcribing) { return; } const next = transcriptionQueue.shift(); if (!next) { return; } transcribing = true; try { const startedAt = Date.now(); const text = await stt.transcribePcm16(next.pcm16); logger.info("STT latency", { index: next.index, wait_ms: startedAt - next.queuedAt, transcribe_ms: Date.now() - startedAt, }); if (!text) { logger.info("빈 전사 결과"); } else { logger.info("Transcript", { index: next.index, text }); if (config.DEBUG) { if (config.DEBUG_TRANSCRIPTS) { console.log(`\n[text] ${text}\n`); } } else { console.log(`사용자> ${text}`); } if (llm) { const assessmentStartedAt = Date.now(); const assessment = await llm.assessReplyNeed(text); logger.info("Reply assessment", { index: next.index, should_reply: assessment.shouldReply, likely_needs_lookup: assessment.likelyNeedsLookup, reason: assessment.reason, assessment_ms: Date.now() - assessmentStartedAt, }); if (!assessment.shouldReply) { if (config.DEBUG) { console.log(`[skip] ${assessment.reason}\n`); } return; } const llmStartedAt = Date.now(); const reply = await llm.generateReply(text, { onProgress: (message) => { if (config.DEBUG) { console.log(`[assistant] ${message}`); return; } console.log(`답변> ${message}`); }, }); logger.info("LLM latency", { index: next.index, llm_ms: Date.now() - llmStartedAt, }); logger.info("LLM reply", { index: next.index, text: reply }); if (config.DEBUG) { if (config.DEBUG_TRANSCRIPTS) { console.log(`[assistant] ${reply}\n`); } } else { console.log(`답변> ${reply}`); } if (tts) { suppressCapture = true; segmenter.reset(); try { await tts.speak(reply); } catch (error) { logger.warn("TTS playback failed", error); } finally { suppressCapture = false; sawSpeechStart = false; maxPeak = 0; } } } } } catch (error) { logger.error("STT/LLM failed", error); } finally { transcribing = false; void runNext(); } }; const segmenter = new RealtimeSegmenter({ preRollSamples: config.SEGMENT_PREROLL_SAMPLES, speechStartThreshold: config.SEGMENT_START_THRESHOLD, speechContinueThreshold: config.SEGMENT_CONTINUE_THRESHOLD, speechStartFrames: config.SEGMENT_START_FRAMES, speechEndFrames: config.SEGMENT_END_FRAMES, minSpeechSamples: config.SEGMENT_MIN_SPEECH_SAMPLES, maxSpeechSamples: config.SEGMENT_MAX_SPEECH_SAMPLES, onLevel: (peak) => { if (peak > maxPeak) { maxPeak = peak; } const now = Date.now(); if (now - lastLevelLogAt >= 3000) { lastLevelLogAt = now; logger.info("Audio input heartbeat", { chunks: receivedChunks, bytes: receivedBytes, peak: maxPeak, speech_started: sawSpeechStart, emitted_segments: emittedSegmentCount, }); maxPeak = 0; } }, onSpeechStart: (peak) => { sawSpeechStart = true; logger.info("Speech start detected", { peak }); }, onSpeechDiscarded: (samples) => { logger.info("Discarded short speech segment", { samples }); }, onSpeechReady: (samples) => { emittedSegmentCount += 1; logger.info("Speech segment ready", { index: emittedSegmentCount, samples, ms: Math.round((samples / 16000) * 1000), }); }, onSegment: (pcm16) => { const index = nextSegmentIndex++; transcriptionQueue.push({ pcm16, queuedAt: Date.now(), index, }); logger.info("Queued segment for STT", { index, queue: transcriptionQueue.length, bytes: pcm16.length, }); void runNext(); }, }); capture = spawnLoopbackCapture(config, logger); capture.stdout.on("data", (chunk: Buffer) => { receivedChunks += 1; receivedBytes += chunk.length; lastChunkAt = Date.now(); if (suppressCapture) { return; } segmenter.pushChunk(chunk); }); capture.stderr.on("data", (chunk: Buffer) => { const text = chunk.toString().trim(); if (text) { logger.debug("[capture]", text); } }); capture.on("error", (error) => { void shutdown(1, "capture-error", error); }); capture.on("exit", (code, signal) => { logger.warn("capture exited", { code, signal }); if (!shuttingDown) { void shutdown(1, "capture-exit"); } }); if (config.DEBUG) { if (options.enableLlm && options.enableTts) { console.log("실시간 출력장치 STT+LLM+TTS 테스트를 시작합니다. Ctrl+C 로 종료합니다."); } else if (options.enableLlm) { console.log("실시간 출력장치 STT+LLM 테스트를 시작합니다. Ctrl+C 로 종료합니다."); } else { console.log("실시간 출력장치 STT 테스트를 시작합니다. Ctrl+C 로 종료합니다."); } console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`); console.log(`model: ${config.WHISPER_MODEL}`); console.log(`language: ${config.WHISPER_LANGUAGE}`); console.log(`beam: ${config.WHISPER_BEAM_SIZE}`); if (options.enableLlm) { console.log(`llm: ${config.OLLAMA_MODEL}`); } if (options.enableTts) { console.log(`tts: ${config.TTS_IMAGE}`); } } setInterval(() => { const now = Date.now(); if (lastChunkAt === 0 && !shuttingDown) { logger.warn("아직 캡처 PCM 데이터가 들어오지 않았습니다. AUDIO_SOURCE 가 잘못됐거나 loopback 입력이 아닌 장치일 수 있습니다."); return; } if (lastChunkAt > 0 && now - lastChunkAt >= 5000 && !shuttingDown) { logger.warn("최근 5초 동안 새 PCM chunk 가 들어오지 않았습니다."); } }, 5000).unref(); } async function runLlmCli(): Promise { const config = loadConfig(); const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error"); const llm = new OllamaLlmService(config, logger); await llm.warmup(); console.log(`LLM CLI 테스트를 시작합니다. model=${config.OLLAMA_MODEL}`); console.log("/exit 로 종료, /reset 으로 대화 초기화"); const rl = createInterface({ input: process.stdin, output: process.stdout, prompt: "you> ", }); rl.prompt(); rl.on("line", async (line) => { const text = line.trim(); if (!text) { rl.prompt(); return; } if (text === "/exit") { rl.close(); return; } if (text === "/reset") { llm.resetConversation(); console.log("assistant> 대화 문맥을 초기화했습니다."); rl.prompt(); return; } try { const startedAt = Date.now(); const reply = await llm.generateReply(text, { onProgress: (message) => { console.log(`assistant> ${message}`); }, }); logger.info("LLM latency", { llm_ms: Date.now() - startedAt, }); console.log(`assistant> ${reply}`); } catch (error) { console.error(error instanceof Error ? error.message : String(error)); } rl.prompt(); }); rl.on("close", () => { process.exit(0); }); } async function runTtsTest(): Promise { const text = process.argv.slice(3).join(" ").trim() || "안녕하세요. 로컬 티티에스 테스트입니다."; const config = loadConfig(); const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error"); const tts = new MeloTtsService(config, logger); console.log("TTS 준비중..."); await tts.warmup(); console.log("TTS 준비 완료"); console.log(`재생 문장: ${text}`); await tts.speak(text); } async function main(): Promise { switch (mode) { case "devices": await printAudioDevices(); return; case "test-stt": await runSttTest({ enableLlm: false, enableTts: false }); return; case "test-sttllm": await runSttTest({ enableLlm: true, enableTts: false }); return; case "test-all": await runSttTest({ enableLlm: true, enableTts: true }); return; case "test-llm": await runLlmCli(); return; case "test-tts": await runTtsTest(); return; default: throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-all, test-llm, test-tts, devices`); } } void main().catch((error) => { console.error(error instanceof Error ? error.message : String(error)); process.exit(1); });