428 lines
12 KiB
TypeScript
428 lines
12 KiB
TypeScript
import process from "node:process";
|
|
import { createInterface } from "node:readline";
|
|
|
|
import { loadConfig } from "./config.js";
|
|
import { Logger } from "./logger.js";
|
|
import { printAudioDevices, spawnLoopbackCapture } from "./audio/capture.js";
|
|
import { RealtimeSegmenter } from "./audio/realtime-segmenter.js";
|
|
import { FasterWhisperSttService } from "./services/faster-whisper-stt.js";
|
|
import { MeloTtsService } from "./services/melo-tts.js";
|
|
import { OllamaLlmService } from "./services/ollama-llm.js";
|
|
|
|
const mode = process.argv[2] ?? "test-stt";
|
|
|
|
async function runSttTest(options: { enableLlm: boolean; enableTts: boolean }): Promise<void> {
|
|
const config = loadConfig();
|
|
const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
|
|
const stt = new FasterWhisperSttService(config, logger);
|
|
const llm = options.enableLlm ? new OllamaLlmService(config, logger) : null;
|
|
let tts = options.enableTts && config.TTS_ENABLED ? new MeloTtsService(config, logger) : null;
|
|
let capture = null as ReturnType<typeof spawnLoopbackCapture> | null;
|
|
let shuttingDown: Promise<void> | null = null;
|
|
let suppressCapture = false;
|
|
let receivedChunks = 0;
|
|
let receivedBytes = 0;
|
|
let maxPeak = 0;
|
|
let lastChunkAt = 0;
|
|
let lastLevelLogAt = 0;
|
|
let sawSpeechStart = false;
|
|
let emittedSegmentCount = 0;
|
|
|
|
const shutdown = async (exitCode: number, reason: string, error?: unknown): Promise<void> => {
|
|
if (shuttingDown) {
|
|
return await shuttingDown;
|
|
}
|
|
|
|
shuttingDown = (async () => {
|
|
if (error) {
|
|
logger.error(`Shutting down: ${reason}`, error);
|
|
} else {
|
|
logger.info("Shutting down", reason);
|
|
}
|
|
|
|
if (capture && !capture.killed && capture.exitCode === null) {
|
|
capture.kill("SIGTERM");
|
|
}
|
|
|
|
await stt.destroy().catch((destroyError) => {
|
|
logger.warn("STT destroy failed", destroyError);
|
|
});
|
|
if (tts) {
|
|
await tts.destroy().catch((destroyError) => {
|
|
logger.warn("TTS destroy failed", destroyError);
|
|
});
|
|
}
|
|
})();
|
|
|
|
await shuttingDown;
|
|
process.exit(exitCode);
|
|
};
|
|
|
|
process.once("SIGINT", () => {
|
|
void shutdown(0, "SIGINT");
|
|
});
|
|
process.once("SIGTERM", () => {
|
|
void shutdown(0, "SIGTERM");
|
|
});
|
|
process.once("uncaughtException", (error) => {
|
|
void shutdown(1, "uncaughtException", error);
|
|
});
|
|
process.once("unhandledRejection", (reason) => {
|
|
void shutdown(1, "unhandledRejection", reason);
|
|
});
|
|
process.once("exit", () => {
|
|
if (capture && !capture.killed && capture.exitCode === null) {
|
|
capture.kill("SIGKILL");
|
|
}
|
|
void stt.destroy();
|
|
if (tts) {
|
|
void tts.destroy();
|
|
}
|
|
});
|
|
|
|
console.log("STT 준비중...");
|
|
await stt.warmup();
|
|
logger.info("STT warmup finished");
|
|
console.log("STT 준비 완료");
|
|
if (llm) {
|
|
console.log("LLM 준비중...");
|
|
await llm.warmup();
|
|
logger.info("LLM warmup finished");
|
|
console.log("LLM 준비 완료");
|
|
}
|
|
if (tts) {
|
|
console.log("TTS 준비중...");
|
|
try {
|
|
await tts.warmup();
|
|
logger.info("TTS warmup finished", {
|
|
image: config.TTS_IMAGE,
|
|
language: config.TTS_LANGUAGE,
|
|
speaker: config.TTS_SPEAKER,
|
|
});
|
|
console.log("TTS 준비 완료");
|
|
} catch (error) {
|
|
logger.warn("TTS warmup failed", error);
|
|
console.log("TTS 비활성화: bun run setup:tts 를 먼저 실행하세요.");
|
|
tts = null;
|
|
}
|
|
}
|
|
|
|
const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = [];
|
|
let transcribing = false;
|
|
let nextSegmentIndex = 1;
|
|
|
|
const runNext = async (): Promise<void> => {
|
|
if (transcribing) {
|
|
return;
|
|
}
|
|
const next = transcriptionQueue.shift();
|
|
if (!next) {
|
|
return;
|
|
}
|
|
|
|
transcribing = true;
|
|
try {
|
|
const startedAt = Date.now();
|
|
const text = await stt.transcribePcm16(next.pcm16);
|
|
logger.info("STT latency", {
|
|
index: next.index,
|
|
wait_ms: startedAt - next.queuedAt,
|
|
transcribe_ms: Date.now() - startedAt,
|
|
});
|
|
if (!text) {
|
|
logger.info("빈 전사 결과");
|
|
} else {
|
|
logger.info("Transcript", { index: next.index, text });
|
|
if (config.DEBUG) {
|
|
if (config.DEBUG_TRANSCRIPTS) {
|
|
console.log(`\n[text] ${text}\n`);
|
|
}
|
|
} else {
|
|
console.log(`사용자> ${text}`);
|
|
}
|
|
|
|
if (llm) {
|
|
const assessmentStartedAt = Date.now();
|
|
const assessment = await llm.assessReplyNeed(text);
|
|
logger.info("Reply assessment", {
|
|
index: next.index,
|
|
should_reply: assessment.shouldReply,
|
|
likely_needs_lookup: assessment.likelyNeedsLookup,
|
|
reason: assessment.reason,
|
|
assessment_ms: Date.now() - assessmentStartedAt,
|
|
});
|
|
|
|
if (!assessment.shouldReply) {
|
|
if (config.DEBUG) {
|
|
console.log(`[skip] ${assessment.reason}\n`);
|
|
}
|
|
return;
|
|
}
|
|
|
|
const llmStartedAt = Date.now();
|
|
const reply = await llm.generateReply(text, {
|
|
onProgress: (message) => {
|
|
if (config.DEBUG) {
|
|
console.log(`[assistant] ${message}`);
|
|
return;
|
|
}
|
|
console.log(`답변> ${message}`);
|
|
},
|
|
});
|
|
logger.info("LLM latency", {
|
|
index: next.index,
|
|
llm_ms: Date.now() - llmStartedAt,
|
|
});
|
|
logger.info("LLM reply", { index: next.index, text: reply });
|
|
|
|
if (config.DEBUG) {
|
|
if (config.DEBUG_TRANSCRIPTS) {
|
|
console.log(`[assistant] ${reply}\n`);
|
|
}
|
|
} else {
|
|
console.log(`답변> ${reply}`);
|
|
}
|
|
|
|
if (tts) {
|
|
suppressCapture = true;
|
|
segmenter.reset();
|
|
try {
|
|
await tts.speak(reply);
|
|
} catch (error) {
|
|
logger.warn("TTS playback failed", error);
|
|
} finally {
|
|
suppressCapture = false;
|
|
sawSpeechStart = false;
|
|
maxPeak = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch (error) {
|
|
logger.error("STT/LLM failed", error);
|
|
} finally {
|
|
transcribing = false;
|
|
void runNext();
|
|
}
|
|
};
|
|
|
|
const segmenter = new RealtimeSegmenter({
|
|
preRollSamples: config.SEGMENT_PREROLL_SAMPLES,
|
|
speechStartThreshold: config.SEGMENT_START_THRESHOLD,
|
|
speechContinueThreshold: config.SEGMENT_CONTINUE_THRESHOLD,
|
|
speechStartFrames: config.SEGMENT_START_FRAMES,
|
|
speechEndFrames: config.SEGMENT_END_FRAMES,
|
|
minSpeechSamples: config.SEGMENT_MIN_SPEECH_SAMPLES,
|
|
maxSpeechSamples: config.SEGMENT_MAX_SPEECH_SAMPLES,
|
|
onLevel: (peak) => {
|
|
if (peak > maxPeak) {
|
|
maxPeak = peak;
|
|
}
|
|
|
|
const now = Date.now();
|
|
if (now - lastLevelLogAt >= 3000) {
|
|
lastLevelLogAt = now;
|
|
logger.info("Audio input heartbeat", {
|
|
chunks: receivedChunks,
|
|
bytes: receivedBytes,
|
|
peak: maxPeak,
|
|
speech_started: sawSpeechStart,
|
|
emitted_segments: emittedSegmentCount,
|
|
});
|
|
maxPeak = 0;
|
|
}
|
|
},
|
|
onSpeechStart: (peak) => {
|
|
sawSpeechStart = true;
|
|
logger.info("Speech start detected", { peak });
|
|
},
|
|
onSpeechDiscarded: (samples) => {
|
|
logger.info("Discarded short speech segment", { samples });
|
|
},
|
|
onSpeechReady: (samples) => {
|
|
emittedSegmentCount += 1;
|
|
logger.info("Speech segment ready", {
|
|
index: emittedSegmentCount,
|
|
samples,
|
|
ms: Math.round((samples / 16000) * 1000),
|
|
});
|
|
},
|
|
onSegment: (pcm16) => {
|
|
const index = nextSegmentIndex++;
|
|
transcriptionQueue.push({
|
|
pcm16,
|
|
queuedAt: Date.now(),
|
|
index,
|
|
});
|
|
logger.info("Queued segment for STT", {
|
|
index,
|
|
queue: transcriptionQueue.length,
|
|
bytes: pcm16.length,
|
|
});
|
|
void runNext();
|
|
},
|
|
});
|
|
|
|
capture = spawnLoopbackCapture(config, logger);
|
|
capture.stdout.on("data", (chunk: Buffer) => {
|
|
receivedChunks += 1;
|
|
receivedBytes += chunk.length;
|
|
lastChunkAt = Date.now();
|
|
if (suppressCapture) {
|
|
return;
|
|
}
|
|
segmenter.pushChunk(chunk);
|
|
});
|
|
capture.stderr.on("data", (chunk: Buffer) => {
|
|
const text = chunk.toString().trim();
|
|
if (text) {
|
|
logger.debug("[capture]", text);
|
|
}
|
|
});
|
|
capture.on("error", (error) => {
|
|
void shutdown(1, "capture-error", error);
|
|
});
|
|
capture.on("exit", (code, signal) => {
|
|
logger.warn("capture exited", { code, signal });
|
|
if (!shuttingDown) {
|
|
void shutdown(1, "capture-exit");
|
|
}
|
|
});
|
|
|
|
if (config.DEBUG) {
|
|
if (options.enableLlm && options.enableTts) {
|
|
console.log("실시간 출력장치 STT+LLM+TTS 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
|
|
} else if (options.enableLlm) {
|
|
console.log("실시간 출력장치 STT+LLM 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
|
|
} else {
|
|
console.log("실시간 출력장치 STT 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
|
|
}
|
|
console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`);
|
|
console.log(`model: ${config.WHISPER_MODEL}`);
|
|
console.log(`language: ${config.WHISPER_LANGUAGE}`);
|
|
console.log(`beam: ${config.WHISPER_BEAM_SIZE}`);
|
|
if (options.enableLlm) {
|
|
console.log(`llm: ${config.OLLAMA_MODEL}`);
|
|
}
|
|
if (options.enableTts) {
|
|
console.log(`tts: ${config.TTS_IMAGE}`);
|
|
}
|
|
}
|
|
|
|
setInterval(() => {
|
|
const now = Date.now();
|
|
if (lastChunkAt === 0 && !shuttingDown) {
|
|
logger.warn("아직 캡처 PCM 데이터가 들어오지 않았습니다. AUDIO_SOURCE 가 잘못됐거나 loopback 입력이 아닌 장치일 수 있습니다.");
|
|
return;
|
|
}
|
|
|
|
if (lastChunkAt > 0 && now - lastChunkAt >= 5000 && !shuttingDown) {
|
|
logger.warn("최근 5초 동안 새 PCM chunk 가 들어오지 않았습니다.");
|
|
}
|
|
}, 5000).unref();
|
|
}
|
|
|
|
async function runLlmCli(): Promise<void> {
|
|
const config = loadConfig();
|
|
const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
|
|
const llm = new OllamaLlmService(config, logger);
|
|
|
|
await llm.warmup();
|
|
|
|
console.log(`LLM CLI 테스트를 시작합니다. model=${config.OLLAMA_MODEL}`);
|
|
console.log("/exit 로 종료, /reset 으로 대화 초기화");
|
|
|
|
const rl = createInterface({
|
|
input: process.stdin,
|
|
output: process.stdout,
|
|
prompt: "you> ",
|
|
});
|
|
|
|
rl.prompt();
|
|
|
|
rl.on("line", async (line) => {
|
|
const text = line.trim();
|
|
|
|
if (!text) {
|
|
rl.prompt();
|
|
return;
|
|
}
|
|
|
|
if (text === "/exit") {
|
|
rl.close();
|
|
return;
|
|
}
|
|
|
|
if (text === "/reset") {
|
|
llm.resetConversation();
|
|
console.log("assistant> 대화 문맥을 초기화했습니다.");
|
|
rl.prompt();
|
|
return;
|
|
}
|
|
|
|
try {
|
|
const startedAt = Date.now();
|
|
const reply = await llm.generateReply(text, {
|
|
onProgress: (message) => {
|
|
console.log(`assistant> ${message}`);
|
|
},
|
|
});
|
|
logger.info("LLM latency", {
|
|
llm_ms: Date.now() - startedAt,
|
|
});
|
|
console.log(`assistant> ${reply}`);
|
|
} catch (error) {
|
|
console.error(error instanceof Error ? error.message : String(error));
|
|
}
|
|
|
|
rl.prompt();
|
|
});
|
|
|
|
rl.on("close", () => {
|
|
process.exit(0);
|
|
});
|
|
}
|
|
|
|
async function runTtsTest(): Promise<void> {
|
|
const text = process.argv.slice(3).join(" ").trim() || "안녕하세요. 로컬 티티에스 테스트입니다.";
|
|
const config = loadConfig();
|
|
const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
|
|
const tts = new MeloTtsService(config, logger);
|
|
|
|
console.log("TTS 준비중...");
|
|
await tts.warmup();
|
|
console.log("TTS 준비 완료");
|
|
console.log(`재생 문장: ${text}`);
|
|
await tts.speak(text);
|
|
}
|
|
|
|
async function main(): Promise<void> {
|
|
switch (mode) {
|
|
case "devices":
|
|
await printAudioDevices();
|
|
return;
|
|
case "test-stt":
|
|
await runSttTest({ enableLlm: false, enableTts: false });
|
|
return;
|
|
case "test-sttllm":
|
|
await runSttTest({ enableLlm: true, enableTts: false });
|
|
return;
|
|
case "test-all":
|
|
await runSttTest({ enableLlm: true, enableTts: true });
|
|
return;
|
|
case "test-llm":
|
|
await runLlmCli();
|
|
return;
|
|
case "test-tts":
|
|
await runTtsTest();
|
|
return;
|
|
default:
|
|
throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-all, test-llm, test-tts, devices`);
|
|
}
|
|
}
|
|
|
|
void main().catch((error) => {
|
|
console.error(error instanceof Error ? error.message : String(error));
|
|
process.exit(1);
|
|
});
|