diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..9f94d8c --- /dev/null +++ b/.env.example @@ -0,0 +1,16 @@ +LOCAL_AI_VENV_PATH=.local-ai/.venv +# Windows면 보통 python 또는 py -3 +LOCAL_AI_PYTHON= + +# Windows: ffmpeg dshow 장치 이름 +# Linux: pactl list sources short 에서 monitor/source 이름 +AUDIO_SOURCE= + +WHISPER_MODEL=large-v3-turbo +WHISPER_LANGUAGE=ko +WHISPER_DEVICE=auto +WHISPER_COMPUTE_TYPE=auto +WHISPER_BEAM_SIZE=1 + +DEBUG_TRANSCRIPTS=true +LOG_LEVEL=info diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6c1f2e6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +node_modules +dist +.env +.local-ai +*.pyc +__pycache__ diff --git a/README.md b/README.md index bf94a48..329d2c2 100644 --- a/README.md +++ b/README.md @@ -1 +1,54 @@ # realtime_voice_bot + +출력장치로 재생되는 소리를 파일 저장 없이 바로 받아서, 메모리 버퍼에서 발화 구간을 나눈 뒤 `faster-whisper`로 STT 하는 최소 프로토타입입니다. + +## 현재 범위 + +- Node.js + TypeScript 메인 프로세스 +- 출력 오디오 실시간 캡처 +- 메모리 버퍼 기반 간단한 저지연 발화 분리 +- 미리 로드한 `faster-whisper` 워커에 PCM 직접 전달 +- 디스크에 WAV 저장 없이 바로 전사 + +## 빠른 시작 + +```bash +bun install +bun run setup:python +cp .env.example .env +``` + +장치 목록 확인: + +```bash +bun run devices +``` + +실행: + +```bash +bun run start:loopback +``` + +## 환경 변수 + +- `AUDIO_SOURCE` + - Windows: `bun run devices` 에서 보이는 `ffmpeg dshow` 오디오 장치 이름 + - Linux: `pactl list sources short` 에서 보이는 monitor/source 이름 +- `WHISPER_MODEL` + - 기본값 `large-v3-turbo` +- `WHISPER_LANGUAGE` + - 기본값 `ko` +- `WHISPER_DEVICE` + - `auto`, `cuda`, `cpu` +- `WHISPER_COMPUTE_TYPE` + - `auto`, `float16`, `int8_float16`, `int8`, `float32` +- `WHISPER_BEAM_SIZE` + - 기본값 `1` + +## 메모 + +- 이 버전은 일단 `STT`만 합니다. +- 최소 지연을 위해 파일 저장은 하지 않습니다. +- VAD는 현재 모델 기반이 아니라 진폭 기반 단순 분리입니다. +- Windows에서는 보통 출력 루프백이 가능한 장치나 `Stereo Mix`, 오디오 인터페이스 loopback 채널을 `AUDIO_SOURCE`로 잡아야 합니다. diff --git a/bun.lock b/bun.lock new file mode 100644 index 0000000..0f50b05 --- /dev/null +++ b/bun.lock @@ -0,0 +1,28 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "realtime_voice_bot", + "dependencies": { + "dotenv": "^17.4.2", + "zod": "^4.3.6", + }, + "devDependencies": { + "@types/node": "^25.6.0", + "typescript": "^6.0.3", + }, + }, + }, + "packages": { + "@types/node": ["@types/node@25.6.0", "", { "dependencies": { "undici-types": "~7.19.0" } }, "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ=="], + + "dotenv": ["dotenv@17.4.2", "", {}, "sha512-nI4U3TottKAcAD9LLud4Cb7b2QztQMUEfHbvhTH09bqXTxnSie8WnjPALV/WMCrJZ6UV/qHJ6L03OqO3LcdYZw=="], + + "typescript": ["typescript@6.0.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw=="], + + "undici-types": ["undici-types@7.19.2", "", {}, "sha512-qYVnV5OEm2AW8cJMCpdV20CDyaN3g0AjDlOGf1OW4iaDEx8MwdtChUp4zu4H0VP3nDRF/8RKWH+IPp9uW0YGZg=="], + + "zod": ["zod@4.4.2", "", {}, "sha512-IynmDyxsEsb9RKzO3J9+4SxXnl2FTFSzNBaKKaMV6tsSk0rw9gYw9gs+JFCq/qk2LCZ78KDwyj+Z289TijSkUw=="], + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..7f80149 --- /dev/null +++ b/package.json @@ -0,0 +1,25 @@ +{ + "name": "realtime_voice_bot", + "version": "0.1.0", + "private": true, + "type": "module", + "scripts": { + "start:loopback": "bun src/index.ts loopback", + "devices": "bun src/index.ts devices", + "setup:python": "bun src/setup-python.ts", + "check": "tsc --noEmit", + "build": "tsc -p tsconfig.json" + }, + "engines": { + "bun": ">=1.3.0", + "node": ">=22.12.0" + }, + "dependencies": { + "dotenv": "^17.4.2", + "zod": "^4.3.6" + }, + "devDependencies": { + "@types/node": "^25.6.0", + "typescript": "^6.0.3" + } +} diff --git a/python/loopback_stt_worker.py b/python/loopback_stt_worker.py new file mode 100644 index 0000000..9140bf1 --- /dev/null +++ b/python/loopback_stt_worker.py @@ -0,0 +1,133 @@ +import base64 +import json +import os +import sys +import traceback +from typing import Any + +import numpy as np +from faster_whisper import WhisperModel + + +def resolve_model() -> WhisperModel: + model_name = os.environ.get("WHISPER_MODEL", "large-v3-turbo") + requested_device = os.environ.get("WHISPER_DEVICE", "auto") + requested_compute = os.environ.get("WHISPER_COMPUTE_TYPE", "auto") + + attempts: list[tuple[str, str]] = [] + if requested_device == "auto": + if requested_compute == "auto": + attempts.extend( + [ + ("cuda", "float16"), + ("cuda", "int8_float16"), + ("cpu", "int8"), + ("cpu", "float32"), + ] + ) + else: + attempts.extend( + [ + ("cuda", requested_compute), + ("cpu", requested_compute), + ] + ) + else: + if requested_compute == "auto": + compute = "float16" if requested_device == "cuda" else "int8" + else: + compute = requested_compute + attempts.append((requested_device, compute)) + + last_error: Exception | None = None + for device, compute_type in attempts: + try: + model = WhisperModel(model_name, device=device, compute_type=compute_type) + setattr(model, "_resolved_device", device) + setattr(model, "_resolved_compute_type", compute_type) + return model + except Exception as error: # noqa: BLE001 + last_error = error + + assert last_error is not None + raise last_error + + +MODEL = resolve_model() +LANGUAGE = os.environ.get("WHISPER_LANGUAGE", "ko") +BEAM_SIZE = int(os.environ.get("WHISPER_BEAM_SIZE", "1")) + + +def write(payload: dict[str, Any]) -> None: + sys.stdout.write(json.dumps(payload, ensure_ascii=False) + "\n") + sys.stdout.flush() + + +def transcribe_pcm16_base64(pcm16_base64: str) -> str: + audio_bytes = base64.b64decode(pcm16_base64) + audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 + + segments, _info = MODEL.transcribe( + audio, + language=LANGUAGE, + task="transcribe", + beam_size=BEAM_SIZE, + condition_on_previous_text=False, + vad_filter=False, + without_timestamps=True, + word_timestamps=False, + temperature=0.0, + ) + + text_parts: list[str] = [] + for segment in segments: + if segment.text: + text_parts.append(segment.text.strip()) + return " ".join(part for part in text_parts if part).strip() + + +for raw_line in sys.stdin: + line = raw_line.strip() + if not line: + continue + + request = json.loads(line) + request_id = request["id"] + method = request["method"] + params = request.get("params", {}) + + try: + if method == "ping": + write( + { + "id": request_id, + "result": { + "model": os.environ.get("WHISPER_MODEL", "large-v3-turbo"), + "device": getattr(MODEL, "_resolved_device", "unknown"), + "compute_type": getattr(MODEL, "_resolved_compute_type", "unknown"), + }, + } + ) + continue + + if method == "transcribe": + text = transcribe_pcm16_base64(params["pcm16_base64"]) + write( + { + "id": request_id, + "result": { + "text": text, + }, + } + ) + continue + + raise RuntimeError(f"unknown method: {method}") + except Exception as error: # noqa: BLE001 + traceback.print_exc(file=sys.stderr) + write( + { + "id": request_id, + "error": f"{type(error).__name__}: {error}", + } + ) diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 0000000..222236a --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1 @@ +faster-whisper==1.2.1 diff --git a/src/audio/capture.ts b/src/audio/capture.ts new file mode 100644 index 0000000..bce8e13 --- /dev/null +++ b/src/audio/capture.ts @@ -0,0 +1,138 @@ +import { spawn, type ChildProcessByStdio } from "node:child_process"; +import process from "node:process"; +import type { Readable } from "node:stream"; + +import type { AppConfig } from "../config.js"; +import type { Logger } from "../logger.js"; + +export function printAudioDevices(): Promise { + if (process.platform === "win32") { + return new Promise((resolve, reject) => { + const child = spawn("ffmpeg", ["-hide_banner", "-list_devices", "true", "-f", "dshow", "-i", "dummy"], { + stdio: ["ignore", "ignore", "inherit"], + }); + child.on("exit", (code) => { + if (code === 0 || code === 1) { + resolve(); + return; + } + reject(new Error(`ffmpeg exited with code ${code ?? "null"}`)); + }); + child.on("error", reject); + }); + } + + return new Promise((resolve, reject) => { + const pactl = spawn("pactl", ["list", "sources", "short"], { + stdio: ["ignore", "inherit", "inherit"], + }); + + pactl.on("exit", (code) => { + if (code === 0) { + resolve(); + return; + } + + const wpctl = spawn("wpctl", ["status", "-n"], { + stdio: ["ignore", "inherit", "inherit"], + }); + wpctl.on("exit", (wpctlCode) => { + if (wpctlCode === 0) { + resolve(); + return; + } + reject(new Error(`pactl exited with code ${code ?? "null"} and wpctl exited with code ${wpctlCode ?? "null"}`)); + }); + wpctl.on("error", reject); + }); + + pactl.on("error", () => { + const wpctl = spawn("wpctl", ["status", "-n"], { + stdio: ["ignore", "inherit", "inherit"], + }); + wpctl.on("exit", (code) => { + if (code === 0) { + resolve(); + return; + } + reject(new Error(`pactl, wpctl 둘 다 실행할 수 없습니다. code=${code ?? "null"}`)); + }); + wpctl.on("error", reject); + }); + }); +} + +export function spawnLoopbackCapture( + config: AppConfig, + logger: Logger, +): ChildProcessByStdio { + if (!config.AUDIO_SOURCE) { + throw new Error("AUDIO_SOURCE 설정이 필요합니다. 먼저 `bun run devices` 로 장치 이름을 확인하세요."); + } + + if (process.platform === "win32") { + const args = [ + "-hide_banner", + "-loglevel", + "warning", + "-fflags", + "nobuffer", + "-flags", + "low_delay", + "-f", + "dshow", + "-i", + `audio=${config.AUDIO_SOURCE}`, + "-ac", + "1", + "-ar", + "16000", + "-f", + "s16le", + "pipe:1", + ]; + + logger.info("Starting Windows loopback capture", { + source: config.AUDIO_SOURCE, + backend: "ffmpeg-dshow", + }); + + return spawn("ffmpeg", args, { + stdio: ["ignore", "pipe", "pipe"], + }); + } + + if (process.platform === "linux") { + const args = [ + "-hide_banner", + "-loglevel", + "warning", + "-fflags", + "nobuffer", + "-flags", + "low_delay", + "-f", + "pulse", + "-i", + config.AUDIO_SOURCE, + "-ac", + "1", + "-ar", + "16000", + "-f", + "s16le", + "pipe:1", + ]; + + logger.info("Starting Linux loopback capture", { + source: config.AUDIO_SOURCE, + backend: "ffmpeg-pulse", + }); + + return spawn("ffmpeg", args, { + stdio: ["ignore", "pipe", "pipe"], + }); + } + + throw new Error(`지원하지 않는 플랫폼입니다: ${process.platform}`); +} diff --git a/src/audio/realtime-segmenter.ts b/src/audio/realtime-segmenter.ts new file mode 100644 index 0000000..028d045 --- /dev/null +++ b/src/audio/realtime-segmenter.ts @@ -0,0 +1,112 @@ +interface RealtimeSegmenterOptions { + onSegment: (pcm16: Buffer) => void; +} + +export class RealtimeSegmenter { + private readonly pendingSamples: number[] = []; + private readonly preRoll: number[] = []; + private readonly speech: number[] = []; + + private readonly frameSamples = 320; + private readonly preRollSamples = 3200; + private readonly speechStartThreshold = 900; + private readonly speechContinueThreshold = 450; + private readonly speechStartFrames = 2; + private readonly speechEndFrames = 18; + private readonly minSpeechSamples = 6400; + + private speechActive = false; + private speechCandidateFrames = 0; + private silenceFrames = 0; + + constructor(private readonly options: RealtimeSegmenterOptions) {} + + pushChunk(chunk: Buffer): void { + for (let offset = 0; offset + 1 < chunk.length; offset += 2) { + this.pendingSamples.push(chunk.readInt16LE(offset)); + } + + while (true) { + const frame = takeFrame(this.pendingSamples, this.frameSamples); + if (!frame) { + return; + } + this.processFrame(frame); + } + } + + private processFrame(frame: Int16Array): void { + let peak = 0; + for (const sample of frame) { + const abs = Math.abs(sample); + if (abs > peak) { + peak = abs; + } + } + + if (!this.speechActive) { + appendWithCap(this.preRoll, frame, this.preRollSamples); + if (peak >= this.speechStartThreshold) { + this.speechCandidateFrames += 1; + } else { + this.speechCandidateFrames = 0; + } + + if (this.speechCandidateFrames < this.speechStartFrames) { + return; + } + + this.speechActive = true; + this.silenceFrames = 0; + this.speech.splice(0, this.speech.length, ...this.preRoll); + this.preRoll.splice(0, this.preRoll.length); + } + + this.speech.push(...frame); + + if (peak >= this.speechContinueThreshold) { + this.silenceFrames = 0; + } else { + this.silenceFrames += 1; + } + + if (this.silenceFrames < this.speechEndFrames) { + return; + } + + const speechPcm = int16ArrayToBuffer(Int16Array.from(this.speech)); + this.speechActive = false; + this.speech.splice(0, this.speech.length); + this.silenceFrames = 0; + this.speechCandidateFrames = 0; + + if (speechPcm.length < this.minSpeechSamples * 2) { + return; + } + + this.options.onSegment(speechPcm); + } +} + +function takeFrame(source: number[], size: number): Int16Array | null { + if (source.length < size) { + return null; + } + const samples = source.splice(0, size); + return Int16Array.from(samples); +} + +function appendWithCap(target: number[], samples: Int16Array, cap: number): void { + target.push(...samples); + if (target.length > cap) { + target.splice(0, target.length - cap); + } +} + +function int16ArrayToBuffer(input: Int16Array): Buffer { + const output = Buffer.allocUnsafe(input.length * 2); + for (let index = 0; index < input.length; index += 1) { + output.writeInt16LE(input[index]!, index * 2); + } + return output; +} diff --git a/src/config.ts b/src/config.ts new file mode 100644 index 0000000..9ed5b55 --- /dev/null +++ b/src/config.ts @@ -0,0 +1,34 @@ +import { config as loadDotenv } from "dotenv"; +import { z } from "zod"; + +loadDotenv(); + +const emptyToUndefined = z.preprocess((value) => { + if (typeof value !== "string") { + return value; + } + const trimmed = value.trim(); + return trimmed.length === 0 ? undefined : trimmed; +}, z.string().min(1).optional()); + +const envSchema = z.object({ + LOCAL_AI_VENV_PATH: z.string().min(1).default(".local-ai/.venv"), + LOCAL_AI_PYTHON: emptyToUndefined, + AUDIO_SOURCE: emptyToUndefined, + WHISPER_MODEL: z.string().min(1).default("large-v3-turbo"), + WHISPER_LANGUAGE: z.string().min(1).default("ko"), + WHISPER_DEVICE: z.enum(["auto", "cuda", "cpu"]).default("auto"), + WHISPER_COMPUTE_TYPE: z.string().min(1).default("auto"), + WHISPER_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1), + DEBUG_TRANSCRIPTS: z + .string() + .optional() + .transform((value) => value === "true"), + LOG_LEVEL: z.enum(["debug", "info", "warn", "error"]).default("info"), +}); + +export type AppConfig = z.infer; + +export function loadConfig(): AppConfig { + return envSchema.parse(process.env); +} diff --git a/src/index.ts b/src/index.ts new file mode 100644 index 0000000..c174f88 --- /dev/null +++ b/src/index.ts @@ -0,0 +1,107 @@ +import process from "node:process"; + +import { loadConfig } from "./config.js"; +import { Logger } from "./logger.js"; +import { printAudioDevices, spawnLoopbackCapture } from "./audio/capture.js"; +import { RealtimeSegmenter } from "./audio/realtime-segmenter.js"; +import { FasterWhisperSttService } from "./services/faster-whisper-stt.js"; + +const mode = process.argv[2] ?? "loopback"; + +async function runLoopback(): Promise { + const config = loadConfig(); + const logger = new Logger(config.LOG_LEVEL); + const stt = new FasterWhisperSttService(config, logger); + + await stt.warmup(); + + const transcriptionQueue: Buffer[] = []; + let transcribing = false; + + const runNext = async (): Promise => { + if (transcribing) { + return; + } + const next = transcriptionQueue.shift(); + if (!next) { + return; + } + + transcribing = true; + try { + const text = await stt.transcribePcm16(next); + if (!text) { + logger.info("빈 전사 결과"); + } else { + logger.info("Transcript", text); + if (config.DEBUG_TRANSCRIPTS) { + console.log(`\n[text] ${text}\n`); + } + } + } catch (error) { + logger.warn("STT failed", error); + } finally { + transcribing = false; + void runNext(); + } + }; + + const segmenter = new RealtimeSegmenter({ + onSegment: (pcm16) => { + transcriptionQueue.push(pcm16); + void runNext(); + }, + }); + + const capture = spawnLoopbackCapture(config, logger); + capture.stdout.on("data", (chunk: Buffer) => { + segmenter.pushChunk(chunk); + }); + capture.stderr.on("data", (chunk: Buffer) => { + const text = chunk.toString().trim(); + if (text) { + logger.debug("[capture]", text); + } + }); + capture.on("exit", (code, signal) => { + logger.warn("capture exited", { code, signal }); + }); + + console.log("실시간 출력장치 STT를 시작합니다. Ctrl+C 로 종료합니다."); + console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`); + console.log(`model: ${config.WHISPER_MODEL}`); + console.log(`language: ${config.WHISPER_LANGUAGE}`); + + const shutdown = async (): Promise => { + if (!capture.killed) { + capture.kill("SIGTERM"); + } + await stt.destroy(); + process.exit(0); + }; + + process.on("SIGINT", () => { + void shutdown(); + }); + process.on("SIGTERM", () => { + void shutdown(); + }); +} + +async function main(): Promise { + switch (mode) { + case "devices": + await printAudioDevices(); + return; + case "loopback": + await runLoopback(); + return; + default: + throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: loopback, devices`); + } +} + +void main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); +}); diff --git a/src/logger.ts b/src/logger.ts new file mode 100644 index 0000000..28a8ef8 --- /dev/null +++ b/src/logger.ts @@ -0,0 +1,63 @@ +type LogLevel = "debug" | "info" | "warn" | "error"; + +const levelOrder: Record = { + debug: 10, + info: 20, + warn: 30, + error: 40, +}; + +function formatParts(parts: unknown[]): string { + return parts + .map((part) => { + if (part instanceof Error) { + return `${part.name}: ${part.message}`; + } + if (typeof part === "string") { + return part; + } + return JSON.stringify(part); + }) + .join(" "); +} + +export class Logger { + constructor(private readonly level: LogLevel) {} + + private shouldLog(target: LogLevel): boolean { + return levelOrder[target] >= levelOrder[this.level]; + } + + private write(target: LogLevel, ...parts: unknown[]): void { + if (!this.shouldLog(target)) { + return; + } + + const line = `[${new Date().toISOString()}] [${target.toUpperCase()}] ${formatParts(parts)}`; + if (target === "error") { + console.error(line); + return; + } + if (target === "warn") { + console.warn(line); + return; + } + console.log(line); + } + + debug(...parts: unknown[]): void { + this.write("debug", ...parts); + } + + info(...parts: unknown[]): void { + this.write("info", ...parts); + } + + warn(...parts: unknown[]): void { + this.write("warn", ...parts); + } + + error(...parts: unknown[]): void { + this.write("error", ...parts); + } +} diff --git a/src/python-runtime.ts b/src/python-runtime.ts new file mode 100644 index 0000000..9ddfbe1 --- /dev/null +++ b/src/python-runtime.ts @@ -0,0 +1,63 @@ +import { constants as fsConstants } from "node:fs"; +import { access } from "node:fs/promises"; +import path from "node:path"; +import process from "node:process"; + +import type { AppConfig } from "./config.js"; + +function splitCommand(command: string): string[] { + const parts = command.match(/(?:[^\s"]+|"[^"]*")+/g) ?? []; + return parts.map((part) => part.replace(/^"(.*)"$/, "$1")); +} + +async function fileExists(target: string): Promise { + try { + await access(target, fsConstants.X_OK); + return true; + } catch { + return false; + } +} + +export async function resolvePythonCommand(config: AppConfig): Promise<{ command: string; args: string[] }> { + return await resolveWorkerPythonCommand(config); +} + +export async function resolveBasePythonCommand(config: AppConfig): Promise<{ command: string; args: string[] }> { + const configured = config.LOCAL_AI_PYTHON?.trim(); + if (configured) { + const [command, ...args] = splitCommand(configured); + if (!command) { + throw new Error("LOCAL_AI_PYTHON 값이 비어 있습니다."); + } + return { command, args }; + } + + const venvPath = resolveVenvPythonPath(config); + if (await fileExists(venvPath)) { + return { command: venvPath, args: [] }; + } + + return await resolveBasePythonCommand(config); +} + +export async function resolveWorkerPythonCommand(config: AppConfig): Promise<{ command: string; args: string[] }> { + const venvPath = resolveVenvPythonPath(config); + if (await fileExists(venvPath)) { + return { command: venvPath, args: [] }; + } + + return await resolveBasePythonCommand(config); +} + +export function resolveVenvPythonPath(config: AppConfig): string { + const root = path.resolve(process.cwd(), config.LOCAL_AI_VENV_PATH); + if (process.platform === "win32") { + return path.join(root, "Scripts", "python.exe"); + } + return path.join(root, "bin", "python"); +} + +export function resolveWorkerScript(name: string): string { + return path.resolve(process.cwd(), "python", name); +} diff --git a/src/services/faster-whisper-stt.ts b/src/services/faster-whisper-stt.ts new file mode 100644 index 0000000..58f1649 --- /dev/null +++ b/src/services/faster-whisper-stt.ts @@ -0,0 +1,40 @@ +import type { AppConfig } from "../config.js"; +import type { Logger } from "../logger.js"; +import { PythonJsonWorker } from "./python-json-worker.js"; + +interface PingResult { + model: string; + device: string; + compute_type: string; +} + +interface TranscribeResult { + text: string; +} + +export class FasterWhisperSttService { + private readonly worker: PythonJsonWorker; + + constructor( + private readonly config: AppConfig, + private readonly logger: Logger, + ) { + this.worker = new PythonJsonWorker(config, logger, "loopback_stt_worker.py", "faster-whisper"); + } + + async warmup(): Promise { + const result = await this.worker.request("ping", {}); + this.logger.info("STT worker ready", result); + } + + async transcribePcm16(pcm16: Buffer): Promise { + const result = await this.worker.request("transcribe", { + pcm16_base64: pcm16.toString("base64"), + }); + return result.text.trim(); + } + + async destroy(): Promise { + await this.worker.destroy(); + } +} diff --git a/src/services/python-json-worker.ts b/src/services/python-json-worker.ts new file mode 100644 index 0000000..3103ec1 --- /dev/null +++ b/src/services/python-json-worker.ts @@ -0,0 +1,147 @@ +import { spawn, type ChildProcessWithoutNullStreams } from "node:child_process"; +import { createInterface } from "node:readline"; + +import type { AppConfig } from "../config.js"; +import type { Logger } from "../logger.js"; +import { resolveWorkerPythonCommand, resolveWorkerScript } from "../python-runtime.js"; + +interface RpcSuccess { + id: string; + result: T; +} + +interface RpcFailure { + id: string; + error: string; +} + +type RpcResponse = RpcSuccess | RpcFailure; + +function isFailure(value: RpcResponse): value is RpcFailure { + return "error" in value; +} + +export class PythonJsonWorker { + private processRef: ChildProcessWithoutNullStreams | null = null; + private readonly pending = new Map< + string, + { + resolve: (value: unknown) => void; + reject: (reason?: unknown) => void; + } + >(); + private nextId = 1; + + constructor( + private readonly config: AppConfig, + private readonly logger: Logger, + private readonly scriptName: string, + private readonly logPrefix: string, + ) {} + + async start(): Promise { + if (this.processRef) { + return; + } + + const { command, args } = await resolveWorkerPythonCommand(this.config); + const scriptPath = resolveWorkerScript(this.scriptName); + + this.processRef = spawn(command, [...args, scriptPath], { + stdio: ["pipe", "pipe", "pipe"], + env: { + ...process.env, + WHISPER_MODEL: this.config.WHISPER_MODEL, + WHISPER_LANGUAGE: this.config.WHISPER_LANGUAGE, + WHISPER_DEVICE: this.config.WHISPER_DEVICE, + WHISPER_COMPUTE_TYPE: this.config.WHISPER_COMPUTE_TYPE, + WHISPER_BEAM_SIZE: String(this.config.WHISPER_BEAM_SIZE), + }, + }); + + const rl = createInterface({ + input: this.processRef.stdout, + crlfDelay: Infinity, + }); + + rl.on("line", (line) => { + this.handleStdoutLine(line); + }); + + this.processRef.stderr.on("data", (chunk: Buffer) => { + const text = chunk.toString().trim(); + if (text.length > 0) { + this.logger.warn(`[${this.logPrefix}] ${text}`); + } + }); + + this.processRef.on("exit", (code, signal) => { + const error = new Error(`${this.logPrefix} worker exited code=${code ?? "null"} signal=${signal ?? "null"}`); + for (const entry of this.pending.values()) { + entry.reject(error); + } + this.pending.clear(); + this.processRef = null; + }); + } + + async request(method: string, params: Record): Promise { + await this.start(); + + if (!this.processRef) { + throw new Error(`${this.logPrefix} worker is not running`); + } + + const id = String(this.nextId++); + const payload = JSON.stringify({ + id, + method, + params, + }); + + const promise = new Promise((resolve, reject) => { + this.pending.set(id, { + resolve: (value) => resolve(value as T), + reject, + }); + }); + + this.processRef.stdin.write(`${payload}\n`); + return await promise; + } + + async destroy(): Promise { + if (!this.processRef) { + return; + } + this.processRef.kill("SIGTERM"); + this.processRef = null; + } + + private handleStdoutLine(line: string): void { + const trimmed = line.trim(); + if (!trimmed) { + return; + } + + let message: RpcResponse; + try { + message = JSON.parse(trimmed) as RpcResponse; + } catch (error) { + this.logger.warn(`${this.logPrefix} stdout parse failed`, error); + return; + } + + const pending = this.pending.get(message.id); + if (!pending) { + return; + } + + this.pending.delete(message.id); + if (isFailure(message)) { + pending.reject(new Error(message.error)); + return; + } + pending.resolve(message.result); + } +} diff --git a/src/setup-python.ts b/src/setup-python.ts new file mode 100644 index 0000000..3b83075 --- /dev/null +++ b/src/setup-python.ts @@ -0,0 +1,47 @@ +import process from "node:process"; +import { mkdir } from "node:fs/promises"; +import path from "node:path"; +import { spawn } from "node:child_process"; + +import { loadConfig } from "./config.js"; +import { resolveBasePythonCommand, resolveVenvPythonPath } from "./python-runtime.js"; + +async function run(command: string, args: string[], cwd: string): Promise { + await new Promise((resolve, reject) => { + const child = spawn(command, args, { + cwd, + stdio: "inherit", + }); + child.on("exit", (code) => { + if (code === 0) { + resolve(); + return; + } + reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`)); + }); + child.on("error", reject); + }); +} + +async function main(): Promise { + const config = loadConfig(); + const { command, args } = await resolveBasePythonCommand(config); + const venvRoot = path.resolve(process.cwd(), config.LOCAL_AI_VENV_PATH); + const requirementsPath = path.resolve(process.cwd(), "python", "requirements.txt"); + + await mkdir(path.dirname(venvRoot), { recursive: true }); + + console.log(`가상환경 생성: ${venvRoot}`); + await run(command, [...args, "-m", "venv", venvRoot], process.cwd()); + + const venvPython = resolveVenvPythonPath(config); + await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], process.cwd()); + await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], process.cwd()); + + console.log("Python STT 환경 준비 완료"); +} + +void main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); +}); diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..7edb43f --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,21 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "NodeNext", + "moduleResolution": "NodeNext", + "strict": true, + "noEmit": false, + "rootDir": "src", + "outDir": "dist", + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "skipLibCheck": true, + "resolveJsonModule": true, + "types": [ + "node" + ] + }, + "include": [ + "src/**/*.ts" + ] +}