Add realtime loopback STT prototype

This commit is contained in:
2026-05-02 20:20:54 +09:00
parent 10e0dd75db
commit 5775c4809a
17 changed files with 1034 additions and 0 deletions

138
src/audio/capture.ts Normal file
View File

@@ -0,0 +1,138 @@
import { spawn, type ChildProcessByStdio } from "node:child_process";
import process from "node:process";
import type { Readable } from "node:stream";
import type { AppConfig } from "../config.js";
import type { Logger } from "../logger.js";
export function printAudioDevices(): Promise<void> {
if (process.platform === "win32") {
return new Promise<void>((resolve, reject) => {
const child = spawn("ffmpeg", ["-hide_banner", "-list_devices", "true", "-f", "dshow", "-i", "dummy"], {
stdio: ["ignore", "ignore", "inherit"],
});
child.on("exit", (code) => {
if (code === 0 || code === 1) {
resolve();
return;
}
reject(new Error(`ffmpeg exited with code ${code ?? "null"}`));
});
child.on("error", reject);
});
}
return new Promise<void>((resolve, reject) => {
const pactl = spawn("pactl", ["list", "sources", "short"], {
stdio: ["ignore", "inherit", "inherit"],
});
pactl.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
const wpctl = spawn("wpctl", ["status", "-n"], {
stdio: ["ignore", "inherit", "inherit"],
});
wpctl.on("exit", (wpctlCode) => {
if (wpctlCode === 0) {
resolve();
return;
}
reject(new Error(`pactl exited with code ${code ?? "null"} and wpctl exited with code ${wpctlCode ?? "null"}`));
});
wpctl.on("error", reject);
});
pactl.on("error", () => {
const wpctl = spawn("wpctl", ["status", "-n"], {
stdio: ["ignore", "inherit", "inherit"],
});
wpctl.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(`pactl, wpctl 둘 다 실행할 수 없습니다. code=${code ?? "null"}`));
});
wpctl.on("error", reject);
});
});
}
export function spawnLoopbackCapture(
config: AppConfig,
logger: Logger,
): ChildProcessByStdio<null, Readable, Readable> {
if (!config.AUDIO_SOURCE) {
throw new Error("AUDIO_SOURCE 설정이 필요합니다. 먼저 `bun run devices` 로 장치 이름을 확인하세요.");
}
if (process.platform === "win32") {
const args = [
"-hide_banner",
"-loglevel",
"warning",
"-fflags",
"nobuffer",
"-flags",
"low_delay",
"-f",
"dshow",
"-i",
`audio=${config.AUDIO_SOURCE}`,
"-ac",
"1",
"-ar",
"16000",
"-f",
"s16le",
"pipe:1",
];
logger.info("Starting Windows loopback capture", {
source: config.AUDIO_SOURCE,
backend: "ffmpeg-dshow",
});
return spawn("ffmpeg", args, {
stdio: ["ignore", "pipe", "pipe"],
});
}
if (process.platform === "linux") {
const args = [
"-hide_banner",
"-loglevel",
"warning",
"-fflags",
"nobuffer",
"-flags",
"low_delay",
"-f",
"pulse",
"-i",
config.AUDIO_SOURCE,
"-ac",
"1",
"-ar",
"16000",
"-f",
"s16le",
"pipe:1",
];
logger.info("Starting Linux loopback capture", {
source: config.AUDIO_SOURCE,
backend: "ffmpeg-pulse",
});
return spawn("ffmpeg", args, {
stdio: ["ignore", "pipe", "pipe"],
});
}
throw new Error(`지원하지 않는 플랫폼입니다: ${process.platform}`);
}

View File

@@ -0,0 +1,112 @@
interface RealtimeSegmenterOptions {
onSegment: (pcm16: Buffer) => void;
}
export class RealtimeSegmenter {
private readonly pendingSamples: number[] = [];
private readonly preRoll: number[] = [];
private readonly speech: number[] = [];
private readonly frameSamples = 320;
private readonly preRollSamples = 3200;
private readonly speechStartThreshold = 900;
private readonly speechContinueThreshold = 450;
private readonly speechStartFrames = 2;
private readonly speechEndFrames = 18;
private readonly minSpeechSamples = 6400;
private speechActive = false;
private speechCandidateFrames = 0;
private silenceFrames = 0;
constructor(private readonly options: RealtimeSegmenterOptions) {}
pushChunk(chunk: Buffer): void {
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
this.pendingSamples.push(chunk.readInt16LE(offset));
}
while (true) {
const frame = takeFrame(this.pendingSamples, this.frameSamples);
if (!frame) {
return;
}
this.processFrame(frame);
}
}
private processFrame(frame: Int16Array): void {
let peak = 0;
for (const sample of frame) {
const abs = Math.abs(sample);
if (abs > peak) {
peak = abs;
}
}
if (!this.speechActive) {
appendWithCap(this.preRoll, frame, this.preRollSamples);
if (peak >= this.speechStartThreshold) {
this.speechCandidateFrames += 1;
} else {
this.speechCandidateFrames = 0;
}
if (this.speechCandidateFrames < this.speechStartFrames) {
return;
}
this.speechActive = true;
this.silenceFrames = 0;
this.speech.splice(0, this.speech.length, ...this.preRoll);
this.preRoll.splice(0, this.preRoll.length);
}
this.speech.push(...frame);
if (peak >= this.speechContinueThreshold) {
this.silenceFrames = 0;
} else {
this.silenceFrames += 1;
}
if (this.silenceFrames < this.speechEndFrames) {
return;
}
const speechPcm = int16ArrayToBuffer(Int16Array.from(this.speech));
this.speechActive = false;
this.speech.splice(0, this.speech.length);
this.silenceFrames = 0;
this.speechCandidateFrames = 0;
if (speechPcm.length < this.minSpeechSamples * 2) {
return;
}
this.options.onSegment(speechPcm);
}
}
function takeFrame(source: number[], size: number): Int16Array | null {
if (source.length < size) {
return null;
}
const samples = source.splice(0, size);
return Int16Array.from(samples);
}
function appendWithCap(target: number[], samples: Int16Array, cap: number): void {
target.push(...samples);
if (target.length > cap) {
target.splice(0, target.length - cap);
}
}
function int16ArrayToBuffer(input: Int16Array): Buffer {
const output = Buffer.allocUnsafe(input.length * 2);
for (let index = 0; index < input.length; index += 1) {
output.writeInt16LE(input[index]!, index * 2);
}
return output;
}

34
src/config.ts Normal file
View File

@@ -0,0 +1,34 @@
import { config as loadDotenv } from "dotenv";
import { z } from "zod";
loadDotenv();
const emptyToUndefined = z.preprocess((value) => {
if (typeof value !== "string") {
return value;
}
const trimmed = value.trim();
return trimmed.length === 0 ? undefined : trimmed;
}, z.string().min(1).optional());
const envSchema = z.object({
LOCAL_AI_VENV_PATH: z.string().min(1).default(".local-ai/.venv"),
LOCAL_AI_PYTHON: emptyToUndefined,
AUDIO_SOURCE: emptyToUndefined,
WHISPER_MODEL: z.string().min(1).default("large-v3-turbo"),
WHISPER_LANGUAGE: z.string().min(1).default("ko"),
WHISPER_DEVICE: z.enum(["auto", "cuda", "cpu"]).default("auto"),
WHISPER_COMPUTE_TYPE: z.string().min(1).default("auto"),
WHISPER_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1),
DEBUG_TRANSCRIPTS: z
.string()
.optional()
.transform((value) => value === "true"),
LOG_LEVEL: z.enum(["debug", "info", "warn", "error"]).default("info"),
});
export type AppConfig = z.infer<typeof envSchema>;
export function loadConfig(): AppConfig {
return envSchema.parse(process.env);
}

107
src/index.ts Normal file
View File

@@ -0,0 +1,107 @@
import process from "node:process";
import { loadConfig } from "./config.js";
import { Logger } from "./logger.js";
import { printAudioDevices, spawnLoopbackCapture } from "./audio/capture.js";
import { RealtimeSegmenter } from "./audio/realtime-segmenter.js";
import { FasterWhisperSttService } from "./services/faster-whisper-stt.js";
const mode = process.argv[2] ?? "loopback";
async function runLoopback(): Promise<void> {
const config = loadConfig();
const logger = new Logger(config.LOG_LEVEL);
const stt = new FasterWhisperSttService(config, logger);
await stt.warmup();
const transcriptionQueue: Buffer[] = [];
let transcribing = false;
const runNext = async (): Promise<void> => {
if (transcribing) {
return;
}
const next = transcriptionQueue.shift();
if (!next) {
return;
}
transcribing = true;
try {
const text = await stt.transcribePcm16(next);
if (!text) {
logger.info("빈 전사 결과");
} else {
logger.info("Transcript", text);
if (config.DEBUG_TRANSCRIPTS) {
console.log(`\n[text] ${text}\n`);
}
}
} catch (error) {
logger.warn("STT failed", error);
} finally {
transcribing = false;
void runNext();
}
};
const segmenter = new RealtimeSegmenter({
onSegment: (pcm16) => {
transcriptionQueue.push(pcm16);
void runNext();
},
});
const capture = spawnLoopbackCapture(config, logger);
capture.stdout.on("data", (chunk: Buffer) => {
segmenter.pushChunk(chunk);
});
capture.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text) {
logger.debug("[capture]", text);
}
});
capture.on("exit", (code, signal) => {
logger.warn("capture exited", { code, signal });
});
console.log("실시간 출력장치 STT를 시작합니다. Ctrl+C 로 종료합니다.");
console.log(`source: ${config.AUDIO_SOURCE ?? "unset"}`);
console.log(`model: ${config.WHISPER_MODEL}`);
console.log(`language: ${config.WHISPER_LANGUAGE}`);
const shutdown = async (): Promise<void> => {
if (!capture.killed) {
capture.kill("SIGTERM");
}
await stt.destroy();
process.exit(0);
};
process.on("SIGINT", () => {
void shutdown();
});
process.on("SIGTERM", () => {
void shutdown();
});
}
async function main(): Promise<void> {
switch (mode) {
case "devices":
await printAudioDevices();
return;
case "loopback":
await runLoopback();
return;
default:
throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: loopback, devices`);
}
}
void main().catch((error) => {
console.error(error instanceof Error ? error.message : String(error));
process.exit(1);
});

63
src/logger.ts Normal file
View File

@@ -0,0 +1,63 @@
type LogLevel = "debug" | "info" | "warn" | "error";
const levelOrder: Record<LogLevel, number> = {
debug: 10,
info: 20,
warn: 30,
error: 40,
};
function formatParts(parts: unknown[]): string {
return parts
.map((part) => {
if (part instanceof Error) {
return `${part.name}: ${part.message}`;
}
if (typeof part === "string") {
return part;
}
return JSON.stringify(part);
})
.join(" ");
}
export class Logger {
constructor(private readonly level: LogLevel) {}
private shouldLog(target: LogLevel): boolean {
return levelOrder[target] >= levelOrder[this.level];
}
private write(target: LogLevel, ...parts: unknown[]): void {
if (!this.shouldLog(target)) {
return;
}
const line = `[${new Date().toISOString()}] [${target.toUpperCase()}] ${formatParts(parts)}`;
if (target === "error") {
console.error(line);
return;
}
if (target === "warn") {
console.warn(line);
return;
}
console.log(line);
}
debug(...parts: unknown[]): void {
this.write("debug", ...parts);
}
info(...parts: unknown[]): void {
this.write("info", ...parts);
}
warn(...parts: unknown[]): void {
this.write("warn", ...parts);
}
error(...parts: unknown[]): void {
this.write("error", ...parts);
}
}

63
src/python-runtime.ts Normal file
View File

@@ -0,0 +1,63 @@
import { constants as fsConstants } from "node:fs";
import { access } from "node:fs/promises";
import path from "node:path";
import process from "node:process";
import type { AppConfig } from "./config.js";
function splitCommand(command: string): string[] {
const parts = command.match(/(?:[^\s"]+|"[^"]*")+/g) ?? [];
return parts.map((part) => part.replace(/^"(.*)"$/, "$1"));
}
async function fileExists(target: string): Promise<boolean> {
try {
await access(target, fsConstants.X_OK);
return true;
} catch {
return false;
}
}
export async function resolvePythonCommand(config: AppConfig): Promise<{ command: string; args: string[] }> {
return await resolveWorkerPythonCommand(config);
}
export async function resolveBasePythonCommand(config: AppConfig): Promise<{ command: string; args: string[] }> {
const configured = config.LOCAL_AI_PYTHON?.trim();
if (configured) {
const [command, ...args] = splitCommand(configured);
if (!command) {
throw new Error("LOCAL_AI_PYTHON 값이 비어 있습니다.");
}
return { command, args };
}
const venvPath = resolveVenvPythonPath(config);
if (await fileExists(venvPath)) {
return { command: venvPath, args: [] };
}
return await resolveBasePythonCommand(config);
}
export async function resolveWorkerPythonCommand(config: AppConfig): Promise<{ command: string; args: string[] }> {
const venvPath = resolveVenvPythonPath(config);
if (await fileExists(venvPath)) {
return { command: venvPath, args: [] };
}
return await resolveBasePythonCommand(config);
}
export function resolveVenvPythonPath(config: AppConfig): string {
const root = path.resolve(process.cwd(), config.LOCAL_AI_VENV_PATH);
if (process.platform === "win32") {
return path.join(root, "Scripts", "python.exe");
}
return path.join(root, "bin", "python");
}
export function resolveWorkerScript(name: string): string {
return path.resolve(process.cwd(), "python", name);
}

View File

@@ -0,0 +1,40 @@
import type { AppConfig } from "../config.js";
import type { Logger } from "../logger.js";
import { PythonJsonWorker } from "./python-json-worker.js";
interface PingResult {
model: string;
device: string;
compute_type: string;
}
interface TranscribeResult {
text: string;
}
export class FasterWhisperSttService {
private readonly worker: PythonJsonWorker;
constructor(
private readonly config: AppConfig,
private readonly logger: Logger,
) {
this.worker = new PythonJsonWorker(config, logger, "loopback_stt_worker.py", "faster-whisper");
}
async warmup(): Promise<void> {
const result = await this.worker.request<PingResult>("ping", {});
this.logger.info("STT worker ready", result);
}
async transcribePcm16(pcm16: Buffer): Promise<string> {
const result = await this.worker.request<TranscribeResult>("transcribe", {
pcm16_base64: pcm16.toString("base64"),
});
return result.text.trim();
}
async destroy(): Promise<void> {
await this.worker.destroy();
}
}

View File

@@ -0,0 +1,147 @@
import { spawn, type ChildProcessWithoutNullStreams } from "node:child_process";
import { createInterface } from "node:readline";
import type { AppConfig } from "../config.js";
import type { Logger } from "../logger.js";
import { resolveWorkerPythonCommand, resolveWorkerScript } from "../python-runtime.js";
interface RpcSuccess<T> {
id: string;
result: T;
}
interface RpcFailure {
id: string;
error: string;
}
type RpcResponse<T> = RpcSuccess<T> | RpcFailure;
function isFailure<T>(value: RpcResponse<T>): value is RpcFailure {
return "error" in value;
}
export class PythonJsonWorker {
private processRef: ChildProcessWithoutNullStreams | null = null;
private readonly pending = new Map<
string,
{
resolve: (value: unknown) => void;
reject: (reason?: unknown) => void;
}
>();
private nextId = 1;
constructor(
private readonly config: AppConfig,
private readonly logger: Logger,
private readonly scriptName: string,
private readonly logPrefix: string,
) {}
async start(): Promise<void> {
if (this.processRef) {
return;
}
const { command, args } = await resolveWorkerPythonCommand(this.config);
const scriptPath = resolveWorkerScript(this.scriptName);
this.processRef = spawn(command, [...args, scriptPath], {
stdio: ["pipe", "pipe", "pipe"],
env: {
...process.env,
WHISPER_MODEL: this.config.WHISPER_MODEL,
WHISPER_LANGUAGE: this.config.WHISPER_LANGUAGE,
WHISPER_DEVICE: this.config.WHISPER_DEVICE,
WHISPER_COMPUTE_TYPE: this.config.WHISPER_COMPUTE_TYPE,
WHISPER_BEAM_SIZE: String(this.config.WHISPER_BEAM_SIZE),
},
});
const rl = createInterface({
input: this.processRef.stdout,
crlfDelay: Infinity,
});
rl.on("line", (line) => {
this.handleStdoutLine(line);
});
this.processRef.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text.length > 0) {
this.logger.warn(`[${this.logPrefix}] ${text}`);
}
});
this.processRef.on("exit", (code, signal) => {
const error = new Error(`${this.logPrefix} worker exited code=${code ?? "null"} signal=${signal ?? "null"}`);
for (const entry of this.pending.values()) {
entry.reject(error);
}
this.pending.clear();
this.processRef = null;
});
}
async request<T>(method: string, params: Record<string, unknown>): Promise<T> {
await this.start();
if (!this.processRef) {
throw new Error(`${this.logPrefix} worker is not running`);
}
const id = String(this.nextId++);
const payload = JSON.stringify({
id,
method,
params,
});
const promise = new Promise<T>((resolve, reject) => {
this.pending.set(id, {
resolve: (value) => resolve(value as T),
reject,
});
});
this.processRef.stdin.write(`${payload}\n`);
return await promise;
}
async destroy(): Promise<void> {
if (!this.processRef) {
return;
}
this.processRef.kill("SIGTERM");
this.processRef = null;
}
private handleStdoutLine(line: string): void {
const trimmed = line.trim();
if (!trimmed) {
return;
}
let message: RpcResponse<unknown>;
try {
message = JSON.parse(trimmed) as RpcResponse<unknown>;
} catch (error) {
this.logger.warn(`${this.logPrefix} stdout parse failed`, error);
return;
}
const pending = this.pending.get(message.id);
if (!pending) {
return;
}
this.pending.delete(message.id);
if (isFailure(message)) {
pending.reject(new Error(message.error));
return;
}
pending.resolve(message.result);
}
}

47
src/setup-python.ts Normal file
View File

@@ -0,0 +1,47 @@
import process from "node:process";
import { mkdir } from "node:fs/promises";
import path from "node:path";
import { spawn } from "node:child_process";
import { loadConfig } from "./config.js";
import { resolveBasePythonCommand, resolveVenvPythonPath } from "./python-runtime.js";
async function run(command: string, args: string[], cwd: string): Promise<void> {
await new Promise<void>((resolve, reject) => {
const child = spawn(command, args, {
cwd,
stdio: "inherit",
});
child.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
});
child.on("error", reject);
});
}
async function main(): Promise<void> {
const config = loadConfig();
const { command, args } = await resolveBasePythonCommand(config);
const venvRoot = path.resolve(process.cwd(), config.LOCAL_AI_VENV_PATH);
const requirementsPath = path.resolve(process.cwd(), "python", "requirements.txt");
await mkdir(path.dirname(venvRoot), { recursive: true });
console.log(`가상환경 생성: ${venvRoot}`);
await run(command, [...args, "-m", "venv", venvRoot], process.cwd());
const venvPython = resolveVenvPythonPath(config);
await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], process.cwd());
await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], process.cwd());
console.log("Python STT 환경 준비 완료");
}
void main().catch((error) => {
console.error(error instanceof Error ? error.message : String(error));
process.exit(1);
});