Add local MeloTTS support
This commit is contained in:
@@ -55,6 +55,15 @@ export class RealtimeSegmenter {
|
||||
}
|
||||
}
|
||||
|
||||
reset(): void {
|
||||
this.pendingSamples.splice(0, this.pendingSamples.length);
|
||||
this.preRoll.splice(0, this.preRoll.length);
|
||||
this.speech.splice(0, this.speech.length);
|
||||
this.speechActive = false;
|
||||
this.speechCandidateFrames = 0;
|
||||
this.silenceFrames = 0;
|
||||
}
|
||||
|
||||
private processFrame(frame: Int16Array): void {
|
||||
let peak = 0;
|
||||
for (const sample of frame) {
|
||||
|
||||
@@ -15,6 +15,17 @@ const envSchema = z.object({
|
||||
LOCAL_AI_VENV_PATH: z.string().min(1).default(".local-ai/.venv"),
|
||||
LOCAL_AI_PYTHON: emptyToUndefined,
|
||||
AUDIO_SOURCE: emptyToUndefined,
|
||||
TTS_ENABLED: z
|
||||
.string()
|
||||
.optional()
|
||||
.transform((value) => value?.trim().toLowerCase() !== "false"),
|
||||
TTS_IMAGE: z.string().min(1).default("realtime-voice-bot-melotts:v0.1.2"),
|
||||
TTS_LANGUAGE: z.string().min(1).default("KR"),
|
||||
TTS_SPEAKER: z.string().min(1).default("KR"),
|
||||
TTS_DEVICE: z.string().min(1).default("cpu"),
|
||||
TTS_SPEED: z.coerce.number().min(0.5).max(2).default(1),
|
||||
TTS_CACHE_DIR: z.string().min(1).default(".local-ai/tts-cache"),
|
||||
TTS_OUTPUT_DIR: z.string().min(1).default(".local-ai/tts-output"),
|
||||
DEBUG: z
|
||||
.string()
|
||||
.optional()
|
||||
|
||||
54
src/index.ts
54
src/index.ts
@@ -6,6 +6,7 @@ import { Logger } from "./logger.js";
|
||||
import { printAudioDevices, spawnLoopbackCapture } from "./audio/capture.js";
|
||||
import { RealtimeSegmenter } from "./audio/realtime-segmenter.js";
|
||||
import { FasterWhisperSttService } from "./services/faster-whisper-stt.js";
|
||||
import { MeloTtsService } from "./services/melo-tts.js";
|
||||
import { OllamaLlmService } from "./services/ollama-llm.js";
|
||||
|
||||
const mode = process.argv[2] ?? "test-stt";
|
||||
@@ -15,8 +16,10 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
|
||||
const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
|
||||
const stt = new FasterWhisperSttService(config, logger);
|
||||
const llm = enableLlm ? new OllamaLlmService(config, logger) : null;
|
||||
let tts = enableLlm && config.TTS_ENABLED ? new MeloTtsService(config, logger) : null;
|
||||
let capture = null as ReturnType<typeof spawnLoopbackCapture> | null;
|
||||
let shuttingDown: Promise<void> | null = null;
|
||||
let suppressCapture = false;
|
||||
let receivedChunks = 0;
|
||||
let receivedBytes = 0;
|
||||
let maxPeak = 0;
|
||||
@@ -79,6 +82,22 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
|
||||
logger.info("LLM warmup finished");
|
||||
console.log("LLM 준비 완료");
|
||||
}
|
||||
if (tts) {
|
||||
console.log("TTS 준비중...");
|
||||
try {
|
||||
await tts.warmup();
|
||||
logger.info("TTS warmup finished", {
|
||||
image: config.TTS_IMAGE,
|
||||
language: config.TTS_LANGUAGE,
|
||||
speaker: config.TTS_SPEAKER,
|
||||
});
|
||||
console.log("TTS 준비 완료");
|
||||
} catch (error) {
|
||||
logger.warn("TTS warmup failed", error);
|
||||
console.log("TTS 비활성화: bun run setup:tts 를 먼저 실행하세요.");
|
||||
tts = null;
|
||||
}
|
||||
}
|
||||
|
||||
const transcriptionQueue: Array<{ pcm16: Buffer; queuedAt: number; index: number }> = [];
|
||||
let transcribing = false;
|
||||
@@ -155,6 +174,20 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
|
||||
} else {
|
||||
console.log(`답변> ${reply}`);
|
||||
}
|
||||
|
||||
if (tts) {
|
||||
suppressCapture = true;
|
||||
segmenter.reset();
|
||||
try {
|
||||
await tts.speak(reply);
|
||||
} catch (error) {
|
||||
logger.warn("TTS playback failed", error);
|
||||
} finally {
|
||||
suppressCapture = false;
|
||||
sawSpeechStart = false;
|
||||
maxPeak = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
@@ -227,6 +260,9 @@ async function runSttTest(enableLlm: boolean): Promise<void> {
|
||||
receivedChunks += 1;
|
||||
receivedBytes += chunk.length;
|
||||
lastChunkAt = Date.now();
|
||||
if (suppressCapture) {
|
||||
return;
|
||||
}
|
||||
segmenter.pushChunk(chunk);
|
||||
});
|
||||
capture.stderr.on("data", (chunk: Buffer) => {
|
||||
@@ -330,6 +366,19 @@ async function runLlmCli(): Promise<void> {
|
||||
});
|
||||
}
|
||||
|
||||
async function runTtsTest(): Promise<void> {
|
||||
const text = process.argv.slice(3).join(" ").trim() || "안녕하세요. 로컬 티티에스 테스트입니다.";
|
||||
const config = loadConfig();
|
||||
const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
|
||||
const tts = new MeloTtsService(config, logger);
|
||||
|
||||
console.log("TTS 준비중...");
|
||||
await tts.warmup();
|
||||
console.log("TTS 준비 완료");
|
||||
console.log(`재생 문장: ${text}`);
|
||||
await tts.speak(text);
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
switch (mode) {
|
||||
case "devices":
|
||||
@@ -344,8 +393,11 @@ async function main(): Promise<void> {
|
||||
case "test-llm":
|
||||
await runLlmCli();
|
||||
return;
|
||||
case "test-tts":
|
||||
await runTtsTest();
|
||||
return;
|
||||
default:
|
||||
throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-llm, devices`);
|
||||
throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: test-stt, test-sttllm, test-llm, test-tts, devices`);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
42
src/services/audio-playback.ts
Normal file
42
src/services/audio-playback.ts
Normal file
@@ -0,0 +1,42 @@
|
||||
import { spawn } from "node:child_process";
|
||||
import process from "node:process";
|
||||
|
||||
async function run(command: string, args: string[]): Promise<void> {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const child = spawn(command, args, {
|
||||
stdio: ["ignore", "inherit", "inherit"],
|
||||
windowsHide: true,
|
||||
});
|
||||
|
||||
child.on("error", reject);
|
||||
child.on("exit", (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export async function playWavFile(filePath: string): Promise<void> {
|
||||
if (process.platform === "win32") {
|
||||
await run("powershell.exe", [
|
||||
"-NoProfile",
|
||||
"-NonInteractive",
|
||||
"-ExecutionPolicy",
|
||||
"Bypass",
|
||||
"-Command",
|
||||
[
|
||||
"$path = $args[0]",
|
||||
"$player = New-Object System.Media.SoundPlayer $path",
|
||||
"$player.Load()",
|
||||
"$player.PlaySync()",
|
||||
].join("; "),
|
||||
filePath,
|
||||
]);
|
||||
return;
|
||||
}
|
||||
|
||||
throw new Error(`지원하지 않는 플랫폼입니다: ${process.platform}`);
|
||||
}
|
||||
113
src/services/melo-tts.ts
Normal file
113
src/services/melo-tts.ts
Normal file
@@ -0,0 +1,113 @@
|
||||
import { spawn } from "node:child_process";
|
||||
import { randomUUID } from "node:crypto";
|
||||
import { mkdir, rm } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
import type { Logger } from "../logger.js";
|
||||
import { playWavFile } from "./audio-playback.js";
|
||||
|
||||
async function run(command: string, args: string[], stdio: "ignore" | "inherit" = "ignore"): Promise<void> {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const child = spawn(command, args, {
|
||||
stdio: ["ignore", stdio, "inherit"],
|
||||
windowsHide: true,
|
||||
});
|
||||
|
||||
child.on("error", reject);
|
||||
child.on("exit", (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export class MeloTtsService {
|
||||
constructor(
|
||||
private readonly config: AppConfig,
|
||||
private readonly logger: Logger,
|
||||
) {}
|
||||
|
||||
async warmup(): Promise<void> {
|
||||
await mkdir(path.resolve(process.cwd(), this.config.TTS_CACHE_DIR), { recursive: true });
|
||||
await mkdir(path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR), { recursive: true });
|
||||
|
||||
await run("docker", ["--version"]);
|
||||
await run("docker", ["image", "inspect", this.config.TTS_IMAGE]);
|
||||
}
|
||||
|
||||
async speak(text: string): Promise<void> {
|
||||
const trimmed = text.trim();
|
||||
if (!trimmed) {
|
||||
return;
|
||||
}
|
||||
|
||||
const fileName = `tts-${Date.now()}-${randomUUID()}.wav`;
|
||||
const targetPath = path.resolve(process.cwd(), this.config.TTS_OUTPUT_DIR, fileName);
|
||||
|
||||
try {
|
||||
await this.synthesizeToFile(trimmed, targetPath);
|
||||
await playWavFile(targetPath);
|
||||
} finally {
|
||||
await rm(targetPath, { force: true }).catch(() => undefined);
|
||||
}
|
||||
}
|
||||
|
||||
async synthesizeToFile(text: string, targetPath: string): Promise<void> {
|
||||
await this.warmup();
|
||||
|
||||
const outputDir = path.dirname(targetPath);
|
||||
const cacheDir = path.resolve(process.cwd(), this.config.TTS_CACHE_DIR);
|
||||
const fileName = path.basename(targetPath);
|
||||
|
||||
await mkdir(outputDir, { recursive: true });
|
||||
|
||||
const args = [
|
||||
"run",
|
||||
"--rm",
|
||||
"-v",
|
||||
`${outputDir}:/work/output`,
|
||||
"-v",
|
||||
`${cacheDir}:/cache`,
|
||||
"-e",
|
||||
"HF_HOME=/cache/huggingface",
|
||||
"-e",
|
||||
"HF_HUB_CACHE=/cache/huggingface/hub",
|
||||
"-e",
|
||||
"TRANSFORMERS_CACHE=/cache/transformers",
|
||||
];
|
||||
|
||||
if (this.config.TTS_DEVICE !== "cpu") {
|
||||
args.push("--gpus", "all");
|
||||
}
|
||||
|
||||
args.push(
|
||||
this.config.TTS_IMAGE,
|
||||
"--text",
|
||||
text,
|
||||
"--output",
|
||||
`/work/output/${fileName}`,
|
||||
"--language",
|
||||
this.config.TTS_LANGUAGE,
|
||||
"--speaker",
|
||||
this.config.TTS_SPEAKER,
|
||||
"--speed",
|
||||
String(this.config.TTS_SPEED),
|
||||
"--device",
|
||||
this.config.TTS_DEVICE,
|
||||
);
|
||||
|
||||
this.logger.info("Starting MeloTTS synthesis", {
|
||||
image: this.config.TTS_IMAGE,
|
||||
language: this.config.TTS_LANGUAGE,
|
||||
speaker: this.config.TTS_SPEAKER,
|
||||
speed: this.config.TTS_SPEED,
|
||||
device: this.config.TTS_DEVICE,
|
||||
});
|
||||
|
||||
await run("docker", args, "inherit");
|
||||
}
|
||||
}
|
||||
@@ -374,10 +374,12 @@ export class OllamaLlmService {
|
||||
"bun run setup",
|
||||
"bun run setup:stt",
|
||||
"bun run setup:llm",
|
||||
"bun run setup:tts",
|
||||
"bun run devices",
|
||||
"bun run test:stt",
|
||||
"bun run test:sttllm",
|
||||
"bun run test:llm",
|
||||
"bun run test:tts -- \"안녕하세요\"",
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
60
src/setup-tts.ts
Normal file
60
src/setup-tts.ts
Normal file
@@ -0,0 +1,60 @@
|
||||
import process from "node:process";
|
||||
import { mkdir, rm } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { spawn } from "node:child_process";
|
||||
|
||||
import { loadConfig } from "./config.js";
|
||||
import { Logger } from "./logger.js";
|
||||
import { MeloTtsService } from "./services/melo-tts.js";
|
||||
|
||||
async function run(command: string, args: string[], cwd = process.cwd()): Promise<void> {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const child = spawn(command, args, {
|
||||
cwd,
|
||||
stdio: "inherit",
|
||||
windowsHide: true,
|
||||
});
|
||||
|
||||
child.on("error", reject);
|
||||
child.on("exit", (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
export async function setupTts(): Promise<void> {
|
||||
const config = loadConfig();
|
||||
const logger = new Logger(config.DEBUG ? config.LOG_LEVEL : "error");
|
||||
const dockerContext = path.resolve(process.cwd(), "docker", "melotts");
|
||||
const cacheDir = path.resolve(process.cwd(), config.TTS_CACHE_DIR);
|
||||
const outputDir = path.resolve(process.cwd(), config.TTS_OUTPUT_DIR);
|
||||
|
||||
await mkdir(cacheDir, { recursive: true });
|
||||
await mkdir(outputDir, { recursive: true });
|
||||
|
||||
console.log(`MeloTTS Docker 이미지 빌드: ${config.TTS_IMAGE}`);
|
||||
await run("docker", ["build", "-t", config.TTS_IMAGE, dockerContext]);
|
||||
|
||||
const tts = new MeloTtsService(config, logger);
|
||||
const warmupPath = path.join(outputDir, "warmup.wav");
|
||||
|
||||
console.log("MeloTTS 모델 워밍업...");
|
||||
try {
|
||||
await tts.synthesizeToFile("안녕하세요. 로컬 티티에스 준비 테스트입니다.", warmupPath);
|
||||
} finally {
|
||||
await rm(warmupPath, { force: true }).catch(() => undefined);
|
||||
}
|
||||
|
||||
console.log("로컬 TTS 환경 준비 완료");
|
||||
}
|
||||
|
||||
if (import.meta.main) {
|
||||
void setupTts().catch((error) => {
|
||||
console.error(error instanceof Error ? error.message : String(error));
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@@ -2,10 +2,12 @@ import process from "node:process";
|
||||
|
||||
import { setupLlm } from "./setup-llm.js";
|
||||
import { setupSttPython } from "./setup-python.js";
|
||||
import { setupTts } from "./setup-tts.js";
|
||||
|
||||
async function main(): Promise<void> {
|
||||
await setupSttPython();
|
||||
await setupLlm();
|
||||
await setupTts();
|
||||
}
|
||||
|
||||
if (import.meta.main) {
|
||||
|
||||
Reference in New Issue
Block a user