Replace ElevenLabs with local STT and TTS
This commit is contained in:
@@ -22,9 +22,9 @@ import type { AppConfig } from "../config.js";
|
||||
import { Logger } from "../logger.js";
|
||||
import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js";
|
||||
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
|
||||
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
|
||||
import type { LlmService } from "../services/llm.js";
|
||||
import type { SttService } from "../services/stt.js";
|
||||
import type { PreparedSpeechAudio, TtsService } from "../services/tts.js";
|
||||
|
||||
interface GuildVoiceSessionOptions {
|
||||
client: Client;
|
||||
@@ -33,8 +33,8 @@ interface GuildVoiceSessionOptions {
|
||||
guild: Guild;
|
||||
voiceChannel: VoiceBasedChannel;
|
||||
textChannelId?: string;
|
||||
stt: ElevenLabsSttService;
|
||||
tts: ElevenLabsTtsService;
|
||||
stt: SttService;
|
||||
tts: TtsService;
|
||||
llm: LlmService;
|
||||
}
|
||||
|
||||
|
||||
@@ -12,15 +12,15 @@ import { Logger } from "../logger.js";
|
||||
import { requireFfmpegPath } from "./ffmpeg-path.js";
|
||||
import { takeFrame, int16ArrayToFloat32, float32ToPcm16Buffer } from "./pcm.js";
|
||||
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
|
||||
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
|
||||
import type { LlmService } from "../services/llm.js";
|
||||
import type { SttService } from "../services/stt.js";
|
||||
import type { PreparedSpeechAudio, TtsService } from "../services/tts.js";
|
||||
|
||||
interface LocalVoiceSessionOptions {
|
||||
config: AssistantRuntimeConfig;
|
||||
logger: Logger;
|
||||
stt: ElevenLabsSttService;
|
||||
tts: ElevenLabsTtsService;
|
||||
stt: SttService;
|
||||
tts: TtsService;
|
||||
llm: LlmService;
|
||||
}
|
||||
|
||||
|
||||
@@ -15,14 +15,21 @@ const envSchema = z.object({
|
||||
DISCORD_BOT_TOKEN: emptyToUndefined,
|
||||
DISCORD_APPLICATION_ID: emptyToUndefined,
|
||||
DISCORD_COMMAND_GUILD_ID: emptyToUndefined,
|
||||
ELEVENLABS_API_KEY: emptyToUndefined,
|
||||
ELEVENLABS_VOICE_ID: emptyToUndefined,
|
||||
ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"),
|
||||
ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"),
|
||||
OLLAMA_BASE_URL: z.string().min(1).default("http://localhost:11434"),
|
||||
OLLAMA_MODEL: z.string().min(1).default("qwen3:0.6b"),
|
||||
OLLAMA_KEEP_ALIVE: z.string().min(1).default("5m"),
|
||||
OLLAMA_NUM_CTX: z.coerce.number().int().min(512).max(32768).default(4096),
|
||||
LOCAL_AI_VENV_PATH: z.string().min(1).default(".local-ai/.venv"),
|
||||
LOCAL_AI_CACHE_DIR: z.string().min(1).default(".local-ai/cache"),
|
||||
LOCAL_AI_PYTHON: emptyToUndefined,
|
||||
LOCAL_STT_MODEL: z.string().min(1).default("tiny"),
|
||||
LOCAL_STT_DEVICE: z.string().min(1).default("auto"),
|
||||
LOCAL_STT_COMPUTE_TYPE: z.string().min(1).default("auto"),
|
||||
LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1),
|
||||
LOCAL_TTS_LANGUAGE: z.string().min(1).default("KR"),
|
||||
LOCAL_TTS_SPEAKER: z.string().min(1).default("KR"),
|
||||
LOCAL_TTS_DEVICE: z.string().min(1).default("auto"),
|
||||
LOCAL_TTS_SPEED: z.coerce.number().min(0.8).max(1.6).default(1.12),
|
||||
BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),
|
||||
MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12),
|
||||
LOCAL_AUDIO_SOURCE: emptyToUndefined,
|
||||
@@ -36,10 +43,7 @@ const envSchema = z.object({
|
||||
});
|
||||
|
||||
export type AppConfig = z.infer<typeof envSchema>;
|
||||
export type AssistantRuntimeConfig = AppConfig & {
|
||||
ELEVENLABS_API_KEY: string;
|
||||
ELEVENLABS_VOICE_ID: string;
|
||||
};
|
||||
export type AssistantRuntimeConfig = AppConfig;
|
||||
export type DiscordRuntimeConfig = AssistantRuntimeConfig & {
|
||||
DISCORD_BOT_TOKEN: string;
|
||||
DISCORD_APPLICATION_ID: string;
|
||||
@@ -57,11 +61,7 @@ function requirePresent(value: string | undefined, name: string): string {
|
||||
}
|
||||
|
||||
export function requireAssistantRuntimeConfig(config: AppConfig): AssistantRuntimeConfig {
|
||||
return {
|
||||
...config,
|
||||
ELEVENLABS_API_KEY: requirePresent(config.ELEVENLABS_API_KEY, "ELEVENLABS_API_KEY"),
|
||||
ELEVENLABS_VOICE_ID: requirePresent(config.ELEVENLABS_VOICE_ID, "ELEVENLABS_VOICE_ID"),
|
||||
};
|
||||
return config;
|
||||
}
|
||||
|
||||
export function requireDiscordRuntimeConfig(config: AppConfig): DiscordRuntimeConfig {
|
||||
|
||||
@@ -15,8 +15,8 @@ import { Client as DiscordClient } from "discord.js";
|
||||
import { GuildVoiceSession } from "./audio/guild-voice-session.js";
|
||||
import { type DiscordRuntimeConfig } from "./config.js";
|
||||
import { Logger } from "./logger.js";
|
||||
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
|
||||
import { LocalFasterWhisperSttService } from "./services/local-stt.js";
|
||||
import { LocalMeloTtsService } from "./services/local-tts.js";
|
||||
import { OllamaLlmService } from "./services/ollama-llm.js";
|
||||
|
||||
export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
|
||||
@@ -37,11 +37,14 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger
|
||||
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
|
||||
});
|
||||
|
||||
const stt = new ElevenLabsSttService(config);
|
||||
const tts = new ElevenLabsTtsService(config);
|
||||
const stt = new LocalFasterWhisperSttService(config, logger);
|
||||
const tts = new LocalMeloTtsService(config, logger);
|
||||
const llm = new OllamaLlmService(config);
|
||||
const sessions = new Map<string, GuildVoiceSession>();
|
||||
|
||||
await stt.warmup();
|
||||
await tts.warmup();
|
||||
|
||||
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
|
||||
const member = interaction.member as GuildMember | null;
|
||||
return member?.voice.channel ?? null;
|
||||
@@ -174,6 +177,7 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger
|
||||
});
|
||||
}
|
||||
sessions.clear();
|
||||
await Promise.allSettled([stt.destroy?.(), tts.destroy?.()]);
|
||||
await client.destroy();
|
||||
process.exit(exitCode);
|
||||
}
|
||||
|
||||
@@ -5,8 +5,8 @@ import type { AssistantRuntimeConfig } from "./config.js";
|
||||
import { Logger } from "./logger.js";
|
||||
import { LocalVoiceSession } from "./audio/local-voice-session.js";
|
||||
import { requireFfmpegPath } from "./audio/ffmpeg-path.js";
|
||||
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
|
||||
import { LocalFasterWhisperSttService } from "./services/local-stt.js";
|
||||
import { LocalMeloTtsService } from "./services/local-tts.js";
|
||||
import { OllamaLlmService } from "./services/ollama-llm.js";
|
||||
|
||||
export async function printLocalAudioDevices(): Promise<void> {
|
||||
@@ -67,9 +67,13 @@ export async function printLocalAudioDevices(): Promise<void> {
|
||||
}
|
||||
|
||||
export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
|
||||
const stt = new ElevenLabsSttService(config);
|
||||
const tts = new ElevenLabsTtsService(config);
|
||||
const stt = new LocalFasterWhisperSttService(config, logger);
|
||||
const tts = new LocalMeloTtsService(config, logger);
|
||||
const llm = new OllamaLlmService(config);
|
||||
|
||||
await stt.warmup();
|
||||
await tts.warmup();
|
||||
|
||||
const session = new LocalVoiceSession({
|
||||
config,
|
||||
logger,
|
||||
@@ -91,6 +95,7 @@ export async function runLocalAssistant(config: AssistantRuntimeConfig, logger:
|
||||
await session.destroy().catch((error) => {
|
||||
logger.warn("Local session shutdown failed", error);
|
||||
});
|
||||
await Promise.allSettled([stt.destroy?.(), tts.destroy?.()]);
|
||||
process.exit(exitCode);
|
||||
};
|
||||
|
||||
|
||||
90
src/python-runtime.ts
Normal file
90
src/python-runtime.ts
Normal file
@@ -0,0 +1,90 @@
|
||||
import { existsSync } from "node:fs";
|
||||
import { spawnSync } from "node:child_process";
|
||||
import path from "node:path";
|
||||
|
||||
import type { AppConfig } from "./config.js";
|
||||
|
||||
export interface PythonLaunch {
|
||||
command: string;
|
||||
args: string[];
|
||||
source: "venv" | "configured" | "system";
|
||||
}
|
||||
|
||||
function splitCommandSpec(spec: string): string[] {
|
||||
return spec.match(/(?:[^\s"]+|"[^"]*")+/g)?.map((part) => part.replace(/^"|"$/g, "")) ?? [];
|
||||
}
|
||||
|
||||
function canRun(command: string, args: string[]): boolean {
|
||||
const result = spawnSync(command, [...args, "--version"], {
|
||||
encoding: "utf8",
|
||||
});
|
||||
return result.status === 0;
|
||||
}
|
||||
|
||||
export function resolveLocalAiVenvPath(config: AppConfig): string {
|
||||
return path.resolve(process.cwd(), config.LOCAL_AI_VENV_PATH);
|
||||
}
|
||||
|
||||
export function resolveLocalAiCachePath(config: AppConfig): string {
|
||||
return path.resolve(process.cwd(), config.LOCAL_AI_CACHE_DIR);
|
||||
}
|
||||
|
||||
export function resolveVenvPythonPath(config: AppConfig): string {
|
||||
const venvPath = resolveLocalAiVenvPath(config);
|
||||
return process.platform === "win32"
|
||||
? path.join(venvPath, "Scripts", "python.exe")
|
||||
: path.join(venvPath, "bin", "python");
|
||||
}
|
||||
|
||||
export function resolvePythonLaunch(config: AppConfig, options?: { preferVenv?: boolean }): PythonLaunch {
|
||||
const preferVenv = options?.preferVenv ?? true;
|
||||
const venvPython = resolveVenvPythonPath(config);
|
||||
|
||||
if (preferVenv && existsSync(venvPython)) {
|
||||
return {
|
||||
command: venvPython,
|
||||
args: [],
|
||||
source: "venv",
|
||||
};
|
||||
}
|
||||
|
||||
const configured = config.LOCAL_AI_PYTHON ? splitCommandSpec(config.LOCAL_AI_PYTHON) : [];
|
||||
if (configured.length > 0 && canRun(configured[0]!, configured.slice(1))) {
|
||||
return {
|
||||
command: configured[0]!,
|
||||
args: configured.slice(1),
|
||||
source: "configured",
|
||||
};
|
||||
}
|
||||
|
||||
const candidates =
|
||||
process.platform === "win32"
|
||||
? [
|
||||
["py", "-3"],
|
||||
["python"],
|
||||
["python3"],
|
||||
]
|
||||
: [
|
||||
["python3"],
|
||||
["python"],
|
||||
];
|
||||
|
||||
for (const [command, ...args] of candidates) {
|
||||
if (canRun(command, args)) {
|
||||
return {
|
||||
command,
|
||||
args,
|
||||
source: "system",
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
[
|
||||
"Python 실행 파일을 찾지 못했습니다.",
|
||||
"1. Python 3.11 이상을 설치",
|
||||
"2. 필요하면 `.env` 에 `LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 설정",
|
||||
"3. 그 다음 `bun run setup:local-ai` 실행",
|
||||
].join("\n"),
|
||||
);
|
||||
}
|
||||
@@ -1,124 +0,0 @@
|
||||
import WebSocket from "ws";
|
||||
|
||||
import type { AssistantRuntimeConfig } from "../config.js";
|
||||
|
||||
interface ElevenLabsMessage {
|
||||
message_type?: string;
|
||||
text?: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
const NON_FATAL_ERROR_TYPES = new Set([
|
||||
"insufficient_audio_activity",
|
||||
]);
|
||||
|
||||
export class ElevenLabsSttService {
|
||||
constructor(private readonly config: AssistantRuntimeConfig) {}
|
||||
|
||||
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
|
||||
if (pcm16MonoAudio.byteLength === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const url = new URL("wss://api.elevenlabs.io/v1/speech-to-text/realtime");
|
||||
url.searchParams.set("model_id", this.config.ELEVENLABS_STT_MODEL);
|
||||
url.searchParams.set("language_code", this.config.BOT_DEFAULT_LANGUAGE);
|
||||
url.searchParams.set("audio_format", "pcm_16000");
|
||||
url.searchParams.set("commit_strategy", "manual");
|
||||
url.searchParams.set("include_timestamps", "false");
|
||||
url.searchParams.set("include_language_detection", "false");
|
||||
url.searchParams.set("enable_logging", "false");
|
||||
|
||||
return await new Promise<string | null>((resolve, reject) => {
|
||||
const socket = new WebSocket(url, {
|
||||
headers: {
|
||||
"xi-api-key": this.config.ELEVENLABS_API_KEY,
|
||||
},
|
||||
});
|
||||
|
||||
let settled = false;
|
||||
let lastTranscript = "";
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
finish(lastTranscript || null);
|
||||
}, 15_000);
|
||||
|
||||
const finish = (result: string | null, error?: Error) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
settled = true;
|
||||
clearTimeout(timeout);
|
||||
try {
|
||||
socket.close();
|
||||
} catch {
|
||||
// Ignore close race.
|
||||
}
|
||||
|
||||
if (error) {
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
resolve(result);
|
||||
};
|
||||
|
||||
socket.on("message", (raw) => {
|
||||
let message: ElevenLabsMessage;
|
||||
try {
|
||||
message = JSON.parse(raw.toString()) as ElevenLabsMessage;
|
||||
} catch (error) {
|
||||
finish(null, error as Error);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (message.message_type) {
|
||||
case "session_started":
|
||||
socket.send(
|
||||
JSON.stringify({
|
||||
message_type: "input_audio_chunk",
|
||||
audio_base_64: pcm16MonoAudio.toString("base64"),
|
||||
commit: true,
|
||||
sample_rate: 16000,
|
||||
}),
|
||||
);
|
||||
return;
|
||||
case "partial_transcript":
|
||||
return;
|
||||
case "committed_transcript":
|
||||
case "committed_transcript_with_timestamps": {
|
||||
const transcript = message.text?.trim() ?? "";
|
||||
if (transcript.length > 0) {
|
||||
lastTranscript = transcript;
|
||||
finish(transcript);
|
||||
}
|
||||
return;
|
||||
}
|
||||
default:
|
||||
if (!message.message_type?.endsWith("error") && !message.message_type) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (message.message_type && NON_FATAL_ERROR_TYPES.has(message.message_type)) {
|
||||
finish(null);
|
||||
return;
|
||||
}
|
||||
|
||||
finish(
|
||||
null,
|
||||
new Error(message.error ?? `ElevenLabs STT error: ${message.message_type ?? "unknown"}`),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
socket.on("error", (error) => {
|
||||
finish(null, error as Error);
|
||||
});
|
||||
|
||||
socket.on("close", () => {
|
||||
if (!settled) {
|
||||
finish(lastTranscript || null);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1,78 +0,0 @@
|
||||
import { Readable } from "node:stream";
|
||||
|
||||
import prism from "prism-media";
|
||||
|
||||
import type { AssistantRuntimeConfig } from "../config.js";
|
||||
import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
|
||||
|
||||
export interface PreparedSpeechAudio {
|
||||
stream: Readable;
|
||||
dispose: () => void;
|
||||
}
|
||||
|
||||
export class ElevenLabsTtsService {
|
||||
constructor(private readonly config: AssistantRuntimeConfig) {
|
||||
const resolvedFfmpegPath = resolveFfmpegPath();
|
||||
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
|
||||
process.env.FFMPEG_PATH = resolvedFfmpegPath;
|
||||
}
|
||||
}
|
||||
|
||||
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
|
||||
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
|
||||
url.searchParams.set("output_format", "mp3_44100_128");
|
||||
url.searchParams.set("enable_logging", "false");
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"xi-api-key": this.config.ELEVENLABS_API_KEY,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
model_id: this.config.ELEVENLABS_TTS_MODEL,
|
||||
language_code: this.config.BOT_DEFAULT_LANGUAGE,
|
||||
voice_settings: {
|
||||
stability: 0.35,
|
||||
similarity_boost: 0.75,
|
||||
speed: 1.05,
|
||||
},
|
||||
}),
|
||||
signal,
|
||||
});
|
||||
|
||||
if (!response.ok || !response.body) {
|
||||
throw new Error(`ElevenLabs TTS request failed with status ${response.status}`);
|
||||
}
|
||||
|
||||
const input = Readable.fromWeb(response.body as never);
|
||||
const ffmpeg = new prism.FFmpeg({
|
||||
args: [
|
||||
"-analyzeduration",
|
||||
"0",
|
||||
"-loglevel",
|
||||
"0",
|
||||
"-i",
|
||||
"pipe:0",
|
||||
"-f",
|
||||
"s16le",
|
||||
"-ar",
|
||||
"48000",
|
||||
"-ac",
|
||||
"2",
|
||||
"pipe:1",
|
||||
],
|
||||
});
|
||||
|
||||
input.pipe(ffmpeg);
|
||||
|
||||
return {
|
||||
stream: ffmpeg,
|
||||
dispose: () => {
|
||||
input.destroy();
|
||||
ffmpeg.destroy();
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
43
src/services/local-stt.ts
Normal file
43
src/services/local-stt.ts
Normal file
@@ -0,0 +1,43 @@
|
||||
import type { AssistantRuntimeConfig } from "../config.js";
|
||||
import type { Logger } from "../logger.js";
|
||||
import { PythonJsonWorker } from "./python-json-worker.js";
|
||||
import type { SttService } from "./stt.js";
|
||||
|
||||
interface TranscribeResult {
|
||||
text?: string;
|
||||
}
|
||||
|
||||
export class LocalFasterWhisperSttService implements SttService {
|
||||
private readonly worker: PythonJsonWorker;
|
||||
|
||||
constructor(private readonly config: AssistantRuntimeConfig, logger: Logger) {
|
||||
this.worker = new PythonJsonWorker(config, logger, "local_stt_worker.py", "local-stt", {
|
||||
LOCAL_STT_MODEL: config.LOCAL_STT_MODEL,
|
||||
LOCAL_STT_DEVICE: config.LOCAL_STT_DEVICE,
|
||||
LOCAL_STT_COMPUTE_TYPE: config.LOCAL_STT_COMPUTE_TYPE,
|
||||
LOCAL_STT_BEAM_SIZE: String(config.LOCAL_STT_BEAM_SIZE),
|
||||
});
|
||||
}
|
||||
|
||||
async warmup(): Promise<void> {
|
||||
await this.worker.request("ping", {});
|
||||
}
|
||||
|
||||
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
|
||||
if (pcm16MonoAudio.byteLength === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const result = await this.worker.request<TranscribeResult>("transcribe", {
|
||||
audio_base64: pcm16MonoAudio.toString("base64"),
|
||||
language: this.config.BOT_DEFAULT_LANGUAGE,
|
||||
});
|
||||
|
||||
const transcript = result.text?.trim() ?? "";
|
||||
return transcript.length > 0 ? transcript : null;
|
||||
}
|
||||
|
||||
async destroy(): Promise<void> {
|
||||
await this.worker.destroy();
|
||||
}
|
||||
}
|
||||
94
src/services/local-tts.ts
Normal file
94
src/services/local-tts.ts
Normal file
@@ -0,0 +1,94 @@
|
||||
import { Readable } from "node:stream";
|
||||
|
||||
import prism from "prism-media";
|
||||
|
||||
import type { AssistantRuntimeConfig } from "../config.js";
|
||||
import type { Logger } from "../logger.js";
|
||||
import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
|
||||
import { PythonJsonWorker } from "./python-json-worker.js";
|
||||
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
|
||||
|
||||
interface SynthesizeResult {
|
||||
wav_base64?: string;
|
||||
}
|
||||
|
||||
export class LocalMeloTtsService implements TtsService {
|
||||
private readonly worker: PythonJsonWorker;
|
||||
|
||||
constructor(config: AssistantRuntimeConfig, logger: Logger) {
|
||||
const resolvedFfmpegPath = resolveFfmpegPath();
|
||||
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
|
||||
process.env.FFMPEG_PATH = resolvedFfmpegPath;
|
||||
}
|
||||
|
||||
this.worker = new PythonJsonWorker(config, logger, "local_tts_worker.py", "local-tts", {
|
||||
LOCAL_TTS_LANGUAGE: config.LOCAL_TTS_LANGUAGE,
|
||||
LOCAL_TTS_SPEAKER: config.LOCAL_TTS_SPEAKER,
|
||||
LOCAL_TTS_DEVICE: config.LOCAL_TTS_DEVICE,
|
||||
LOCAL_TTS_SPEED: String(config.LOCAL_TTS_SPEED),
|
||||
});
|
||||
}
|
||||
|
||||
async warmup(): Promise<void> {
|
||||
await this.worker.request("ping", {});
|
||||
}
|
||||
|
||||
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
|
||||
const result = await this.worker.request<SynthesizeResult>(
|
||||
"synthesize",
|
||||
{
|
||||
text,
|
||||
},
|
||||
signal,
|
||||
);
|
||||
|
||||
const wavBase64 = result.wav_base64;
|
||||
if (!wavBase64) {
|
||||
throw new Error("로컬 TTS가 빈 오디오를 반환했습니다.");
|
||||
}
|
||||
|
||||
const input = Readable.from([Buffer.from(wavBase64, "base64")]);
|
||||
const ffmpeg = new prism.FFmpeg({
|
||||
args: [
|
||||
"-analyzeduration",
|
||||
"0",
|
||||
"-loglevel",
|
||||
"0",
|
||||
"-i",
|
||||
"pipe:0",
|
||||
"-f",
|
||||
"s16le",
|
||||
"-ar",
|
||||
"48000",
|
||||
"-ac",
|
||||
"2",
|
||||
"pipe:1",
|
||||
],
|
||||
});
|
||||
|
||||
if (signal) {
|
||||
signal.addEventListener(
|
||||
"abort",
|
||||
() => {
|
||||
input.destroy();
|
||||
ffmpeg.destroy();
|
||||
},
|
||||
{ once: true },
|
||||
);
|
||||
}
|
||||
|
||||
input.pipe(ffmpeg);
|
||||
|
||||
return {
|
||||
stream: ffmpeg,
|
||||
dispose: () => {
|
||||
input.destroy();
|
||||
ffmpeg.destroy();
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
async destroy(): Promise<void> {
|
||||
await this.worker.destroy();
|
||||
}
|
||||
}
|
||||
189
src/services/python-json-worker.ts
Normal file
189
src/services/python-json-worker.ts
Normal file
@@ -0,0 +1,189 @@
|
||||
import { spawn, type ChildProcessWithoutNullStreams } from "node:child_process";
|
||||
import { createInterface } from "node:readline";
|
||||
import path from "node:path";
|
||||
|
||||
import type { AssistantRuntimeConfig } from "../config.js";
|
||||
import type { Logger } from "../logger.js";
|
||||
import { resolveLocalAiCachePath, resolvePythonLaunch } from "../python-runtime.js";
|
||||
|
||||
interface WorkerRequest {
|
||||
id: number;
|
||||
method: string;
|
||||
params: Record<string, unknown>;
|
||||
}
|
||||
|
||||
interface WorkerResponse {
|
||||
id: number;
|
||||
ok: boolean;
|
||||
result?: unknown;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export class PythonJsonWorker {
|
||||
private child: ChildProcessWithoutNullStreams | null = null;
|
||||
private nextId = 1;
|
||||
private readonly pending = new Map<
|
||||
number,
|
||||
{
|
||||
resolve: (value: unknown) => void;
|
||||
reject: (error: Error) => void;
|
||||
}
|
||||
>();
|
||||
|
||||
constructor(
|
||||
private readonly config: AssistantRuntimeConfig,
|
||||
private readonly logger: Logger,
|
||||
private readonly scriptName: string,
|
||||
private readonly label: string,
|
||||
private readonly workerEnv: Record<string, string>,
|
||||
) {}
|
||||
|
||||
async request<T>(method: string, params: Record<string, unknown>, signal?: AbortSignal): Promise<T> {
|
||||
const child = this.ensureStarted();
|
||||
const id = this.nextId++;
|
||||
|
||||
return await new Promise<T>((resolve, reject) => {
|
||||
if (signal?.aborted) {
|
||||
reject(new Error(`${this.label} request aborted before start`));
|
||||
return;
|
||||
}
|
||||
|
||||
const abortHandler = () => {
|
||||
this.pending.delete(id);
|
||||
reject(new Error(`${this.label} request aborted`));
|
||||
};
|
||||
|
||||
if (signal) {
|
||||
signal.addEventListener("abort", abortHandler, { once: true });
|
||||
}
|
||||
|
||||
this.pending.set(id, {
|
||||
resolve: (value) => {
|
||||
if (signal) {
|
||||
signal.removeEventListener("abort", abortHandler);
|
||||
}
|
||||
resolve(value as T);
|
||||
},
|
||||
reject: (error) => {
|
||||
if (signal) {
|
||||
signal.removeEventListener("abort", abortHandler);
|
||||
}
|
||||
reject(error);
|
||||
},
|
||||
});
|
||||
|
||||
const message: WorkerRequest = {
|
||||
id,
|
||||
method,
|
||||
params,
|
||||
};
|
||||
|
||||
child.stdin.write(`${JSON.stringify(message)}\n`);
|
||||
});
|
||||
}
|
||||
|
||||
async destroy(): Promise<void> {
|
||||
this.rejectAll(new Error(`${this.label} worker terminated`));
|
||||
|
||||
if (!this.child) {
|
||||
return;
|
||||
}
|
||||
|
||||
const child = this.child;
|
||||
this.child = null;
|
||||
|
||||
child.kill("SIGTERM");
|
||||
await new Promise<void>((resolve) => {
|
||||
child.once("exit", () => resolve());
|
||||
setTimeout(resolve, 1_500);
|
||||
});
|
||||
}
|
||||
|
||||
private ensureStarted(): ChildProcessWithoutNullStreams {
|
||||
if (this.child) {
|
||||
return this.child;
|
||||
}
|
||||
|
||||
const launch = resolvePythonLaunch(this.config);
|
||||
const scriptPath = path.resolve(process.cwd(), "python", this.scriptName);
|
||||
const cachePath = resolveLocalAiCachePath(this.config);
|
||||
const recentStderr: string[] = [];
|
||||
|
||||
const child = spawn(launch.command, [...launch.args, scriptPath], {
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
env: {
|
||||
...process.env,
|
||||
HF_HOME: cachePath,
|
||||
TRANSFORMERS_CACHE: cachePath,
|
||||
PYTHONIOENCODING: "utf-8",
|
||||
BOT_DEFAULT_LANGUAGE: this.config.BOT_DEFAULT_LANGUAGE,
|
||||
...this.workerEnv,
|
||||
},
|
||||
});
|
||||
|
||||
createInterface({
|
||||
input: child.stdout,
|
||||
crlfDelay: Number.POSITIVE_INFINITY,
|
||||
}).on("line", (line) => {
|
||||
if (!line.trim()) {
|
||||
return;
|
||||
}
|
||||
|
||||
let payload: WorkerResponse;
|
||||
try {
|
||||
payload = JSON.parse(line) as WorkerResponse;
|
||||
} catch (error) {
|
||||
this.logger.warn(`${this.label} stdout parse failed`, error);
|
||||
return;
|
||||
}
|
||||
|
||||
const pending = this.pending.get(payload.id);
|
||||
if (!pending) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.pending.delete(payload.id);
|
||||
if (payload.ok) {
|
||||
pending.resolve(payload.result);
|
||||
return;
|
||||
}
|
||||
|
||||
pending.reject(new Error(payload.error ?? `${this.label} worker error`));
|
||||
});
|
||||
|
||||
child.stderr.on("data", (chunk: Buffer) => {
|
||||
const text = chunk.toString().trim();
|
||||
if (text.length > 0) {
|
||||
recentStderr.push(text);
|
||||
if (recentStderr.length > 20) {
|
||||
recentStderr.shift();
|
||||
}
|
||||
this.logger.warn(`[${this.label}]`, text);
|
||||
}
|
||||
});
|
||||
|
||||
child.on("exit", (code, signal) => {
|
||||
if (this.child === child) {
|
||||
this.child = null;
|
||||
}
|
||||
|
||||
const detail = recentStderr.length > 0 ? `\n${recentStderr.join("\n")}` : "";
|
||||
this.rejectAll(new Error(`${this.label} worker exited code=${code ?? "null"} signal=${signal ?? "null"}${detail}`));
|
||||
});
|
||||
|
||||
child.on("error", (error) => {
|
||||
this.rejectAll(error as Error);
|
||||
});
|
||||
|
||||
this.child = child;
|
||||
return child;
|
||||
}
|
||||
|
||||
private rejectAll(error: Error): void {
|
||||
const pending = [...this.pending.values()];
|
||||
this.pending.clear();
|
||||
for (const item of pending) {
|
||||
item.reject(error);
|
||||
}
|
||||
}
|
||||
}
|
||||
4
src/services/stt.ts
Normal file
4
src/services/stt.ts
Normal file
@@ -0,0 +1,4 @@
|
||||
export interface SttService {
|
||||
transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null>;
|
||||
destroy?(): Promise<void>;
|
||||
}
|
||||
11
src/services/tts.ts
Normal file
11
src/services/tts.ts
Normal file
@@ -0,0 +1,11 @@
|
||||
import type { Readable } from "node:stream";
|
||||
|
||||
export interface PreparedSpeechAudio {
|
||||
stream: Readable;
|
||||
dispose: () => void;
|
||||
}
|
||||
|
||||
export interface TtsService {
|
||||
preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio>;
|
||||
destroy?(): Promise<void>;
|
||||
}
|
||||
88
src/setup-local-ai.ts
Normal file
88
src/setup-local-ai.ts
Normal file
@@ -0,0 +1,88 @@
|
||||
import { existsSync } from "node:fs";
|
||||
import { mkdir } from "node:fs/promises";
|
||||
import { spawn } from "node:child_process";
|
||||
import path from "node:path";
|
||||
|
||||
import { loadConfig } from "./config.js";
|
||||
import { resolveLocalAiCachePath, resolveLocalAiVenvPath, resolvePythonLaunch, resolveVenvPythonPath } from "./python-runtime.js";
|
||||
|
||||
async function run(command: string, args: string[], extraEnv?: NodeJS.ProcessEnv): Promise<void> {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const child = spawn(command, args, {
|
||||
stdio: "inherit",
|
||||
env: {
|
||||
...process.env,
|
||||
...extraEnv,
|
||||
},
|
||||
});
|
||||
|
||||
child.on("exit", (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
|
||||
});
|
||||
child.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
||||
async function ensurePip(pythonBin: string, env: NodeJS.ProcessEnv): Promise<void> {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const child = spawn(pythonBin, ["-m", "pip", "--version"], {
|
||||
stdio: "ignore",
|
||||
env,
|
||||
});
|
||||
child.on("exit", (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
reject(new Error("pip missing"));
|
||||
});
|
||||
child.on("error", reject);
|
||||
}).catch(async () => {
|
||||
await run(pythonBin, ["-m", "ensurepip", "--upgrade"], env);
|
||||
});
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const config = loadConfig();
|
||||
const venvPath = resolveLocalAiVenvPath(config);
|
||||
const venvPython = resolveVenvPythonPath(config);
|
||||
const cachePath = resolveLocalAiCachePath(config);
|
||||
const requirementsPath = path.resolve(process.cwd(), "python", "requirements.txt");
|
||||
const baseEnv = {
|
||||
HF_HOME: cachePath,
|
||||
TRANSFORMERS_CACHE: cachePath,
|
||||
PYTHONIOENCODING: "utf-8",
|
||||
};
|
||||
|
||||
await mkdir(cachePath, { recursive: true });
|
||||
|
||||
if (!existsSync(venvPython)) {
|
||||
const launch = resolvePythonLaunch(config, { preferVenv: false });
|
||||
console.log(`기본 Python 확인: ${launch.command} ${launch.args.join(" ")}`.trim());
|
||||
console.log(`가상환경 생성: ${venvPath}`);
|
||||
await run(launch.command, [...launch.args, "-m", "venv", venvPath], baseEnv);
|
||||
}
|
||||
|
||||
await ensurePip(venvPython, {
|
||||
...process.env,
|
||||
...baseEnv,
|
||||
});
|
||||
|
||||
console.log("로컬 AI 의존성 설치를 시작합니다.");
|
||||
await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], baseEnv);
|
||||
await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], baseEnv);
|
||||
|
||||
console.log("설치가 끝났습니다.");
|
||||
console.log("다음 순서:");
|
||||
console.log("1. bun run devices");
|
||||
console.log("2. bun run start:local");
|
||||
}
|
||||
|
||||
void main().catch((error) => {
|
||||
console.error(error instanceof Error ? error.message : String(error));
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user