Reset project to README only

This commit is contained in:
2026-05-01 23:14:23 +09:00
parent 53777be675
commit 10e0dd75db
33 changed files with 0 additions and 4155 deletions

View File

@@ -1,59 +0,0 @@
import { existsSync } from "node:fs";
import { spawnSync } from "node:child_process";
import process from "node:process";
import ffmpegStatic from "ffmpeg-static";
function firstExisting(paths: Array<string | null | undefined>): string | null {
for (const candidate of paths) {
if (candidate && existsSync(candidate)) {
return candidate;
}
}
return null;
}
function findOnPath(): string | null {
const locator = process.platform === "win32" ? "where" : "which";
const binaryName = process.platform === "win32" ? "ffmpeg.exe" : "ffmpeg";
const result = spawnSync(locator, [binaryName], {
encoding: "utf8",
});
if (result.status !== 0) {
return null;
}
const match = result.stdout
.split(/\r?\n/)
.map((line) => line.trim())
.find((line) => line.length > 0 && existsSync(line));
return match ?? null;
}
export function resolveFfmpegPath(): string | null {
const staticPath = ffmpegStatic as unknown as string | null;
return firstExisting([
process.env.FFMPEG_PATH,
process.env.FFMPEG_BIN,
staticPath,
findOnPath(),
]);
}
export function requireFfmpegPath(): string {
const resolved = resolveFfmpegPath();
if (resolved) {
return resolved;
}
throw new Error(
[
"ffmpeg를 찾지 못했습니다.",
"1. `bun install` 재실행",
"2. 안 되면 `bun pm trust ffmpeg-static` 후 다시 `bun install`",
"3. 또는 시스템 ffmpeg를 설치해서 PATH에 추가",
].join("\n"),
);
}

View File

@@ -1,456 +0,0 @@
import { EventEmitter } from "node:events";
import prism from "prism-media";
import { RealTimeVAD } from "avr-vad";
import {
AudioPlayerStatus,
EndBehaviorType,
NoSubscriberBehavior,
VoiceConnectionStatus,
createAudioPlayer,
createAudioResource,
entersState,
joinVoiceChannel,
StreamType,
type AudioPlayer,
type AudioReceiveStream,
type VoiceConnection,
} from "@discordjs/voice";
import type { Client, Guild, VoiceBasedChannel } from "discord.js";
import type { AppConfig } from "../config.js";
import { Logger } from "../logger.js";
import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js";
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
import type { LlmService } from "../services/llm.js";
import type { SttService } from "../services/stt.js";
import type { PreparedSpeechAudio, TtsService } from "../services/tts.js";
interface GuildVoiceSessionOptions {
client: Client;
config: AppConfig;
logger: Logger;
guild: Guild;
voiceChannel: VoiceBasedChannel;
textChannelId?: string;
stt: SttService;
tts: TtsService;
llm: LlmService;
}
interface SpeechJob {
text: string;
source: "assistant" | "manual";
}
class UserAudioSession {
private readonly downsampler = new Stereo48kToMono16kDownsampler();
private readonly pendingSamples: number[] = [];
private readonly vad: RealTimeVAD;
private processing = Promise.resolve();
private constructor(
private readonly logger: Logger,
private readonly speakerId: string,
private readonly speakerName: string,
private readonly receiveStream: AudioReceiveStream,
private readonly decoder: NodeJS.ReadWriteStream & { destroy: () => void },
vad: RealTimeVAD,
private readonly onSpeechEnd: (utterance: UserUtterance, audio: Float32Array) => void,
) {
this.vad = vad;
}
static async create(options: {
logger: Logger;
speakerId: string;
speakerName: string;
receiveStream: AudioReceiveStream;
decoder: NodeJS.ReadWriteStream & { destroy: () => void };
onSpeechStart: () => void;
onSpeechEnd: (utterance: UserUtterance, audio: Float32Array) => void;
}): Promise<UserAudioSession> {
const vadInstance = await RealTimeVAD.new({
model: "v5",
sampleRate: 16000,
frameSamples: 1536,
positiveSpeechThreshold: 0.55,
negativeSpeechThreshold: 0.35,
redemptionFrames: 8,
preSpeechPadFrames: 2,
minSpeechFrames: 3,
onFrameProcessed: () => undefined,
onVADMisfire: () => undefined,
onSpeechStart: () => {
options.onSpeechStart();
},
onSpeechRealStart: () => undefined,
onSpeechEnd: (audio: Float32Array) => {
options.onSpeechEnd(
{
speakerId: options.speakerId,
speakerName: options.speakerName,
text: "",
},
audio,
);
},
});
const session = new UserAudioSession(
options.logger,
options.speakerId,
options.speakerName,
options.receiveStream,
options.decoder,
vadInstance,
options.onSpeechEnd,
);
session.decoder.on("data", (chunk: Buffer) => {
session.pushPcmChunk(chunk);
});
session.decoder.on("error", (error) => {
options.logger.warn("PCM decoder error", options.speakerId, error);
});
session.receiveStream.on("error", (error) => {
options.logger.warn("Audio receive stream error", options.speakerId, error);
});
return session;
}
private pushPcmChunk(chunk: Buffer): void {
const mono16k = this.downsampler.pushStereo48kChunk(chunk);
if (mono16k.length === 0) {
return;
}
for (const sample of mono16k) {
this.pendingSamples.push(sample);
}
while (true) {
const frame = takeFrame(this.pendingSamples, 1536);
if (!frame) {
return;
}
const floatFrame = int16ArrayToFloat32(frame);
this.processing = this.processing
.then(() => this.vad.processAudio(floatFrame))
.catch((error) => {
this.logger.warn("VAD frame processing failed", this.speakerId, this.speakerName, error);
});
}
}
destroy(): void {
this.receiveStream.destroy();
this.decoder.destroy();
void this.vad.destroy().catch((error) => {
this.logger.warn("VAD destroy failed", this.speakerId, this.speakerName, error);
});
}
}
export class GuildVoiceSession extends EventEmitter {
readonly guildId: string;
readonly voiceChannelId: string;
private readonly connection: VoiceConnection;
private readonly player: AudioPlayer;
private readonly memory: ConversationMemory;
private readonly trackedUsers = new Map<string, UserAudioSession>();
private readonly pendingUsers = new Map<string, Promise<void>>();
private readonly queue: SpeechJob[] = [];
private draining = false;
private currentAbortController: AbortController | null = null;
private currentPlayback: PreparedSpeechAudio | null = null;
private textChannelId?: string;
private constructor(private readonly options: GuildVoiceSessionOptions) {
super();
this.guildId = options.guild.id;
this.voiceChannelId = options.voiceChannel.id;
this.textChannelId = options.textChannelId;
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
this.player = createAudioPlayer({
behaviors: {
noSubscriber: NoSubscriberBehavior.Pause,
},
});
this.connection = joinVoiceChannel({
guildId: options.guild.id,
channelId: options.voiceChannel.id,
adapterCreator: options.guild.voiceAdapterCreator,
selfDeaf: false,
selfMute: false,
});
}
static async create(options: GuildVoiceSessionOptions): Promise<GuildVoiceSession> {
const session = new GuildVoiceSession(options);
await session.initialize();
return session;
}
private async initialize(): Promise<void> {
this.player.on("error", (error) => {
this.options.logger.warn("Audio player error", this.guildId, error);
});
this.connection.on("stateChange", (_oldState, newState) => {
if (newState.status === VoiceConnectionStatus.Destroyed) {
this.options.logger.info("Voice connection destroyed", this.guildId);
}
});
this.connection.subscribe(this.player);
await entersState(this.connection, VoiceConnectionStatus.Ready, 30_000);
this.connection.receiver.speaking.on("start", (userId: string) => {
if (userId === this.options.client.user?.id) {
return;
}
void this.ensureTrackedUser(userId);
});
}
setTextChannel(textChannelId?: string): void {
this.textChannelId = textChannelId;
}
clearConversation(): void {
this.memory.clear();
this.interruptPlayback("conversation-reset");
}
statusSummary(): string {
const playbackState = this.player.state.status;
return [
`세션 활성: 예`,
`음성 채널: ${this.options.voiceChannel.name}`,
`추적 유저 수: ${this.trackedUsers.size}`,
`재생 상태: ${playbackState}`,
`대기열: ${this.queue.length}`,
`최근 대화 턴: ${this.memory.recentTurns().length}`,
].join("\n");
}
async speakText(text: string): Promise<void> {
this.queue.push({
text,
source: "manual",
});
await this.drainQueue();
}
interruptPlayback(reason: string): void {
if (this.queue.length > 0 || this.player.state.status !== AudioPlayerStatus.Idle) {
this.options.logger.info("Interrupting playback", this.guildId, reason);
}
this.queue.splice(0, this.queue.length);
this.currentAbortController?.abort();
this.currentAbortController = null;
this.currentPlayback?.dispose();
this.currentPlayback = null;
this.player.stop(true);
}
async destroy(): Promise<void> {
this.interruptPlayback("session-destroy");
for (const session of this.trackedUsers.values()) {
session.destroy();
}
this.trackedUsers.clear();
this.pendingUsers.clear();
this.connection.destroy();
}
private async ensureTrackedUser(userId: string): Promise<void> {
if (this.trackedUsers.has(userId)) {
return;
}
const existing = this.pendingUsers.get(userId);
if (existing) {
await existing;
return;
}
const pending = this.createTrackedUser(userId).finally(() => {
this.pendingUsers.delete(userId);
});
this.pendingUsers.set(userId, pending);
await pending;
}
private async createTrackedUser(userId: string): Promise<void> {
const speakerName = await this.resolveSpeakerName(userId);
const receiveStream = this.connection.receiver.subscribe(userId, {
end: {
behavior: EndBehaviorType.Manual,
},
});
const decoder = new prism.opus.Decoder({
rate: 48000,
channels: 2,
frameSize: 960,
}) as NodeJS.ReadWriteStream & { destroy: () => void };
receiveStream.pipe(decoder);
const session = await UserAudioSession.create({
logger: this.options.logger,
speakerId: userId,
speakerName,
receiveStream,
decoder,
onSpeechStart: () => {
this.interruptPlayback(`barge-in:${speakerName}`);
},
onSpeechEnd: (utterance, audio) => {
void this.handleSpeechEnd(utterance, audio);
},
});
this.trackedUsers.set(userId, session);
this.options.logger.info("Tracking speaker", this.guildId, userId, speakerName);
}
private async resolveSpeakerName(userId: string): Promise<string> {
try {
const user = await this.options.client.users.fetch(userId);
return user.globalName ?? user.username;
} catch {
return `user-${userId.slice(-6)}`;
}
}
private async handleSpeechEnd(utterance: UserUtterance, audio: Float32Array): Promise<void> {
if (audio.length < 16000 * 0.25) {
return;
}
const pcmBuffer = float32ToPcm16Buffer(audio);
let transcript: string | null = null;
try {
transcript = await this.options.stt.transcribePcm16(pcmBuffer);
} catch (error) {
this.options.logger.warn("STT failed", this.guildId, utterance.speakerId, error);
await this.announce(`음성 인식 실패: ${utterance.speakerName}`);
return;
}
if (!transcript || transcript.trim().length === 0) {
return;
}
const hydratedUtterance: UserUtterance = {
...utterance,
text: transcript.trim(),
};
this.options.logger.info("Transcript committed", this.guildId, hydratedUtterance.speakerName, hydratedUtterance.text);
if (this.options.config.DEBUG_TEXT_EVENTS) {
await this.announce(`🗣️ ${hydratedUtterance.speakerName}: ${hydratedUtterance.text}`);
}
let reply: string;
try {
reply = await this.options.llm.generateReply(this.memory, hydratedUtterance);
} catch (error) {
this.options.logger.warn("LLM failed", this.guildId, utterance.speakerId, error);
reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요.";
}
this.memory.addUserTurn(hydratedUtterance);
this.memory.addAssistantTurn(reply);
if (this.options.config.DEBUG_TEXT_EVENTS) {
await this.announce(`🤖 ${reply}`);
}
this.queue.push({
text: reply,
source: "assistant",
});
await this.drainQueue();
}
private async drainQueue(): Promise<void> {
if (this.draining) {
return;
}
this.draining = true;
try {
while (this.queue.length > 0) {
const job = this.queue.shift();
if (!job) {
continue;
}
const abortController = new AbortController();
this.currentAbortController = abortController;
try {
this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal);
} catch (error) {
if (abortController.signal.aborted) {
continue;
}
this.options.logger.warn("TTS synthesis failed", this.guildId, job.source, error);
await this.announce("음성 출력 생성에 실패했습니다.");
continue;
}
try {
const resource = createAudioResource(this.currentPlayback.stream, {
inputType: StreamType.Raw,
});
this.player.play(resource);
await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null);
await entersState(this.player, AudioPlayerStatus.Idle, 300_000);
} catch (error) {
if (!abortController.signal.aborted) {
this.options.logger.warn("Audio playback failed", this.guildId, error);
}
} finally {
this.currentPlayback?.dispose();
this.currentPlayback = null;
if (this.currentAbortController === abortController) {
this.currentAbortController = null;
}
}
}
} finally {
this.draining = false;
}
}
private async announce(message: string): Promise<void> {
if (!this.textChannelId) {
return;
}
const channel = await this.options.client.channels.fetch(this.textChannelId).catch(() => null);
if (!channel?.isTextBased() || !("send" in channel) || typeof channel.send !== "function") {
return;
}
await channel.send(message).catch(() => null);
}
}

View File

@@ -1,710 +0,0 @@
import { spawn, type ChildProcess, type ChildProcessByStdio } from "node:child_process";
import { once } from "node:events";
import { promises as fs } from "node:fs";
import os from "node:os";
import path from "node:path";
import type { Readable, Writable } from "node:stream";
import { RealTimeVAD } from "avr-vad";
import type { AssistantRuntimeConfig } from "../config.js";
import { Logger } from "../logger.js";
import { requireFfmpegPath } from "./ffmpeg-path.js";
import { takeFrame, int16ArrayToFloat32, float32ToPcm16Buffer } from "./pcm.js";
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
import type { LlmService } from "../services/llm.js";
import type { SttService } from "../services/stt.js";
import type { PreparedSpeechAudio, TtsService } from "../services/tts.js";
interface LocalVoiceSessionOptions {
config: AssistantRuntimeConfig;
logger: Logger;
stt: SttService;
tts: TtsService;
llm: LlmService;
}
interface SpeechJob {
text: string;
source: "assistant" | "manual";
}
export class LocalVoiceSession {
private readonly memory: ConversationMemory;
private readonly queue: SpeechJob[] = [];
private readonly pendingSamples: number[] = [];
private readonly silenceThreshold = 900;
private readonly windowsFrameSamples = 320;
private readonly windowsPreRollSamples = 3_200;
private readonly windowsSpeechStartThreshold = 520;
private readonly windowsSpeechContinueThreshold = 260;
private readonly windowsSpeechStartFrames = 2;
private readonly windowsSpeechEndFrames = 18;
private readonly windowsMinSpeechSamples = 7_200;
private vad: RealTimeVAD | null = null;
private recorder: ChildProcessByStdio<null, Readable, Readable> | null = null;
private currentPlayer: ChildProcess | null = null;
private currentAbortController: AbortController | null = null;
private currentPlayback: PreparedSpeechAudio | null = null;
private processing = Promise.resolve();
private draining = false;
private destroyed = false;
private inputWatchdog: NodeJS.Timeout | null = null;
private recorderStartedAt = 0;
private lastPcmChunkAt = 0;
private lastNonSilentAudioAt = 0;
private warnedNoPcm = false;
private warnedSilent = false;
private windowsSpeechBuffer: number[] = [];
private windowsPreRollBuffer: number[] = [];
private windowsSpeechActive = false;
private windowsSpeechCandidateFrames = 0;
private windowsSilenceFrames = 0;
constructor(private readonly options: LocalVoiceSessionOptions) {
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
}
async start(): Promise<void> {
if (process.platform !== "win32") {
this.vad = await RealTimeVAD.new({
model: "v5",
sampleRate: 16000,
frameSamples: 1536,
positiveSpeechThreshold: 0.55,
negativeSpeechThreshold: 0.35,
redemptionFrames: 8,
preSpeechPadFrames: 2,
minSpeechFrames: 3,
onFrameProcessed: () => undefined,
onVADMisfire: () => undefined,
onSpeechStart: () => {
this.interruptPlayback("local-barge-in");
},
onSpeechRealStart: () => undefined,
onSpeechEnd: (audio: Float32Array) => {
void this.handleSpeechEnd(audio);
},
});
} else {
this.options.logger.info("Windows local mode uses amplitude-based speech detection");
}
this.recorder = this.spawnRecorder();
this.recorderStartedAt = Date.now();
this.lastPcmChunkAt = 0;
this.lastNonSilentAudioAt = 0;
this.warnedNoPcm = false;
this.warnedSilent = false;
this.recorder.stdout.on("data", (chunk: Buffer) => {
this.pushPcm16Chunk(chunk);
});
this.recorder.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text.length > 0) {
this.options.logger.debug("[pw-record]", text);
}
});
this.recorder.on("exit", (code, signal) => {
if (!this.destroyed) {
this.options.logger.warn("pw-record exited unexpectedly", { code, signal });
}
});
this.inputWatchdog = setInterval(() => {
this.reportInputHealth();
}, 3_000);
}
async destroy(): Promise<void> {
this.destroyed = true;
this.interruptPlayback("local-shutdown");
if (this.inputWatchdog) {
clearInterval(this.inputWatchdog);
this.inputWatchdog = null;
}
if (this.recorder && !this.recorder.killed) {
this.recorder.kill("SIGTERM");
await once(this.recorder, "exit").catch(() => null);
}
if (this.vad) {
await this.vad.destroy().catch((error) => {
this.options.logger.warn("Local VAD destroy failed", error);
});
this.vad = null;
}
}
clearConversation(): void {
this.memory.clear();
this.interruptPlayback("local-reset");
}
async speakText(text: string): Promise<void> {
this.queue.push({
text,
source: "manual",
});
await this.drainQueue();
}
statusSummary(): string {
return [
"모드: local",
`플랫폼: ${process.platform}`,
`입력 source: ${this.options.config.LOCAL_AUDIO_SOURCE ?? "default"}`,
`출력 sink: ${this.describeSink()}`,
`대기열: ${this.queue.length}`,
`최근 대화 턴: ${this.memory.recentTurns().length}`,
].join("\n");
}
private spawnRecorder(): ChildProcessByStdio<null, Readable, Readable> {
if (process.platform === "win32") {
return this.spawnWindowsRecorder();
}
const args = [
"--rate",
"16000",
"--channels",
"1",
"--format",
"s16",
"--raw",
];
if (this.options.config.LOCAL_AUDIO_SOURCE) {
args.push("--target", this.options.config.LOCAL_AUDIO_SOURCE);
}
args.push("-");
this.options.logger.info("Starting local recorder", {
source: this.options.config.LOCAL_AUDIO_SOURCE ?? "default",
});
return spawn("pw-record", args, {
stdio: ["ignore", "pipe", "pipe"],
});
}
private spawnWindowsRecorder(): ChildProcessByStdio<null, Readable, Readable> {
const ffmpegPath = this.getFfmpegPath();
const sourceName = this.options.config.LOCAL_AUDIO_SOURCE;
if (!sourceName) {
throw new Error("Windows 로컬 모드는 LOCAL_AUDIO_SOURCE 설정이 필요합니다. `bun run audio:devices` 로 이름을 확인해 주세요.");
}
const args = [
"-hide_banner",
"-loglevel",
"warning",
"-f",
"dshow",
"-i",
`audio=${sourceName}`,
"-ac",
"1",
"-ar",
"16000",
"-f",
"s16le",
"pipe:1",
];
this.options.logger.info("Starting local recorder", {
source: sourceName,
backend: "ffmpeg-dshow",
});
return spawn(ffmpegPath, args, {
stdio: ["ignore", "pipe", "pipe"],
});
}
private pushPcm16Chunk(chunk: Buffer): void {
if (this.destroyed) {
return;
}
this.lastPcmChunkAt = Date.now();
let peak = 0;
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
const sample = chunk.readInt16LE(offset);
const abs = Math.abs(sample);
if (abs > peak) {
peak = abs;
}
this.pendingSamples.push(sample);
}
if (peak >= this.silenceThreshold) {
this.lastNonSilentAudioAt = Date.now();
}
if (process.platform === "win32") {
this.processWindowsSpeechFrames();
return;
}
if (!this.vad) {
return;
}
while (true) {
const frame = takeFrame(this.pendingSamples, 1536);
if (!frame) {
return;
}
const floatFrame = int16ArrayToFloat32(frame);
this.processing = this.processing
.then(() => this.vad?.processAudio(floatFrame))
.catch((error) => {
this.options.logger.warn("Local VAD processing failed", error);
});
}
}
private processWindowsSpeechFrames(): void {
while (true) {
const frame = takeFrame(this.pendingSamples, this.windowsFrameSamples);
if (!frame) {
return;
}
let peak = 0;
for (const sample of frame) {
const abs = Math.abs(sample);
if (abs > peak) {
peak = abs;
}
}
if (!this.windowsSpeechActive) {
this.appendWithCap(this.windowsPreRollBuffer, frame, this.windowsPreRollSamples);
if (peak >= this.windowsSpeechStartThreshold) {
this.windowsSpeechCandidateFrames += 1;
} else {
this.windowsSpeechCandidateFrames = 0;
}
if (this.windowsSpeechCandidateFrames >= this.windowsSpeechStartFrames) {
this.windowsSpeechActive = true;
this.windowsSilenceFrames = 0;
this.windowsSpeechBuffer = [...this.windowsPreRollBuffer];
this.windowsPreRollBuffer = [];
this.interruptPlayback("local-barge-in");
this.options.logger.debug("Windows speech start detected", { peak });
} else {
continue;
}
}
this.windowsSpeechBuffer.push(...frame);
if (peak >= this.windowsSpeechContinueThreshold) {
this.windowsSilenceFrames = 0;
} else {
this.windowsSilenceFrames += 1;
}
if (this.windowsSilenceFrames < this.windowsSpeechEndFrames) {
continue;
}
const speech = Int16Array.from(this.windowsSpeechBuffer);
this.windowsSpeechActive = false;
this.windowsSpeechBuffer = [];
this.windowsSilenceFrames = 0;
this.windowsSpeechCandidateFrames = 0;
if (speech.length < this.windowsMinSpeechSamples) {
this.options.logger.debug("Ignored short Windows speech segment", { samples: speech.length });
continue;
}
this.options.logger.debug("Windows speech end detected", { samples: speech.length });
void this.handleSpeechEnd(int16ArrayToFloat32(speech));
}
}
private async handleSpeechEnd(audio: Float32Array): Promise<void> {
if (audio.length < 16000 * 0.25) {
this.options.logger.debug("Ignored short local speech segment", { samples: audio.length });
return;
}
const utterance: UserUtterance = {
speakerId: "local-user",
speakerName: this.options.config.LOCAL_SPEAKER_NAME,
text: "",
};
let transcript: string | null = null;
try {
transcript = await this.options.stt.transcribePcm16(float32ToPcm16Buffer(audio));
} catch (error) {
this.options.logger.warn("Local STT failed", error);
return;
}
if (!transcript || transcript.trim().length === 0) {
this.options.logger.info("Local STT returned empty transcript");
return;
}
utterance.text = transcript.trim();
this.options.logger.info("Local transcript", utterance.text);
if (this.options.config.DEBUG_TEXT_EVENTS) {
console.log(`\n[you] ${utterance.text}`);
}
let reply: string;
try {
reply = await this.options.llm.generateReply(this.memory, utterance);
} catch (error) {
this.options.logger.warn("Local LLM failed", error);
reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요.";
}
this.memory.addUserTurn(utterance);
this.memory.addAssistantTurn(reply);
this.options.logger.info("Local reply", reply);
if (this.options.config.DEBUG_TEXT_EVENTS) {
console.log(`[bot] ${reply}\n`);
}
this.queue.push({
text: reply,
source: "assistant",
});
await this.drainQueue();
}
private interruptPlayback(reason: string): void {
if (this.queue.length > 0 || this.currentPlayer) {
this.options.logger.info("Interrupting local playback", reason);
}
this.queue.splice(0, this.queue.length);
this.currentAbortController?.abort();
this.currentAbortController = null;
this.currentPlayback?.dispose();
this.currentPlayback = null;
if (this.currentPlayer && !this.currentPlayer.killed) {
this.currentPlayer.kill("SIGKILL");
}
this.currentPlayer = null;
}
private async drainQueue(): Promise<void> {
if (this.draining || this.destroyed) {
return;
}
this.draining = true;
try {
while (this.queue.length > 0 && !this.destroyed) {
const job = this.queue.shift();
if (!job) {
continue;
}
const abortController = new AbortController();
this.currentAbortController = abortController;
try {
this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal);
} catch (error) {
if (!abortController.signal.aborted) {
this.options.logger.warn("Local TTS synthesis failed", error);
}
continue;
}
try {
await this.playToSink(this.currentPlayback, abortController.signal);
} catch (error) {
if (!abortController.signal.aborted) {
this.options.logger.warn("Local playback failed", error);
}
} finally {
this.currentPlayback?.dispose();
this.currentPlayback = null;
if (this.currentAbortController === abortController) {
this.currentAbortController = null;
}
}
}
} finally {
this.draining = false;
}
}
private async playToSink(playback: PreparedSpeechAudio, signal: AbortSignal): Promise<void> {
if (process.platform === "win32") {
await this.playToWindowsDefaultSink(playback, signal);
return;
}
const args = [
"--rate",
"48000",
"--channels",
"2",
"--format",
"s16",
"--raw",
];
if (this.options.config.LOCAL_AUDIO_SINK) {
args.push("--target", this.options.config.LOCAL_AUDIO_SINK);
}
args.push("-");
const player = spawn("pw-play", args, {
stdio: ["pipe", "ignore", "pipe"],
});
this.currentPlayer = player;
player.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text.length > 0) {
this.options.logger.debug("[pw-play]", text);
}
});
signal.addEventListener(
"abort",
() => {
playback.stream.destroy();
if (!player.killed) {
player.kill("SIGKILL");
}
},
{ once: true },
);
playback.stream.pipe(player.stdin);
const [code, playSignal] = (await once(player, "exit")) as [number | null, NodeJS.Signals | null];
this.currentPlayer = null;
if (signal.aborted) {
return;
}
if (code !== 0) {
throw new Error(`pw-play exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`);
}
}
private async playToWindowsDefaultSink(playback: PreparedSpeechAudio, signal: AbortSignal): Promise<void> {
if (playback.sourceFilePath) {
await this.playWindowsWaveFile(playback.sourceFilePath, signal);
return;
}
const chunks: Buffer[] = [];
await new Promise<void>((resolve, reject) => {
playback.stream.on("data", (chunk: Buffer) => {
chunks.push(Buffer.from(chunk));
});
playback.stream.once("end", resolve);
playback.stream.once("error", reject);
signal.addEventListener(
"abort",
() => {
playback.stream.destroy();
reject(new Error("playback aborted"));
},
{ once: true },
);
}).catch((error) => {
if (signal.aborted) {
return;
}
throw error;
});
if (signal.aborted) {
return;
}
const pcm = Buffer.concat(chunks);
const wav = createWaveFileBuffer(pcm, 48000, 2, 16);
const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-${Date.now()}.wav`);
await fs.writeFile(tempPath, wav);
const psScript = [
"Add-Type -AssemblyName System;",
`$player = New-Object System.Media.SoundPlayer('${tempPath.replace(/'/g, "''")}');`,
"$player.PlaySync();",
].join(" ");
const player = spawn("powershell", ["-NoProfile", "-Command", psScript], {
stdio: ["ignore", "ignore", "pipe"],
});
this.currentPlayer = player;
player.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text.length > 0) {
this.options.logger.debug("[powershell-player]", text);
}
});
signal.addEventListener(
"abort",
() => {
if (!player.killed) {
player.kill("SIGKILL");
}
},
{ once: true },
);
const [code, playSignal] = (await once(player, "exit")) as [number | null, NodeJS.Signals | null];
this.currentPlayer = null;
await fs.unlink(tempPath).catch(() => null);
if (signal.aborted) {
return;
}
if (code !== 0) {
throw new Error(`powershell playback exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`);
}
}
private async playWindowsWaveFile(filePath: string, signal: AbortSignal): Promise<void> {
const psScript = [
"Add-Type -AssemblyName System;",
`$player = New-Object System.Media.SoundPlayer('${filePath.replace(/'/g, "''")}');`,
"$player.PlaySync();",
].join(" ");
const player = spawn("powershell", ["-NoProfile", "-Command", psScript], {
stdio: ["ignore", "ignore", "pipe"],
});
this.currentPlayer = player;
player.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text.length > 0) {
this.options.logger.debug("[powershell-player]", text);
}
});
signal.addEventListener(
"abort",
() => {
if (!player.killed) {
player.kill("SIGKILL");
}
},
{ once: true },
);
const [code, playSignal] = (await once(player, "exit")) as [number | null, NodeJS.Signals | null];
this.currentPlayer = null;
if (signal.aborted) {
return;
}
if (code !== 0) {
throw new Error(`powershell playback exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`);
}
}
private getFfmpegPath(): string {
return requireFfmpegPath();
}
private reportInputHealth(): void {
if (this.destroyed) {
return;
}
const now = Date.now();
if (!this.warnedNoPcm && this.lastPcmChunkAt === 0 && now - this.recorderStartedAt >= 6_000) {
this.warnedNoPcm = true;
this.options.logger.warn(
[
"입력 장치에서 PCM 데이터가 들어오지 않습니다.",
`현재 source: ${this.options.config.LOCAL_AUDIO_SOURCE ?? "default"}`,
"Windows에서는 마이크 입력이 아니라 SPDIF/ADAT 같은 디지털 입력을 고르면 반응이 없습니다.",
"`bun run devices`로 실제 마이크 이름을 다시 고르세요.",
].join("\n"),
);
return;
}
if (!this.warnedSilent && this.lastPcmChunkAt > 0 && this.lastNonSilentAudioAt === 0 && now - this.recorderStartedAt >= 6_000) {
this.warnedSilent = true;
this.options.logger.warn(
[
"입력 장치에서는 데이터가 오지만 말소리 수준으로 올라오지 않습니다.",
`현재 source: ${this.options.config.LOCAL_AUDIO_SOURCE ?? "default"}`,
"잘못된 입력 채널이거나, 마이크가 그 장치로 라우팅되지 않은 상태일 가능성이 큽니다.",
"RME Babyface Pro라면 SPDIF/ADAT 대신 아날로그 마이크 입력 채널 이름을 선택해야 합니다.",
].join("\n"),
);
}
}
private describeSink(): string {
if (process.platform === "win32") {
return this.options.config.LOCAL_AUDIO_SINK ?? "system-default";
}
return this.options.config.LOCAL_AUDIO_SINK ?? "default";
}
private appendWithCap(target: number[], samples: Int16Array, cap: number): void {
target.push(...samples);
if (target.length > cap) {
target.splice(0, target.length - cap);
}
}
}
function createWaveFileBuffer(
pcm: Buffer,
sampleRate: number,
channels: number,
bitsPerSample: number,
): Buffer {
const header = Buffer.alloc(44);
const byteRate = sampleRate * channels * (bitsPerSample / 8);
const blockAlign = channels * (bitsPerSample / 8);
header.write("RIFF", 0, 4, "ascii");
header.writeUInt32LE(36 + pcm.length, 4);
header.write("WAVE", 8, 4, "ascii");
header.write("fmt ", 12, 4, "ascii");
header.writeUInt32LE(16, 16);
header.writeUInt16LE(1, 20);
header.writeUInt16LE(channels, 22);
header.writeUInt32LE(sampleRate, 24);
header.writeUInt32LE(byteRate, 28);
header.writeUInt16LE(blockAlign, 32);
header.writeUInt16LE(bitsPerSample, 34);
header.write("data", 36, 4, "ascii");
header.writeUInt32LE(pcm.length, 40);
return Buffer.concat([header, pcm]);
}

View File

@@ -1,60 +0,0 @@
export class Stereo48kToMono16kDownsampler {
private readonly pendingMono48k: number[] = [];
pushStereo48kChunk(chunk: Buffer): Int16Array {
if (chunk.length < 4) {
return new Int16Array();
}
for (let offset = 0; offset + 3 < chunk.length; offset += 4) {
const left = chunk.readInt16LE(offset);
const right = chunk.readInt16LE(offset + 2);
this.pendingMono48k.push(Math.round((left + right) / 2));
}
const outputLength = Math.floor(this.pendingMono48k.length / 3);
if (outputLength === 0) {
return new Int16Array();
}
const output = new Int16Array(outputLength);
let readIndex = 0;
for (let index = 0; index < outputLength; index += 1) {
const a = this.pendingMono48k[readIndex];
const b = this.pendingMono48k[readIndex + 1];
const c = this.pendingMono48k[readIndex + 2];
output[index] = Math.round((a + b + c) / 3);
readIndex += 3;
}
this.pendingMono48k.splice(0, readIndex);
return output;
}
}
export function int16ArrayToFloat32(input: Int16Array): Float32Array {
const output = new Float32Array(input.length);
for (let index = 0; index < input.length; index += 1) {
output[index] = input[index] / 32768;
}
return output;
}
export function float32ToPcm16Buffer(input: Float32Array): Buffer {
const buffer = Buffer.allocUnsafe(input.length * 2);
for (let index = 0; index < input.length; index += 1) {
const value = Math.max(-1, Math.min(1, input[index]));
const scaled = value < 0 ? value * 32768 : value * 32767;
buffer.writeInt16LE(Math.round(scaled), index * 2);
}
return buffer;
}
export function takeFrame(source: number[], frameSize: number): Int16Array | null {
if (source.length < frameSize) {
return null;
}
const values = source.splice(0, frameSize);
return Int16Array.from(values);
}

View File

@@ -1,78 +0,0 @@
import { config as loadDotenv } from "dotenv";
import { z } from "zod";
loadDotenv();
const emptyToUndefined = z.preprocess((value) => {
if (typeof value !== "string") {
return value;
}
const trimmed = value.trim();
return trimmed.length === 0 ? undefined : trimmed;
}, z.string().min(1).optional());
const envSchema = z.object({
DISCORD_BOT_TOKEN: emptyToUndefined,
DISCORD_APPLICATION_ID: emptyToUndefined,
DISCORD_COMMAND_GUILD_ID: emptyToUndefined,
OLLAMA_BASE_URL: z.string().min(1).default("http://127.0.0.1:11434"),
OLLAMA_MODEL: z.string().min(1).default("qwen3:0.6b"),
OLLAMA_KEEP_ALIVE: z.string().min(1).default("5m"),
OLLAMA_NUM_CTX: z.coerce.number().int().min(512).max(32768).default(4096),
LOCAL_AI_VENV_PATH: z.string().min(1).default(".local-ai/.venv"),
LOCAL_AI_CACHE_DIR: z.string().min(1).default(".local-ai/cache"),
LOCAL_AI_PYTHON: emptyToUndefined,
LOCAL_STT_MODEL: z.string().min(1).default("small"),
LOCAL_STT_DEVICE: z.string().min(1).default("auto"),
LOCAL_STT_COMPUTE_TYPE: z.string().min(1).default("auto"),
LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(3),
LOCAL_TTS_MODEL_PATH: z.string().min(1).default(".local-ai/models/kokoro-v1.0.onnx"),
LOCAL_TTS_VOICES_PATH: z.string().min(1).default(".local-ai/models/voices-v1.0.bin"),
LOCAL_TTS_ENGINE: z.enum(["auto", "windows-media", "system", "kokoro"]).default("auto"),
LOCAL_TTS_VOICE_NAME: emptyToUndefined,
LOCAL_TTS_LANGUAGE: z.string().min(1).default("ko"),
LOCAL_TTS_SPEAKER: z.string().min(1).default("af_heart"),
LOCAL_TTS_DEVICE: z.string().min(1).default("auto"),
LOCAL_TTS_SPEED: z.coerce.number().min(0.8).max(1.6).default(1.12),
BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),
MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12),
LOCAL_AUDIO_SOURCE: emptyToUndefined,
LOCAL_AUDIO_SINK: emptyToUndefined,
LOCAL_SPEAKER_NAME: z.string().min(1).default("local-user"),
DEBUG_TEXT_EVENTS: z
.string()
.optional()
.transform((value) => value === "true"),
LOG_LEVEL: z.enum(["debug", "info", "warn", "error"]).default("info"),
});
export type AppConfig = z.infer<typeof envSchema>;
export type AssistantRuntimeConfig = AppConfig;
export type DiscordRuntimeConfig = AssistantRuntimeConfig & {
DISCORD_BOT_TOKEN: string;
DISCORD_APPLICATION_ID: string;
};
export function loadConfig(): AppConfig {
return envSchema.parse(process.env);
}
function requirePresent(value: string | undefined, name: string): string {
if (!value) {
throw new Error(`${name} 환경변수가 필요합니다.`);
}
return value;
}
export function requireAssistantRuntimeConfig(config: AppConfig): AssistantRuntimeConfig {
return config;
}
export function requireDiscordRuntimeConfig(config: AppConfig): DiscordRuntimeConfig {
const assistant = requireAssistantRuntimeConfig(config);
return {
...assistant,
DISCORD_BOT_TOKEN: requirePresent(config.DISCORD_BOT_TOKEN, "DISCORD_BOT_TOKEN"),
DISCORD_APPLICATION_ID: requirePresent(config.DISCORD_APPLICATION_ID, "DISCORD_APPLICATION_ID"),
};
}

View File

@@ -1,238 +0,0 @@
import process from "node:process";
import {
GatewayIntentBits,
REST,
Routes,
SlashCommandBuilder,
type ChatInputCommandInteraction,
type Client,
type GuildMember,
type VoiceBasedChannel,
} from "discord.js";
import { Client as DiscordClient } from "discord.js";
import { GuildVoiceSession } from "./audio/guild-voice-session.js";
import { type DiscordRuntimeConfig } from "./config.js";
import { Logger } from "./logger.js";
import { LocalFasterWhisperSttService } from "./services/local-stt.js";
import { OllamaLlmService } from "./services/ollama-llm.js";
import { createTtsService } from "./services/create-tts-service.js";
export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
const commands = [
new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
new SlashCommandBuilder()
.setName("say")
.setDescription("텍스트를 바로 음성으로 읽습니다.")
.addStringOption((option) =>
option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
),
].map((command) => command.toJSON());
const client = new DiscordClient({
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
});
const stt = new LocalFasterWhisperSttService(config, logger);
const tts = createTtsService(config, logger);
const llm = new OllamaLlmService(config);
const sessions = new Map<string, GuildVoiceSession>();
await stt.warmup();
await tts.warmup?.();
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
const member = interaction.member as GuildMember | null;
return member?.voice.channel ?? null;
}
async function registerCommands(_appClient: Client): Promise<void> {
const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
if (config.DISCORD_COMMAND_GUILD_ID) {
await rest.put(
Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
{
body: commands,
},
);
logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
return;
}
await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
body: commands,
});
logger.info("Registered global commands");
}
async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
if (!interaction.guild) {
throw new Error("Guild interaction required");
}
const voiceChannel = getVoiceChannel(interaction);
if (!voiceChannel) {
throw new Error("먼저 음성 채널에 들어가 주세요.");
}
const existing = sessions.get(interaction.guild.id);
if (existing && existing.voiceChannelId === voiceChannel.id) {
existing.setTextChannel(interaction.channelId);
return existing;
}
if (existing) {
await existing.destroy();
sessions.delete(interaction.guild.id);
}
const session = await GuildVoiceSession.create({
client,
config,
logger,
guild: interaction.guild,
voiceChannel,
textChannelId: interaction.channelId,
stt,
tts,
llm,
});
sessions.set(interaction.guild.id, session);
return session;
}
async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
try {
const session = await createSession(interaction);
await interaction.editReply(
`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`,
);
} catch (error) {
const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
await interaction.editReply(message);
}
}
async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await session.destroy();
sessions.delete(interaction.guildId!);
await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
}
async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
await interaction.reply({
content: session.statusSummary(),
ephemeral: true,
});
}
async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
return;
}
session.clearConversation();
await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
}
async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
await interaction.deferReply({ ephemeral: true });
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
if (!session) {
await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
return;
}
const text = interaction.options.getString("text", true).trim();
await session.speakText(text);
await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
}
async function shutdown(exitCode = 0): Promise<void> {
logger.info("Shutting down");
for (const session of sessions.values()) {
await session.destroy().catch((error) => {
logger.warn("Session shutdown failed", error);
});
}
sessions.clear();
await Promise.allSettled([stt.destroy?.(), tts.destroy?.()]);
await client.destroy();
process.exit(exitCode);
}
client.once("ready", async () => {
logger.info("Discord client ready", client.user?.tag ?? "unknown");
try {
await registerCommands(client);
} catch (error) {
logger.error("Command registration failed", error);
}
});
client.on("interactionCreate", async (interaction) => {
if (!interaction.isChatInputCommand()) {
return;
}
try {
switch (interaction.commandName) {
case "join":
await handleJoin(interaction);
return;
case "leave":
await handleLeave(interaction);
return;
case "status":
await handleStatus(interaction);
return;
case "reset":
await handleReset(interaction);
return;
case "say":
await handleSay(interaction);
return;
default:
await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
}
} catch (error) {
logger.error("Interaction handler failed", error);
if (interaction.deferred || interaction.replied) {
await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
return;
}
await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
}
});
process.on("SIGINT", () => {
void shutdown(0);
});
process.on("SIGTERM", () => {
void shutdown(0);
});
await client.login(config.DISCORD_BOT_TOKEN);
}

View File

@@ -1,52 +0,0 @@
import process from "node:process";
import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js";
import { runDiscordBot } from "./discord-main.js";
import { Logger } from "./logger.js";
import {
dumpLocalTtsWave,
printLocalAudioDevices,
printLocalTtsVoices,
runLocalAssistant,
runLocalTtsSmokeTest,
} from "./local-main.js";
const mode = process.argv[2] ?? "discord";
const config = loadConfig();
const logger = new Logger(config.LOG_LEVEL);
async function main(): Promise<void> {
switch (mode) {
case "discord":
await runDiscordBot(requireDiscordRuntimeConfig(config), logger);
return;
case "local":
await runLocalAssistant(requireAssistantRuntimeConfig(config), logger);
return;
case "local-devices":
await printLocalAudioDevices();
return;
case "local-say": {
const text = process.argv.slice(3).join(" ").trim() || "안녕하세요. TTS 단독 재생 테스트입니다.";
await runLocalTtsSmokeTest(requireAssistantRuntimeConfig(config), logger, text);
return;
}
case "local-say-dump": {
const text = process.argv.slice(3).join(" ").trim() || "안녕하세요. TTS WAV 파일 테스트입니다.";
await dumpLocalTtsWave(requireAssistantRuntimeConfig(config), logger, text);
return;
}
case "local-tts-voices":
await printLocalTtsVoices(requireAssistantRuntimeConfig(config));
return;
default:
throw new Error(
`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices, local-say, local-say-dump, local-tts-voices`,
);
}
}
void main().catch((error) => {
logger.error("Fatal startup error", error);
process.exit(1);
});

View File

@@ -1,232 +0,0 @@
import { spawn } from "node:child_process";
import { copyFile, mkdir } from "node:fs/promises";
import path from "node:path";
import process from "node:process";
import type { AssistantRuntimeConfig } from "./config.js";
import { Logger } from "./logger.js";
import { LocalVoiceSession } from "./audio/local-voice-session.js";
import { requireFfmpegPath } from "./audio/ffmpeg-path.js";
import type { LlmService } from "./services/llm.js";
import { LocalFasterWhisperSttService } from "./services/local-stt.js";
import { OllamaLlmService } from "./services/ollama-llm.js";
import type { SttService } from "./services/stt.js";
import { createTtsService } from "./services/create-tts-service.js";
import { listWindowsMediaVoices } from "./services/windows-media-tts.js";
import { listWindowsSystemVoices } from "./services/windows-system-tts.js";
export async function printLocalAudioDevices(): Promise<void> {
if (process.platform === "win32") {
const ffmpegPath = requireFfmpegPath();
console.log("\n=== ffmpeg dshow audio devices ===");
await new Promise<void>((resolve, reject) => {
const child = spawn(
ffmpegPath,
["-hide_banner", "-list_devices", "true", "-f", "dshow", "-i", "dummy"],
{
stdio: ["ignore", "ignore", "inherit"],
},
);
child.on("exit", (code) => {
if (code === 0 || code === 1) {
resolve();
return;
}
reject(new Error(`ffmpeg exited with code ${code ?? "null"}`));
});
child.on("error", reject);
});
console.log("\n위 목록의 오디오 장치 이름을 `LOCAL_AUDIO_SOURCE` 에 그대로 넣으면 됩니다.");
console.log("Windows 로컬 모드는 현재 출력 장치 직접 선택 대신 시스템 기본 출력 장치를 사용합니다.");
return;
}
const runs = [
{
label: "wpctl status",
args: ["status"],
},
{
label: "wpctl status -n",
args: ["status", "-n"],
},
] as const;
for (const run of runs) {
console.log(`\n=== ${run.label} ===`);
await new Promise<void>((resolve, reject) => {
const child = spawn("wpctl", run.args, {
stdio: ["ignore", "inherit", "inherit"],
});
child.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(`wpctl exited with code ${code ?? "null"}`));
});
child.on("error", reject);
});
}
}
export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
const stt = new LocalFasterWhisperSttService(config, logger);
const tts = createTtsService(config, logger);
const llm = new OllamaLlmService(config);
await stt.warmup();
await tts.warmup?.();
await llm.warmup?.();
if (config.BOT_DEFAULT_LANGUAGE.startsWith("ko") && config.LOCAL_STT_MODEL === "tiny") {
logger.warn(
"LOCAL_STT_MODEL=tiny 는 한국어 인식률이 낮을 수 있습니다. GPU 환경이면 small 이상을 권장합니다.",
);
}
const session = new LocalVoiceSession({
config,
logger,
stt,
tts,
llm,
});
console.log(session.statusSummary());
console.log("로컬 음성 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
if (process.platform === "win32") {
console.log("Windows 로컬 모드는 현재 시스템 기본 출력 장치로 재생됩니다.");
}
if (config.DEBUG_TEXT_EVENTS) {
console.log("텍스트 로그 출력이 켜져 있습니다.");
}
const shutdown = async (exitCode = 0) => {
await session.destroy().catch((error) => {
logger.warn("Local session shutdown failed", error);
});
await Promise.allSettled([stt.destroy?.(), tts.destroy?.()]);
process.exit(exitCode);
};
process.on("SIGINT", () => {
void shutdown(0);
});
process.on("SIGTERM", () => {
void shutdown(0);
});
await session.start();
}
export async function runLocalTtsSmokeTest(
config: AssistantRuntimeConfig,
logger: Logger,
text: string,
): Promise<void> {
const tts = createTtsService(config, logger);
const noOpStt: SttService = {
async transcribePcm16() {
return null;
},
};
const noOpLlm: LlmService = {
async generateReply() {
return "";
},
};
await tts.warmup?.();
const session = new LocalVoiceSession({
config,
logger,
stt: noOpStt,
tts,
llm: noOpLlm,
});
console.log("TTS 단독 재생 테스트를 시작합니다.");
console.log(`재생 문장: ${text}`);
if (process.platform === "win32") {
console.log("Windows에서는 시스템 기본 출력 장치로 재생됩니다.");
}
try {
await session.speakText(text);
} finally {
await Promise.allSettled([session.destroy(), tts.destroy?.()]);
}
}
export async function dumpLocalTtsWave(
config: AssistantRuntimeConfig,
logger: Logger,
text: string,
outputPath?: string,
): Promise<void> {
if (process.platform !== "win32") {
throw new Error("현재 TTS WAV 덤프 모드는 Windows에서만 구현되어 있습니다.");
}
const resolvedPath = path.resolve(outputPath?.trim() || "tts-test.wav");
await mkdir(path.dirname(resolvedPath), { recursive: true });
const tts = createTtsService(config, logger);
await tts.warmup?.();
const playback = await tts.preparePlayback(text);
try {
if (!playback.sourceFilePath) {
throw new Error("현재 선택된 TTS 엔진은 직접 WAV 덤프를 지원하지 않습니다.");
}
await copyFile(playback.sourceFilePath, resolvedPath);
} finally {
playback.dispose();
await tts.destroy?.();
}
console.log("TTS WAV 파일 생성 완료");
console.log(`출력 파일: ${resolvedPath}`);
console.log("이 파일이 재생되면 TTS 합성은 정상이고, 실시간 재생 경로만 따로 보면 됩니다.");
}
export async function printLocalTtsVoices(config: AssistantRuntimeConfig): Promise<void> {
if (process.platform !== "win32") {
console.log("현재 플랫폼은 Windows가 아니므로 설치된 시스템 TTS 목록 대신 Kokoro 설정만 사용합니다.");
console.log(`LOCAL_TTS_ENGINE=${config.LOCAL_TTS_ENGINE}`);
console.log(`LOCAL_TTS_SPEAKER=${config.LOCAL_TTS_SPEAKER}`);
return;
}
const [windowsMediaVoices, windowsSystemVoices] = await Promise.all([
listWindowsMediaVoices(),
listWindowsSystemVoices(),
]);
console.log("\n=== Windows.Media.SpeechSynthesis voices (권장) ===");
if (windowsMediaVoices.length === 0) {
console.log("설치된 Windows Media 음성이 없습니다.");
} else {
for (const voice of windowsMediaVoices) {
console.log(`- ${voice.description} | name=${voice.displayName} | lang=${voice.language}`);
}
}
console.log("\n=== System.Speech voices (fallback) ===");
if (windowsSystemVoices.length === 0) {
console.log("설치된 System.Speech 음성이 없습니다.");
} else {
for (const voice of windowsSystemVoices) {
console.log(`- ${voice.description} | name=${voice.name} | lang=${voice.culture}`);
}
}
console.log("\n설정 예시");
console.log("LOCAL_TTS_ENGINE=windows-media");
console.log("LOCAL_TTS_VOICE_NAME=위 목록의 description 또는 name");
}

View File

@@ -1,63 +0,0 @@
type LogLevel = "debug" | "info" | "warn" | "error";
const levelOrder: Record<LogLevel, number> = {
debug: 10,
info: 20,
warn: 30,
error: 40,
};
function formatParts(parts: unknown[]): string {
return parts
.map((part) => {
if (part instanceof Error) {
return `${part.name}: ${part.message}`;
}
if (typeof part === "string") {
return part;
}
return JSON.stringify(part);
})
.join(" ");
}
export class Logger {
constructor(private readonly level: LogLevel) {}
private shouldLog(target: LogLevel): boolean {
return levelOrder[target] >= levelOrder[this.level];
}
private write(target: LogLevel, ...parts: unknown[]): void {
if (!this.shouldLog(target)) {
return;
}
const line = `[${new Date().toISOString()}] [${target.toUpperCase()}] ${formatParts(parts)}`;
if (target === "error") {
console.error(line);
return;
}
if (target === "warn") {
console.warn(line);
return;
}
console.log(line);
}
debug(...parts: unknown[]): void {
this.write("debug", ...parts);
}
info(...parts: unknown[]): void {
this.write("info", ...parts);
}
warn(...parts: unknown[]): void {
this.write("warn", ...parts);
}
error(...parts: unknown[]): void {
this.write("error", ...parts);
}
}

View File

@@ -1,100 +0,0 @@
import { existsSync } from "node:fs";
import { spawnSync } from "node:child_process";
import path from "node:path";
import type { AppConfig } from "./config.js";
export interface PythonLaunch {
command: string;
args: string[];
source: "venv" | "configured" | "system";
}
function splitCommandSpec(spec: string): string[] {
return spec.match(/(?:[^\s"]+|"[^"]*")+/g)?.map((part) => part.replace(/^"|"$/g, "")) ?? [];
}
function canRun(command: string, args: string[]): boolean {
const result = spawnSync(command, [...args, "--version"], {
encoding: "utf8",
shell: process.platform === "win32",
});
return result.error == null && result.status === 0;
}
export function resolveLocalAiVenvPath(config: AppConfig): string {
return path.resolve(process.cwd(), config.LOCAL_AI_VENV_PATH);
}
export function resolveLocalAiCachePath(config: AppConfig): string {
return path.resolve(process.cwd(), config.LOCAL_AI_CACHE_DIR);
}
export function resolveLocalAiTtsModelPath(config: AppConfig): string {
return path.resolve(process.cwd(), config.LOCAL_TTS_MODEL_PATH);
}
export function resolveLocalAiTtsVoicesPath(config: AppConfig): string {
return path.resolve(process.cwd(), config.LOCAL_TTS_VOICES_PATH);
}
export function resolveVenvPythonPath(config: AppConfig): string {
const venvPath = resolveLocalAiVenvPath(config);
return process.platform === "win32"
? path.join(venvPath, "Scripts", "python.exe")
: path.join(venvPath, "bin", "python");
}
export function resolvePythonLaunch(config: AppConfig, options?: { preferVenv?: boolean }): PythonLaunch {
const preferVenv = options?.preferVenv ?? true;
const venvPython = resolveVenvPythonPath(config);
if (preferVenv && existsSync(venvPython)) {
return {
command: venvPython,
args: [],
source: "venv",
};
}
const configured = config.LOCAL_AI_PYTHON ? splitCommandSpec(config.LOCAL_AI_PYTHON) : [];
if (configured.length > 0 && canRun(configured[0]!, configured.slice(1))) {
return {
command: configured[0]!,
args: configured.slice(1),
source: "configured",
};
}
const candidates =
process.platform === "win32"
? [
["py", "-3"],
["python"],
["python3"],
]
: [
["python3"],
["python"],
];
for (const [command, ...args] of candidates) {
if (canRun(command, args)) {
return {
command,
args,
source: "system",
};
}
}
throw new Error(
[
"Python 실행 파일을 찾지 못했습니다.",
"1. Python 3.11 이상을 설치",
"2. Windows면 `py -3 --version` 이 되는지 먼저 확인",
"3. 되면 `.env` 에 `LOCAL_AI_PYTHON=py -3` 설정",
"4. 그 다음 `bun run setup:local-ai` 실행",
].join("\n"),
);
}

View File

@@ -1,98 +0,0 @@
export interface ConversationTurn {
role: "user" | "assistant";
text: string;
speakerId?: string;
speakerName?: string;
createdAt: number;
}
export interface UserUtterance {
speakerId: string;
speakerName: string;
text: string;
}
export interface ChatPromptMessage {
role: "user" | "assistant";
content: string;
}
function renderSpeakerLabel(speakerName?: string): string | null {
const normalized = speakerName?.trim();
if (!normalized || normalized === "unknown" || normalized === "local-user") {
return null;
}
return normalized;
}
function renderUserMessage(speakerName: string | undefined, text: string): string {
const label = renderSpeakerLabel(speakerName);
if (!label) {
return text;
}
return `${label}: ${text}`;
}
export class ConversationMemory {
private readonly turns: ConversationTurn[] = [];
constructor(private readonly maxTurns: number) {}
addUserTurn(utterance: UserUtterance): void {
this.turns.push({
role: "user",
text: utterance.text,
speakerId: utterance.speakerId,
speakerName: utterance.speakerName,
createdAt: Date.now(),
});
this.trim();
}
addAssistantTurn(text: string): void {
this.turns.push({
role: "assistant",
text,
createdAt: Date.now(),
});
this.trim();
}
clear(): void {
this.turns.splice(0, this.turns.length);
}
recentTurns(): ConversationTurn[] {
return [...this.turns];
}
buildMessages(currentUtterance: UserUtterance): ChatPromptMessage[] {
return [
...this.turns
.slice(-this.maxTurns)
.map((turn) => {
if (turn.role === "assistant") {
return {
role: "assistant" as const,
content: turn.text,
};
}
return {
role: "user" as const,
content: renderUserMessage(turn.speakerName, turn.text),
};
}),
{
role: "user",
content: renderUserMessage(currentUtterance.speakerName, currentUtterance.text),
},
];
}
private trim(): void {
const overflow = this.turns.length - this.maxTurns;
if (overflow > 0) {
this.turns.splice(0, overflow);
}
}
}

View File

@@ -1,112 +0,0 @@
import process from "node:process";
import type { AssistantRuntimeConfig } from "../config.js";
import type { Logger } from "../logger.js";
import { LocalKokoroTtsService } from "./local-tts.js";
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
import { WindowsMediaTtsService } from "./windows-media-tts.js";
import { WindowsSystemTtsService } from "./windows-system-tts.js";
interface NamedTtsService {
name: string;
service: TtsService;
}
class FallbackTtsService implements TtsService {
private activeIndex: number | null = null;
constructor(
private readonly logger: Logger,
private readonly services: NamedTtsService[],
) {}
async warmup(): Promise<void> {
await this.ensureActive();
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const active = await this.ensureActive();
try {
return await active.service.preparePlayback(text, signal);
} catch (error) {
if (this.activeIndex === null || this.activeIndex >= this.services.length - 1) {
throw error;
}
const failedName = active.name;
this.activeIndex += 1;
const fallback = await this.activate(this.activeIndex);
this.logger.warn(`TTS 엔진 ${failedName} 이 실패해 ${fallback.name} 로 전환합니다.`, error);
return await fallback.service.preparePlayback(text, signal);
}
}
async destroy(): Promise<void> {
await Promise.allSettled(this.services.map((entry) => entry.service.destroy?.()));
}
private async ensureActive(): Promise<NamedTtsService> {
if (this.activeIndex !== null) {
return this.services[this.activeIndex]!;
}
let lastError: unknown = null;
for (let index = 0; index < this.services.length; index += 1) {
try {
return await this.activate(index);
} catch (error) {
lastError = error;
this.logger.warn(`TTS 엔진 ${this.services[index]!.name} 초기화 실패`, error);
}
}
throw lastError instanceof Error ? lastError : new Error("사용 가능한 TTS 엔진을 찾지 못했습니다.");
}
private async activate(index: number): Promise<NamedTtsService> {
const selected = this.services[index]!;
await selected.service.warmup?.();
this.activeIndex = index;
this.logger.info("Selected TTS engine", selected.name);
return selected;
}
}
export function createTtsService(config: AssistantRuntimeConfig, logger: Logger): TtsService {
if (process.platform !== "win32") {
return new LocalKokoroTtsService(config, logger);
}
const systemTts = new WindowsSystemTtsService(
config.LOCAL_TTS_SPEED,
config.LOCAL_TTS_VOICE_NAME,
config.LOCAL_TTS_LANGUAGE,
);
const windowsMediaTts = new WindowsMediaTtsService(
config.LOCAL_TTS_SPEED,
config.LOCAL_TTS_VOICE_NAME,
config.LOCAL_TTS_LANGUAGE,
);
switch (config.LOCAL_TTS_ENGINE) {
case "system":
return systemTts;
case "windows-media":
return windowsMediaTts;
case "kokoro":
return new LocalKokoroTtsService(config, logger);
case "auto":
default:
return new FallbackTtsService(logger, [
{
name: "windows-media",
service: windowsMediaTts,
},
{
name: "system",
service: systemTts,
},
]);
}
}

View File

@@ -1,6 +0,0 @@
import type { ConversationMemory, UserUtterance } from "./conversation.js";
export interface LlmService {
warmup?(): Promise<void>;
generateReply(memory: ConversationMemory, utterance: UserUtterance): Promise<string>;
}

View File

@@ -1,43 +0,0 @@
import type { AssistantRuntimeConfig } from "../config.js";
import type { Logger } from "../logger.js";
import { PythonJsonWorker } from "./python-json-worker.js";
import type { SttService } from "./stt.js";
interface TranscribeResult {
text?: string;
}
export class LocalFasterWhisperSttService implements SttService {
private readonly worker: PythonJsonWorker;
constructor(private readonly config: AssistantRuntimeConfig, logger: Logger) {
this.worker = new PythonJsonWorker(config, logger, "local_stt_worker.py", "local-stt", {
LOCAL_STT_MODEL: config.LOCAL_STT_MODEL,
LOCAL_STT_DEVICE: config.LOCAL_STT_DEVICE,
LOCAL_STT_COMPUTE_TYPE: config.LOCAL_STT_COMPUTE_TYPE,
LOCAL_STT_BEAM_SIZE: String(config.LOCAL_STT_BEAM_SIZE),
});
}
async warmup(): Promise<void> {
await this.worker.request("ping", {});
}
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
if (pcm16MonoAudio.byteLength === 0) {
return null;
}
const result = await this.worker.request<TranscribeResult>("transcribe", {
audio_base64: pcm16MonoAudio.toString("base64"),
language: this.config.BOT_DEFAULT_LANGUAGE,
});
const transcript = result.text?.trim() ?? "";
return transcript.length > 0 ? transcript : null;
}
async destroy(): Promise<void> {
await this.worker.destroy();
}
}

View File

@@ -1,97 +0,0 @@
import { Readable } from "node:stream";
import prism from "prism-media";
import type { AssistantRuntimeConfig } from "../config.js";
import type { Logger } from "../logger.js";
import { resolveFfmpegPath } from "../audio/ffmpeg-path.js";
import { PythonJsonWorker } from "./python-json-worker.js";
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
import { resolveLocalAiTtsModelPath, resolveLocalAiTtsVoicesPath } from "../python-runtime.js";
interface SynthesizeResult {
wav_base64?: string;
}
export class LocalKokoroTtsService implements TtsService {
private readonly worker: PythonJsonWorker;
constructor(config: AssistantRuntimeConfig, logger: Logger) {
const resolvedFfmpegPath = resolveFfmpegPath();
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
process.env.FFMPEG_PATH = resolvedFfmpegPath;
}
this.worker = new PythonJsonWorker(config, logger, "local_tts_worker.py", "local-tts", {
LOCAL_TTS_MODEL_PATH: resolveLocalAiTtsModelPath(config),
LOCAL_TTS_VOICES_PATH: resolveLocalAiTtsVoicesPath(config),
LOCAL_TTS_LANGUAGE: config.LOCAL_TTS_LANGUAGE,
LOCAL_TTS_SPEAKER: config.LOCAL_TTS_SPEAKER,
LOCAL_TTS_DEVICE: config.LOCAL_TTS_DEVICE,
LOCAL_TTS_SPEED: String(config.LOCAL_TTS_SPEED),
});
}
async warmup(): Promise<void> {
await this.worker.request("ping", {});
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const result = await this.worker.request<SynthesizeResult>(
"synthesize",
{
text,
},
signal,
);
const wavBase64 = result.wav_base64;
if (!wavBase64) {
throw new Error("로컬 TTS가 빈 오디오를 반환했습니다.");
}
const input = Readable.from([Buffer.from(wavBase64, "base64")]);
const ffmpeg = new prism.FFmpeg({
args: [
"-analyzeduration",
"0",
"-loglevel",
"0",
"-i",
"pipe:0",
"-f",
"s16le",
"-ar",
"48000",
"-ac",
"2",
"pipe:1",
],
});
if (signal) {
signal.addEventListener(
"abort",
() => {
input.destroy();
ffmpeg.destroy();
},
{ once: true },
);
}
input.pipe(ffmpeg);
return {
stream: ffmpeg,
dispose: () => {
input.destroy();
ffmpeg.destroy();
},
};
}
async destroy(): Promise<void> {
await this.worker.destroy();
}
}

View File

@@ -1,159 +0,0 @@
import type { AssistantRuntimeConfig } from "../config.js";
import type { ConversationMemory, UserUtterance } from "./conversation.js";
import type { LlmService } from "./llm.js";
const ASSISTANT_INSTRUCTIONS = [
"너는 디스코드 음성 채널 또는 로컬 마이크 테스트에서 동작하는 한국어 음성 비서다.",
"사용자의 마지막 말에만 직접 답한다.",
"답변은 짧고 실용적으로 한다.",
"기본은 한 문장, 길어도 두 문장을 넘기지 않는다.",
"말투는 자연스러운 한국어로 유지한다.",
"사용자가 정체를 명확히 묻지 않는 한 자기소개하지 않는다.",
"자기소개가 필요할 때만 '저는 로컬 음성 비서입니다.'처럼 짧게 말한다.",
"\"저는 화자입니다\", \"로컬 음성 비서 모드입니다\" 같은 어색한 메타 응답은 하지 않는다.",
"대화 기록에 이름이 붙어 있어도 이름이나 메타 정보를 그대로 따라 말하지 않는다.",
"잘 못 들었거나 의미가 불명확하면 짧게 다시 물어본다.",
"목록, 마크다운, 코드블록, 설명문은 쓰지 않는다.",
"생각 과정을 드러내지 말고 최종 답변만 말한다.",
].join(" ");
const EXAMPLE_MESSAGES = [
{
role: "user" as const,
content: "안녕하세요",
},
{
role: "assistant" as const,
content: "안녕하세요. 무엇을 도와드릴까요?",
},
{
role: "user" as const,
content: "당신은 누구십니까?",
},
{
role: "assistant" as const,
content: "저는 로컬 음성 비서입니다.",
},
{
role: "user" as const,
content: "계속 똑같은 말만 반복합니까?",
},
{
role: "assistant" as const,
content: "아니요. 질문에 맞춰 짧게 답변합니다.",
},
];
interface OllamaChatResponse {
message?: {
content?: string;
thinking?: string;
};
error?: string;
}
interface OllamaTagsResponse {
models?: Array<{
name?: string;
model?: string;
}>;
}
function normalizeReply(text: string): string {
const strippedThink = text.replace(/<think>[\s\S]*?<\/think>/gi, " ");
const compact = strippedThink.replace(/\s+/g, " ").trim();
if (compact.length <= 180) {
return compact;
}
const sentences = compact.match(/[^.!?]+[.!?]?/g);
if (!sentences || sentences.length === 0) {
return compact.slice(0, 180).trim();
}
return sentences.slice(0, 2).join(" ").trim().slice(0, 180).trim();
}
export class OllamaLlmService implements LlmService {
constructor(private readonly config: AssistantRuntimeConfig) {}
async warmup(): Promise<void> {
const url = new URL("/api/tags", this.config.OLLAMA_BASE_URL);
let response: Response;
try {
response = await fetch(url);
} catch {
throw new Error(
`Ollama 서버에 연결할 수 없습니다. ${this.config.OLLAMA_BASE_URL} 확인 후 Ollama 앱이 실행 중인지 확인해 주세요. Windows에서는 \`localhost\` 대신 \`http://127.0.0.1:11434\` 를 권장합니다. 모델이 없으면 \`ollama pull ${this.config.OLLAMA_MODEL}\` 를 먼저 실행하세요.`,
);
}
const body = (await response.json().catch(() => ({}))) as OllamaTagsResponse & { error?: string };
if (!response.ok) {
throw new Error(body.error ?? `Ollama 상태 확인 실패: HTTP ${response.status}`);
}
const models = body.models ?? [];
const exists = models.some((model) => {
const name = model.name?.trim();
const alias = model.model?.trim();
return name === this.config.OLLAMA_MODEL || alias === this.config.OLLAMA_MODEL;
});
if (!exists) {
throw new Error(
`Ollama 모델 ${this.config.OLLAMA_MODEL} 이 없습니다. \`ollama pull ${this.config.OLLAMA_MODEL}\` 를 먼저 실행해 주세요.`,
);
}
}
async generateReply(memory: ConversationMemory, utterance: UserUtterance): Promise<string> {
const url = new URL("/api/chat", this.config.OLLAMA_BASE_URL);
let response: Response;
try {
response = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
model: this.config.OLLAMA_MODEL,
messages: [
{
role: "system",
content: ASSISTANT_INSTRUCTIONS,
},
...EXAMPLE_MESSAGES,
...memory.buildMessages(utterance),
],
think: false,
stream: false,
keep_alive: this.config.OLLAMA_KEEP_ALIVE,
options: {
num_ctx: this.config.OLLAMA_NUM_CTX,
temperature: 0.4,
num_predict: 120,
},
}),
});
} catch {
throw new Error(
`Ollama 서버에 연결할 수 없습니다. ${this.config.OLLAMA_BASE_URL} 확인 후 Ollama 앱이 실행 중인지 확인해 주세요. Windows에서는 \`localhost\` 대신 \`http://127.0.0.1:11434\` 를 권장합니다.`,
);
}
const body = (await response.json().catch(() => ({}))) as OllamaChatResponse;
if (!response.ok) {
throw new Error(body.error ?? `Ollama request failed with status ${response.status}`);
}
const output = body.message?.content?.trim();
if (!output) {
return "잘 못 들었습니다. 한 번만 다시 말씀해 주세요.";
}
return normalizeReply(output);
}
}

View File

@@ -1,208 +0,0 @@
import { spawn, type ChildProcessWithoutNullStreams } from "node:child_process";
import { createInterface } from "node:readline";
import path from "node:path";
import type { AssistantRuntimeConfig } from "../config.js";
import type { Logger } from "../logger.js";
import { resolveLocalAiCachePath, resolvePythonLaunch } from "../python-runtime.js";
interface WorkerRequest {
id: number;
method: string;
params: Record<string, unknown>;
}
interface WorkerResponse {
id: number;
ok: boolean;
result?: unknown;
error?: string;
}
export class PythonJsonWorker {
private child: ChildProcessWithoutNullStreams | null = null;
private nextId = 1;
private readonly pending = new Map<
number,
{
resolve: (value: unknown) => void;
reject: (error: Error) => void;
}
>();
constructor(
private readonly config: AssistantRuntimeConfig,
private readonly logger: Logger,
private readonly scriptName: string,
private readonly label: string,
private readonly workerEnv: Record<string, string>,
) {}
async request<T>(method: string, params: Record<string, unknown>, signal?: AbortSignal): Promise<T> {
const child = this.ensureStarted();
const id = this.nextId++;
return await new Promise<T>((resolve, reject) => {
if (signal?.aborted) {
reject(new Error(`${this.label} request aborted before start`));
return;
}
const abortHandler = () => {
this.pending.delete(id);
reject(new Error(`${this.label} request aborted`));
};
if (signal) {
signal.addEventListener("abort", abortHandler, { once: true });
}
this.pending.set(id, {
resolve: (value) => {
if (signal) {
signal.removeEventListener("abort", abortHandler);
}
resolve(value as T);
},
reject: (error) => {
if (signal) {
signal.removeEventListener("abort", abortHandler);
}
reject(error);
},
});
const message: WorkerRequest = {
id,
method,
params,
};
child.stdin.write(`${JSON.stringify(message)}\n`);
});
}
async destroy(): Promise<void> {
this.rejectAll(new Error(`${this.label} worker terminated`));
if (!this.child) {
return;
}
const child = this.child;
this.child = null;
child.kill("SIGTERM");
await new Promise<void>((resolve) => {
child.once("exit", () => resolve());
setTimeout(resolve, 1_500);
});
}
private ensureStarted(): ChildProcessWithoutNullStreams {
if (this.child) {
return this.child;
}
const launch = resolvePythonLaunch(this.config);
const scriptPath = path.resolve(process.cwd(), "python", this.scriptName);
const cachePath = resolveLocalAiCachePath(this.config);
const recentStderr: string[] = [];
const child = spawn(launch.command, [...launch.args, scriptPath], {
stdio: ["pipe", "pipe", "pipe"],
shell: process.platform === "win32",
env: {
...process.env,
HF_HOME: cachePath,
TRANSFORMERS_CACHE: cachePath,
PYTHONIOENCODING: "utf-8",
HF_HUB_DISABLE_SYMLINKS_WARNING: "1",
BOT_DEFAULT_LANGUAGE: this.config.BOT_DEFAULT_LANGUAGE,
...this.workerEnv,
},
});
createInterface({
input: child.stdout,
crlfDelay: Number.POSITIVE_INFINITY,
}).on("line", (line) => {
if (!line.trim()) {
return;
}
let payload: WorkerResponse;
try {
if (!line.startsWith("{")) {
return;
}
payload = JSON.parse(line) as WorkerResponse;
} catch (error) {
this.logger.warn(`${this.label} stdout parse failed`, error);
return;
}
const pending = this.pending.get(payload.id);
if (!pending) {
return;
}
this.pending.delete(payload.id);
if (payload.ok) {
pending.resolve(payload.result);
return;
}
pending.reject(new Error(payload.error ?? `${this.label} worker error`));
});
child.stderr.on("data", (chunk: Buffer) => {
const text = chunk.toString().trim();
if (text.length > 0) {
recentStderr.push(text);
if (recentStderr.length > 20) {
recentStderr.shift();
}
this.logger.warn(`[${this.label}]`, text);
}
});
child.on("exit", (code, signal) => {
if (this.child === child) {
this.child = null;
}
const detail = recentStderr.length > 0 ? `\n${recentStderr.join("\n")}` : "";
this.rejectAll(new Error(`${this.label} worker exited code=${code ?? "null"} signal=${signal ?? "null"}${detail}`));
});
child.on("error", (error) => {
const spawnError = error as NodeJS.ErrnoException;
if (spawnError.code === "ENOENT") {
this.rejectAll(
new Error(
[
`Python 실행에 실패했습니다: ${launch.command}`,
"Windows면 `.env` 에 `LOCAL_AI_PYTHON=py -3` 를 넣고 다시 실행하세요.",
"최초 1회는 `bun run setup:local-ai` 를 먼저 실행해야 합니다.",
].join("\n"),
),
);
return;
}
this.rejectAll(spawnError);
});
this.child = child;
return child;
}
private rejectAll(error: Error): void {
const pending = [...this.pending.values()];
this.pending.clear();
for (const item of pending) {
item.reject(error);
}
}
}

View File

@@ -1,4 +0,0 @@
export interface SttService {
transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null>;
destroy?(): Promise<void>;
}

View File

@@ -1,13 +0,0 @@
import type { Readable } from "node:stream";
export interface PreparedSpeechAudio {
stream: Readable;
sourceFilePath?: string;
dispose: () => void;
}
export interface TtsService {
warmup?(): Promise<void>;
preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio>;
destroy?(): Promise<void>;
}

View File

@@ -1,152 +0,0 @@
import { createReadStream } from "node:fs";
import { unlink } from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
import { parsePowerShellJsonArray, runPowerShell } from "./windows-powershell.js";
export interface WindowsMediaVoiceInfo {
displayName: string;
description: string;
language: string;
gender: string;
id: string;
}
function escapePowerShellSingleQuoted(text: string): string {
return text.replace(/\r?\n/g, " ").replace(/'/g, "''");
}
function windowsMediaPreamble(): string {
return [
"$ErrorActionPreference = 'Stop';",
"$ProgressPreference = 'SilentlyContinue';",
"Add-Type -AssemblyName System.Runtime.WindowsRuntime;",
"$null = [Windows.Media.SpeechSynthesis.SpeechSynthesizer, Windows.Media.SpeechSynthesis, ContentType=WindowsRuntime];",
"$null = [Windows.Storage.Streams.DataReader, Windows.Storage.Streams, ContentType=WindowsRuntime];",
"function Await-WinRt($operation) {",
" $interfaceType = $operation.GetType().GetInterfaces() | Where-Object {",
" $_.IsGenericType -and $_.GetGenericTypeDefinition().FullName -eq 'Windows.Foundation.IAsyncOperation`1'",
" } | Select-Object -First 1;",
" if (-not $interfaceType) { throw 'IAsyncOperation<T> 인터페이스를 찾지 못했습니다.' }",
" $resultType = $interfaceType.GetGenericArguments()[0];",
" $method = [System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object {",
" $_.Name -eq 'AsTask' -and",
" $_.IsGenericMethodDefinition -and",
" $_.GetGenericArguments().Count -eq 1 -and",
" $_.GetParameters().Count -eq 1 -and",
" $_.GetParameters()[0].ParameterType.IsGenericType -and",
" $_.GetParameters()[0].ParameterType.GetGenericTypeDefinition().FullName -eq 'Windows.Foundation.IAsyncOperation`1'",
" } | Select-Object -First 1;",
" if (-not $method) { throw 'System.WindowsRuntimeSystemExtensions.AsTask(IAsyncOperation<T>) 를 찾지 못했습니다.' }",
" $task = $method.MakeGenericMethod(@($resultType)).Invoke($null, @($operation));",
" return $task.GetAwaiter().GetResult();",
"}",
].join(" ");
}
export async function listWindowsMediaVoices(signal?: AbortSignal): Promise<WindowsMediaVoiceInfo[]> {
const script = [
windowsMediaPreamble(),
"$voices = @([Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices | ForEach-Object {",
" [PSCustomObject]@{",
" displayName = $_.DisplayName;",
" description = $_.Description;",
" language = $_.Language;",
" gender = [string]$_.Gender;",
" id = $_.Id;",
" }",
"});",
"ConvertTo-Json -InputObject $voices -Compress;",
].join(" ");
const { stdout } = await runPowerShell(script, signal);
return parsePowerShellJsonArray<WindowsMediaVoiceInfo>(stdout);
}
export async function synthesizeWindowsMediaSpeechToWaveFile(
text: string,
speed: number,
outputPath: string,
voiceName?: string,
language = "ko",
signal?: AbortSignal,
): Promise<void> {
const script = [
windowsMediaPreamble(),
`$text = '${escapePowerShellSingleQuoted(text)}';`,
`$outputPath = '${escapePowerShellSingleQuoted(outputPath)}';`,
`$preferredVoice = '${escapePowerShellSingleQuoted(voiceName ?? "")}';`,
`$preferredLanguage = '${escapePowerShellSingleQuoted(language)}';`,
`$speakingRate = ${speed.toFixed(2)};`,
"$synth = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::new();",
"try {",
" $voices = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::AllVoices;",
" $selected = $null;",
" if ($preferredVoice) {",
" $selected = $voices | Where-Object {",
" $_.DisplayName -eq $preferredVoice -or $_.Description -eq $preferredVoice -or $_.Id -eq $preferredVoice -or $_.DisplayName -like ('*' + $preferredVoice + '*') -or $_.Description -like ('*' + $preferredVoice + '*')",
" } | Select-Object -First 1;",
" }",
" if (-not $selected -and $preferredLanguage) {",
" $selected = $voices | Where-Object { $_.Language -like ($preferredLanguage + '*') } | Sort-Object @{Expression={ if ($_.DisplayName -match 'Natural' -or $_.Description -match 'Natural') { 0 } else { 1 } }}, Description | Select-Object -First 1;",
" }",
" if (-not $selected) { $selected = [Windows.Media.SpeechSynthesis.SpeechSynthesizer]::DefaultVoice }",
" if ($selected) { $synth.Voice = $selected }",
" try { $synth.Options.SpeakingRate = $speakingRate } catch {}",
" $stream = Await-WinRt ($synth.SynthesizeTextToStreamAsync($text));",
" try {",
" $size = [uint32]$stream.Size;",
" $reader = [Windows.Storage.Streams.DataReader]::new($stream.GetInputStreamAt(0));",
" try {",
" $null = Await-WinRt ($reader.LoadAsync($size));",
" $bytes = New-Object byte[] ([int]$size);",
" $reader.ReadBytes($bytes);",
" [System.IO.File]::WriteAllBytes($outputPath, $bytes);",
" } finally { $reader.Dispose() }",
" } finally { $stream.Dispose() }",
"} finally { $synth.Dispose() }",
].join(" ");
await runPowerShell(script, signal);
}
export class WindowsMediaTtsService implements TtsService {
constructor(
private readonly speed: number,
private readonly voiceName?: string,
private readonly language = "ko",
) {}
async warmup(): Promise<void> {
await listWindowsMediaVoices();
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-wmtts-${Date.now()}.wav`);
await synthesizeWindowsMediaSpeechToWaveFile(
text,
this.speed,
tempPath,
this.voiceName,
this.language,
signal,
).catch(async (error) => {
await unlink(tempPath).catch(() => null);
throw error;
});
return {
stream: createReadStream(tempPath),
sourceFilePath: tempPath,
dispose: () => {
void unlink(tempPath).catch(() => null);
},
};
}
async destroy(): Promise<void> {
return;
}
}

View File

@@ -1,63 +0,0 @@
import { spawn } from "node:child_process";
export interface PowerShellRunResult {
stdout: string;
stderr: string;
}
export async function runPowerShell(script: string, signal?: AbortSignal): Promise<PowerShellRunResult> {
const encodedCommand = Buffer.from(script, "utf16le").toString("base64");
return await new Promise<PowerShellRunResult>((resolve, reject) => {
const child = spawn("powershell", ["-NoProfile", "-EncodedCommand", encodedCommand], {
stdio: ["ignore", "pipe", "pipe"],
});
let stdout = "";
let stderr = "";
child.stdout.on("data", (chunk: Buffer) => {
stdout += chunk.toString();
});
child.stderr.on("data", (chunk: Buffer) => {
stderr += chunk.toString();
});
signal?.addEventListener(
"abort",
() => {
if (!child.killed) {
child.kill("SIGKILL");
}
},
{ once: true },
);
child.on("exit", (code) => {
if (signal?.aborted) {
reject(new Error("powershell aborted"));
return;
}
if (code === 0) {
resolve({ stdout, stderr });
return;
}
reject(new Error(stderr.trim() || stdout.trim() || `powershell exited with code ${code ?? "null"}`));
});
child.on("error", reject);
});
}
export function parsePowerShellJsonArray<T>(stdout: string): T[] {
const trimmed = stdout.trim();
if (!trimmed) {
return [];
}
const parsed: unknown = JSON.parse(trimmed);
return Array.isArray(parsed) ? (parsed as T[]) : ([parsed] as T[]);
}

View File

@@ -1,123 +0,0 @@
import { createReadStream } from "node:fs";
import { unlink } from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import type { PreparedSpeechAudio, TtsService } from "./tts.js";
import { parsePowerShellJsonArray, runPowerShell } from "./windows-powershell.js";
export interface WindowsSystemVoiceInfo {
name: string;
culture: string;
description: string;
gender: string;
enabled: boolean;
}
function escapePowerShellSingleQuoted(text: string): string {
return text.replace(/\r?\n/g, " ").replace(/'/g, "''");
}
function toSpeechRate(speed: number): number {
const mapped = Math.round((speed - 1) * 8);
return Math.max(-10, Math.min(10, mapped));
}
export async function listWindowsSystemVoices(signal?: AbortSignal): Promise<WindowsSystemVoiceInfo[]> {
const script = [
"$ErrorActionPreference = 'Stop';",
"$ProgressPreference = 'SilentlyContinue';",
"Add-Type -AssemblyName System.Speech;",
"$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
"try {",
" $voices = @($synth.GetInstalledVoices() | ForEach-Object {",
" [PSCustomObject]@{",
" name = $_.VoiceInfo.Name;",
" culture = $_.VoiceInfo.Culture.Name;",
" description = $_.VoiceInfo.Description;",
" gender = [string]$_.VoiceInfo.Gender;",
" enabled = [bool]$_.Enabled;",
" }",
" });",
" ConvertTo-Json -InputObject $voices -Compress;",
"} finally { $synth.Dispose() }",
].join(" ");
const { stdout } = await runPowerShell(script, signal);
return parsePowerShellJsonArray<WindowsSystemVoiceInfo>(stdout);
}
export async function synthesizeWindowsSpeechToWaveFile(
text: string,
speed: number,
outputPath: string,
voiceName?: string,
language = "ko",
signal?: AbortSignal,
): Promise<void> {
const rate = toSpeechRate(speed);
const script = [
"$ErrorActionPreference = 'Stop';",
"$ProgressPreference = 'SilentlyContinue';",
"Add-Type -AssemblyName System.Speech;",
"$synth = New-Object System.Speech.Synthesis.SpeechSynthesizer;",
`$preferredVoice = '${escapePowerShellSingleQuoted(voiceName ?? "")}';`,
`$preferredLanguage = '${escapePowerShellSingleQuoted(language)}';`,
"try {",
" $voices = $synth.GetInstalledVoices();",
" $selected = $null;",
" if ($preferredVoice) {",
" $selected = $voices | Where-Object {",
" $_.VoiceInfo.Name -eq $preferredVoice -or $_.VoiceInfo.Description -eq $preferredVoice -or $_.VoiceInfo.Name -like ('*' + $preferredVoice + '*') -or $_.VoiceInfo.Description -like ('*' + $preferredVoice + '*')",
" } | Select-Object -First 1;",
" }",
" if (-not $selected -and $preferredLanguage) {",
" $selected = $voices | Where-Object { $_.VoiceInfo.Culture.Name -like ($preferredLanguage + '*') } | Select-Object -First 1;",
" }",
" if ($selected) { $synth.SelectVoice($selected.VoiceInfo.Name) }",
`$synth.Rate = ${rate};`,
`$synth.SetOutputToWaveFile('${escapePowerShellSingleQuoted(outputPath)}');`,
`$synth.Speak('${escapePowerShellSingleQuoted(text)}');`,
"} finally { $synth.Dispose() }",
].join(" ");
await runPowerShell(script, signal);
}
export class WindowsSystemTtsService implements TtsService {
constructor(
private readonly speed: number,
private readonly voiceName?: string,
private readonly language = "ko",
) {}
async warmup(): Promise<void> {
await listWindowsSystemVoices();
}
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-tts-${Date.now()}.wav`);
await synthesizeWindowsSpeechToWaveFile(text, this.speed, tempPath, this.voiceName, this.language, signal).catch(
async (error) => {
await unlink(tempPath).catch(() => null);
throw error;
},
);
return {
stream: createReadStream(tempPath),
sourceFilePath: tempPath,
dispose: () => {
this.cleanupTempWave(tempPath);
},
};
}
private cleanupTempWave(filePath: string): void {
void unlink(filePath).catch(() => null);
}
async destroy(): Promise<void> {
return;
}
}

View File

@@ -1,132 +0,0 @@
import { existsSync } from "node:fs";
import { mkdir, writeFile } from "node:fs/promises";
import { spawn } from "node:child_process";
import path from "node:path";
import { loadConfig } from "./config.js";
import {
resolveLocalAiCachePath,
resolveLocalAiTtsModelPath,
resolveLocalAiTtsVoicesPath,
resolveLocalAiVenvPath,
resolvePythonLaunch,
resolveVenvPythonPath,
} from "./python-runtime.js";
const KOKORO_MODEL_URL =
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx";
const KOKORO_VOICES_URL =
"https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin";
async function run(command: string, args: string[], extraEnv?: NodeJS.ProcessEnv): Promise<void> {
await new Promise<void>((resolve, reject) => {
const child = spawn(command, args, {
stdio: "inherit",
shell: process.platform === "win32",
env: {
...process.env,
...extraEnv,
},
});
child.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`));
});
child.on("error", reject);
});
}
async function ensurePip(pythonBin: string, env: NodeJS.ProcessEnv): Promise<void> {
await new Promise<void>((resolve, reject) => {
const child = spawn(pythonBin, ["-m", "pip", "--version"], {
stdio: "ignore",
shell: process.platform === "win32",
env,
});
child.on("exit", (code) => {
if (code === 0) {
resolve();
return;
}
reject(new Error("pip missing"));
});
child.on("error", reject);
}).catch(async () => {
await run(pythonBin, ["-m", "ensurepip", "--upgrade"], env);
});
}
async function ensureDownload(url: string, filePath: string): Promise<void> {
if (existsSync(filePath)) {
return;
}
await mkdir(path.dirname(filePath), { recursive: true });
const response = await fetch(url);
if (!response.ok) {
throw new Error(`다운로드 실패: ${url} (${response.status})`);
}
const bytes = Buffer.from(await response.arrayBuffer());
await writeFile(filePath, bytes);
}
async function main(): Promise<void> {
const config = loadConfig();
const venvPath = resolveLocalAiVenvPath(config);
const venvPython = resolveVenvPythonPath(config);
const cachePath = resolveLocalAiCachePath(config);
const ttsModelPath = resolveLocalAiTtsModelPath(config);
const ttsVoicesPath = resolveLocalAiTtsVoicesPath(config);
const requirementsPath = path.resolve(
process.cwd(),
"python",
process.platform === "win32" ? "requirements-windows.txt" : "requirements.txt",
);
const baseEnv = {
HF_HOME: cachePath,
TRANSFORMERS_CACHE: cachePath,
PYTHONIOENCODING: "utf-8",
HF_HUB_DISABLE_SYMLINKS_WARNING: "1",
};
await mkdir(cachePath, { recursive: true });
if (!existsSync(venvPython)) {
const launch = resolvePythonLaunch(config, { preferVenv: false });
console.log(`기본 Python 확인: ${launch.command} ${launch.args.join(" ")}`.trim());
console.log(`가상환경 생성: ${venvPath}`);
await run(launch.command, [...launch.args, "-m", "venv", venvPath], baseEnv);
}
await ensurePip(venvPython, {
...process.env,
...baseEnv,
});
console.log("로컬 AI 의존성 설치를 시작합니다.");
if (process.platform === "win32") {
console.log("Windows GPU STT용 CUDA 런타임 패키지도 함께 확인합니다.");
}
await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], baseEnv);
await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], baseEnv);
if (process.platform !== "win32") {
console.log("로컬 TTS 모델 파일을 확인합니다.");
await ensureDownload(KOKORO_MODEL_URL, ttsModelPath);
await ensureDownload(KOKORO_VOICES_URL, ttsVoicesPath);
}
console.log("설치가 끝났습니다.");
console.log("다음 순서:");
console.log("1. bun run devices");
console.log("2. bun run start:local");
}
void main().catch((error) => {
console.error(error instanceof Error ? error.message : String(error));
process.exit(1);
});