feat: add local audio test mode
This commit is contained in:
@@ -12,5 +12,8 @@ ELEVENLABS_TTS_MODEL=eleven_flash_v2_5
|
||||
|
||||
BOT_DEFAULT_LANGUAGE=ko
|
||||
MAX_CONVERSATION_TURNS=12
|
||||
LOCAL_AUDIO_SOURCE=
|
||||
LOCAL_AUDIO_SINK=
|
||||
LOCAL_SPEAKER_NAME=local-user
|
||||
DEBUG_TEXT_EVENTS=false
|
||||
LOG_LEVEL=info
|
||||
|
||||
41
README.md
41
README.md
@@ -1,10 +1,11 @@
|
||||
# realtime_voice_bot
|
||||
|
||||
디스코드 음성 채널에서 여러 사용자의 음성을 개별로 받아 한국어로 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다.
|
||||
디스코드 음성 채널 또는 로컬 PC 마이크/스피커에서 한국어 음성을 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다.
|
||||
|
||||
## 현재 구현 범위
|
||||
|
||||
- Discord slash command 기반 제어: `/join`, `/leave`, `/status`, `/reset`, `/say`
|
||||
- 로컬 테스트 모드: `pw-record` 입력, `pw-play` 출력
|
||||
- `@discordjs/voice` 기반 음성 채널 입장 및 유저별 오디오 수신
|
||||
- 48k stereo PCM을 16k mono로 내려서 유저별 VAD 처리
|
||||
- Silero 계열 VAD(`avr-vad`)로 발화 시작/종료 감지
|
||||
@@ -28,16 +29,25 @@
|
||||
|
||||
필수:
|
||||
|
||||
- `DISCORD_BOT_TOKEN`
|
||||
- `DISCORD_APPLICATION_ID`
|
||||
- `OPENAI_API_KEY`
|
||||
- `ELEVENLABS_API_KEY`
|
||||
- `ELEVENLABS_VOICE_ID`
|
||||
|
||||
Discord 모드에서만 필수:
|
||||
|
||||
- `DISCORD_BOT_TOKEN`
|
||||
- `DISCORD_APPLICATION_ID`
|
||||
|
||||
선택:
|
||||
|
||||
- `DISCORD_COMMAND_GUILD_ID`
|
||||
- 테스트 서버에만 slash command를 즉시 반영하려면 설정
|
||||
- `LOCAL_AUDIO_SOURCE`
|
||||
- `pw-record --target` 에 넣을 PipeWire source id 또는 node name
|
||||
- `LOCAL_AUDIO_SINK`
|
||||
- `pw-play --target` 에 넣을 PipeWire sink id 또는 node name
|
||||
- `LOCAL_SPEAKER_NAME`
|
||||
- 로컬 테스트에서 프롬프트에 넣을 화자 이름
|
||||
- `OPENAI_MODEL`
|
||||
- 기본값: `gpt-5.4-mini`
|
||||
- `ELEVENLABS_STT_MODEL`
|
||||
@@ -51,13 +61,24 @@
|
||||
|
||||
```bash
|
||||
bun install
|
||||
bun run start
|
||||
```
|
||||
|
||||
개발 모드:
|
||||
디스코드 모드:
|
||||
|
||||
```bash
|
||||
bun run dev
|
||||
bun run start:discord
|
||||
```
|
||||
|
||||
로컬 장치 목록:
|
||||
|
||||
```bash
|
||||
bun run audio:devices
|
||||
```
|
||||
|
||||
로컬 테스트 모드:
|
||||
|
||||
```bash
|
||||
bun run start:local
|
||||
```
|
||||
|
||||
타입 체크:
|
||||
@@ -74,9 +95,17 @@ bun run check
|
||||
4. 말을 하면 봇이 발화 단위로 인식하고 음성으로 짧게 답합니다.
|
||||
5. 다시 말하면 현재 읽고 있던 TTS는 즉시 중단됩니다.
|
||||
|
||||
로컬 테스트:
|
||||
|
||||
1. `bun run audio:devices` 로 source/sink id 또는 이름 확인
|
||||
2. 필요하면 `.env` 에 `LOCAL_AUDIO_SOURCE`, `LOCAL_AUDIO_SINK` 설정
|
||||
3. `bun run start:local`
|
||||
4. 마이크로 바로 말해서 응답 확인
|
||||
|
||||
## 설계 메모
|
||||
|
||||
- 입력은 유저별 병렬 처리
|
||||
- 출력은 길드 세션당 단일 큐
|
||||
- 로컬 모드는 단일 화자 입력 기준
|
||||
- 화자 구분은 `speaker_id`, `speaker_name`을 LLM 프롬프트에 항상 포함
|
||||
- 최소 프로토타입이므로 Deepgram 대체 STT, 장기 메모리, 고급 명령 라우팅은 아직 포함하지 않았습니다.
|
||||
|
||||
@@ -5,7 +5,10 @@
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "bun --watch src/index.ts",
|
||||
"start": "bun src/index.ts",
|
||||
"start": "bun src/index.ts discord",
|
||||
"start:discord": "bun src/index.ts discord",
|
||||
"start:local": "bun src/index.ts local",
|
||||
"audio:devices": "bun src/index.ts local-devices",
|
||||
"check": "tsc --noEmit",
|
||||
"build": "tsc -p tsconfig.json"
|
||||
},
|
||||
|
||||
@@ -8,8 +8,10 @@ import {
|
||||
NoSubscriberBehavior,
|
||||
VoiceConnectionStatus,
|
||||
createAudioPlayer,
|
||||
createAudioResource,
|
||||
entersState,
|
||||
joinVoiceChannel,
|
||||
StreamType,
|
||||
type AudioPlayer,
|
||||
type AudioReceiveStream,
|
||||
type VoiceConnection,
|
||||
@@ -21,7 +23,7 @@ import { Logger } from "../logger.js";
|
||||
import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js";
|
||||
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
|
||||
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService, type PreparedSpeechPlayback } from "../services/elevenlabs-tts.js";
|
||||
import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
|
||||
import { OpenAiLlmService } from "../services/openai-llm.js";
|
||||
|
||||
interface GuildVoiceSessionOptions {
|
||||
@@ -167,7 +169,7 @@ export class GuildVoiceSession extends EventEmitter {
|
||||
|
||||
private draining = false;
|
||||
private currentAbortController: AbortController | null = null;
|
||||
private currentPlayback: PreparedSpeechPlayback | null = null;
|
||||
private currentPlayback: PreparedSpeechAudio | null = null;
|
||||
private textChannelId?: string;
|
||||
|
||||
private constructor(private readonly options: GuildVoiceSessionOptions) {
|
||||
@@ -415,7 +417,9 @@ export class GuildVoiceSession extends EventEmitter {
|
||||
}
|
||||
|
||||
try {
|
||||
const resource = this.currentPlayback.resource;
|
||||
const resource = createAudioResource(this.currentPlayback.stream, {
|
||||
inputType: StreamType.Raw,
|
||||
});
|
||||
this.player.play(resource);
|
||||
|
||||
await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null);
|
||||
|
||||
339
src/audio/local-voice-session.ts
Normal file
339
src/audio/local-voice-session.ts
Normal file
@@ -0,0 +1,339 @@
|
||||
import { spawn, type ChildProcessByStdio } from "node:child_process";
|
||||
import { once } from "node:events";
|
||||
import type { Readable, Writable } from "node:stream";
|
||||
|
||||
import { RealTimeVAD } from "avr-vad";
|
||||
|
||||
import type { AssistantRuntimeConfig } from "../config.js";
|
||||
import { Logger } from "../logger.js";
|
||||
import { takeFrame, int16ArrayToFloat32, float32ToPcm16Buffer } from "./pcm.js";
|
||||
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
|
||||
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js";
|
||||
import { OpenAiLlmService } from "../services/openai-llm.js";
|
||||
|
||||
interface LocalVoiceSessionOptions {
|
||||
config: AssistantRuntimeConfig;
|
||||
logger: Logger;
|
||||
stt: ElevenLabsSttService;
|
||||
tts: ElevenLabsTtsService;
|
||||
llm: OpenAiLlmService;
|
||||
}
|
||||
|
||||
interface SpeechJob {
|
||||
text: string;
|
||||
source: "assistant" | "manual";
|
||||
}
|
||||
|
||||
export class LocalVoiceSession {
|
||||
private readonly memory: ConversationMemory;
|
||||
private readonly queue: SpeechJob[] = [];
|
||||
private readonly pendingSamples: number[] = [];
|
||||
|
||||
private vad: RealTimeVAD | null = null;
|
||||
private recorder: ChildProcessByStdio<null, Readable, Readable> | null = null;
|
||||
private currentPlayer: ChildProcessByStdio<Writable, null, Readable> | null = null;
|
||||
private currentAbortController: AbortController | null = null;
|
||||
private currentPlayback: PreparedSpeechAudio | null = null;
|
||||
private processing = Promise.resolve();
|
||||
private draining = false;
|
||||
private destroyed = false;
|
||||
|
||||
constructor(private readonly options: LocalVoiceSessionOptions) {
|
||||
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
|
||||
}
|
||||
|
||||
async start(): Promise<void> {
|
||||
this.vad = await RealTimeVAD.new({
|
||||
model: "v5",
|
||||
sampleRate: 16000,
|
||||
frameSamples: 1536,
|
||||
positiveSpeechThreshold: 0.55,
|
||||
negativeSpeechThreshold: 0.35,
|
||||
redemptionFrames: 8,
|
||||
preSpeechPadFrames: 2,
|
||||
minSpeechFrames: 3,
|
||||
onFrameProcessed: () => undefined,
|
||||
onVADMisfire: () => undefined,
|
||||
onSpeechStart: () => {
|
||||
this.interruptPlayback("local-barge-in");
|
||||
},
|
||||
onSpeechRealStart: () => undefined,
|
||||
onSpeechEnd: (audio: Float32Array) => {
|
||||
void this.handleSpeechEnd(audio);
|
||||
},
|
||||
});
|
||||
|
||||
this.recorder = this.spawnRecorder();
|
||||
this.recorder.stdout.on("data", (chunk: Buffer) => {
|
||||
this.pushPcm16Chunk(chunk);
|
||||
});
|
||||
this.recorder.stderr.on("data", (chunk: Buffer) => {
|
||||
const text = chunk.toString().trim();
|
||||
if (text.length > 0) {
|
||||
this.options.logger.debug("[pw-record]", text);
|
||||
}
|
||||
});
|
||||
this.recorder.on("exit", (code, signal) => {
|
||||
if (!this.destroyed) {
|
||||
this.options.logger.warn("pw-record exited unexpectedly", { code, signal });
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async destroy(): Promise<void> {
|
||||
this.destroyed = true;
|
||||
this.interruptPlayback("local-shutdown");
|
||||
|
||||
if (this.recorder && !this.recorder.killed) {
|
||||
this.recorder.kill("SIGTERM");
|
||||
await once(this.recorder, "exit").catch(() => null);
|
||||
}
|
||||
|
||||
if (this.vad) {
|
||||
await this.vad.destroy().catch((error) => {
|
||||
this.options.logger.warn("Local VAD destroy failed", error);
|
||||
});
|
||||
this.vad = null;
|
||||
}
|
||||
}
|
||||
|
||||
clearConversation(): void {
|
||||
this.memory.clear();
|
||||
this.interruptPlayback("local-reset");
|
||||
}
|
||||
|
||||
async speakText(text: string): Promise<void> {
|
||||
this.queue.push({
|
||||
text,
|
||||
source: "manual",
|
||||
});
|
||||
await this.drainQueue();
|
||||
}
|
||||
|
||||
statusSummary(): string {
|
||||
return [
|
||||
"모드: local",
|
||||
`입력 source: ${this.options.config.LOCAL_AUDIO_SOURCE ?? "default"}`,
|
||||
`출력 sink: ${this.options.config.LOCAL_AUDIO_SINK ?? "default"}`,
|
||||
`대기열: ${this.queue.length}`,
|
||||
`최근 대화 턴: ${this.memory.recentTurns().length}`,
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
private spawnRecorder(): ChildProcessByStdio<null, Readable, Readable> {
|
||||
const args = [
|
||||
"--rate",
|
||||
"16000",
|
||||
"--channels",
|
||||
"1",
|
||||
"--format",
|
||||
"s16",
|
||||
"--raw",
|
||||
];
|
||||
|
||||
if (this.options.config.LOCAL_AUDIO_SOURCE) {
|
||||
args.push("--target", this.options.config.LOCAL_AUDIO_SOURCE);
|
||||
}
|
||||
|
||||
args.push("-");
|
||||
|
||||
this.options.logger.info("Starting local recorder", {
|
||||
source: this.options.config.LOCAL_AUDIO_SOURCE ?? "default",
|
||||
});
|
||||
|
||||
return spawn("pw-record", args, {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
}
|
||||
|
||||
private pushPcm16Chunk(chunk: Buffer): void {
|
||||
if (this.destroyed || !this.vad) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (let offset = 0; offset + 1 < chunk.length; offset += 2) {
|
||||
this.pendingSamples.push(chunk.readInt16LE(offset));
|
||||
}
|
||||
|
||||
while (true) {
|
||||
const frame = takeFrame(this.pendingSamples, 1536);
|
||||
if (!frame) {
|
||||
return;
|
||||
}
|
||||
|
||||
const floatFrame = int16ArrayToFloat32(frame);
|
||||
this.processing = this.processing
|
||||
.then(() => this.vad?.processAudio(floatFrame))
|
||||
.catch((error) => {
|
||||
this.options.logger.warn("Local VAD processing failed", error);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private async handleSpeechEnd(audio: Float32Array): Promise<void> {
|
||||
if (audio.length < 16000 * 0.25) {
|
||||
return;
|
||||
}
|
||||
|
||||
const utterance: UserUtterance = {
|
||||
speakerId: "local-user",
|
||||
speakerName: this.options.config.LOCAL_SPEAKER_NAME,
|
||||
text: "",
|
||||
};
|
||||
|
||||
let transcript: string | null = null;
|
||||
try {
|
||||
transcript = await this.options.stt.transcribePcm16(float32ToPcm16Buffer(audio));
|
||||
} catch (error) {
|
||||
this.options.logger.warn("Local STT failed", error);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!transcript || transcript.trim().length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
utterance.text = transcript.trim();
|
||||
this.memory.addUserTurn(utterance);
|
||||
this.options.logger.info("Local transcript", utterance.text);
|
||||
if (this.options.config.DEBUG_TEXT_EVENTS) {
|
||||
console.log(`\n[you] ${utterance.text}`);
|
||||
}
|
||||
|
||||
let reply: string;
|
||||
try {
|
||||
reply = await this.options.llm.generateReply(this.memory, utterance);
|
||||
} catch (error) {
|
||||
this.options.logger.warn("Local LLM failed", error);
|
||||
reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요.";
|
||||
}
|
||||
|
||||
this.memory.addAssistantTurn(reply);
|
||||
if (this.options.config.DEBUG_TEXT_EVENTS) {
|
||||
console.log(`[bot] ${reply}\n`);
|
||||
}
|
||||
|
||||
this.queue.push({
|
||||
text: reply,
|
||||
source: "assistant",
|
||||
});
|
||||
await this.drainQueue();
|
||||
}
|
||||
|
||||
private interruptPlayback(reason: string): void {
|
||||
if (this.queue.length > 0 || this.currentPlayer) {
|
||||
this.options.logger.info("Interrupting local playback", reason);
|
||||
}
|
||||
|
||||
this.queue.splice(0, this.queue.length);
|
||||
this.currentAbortController?.abort();
|
||||
this.currentAbortController = null;
|
||||
this.currentPlayback?.dispose();
|
||||
this.currentPlayback = null;
|
||||
|
||||
if (this.currentPlayer && !this.currentPlayer.killed) {
|
||||
this.currentPlayer.kill("SIGKILL");
|
||||
}
|
||||
this.currentPlayer = null;
|
||||
}
|
||||
|
||||
private async drainQueue(): Promise<void> {
|
||||
if (this.draining || this.destroyed) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.draining = true;
|
||||
|
||||
try {
|
||||
while (this.queue.length > 0 && !this.destroyed) {
|
||||
const job = this.queue.shift();
|
||||
if (!job) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const abortController = new AbortController();
|
||||
this.currentAbortController = abortController;
|
||||
|
||||
try {
|
||||
this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal);
|
||||
} catch (error) {
|
||||
if (!abortController.signal.aborted) {
|
||||
this.options.logger.warn("Local TTS synthesis failed", error);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.playToSink(this.currentPlayback, abortController.signal);
|
||||
} catch (error) {
|
||||
if (!abortController.signal.aborted) {
|
||||
this.options.logger.warn("Local playback failed", error);
|
||||
}
|
||||
} finally {
|
||||
this.currentPlayback?.dispose();
|
||||
this.currentPlayback = null;
|
||||
if (this.currentAbortController === abortController) {
|
||||
this.currentAbortController = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
this.draining = false;
|
||||
}
|
||||
}
|
||||
|
||||
private async playToSink(playback: PreparedSpeechAudio, signal: AbortSignal): Promise<void> {
|
||||
const args = [
|
||||
"--rate",
|
||||
"48000",
|
||||
"--channels",
|
||||
"2",
|
||||
"--format",
|
||||
"s16",
|
||||
"--raw",
|
||||
];
|
||||
|
||||
if (this.options.config.LOCAL_AUDIO_SINK) {
|
||||
args.push("--target", this.options.config.LOCAL_AUDIO_SINK);
|
||||
}
|
||||
|
||||
args.push("-");
|
||||
|
||||
const player = spawn("pw-play", args, {
|
||||
stdio: ["pipe", "ignore", "pipe"],
|
||||
});
|
||||
this.currentPlayer = player;
|
||||
|
||||
player.stderr.on("data", (chunk: Buffer) => {
|
||||
const text = chunk.toString().trim();
|
||||
if (text.length > 0) {
|
||||
this.options.logger.debug("[pw-play]", text);
|
||||
}
|
||||
});
|
||||
|
||||
signal.addEventListener(
|
||||
"abort",
|
||||
() => {
|
||||
playback.stream.destroy();
|
||||
if (!player.killed) {
|
||||
player.kill("SIGKILL");
|
||||
}
|
||||
},
|
||||
{ once: true },
|
||||
);
|
||||
|
||||
playback.stream.pipe(player.stdin);
|
||||
|
||||
const [code, playSignal] = (await once(player, "exit")) as [number | null, NodeJS.Signals | null];
|
||||
this.currentPlayer = null;
|
||||
|
||||
if (signal.aborted) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (code !== 0) {
|
||||
throw new Error(`pw-play exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,17 +4,20 @@ import { z } from "zod";
|
||||
loadDotenv();
|
||||
|
||||
const envSchema = z.object({
|
||||
DISCORD_BOT_TOKEN: z.string().min(1),
|
||||
DISCORD_APPLICATION_ID: z.string().min(1),
|
||||
DISCORD_BOT_TOKEN: z.string().min(1).optional(),
|
||||
DISCORD_APPLICATION_ID: z.string().min(1).optional(),
|
||||
DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(),
|
||||
OPENAI_API_KEY: z.string().min(1),
|
||||
OPENAI_API_KEY: z.string().min(1).optional(),
|
||||
OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"),
|
||||
ELEVENLABS_API_KEY: z.string().min(1),
|
||||
ELEVENLABS_VOICE_ID: z.string().min(1),
|
||||
ELEVENLABS_API_KEY: z.string().min(1).optional(),
|
||||
ELEVENLABS_VOICE_ID: z.string().min(1).optional(),
|
||||
ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"),
|
||||
ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"),
|
||||
BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),
|
||||
MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12),
|
||||
LOCAL_AUDIO_SOURCE: z.string().min(1).optional(),
|
||||
LOCAL_AUDIO_SINK: z.string().min(1).optional(),
|
||||
LOCAL_SPEAKER_NAME: z.string().min(1).default("local-user"),
|
||||
DEBUG_TEXT_EVENTS: z
|
||||
.string()
|
||||
.optional()
|
||||
@@ -23,7 +26,41 @@ const envSchema = z.object({
|
||||
});
|
||||
|
||||
export type AppConfig = z.infer<typeof envSchema>;
|
||||
export type AssistantRuntimeConfig = AppConfig & {
|
||||
OPENAI_API_KEY: string;
|
||||
ELEVENLABS_API_KEY: string;
|
||||
ELEVENLABS_VOICE_ID: string;
|
||||
};
|
||||
export type DiscordRuntimeConfig = AssistantRuntimeConfig & {
|
||||
DISCORD_BOT_TOKEN: string;
|
||||
DISCORD_APPLICATION_ID: string;
|
||||
};
|
||||
|
||||
export function loadConfig(): AppConfig {
|
||||
return envSchema.parse(process.env);
|
||||
}
|
||||
|
||||
function requirePresent(value: string | undefined, name: string): string {
|
||||
if (!value) {
|
||||
throw new Error(`${name} 환경변수가 필요합니다.`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
export function requireAssistantRuntimeConfig(config: AppConfig): AssistantRuntimeConfig {
|
||||
return {
|
||||
...config,
|
||||
OPENAI_API_KEY: requirePresent(config.OPENAI_API_KEY, "OPENAI_API_KEY"),
|
||||
ELEVENLABS_API_KEY: requirePresent(config.ELEVENLABS_API_KEY, "ELEVENLABS_API_KEY"),
|
||||
ELEVENLABS_VOICE_ID: requirePresent(config.ELEVENLABS_VOICE_ID, "ELEVENLABS_VOICE_ID"),
|
||||
};
|
||||
}
|
||||
|
||||
export function requireDiscordRuntimeConfig(config: AppConfig): DiscordRuntimeConfig {
|
||||
const assistant = requireAssistantRuntimeConfig(config);
|
||||
return {
|
||||
...assistant,
|
||||
DISCORD_BOT_TOKEN: requirePresent(config.DISCORD_BOT_TOKEN, "DISCORD_BOT_TOKEN"),
|
||||
DISCORD_APPLICATION_ID: requirePresent(config.DISCORD_APPLICATION_ID, "DISCORD_APPLICATION_ID"),
|
||||
};
|
||||
}
|
||||
|
||||
234
src/discord-main.ts
Normal file
234
src/discord-main.ts
Normal file
@@ -0,0 +1,234 @@
|
||||
import process from "node:process";
|
||||
|
||||
import {
|
||||
GatewayIntentBits,
|
||||
REST,
|
||||
Routes,
|
||||
SlashCommandBuilder,
|
||||
type ChatInputCommandInteraction,
|
||||
type Client,
|
||||
type GuildMember,
|
||||
type VoiceBasedChannel,
|
||||
} from "discord.js";
|
||||
import { Client as DiscordClient } from "discord.js";
|
||||
|
||||
import { GuildVoiceSession } from "./audio/guild-voice-session.js";
|
||||
import { type DiscordRuntimeConfig } from "./config.js";
|
||||
import { Logger } from "./logger.js";
|
||||
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
|
||||
import { OpenAiLlmService } from "./services/openai-llm.js";
|
||||
|
||||
export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise<void> {
|
||||
const commands = [
|
||||
new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
|
||||
new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
|
||||
new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
|
||||
new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
|
||||
new SlashCommandBuilder()
|
||||
.setName("say")
|
||||
.setDescription("텍스트를 바로 음성으로 읽습니다.")
|
||||
.addStringOption((option) =>
|
||||
option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
|
||||
),
|
||||
].map((command) => command.toJSON());
|
||||
|
||||
const client = new DiscordClient({
|
||||
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
|
||||
});
|
||||
|
||||
const stt = new ElevenLabsSttService(config);
|
||||
const tts = new ElevenLabsTtsService(config);
|
||||
const llm = new OpenAiLlmService(config);
|
||||
const sessions = new Map<string, GuildVoiceSession>();
|
||||
|
||||
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
|
||||
const member = interaction.member as GuildMember | null;
|
||||
return member?.voice.channel ?? null;
|
||||
}
|
||||
|
||||
async function registerCommands(_appClient: Client): Promise<void> {
|
||||
const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
|
||||
if (config.DISCORD_COMMAND_GUILD_ID) {
|
||||
await rest.put(
|
||||
Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
|
||||
{
|
||||
body: commands,
|
||||
},
|
||||
);
|
||||
logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
|
||||
return;
|
||||
}
|
||||
|
||||
await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
|
||||
body: commands,
|
||||
});
|
||||
logger.info("Registered global commands");
|
||||
}
|
||||
|
||||
async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
|
||||
if (!interaction.guild) {
|
||||
throw new Error("Guild interaction required");
|
||||
}
|
||||
|
||||
const voiceChannel = getVoiceChannel(interaction);
|
||||
if (!voiceChannel) {
|
||||
throw new Error("먼저 음성 채널에 들어가 주세요.");
|
||||
}
|
||||
|
||||
const existing = sessions.get(interaction.guild.id);
|
||||
if (existing && existing.voiceChannelId === voiceChannel.id) {
|
||||
existing.setTextChannel(interaction.channelId);
|
||||
return existing;
|
||||
}
|
||||
|
||||
if (existing) {
|
||||
await existing.destroy();
|
||||
sessions.delete(interaction.guild.id);
|
||||
}
|
||||
|
||||
const session = await GuildVoiceSession.create({
|
||||
client,
|
||||
config,
|
||||
logger,
|
||||
guild: interaction.guild,
|
||||
voiceChannel,
|
||||
textChannelId: interaction.channelId,
|
||||
stt,
|
||||
tts,
|
||||
llm,
|
||||
});
|
||||
sessions.set(interaction.guild.id, session);
|
||||
return session;
|
||||
}
|
||||
|
||||
async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
await interaction.deferReply({ ephemeral: true });
|
||||
|
||||
try {
|
||||
const session = await createSession(interaction);
|
||||
await interaction.editReply(
|
||||
`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`,
|
||||
);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
|
||||
await interaction.editReply(message);
|
||||
}
|
||||
}
|
||||
|
||||
async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
await session.destroy();
|
||||
sessions.delete(interaction.guildId!);
|
||||
await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
|
||||
}
|
||||
|
||||
async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
await interaction.reply({
|
||||
content: session.statusSummary(),
|
||||
ephemeral: true,
|
||||
});
|
||||
}
|
||||
|
||||
async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
session.clearConversation();
|
||||
await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
|
||||
}
|
||||
|
||||
async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
await interaction.deferReply({ ephemeral: true });
|
||||
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
|
||||
return;
|
||||
}
|
||||
|
||||
const text = interaction.options.getString("text", true).trim();
|
||||
await session.speakText(text);
|
||||
await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
|
||||
}
|
||||
|
||||
async function shutdown(exitCode = 0): Promise<void> {
|
||||
logger.info("Shutting down");
|
||||
for (const session of sessions.values()) {
|
||||
await session.destroy().catch((error) => {
|
||||
logger.warn("Session shutdown failed", error);
|
||||
});
|
||||
}
|
||||
sessions.clear();
|
||||
await client.destroy();
|
||||
process.exit(exitCode);
|
||||
}
|
||||
|
||||
client.once("ready", async () => {
|
||||
logger.info("Discord client ready", client.user?.tag ?? "unknown");
|
||||
try {
|
||||
await registerCommands(client);
|
||||
} catch (error) {
|
||||
logger.error("Command registration failed", error);
|
||||
}
|
||||
});
|
||||
|
||||
client.on("interactionCreate", async (interaction) => {
|
||||
if (!interaction.isChatInputCommand()) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
switch (interaction.commandName) {
|
||||
case "join":
|
||||
await handleJoin(interaction);
|
||||
return;
|
||||
case "leave":
|
||||
await handleLeave(interaction);
|
||||
return;
|
||||
case "status":
|
||||
await handleStatus(interaction);
|
||||
return;
|
||||
case "reset":
|
||||
await handleReset(interaction);
|
||||
return;
|
||||
case "say":
|
||||
await handleSay(interaction);
|
||||
return;
|
||||
default:
|
||||
await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("Interaction handler failed", error);
|
||||
if (interaction.deferred || interaction.replied) {
|
||||
await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
|
||||
return;
|
||||
}
|
||||
await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
|
||||
}
|
||||
});
|
||||
|
||||
process.on("SIGINT", () => {
|
||||
void shutdown(0);
|
||||
});
|
||||
|
||||
process.on("SIGTERM", () => {
|
||||
void shutdown(0);
|
||||
});
|
||||
|
||||
await client.login(config.DISCORD_BOT_TOKEN);
|
||||
}
|
||||
243
src/index.ts
243
src/index.ts
@@ -1,237 +1,28 @@
|
||||
import process from "node:process";
|
||||
|
||||
import {
|
||||
GatewayIntentBits,
|
||||
REST,
|
||||
Routes,
|
||||
SlashCommandBuilder,
|
||||
type ChatInputCommandInteraction,
|
||||
type Client,
|
||||
type GuildMember,
|
||||
type VoiceBasedChannel,
|
||||
} from "discord.js";
|
||||
import { Client as DiscordClient } from "discord.js";
|
||||
|
||||
import { GuildVoiceSession } from "./audio/guild-voice-session.js";
|
||||
import { loadConfig } from "./config.js";
|
||||
import { loadConfig, requireAssistantRuntimeConfig, requireDiscordRuntimeConfig } from "./config.js";
|
||||
import { runDiscordBot } from "./discord-main.js";
|
||||
import { Logger } from "./logger.js";
|
||||
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
|
||||
import { OpenAiLlmService } from "./services/openai-llm.js";
|
||||
import { printLocalAudioDevices, runLocalAssistant } from "./local-main.js";
|
||||
|
||||
const mode = process.argv[2] ?? "discord";
|
||||
const config = loadConfig();
|
||||
const logger = new Logger(config.LOG_LEVEL);
|
||||
|
||||
const commands = [
|
||||
new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
|
||||
new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
|
||||
new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
|
||||
new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
|
||||
new SlashCommandBuilder()
|
||||
.setName("say")
|
||||
.setDescription("텍스트를 바로 음성으로 읽습니다.")
|
||||
.addStringOption((option) =>
|
||||
option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
|
||||
),
|
||||
].map((command) => command.toJSON());
|
||||
|
||||
const client = new DiscordClient({
|
||||
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
|
||||
});
|
||||
|
||||
const stt = new ElevenLabsSttService(config);
|
||||
const tts = new ElevenLabsTtsService(config);
|
||||
const llm = new OpenAiLlmService(config);
|
||||
const sessions = new Map<string, GuildVoiceSession>();
|
||||
|
||||
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
|
||||
const member = interaction.member as GuildMember | null;
|
||||
return member?.voice.channel ?? null;
|
||||
}
|
||||
|
||||
async function registerCommands(appClient: Client): Promise<void> {
|
||||
const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
|
||||
if (config.DISCORD_COMMAND_GUILD_ID) {
|
||||
await rest.put(
|
||||
Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
|
||||
{
|
||||
body: commands,
|
||||
},
|
||||
);
|
||||
logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
|
||||
return;
|
||||
}
|
||||
|
||||
await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
|
||||
body: commands,
|
||||
});
|
||||
logger.info("Registered global commands");
|
||||
}
|
||||
|
||||
async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
|
||||
if (!interaction.guild) {
|
||||
throw new Error("Guild interaction required");
|
||||
}
|
||||
|
||||
const voiceChannel = getVoiceChannel(interaction);
|
||||
if (!voiceChannel) {
|
||||
throw new Error("먼저 음성 채널에 들어가 주세요.");
|
||||
}
|
||||
|
||||
const existing = sessions.get(interaction.guild.id);
|
||||
if (existing && existing.voiceChannelId === voiceChannel.id) {
|
||||
existing.setTextChannel(interaction.channelId);
|
||||
return existing;
|
||||
}
|
||||
|
||||
if (existing) {
|
||||
await existing.destroy();
|
||||
sessions.delete(interaction.guild.id);
|
||||
}
|
||||
|
||||
const session = await GuildVoiceSession.create({
|
||||
client,
|
||||
config,
|
||||
logger,
|
||||
guild: interaction.guild,
|
||||
voiceChannel,
|
||||
textChannelId: interaction.channelId,
|
||||
stt,
|
||||
tts,
|
||||
llm,
|
||||
});
|
||||
sessions.set(interaction.guild.id, session);
|
||||
return session;
|
||||
}
|
||||
|
||||
async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
await interaction.deferReply({ ephemeral: true });
|
||||
|
||||
try {
|
||||
const session = await createSession(interaction);
|
||||
await interaction.editReply(`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
|
||||
await interaction.editReply(message);
|
||||
}
|
||||
}
|
||||
|
||||
async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
await session.destroy();
|
||||
sessions.delete(interaction.guildId!);
|
||||
await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
|
||||
}
|
||||
|
||||
async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
await interaction.reply({
|
||||
content: session.statusSummary(),
|
||||
ephemeral: true,
|
||||
});
|
||||
}
|
||||
|
||||
async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
session.clearConversation();
|
||||
await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
|
||||
}
|
||||
|
||||
async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
await interaction.deferReply({ ephemeral: true });
|
||||
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
|
||||
return;
|
||||
}
|
||||
|
||||
const text = interaction.options.getString("text", true).trim();
|
||||
await session.speakText(text);
|
||||
await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
|
||||
}
|
||||
|
||||
async function shutdown(exitCode = 0): Promise<void> {
|
||||
logger.info("Shutting down");
|
||||
for (const session of sessions.values()) {
|
||||
await session.destroy().catch((error) => {
|
||||
logger.warn("Session shutdown failed", error);
|
||||
});
|
||||
}
|
||||
sessions.clear();
|
||||
await client.destroy();
|
||||
process.exit(exitCode);
|
||||
}
|
||||
|
||||
client.once("ready", async () => {
|
||||
logger.info("Discord client ready", client.user?.tag ?? "unknown");
|
||||
try {
|
||||
await registerCommands(client);
|
||||
} catch (error) {
|
||||
logger.error("Command registration failed", error);
|
||||
}
|
||||
});
|
||||
|
||||
client.on("interactionCreate", async (interaction) => {
|
||||
if (!interaction.isChatInputCommand()) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
switch (interaction.commandName) {
|
||||
case "join":
|
||||
await handleJoin(interaction);
|
||||
return;
|
||||
case "leave":
|
||||
await handleLeave(interaction);
|
||||
return;
|
||||
case "status":
|
||||
await handleStatus(interaction);
|
||||
return;
|
||||
case "reset":
|
||||
await handleReset(interaction);
|
||||
return;
|
||||
case "say":
|
||||
await handleSay(interaction);
|
||||
return;
|
||||
default:
|
||||
await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("Interaction handler failed", error);
|
||||
if (interaction.deferred || interaction.replied) {
|
||||
await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
|
||||
return;
|
||||
}
|
||||
await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
|
||||
}
|
||||
});
|
||||
|
||||
process.on("SIGINT", () => {
|
||||
void shutdown(0);
|
||||
});
|
||||
|
||||
process.on("SIGTERM", () => {
|
||||
void shutdown(0);
|
||||
});
|
||||
|
||||
async function main(): Promise<void> {
|
||||
await client.login(config.DISCORD_BOT_TOKEN);
|
||||
switch (mode) {
|
||||
case "discord":
|
||||
await runDiscordBot(requireDiscordRuntimeConfig(config), logger);
|
||||
return;
|
||||
case "local":
|
||||
await runLocalAssistant(requireAssistantRuntimeConfig(config), logger);
|
||||
return;
|
||||
case "local-devices":
|
||||
await printLocalAudioDevices();
|
||||
return;
|
||||
default:
|
||||
throw new Error(`알 수 없는 실행 모드입니다: ${mode}. 사용 가능: discord, local, local-devices`);
|
||||
}
|
||||
}
|
||||
|
||||
void main().catch((error) => {
|
||||
|
||||
75
src/local-main.ts
Normal file
75
src/local-main.ts
Normal file
@@ -0,0 +1,75 @@
|
||||
import { spawn } from "node:child_process";
|
||||
import process from "node:process";
|
||||
|
||||
import type { AssistantRuntimeConfig } from "./config.js";
|
||||
import { Logger } from "./logger.js";
|
||||
import { LocalVoiceSession } from "./audio/local-voice-session.js";
|
||||
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
|
||||
import { OpenAiLlmService } from "./services/openai-llm.js";
|
||||
|
||||
export async function printLocalAudioDevices(): Promise<void> {
|
||||
const runs = [
|
||||
{
|
||||
label: "wpctl status",
|
||||
args: ["status"],
|
||||
},
|
||||
{
|
||||
label: "wpctl status -n",
|
||||
args: ["status", "-n"],
|
||||
},
|
||||
] as const;
|
||||
|
||||
for (const run of runs) {
|
||||
console.log(`\n=== ${run.label} ===`);
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const child = spawn("wpctl", run.args, {
|
||||
stdio: ["ignore", "inherit", "inherit"],
|
||||
});
|
||||
child.on("exit", (code) => {
|
||||
if (code === 0) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
reject(new Error(`wpctl exited with code ${code ?? "null"}`));
|
||||
});
|
||||
child.on("error", reject);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise<void> {
|
||||
const stt = new ElevenLabsSttService(config);
|
||||
const tts = new ElevenLabsTtsService(config);
|
||||
const llm = new OpenAiLlmService(config);
|
||||
const session = new LocalVoiceSession({
|
||||
config,
|
||||
logger,
|
||||
stt,
|
||||
tts,
|
||||
llm,
|
||||
});
|
||||
|
||||
console.log(session.statusSummary());
|
||||
console.log("로컬 음성 테스트를 시작합니다. Ctrl+C 로 종료합니다.");
|
||||
if (config.DEBUG_TEXT_EVENTS) {
|
||||
console.log("텍스트 로그 출력이 켜져 있습니다.");
|
||||
}
|
||||
|
||||
const shutdown = async (exitCode = 0) => {
|
||||
await session.destroy().catch((error) => {
|
||||
logger.warn("Local session shutdown failed", error);
|
||||
});
|
||||
process.exit(exitCode);
|
||||
};
|
||||
|
||||
process.on("SIGINT", () => {
|
||||
void shutdown(0);
|
||||
});
|
||||
|
||||
process.on("SIGTERM", () => {
|
||||
void shutdown(0);
|
||||
});
|
||||
|
||||
await session.start();
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
import WebSocket from "ws";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
import type { AssistantRuntimeConfig } from "../config.js";
|
||||
|
||||
interface ElevenLabsMessage {
|
||||
message_type?: string;
|
||||
@@ -13,7 +13,7 @@ const NON_FATAL_ERROR_TYPES = new Set([
|
||||
]);
|
||||
|
||||
export class ElevenLabsSttService {
|
||||
constructor(private readonly config: AppConfig) {}
|
||||
constructor(private readonly config: AssistantRuntimeConfig) {}
|
||||
|
||||
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
|
||||
if (pcm16MonoAudio.byteLength === 0) {
|
||||
|
||||
@@ -2,24 +2,23 @@ import { Readable } from "node:stream";
|
||||
|
||||
import ffmpegStatic from "ffmpeg-static";
|
||||
import prism from "prism-media";
|
||||
import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
import type { AssistantRuntimeConfig } from "../config.js";
|
||||
|
||||
export interface PreparedSpeechPlayback {
|
||||
resource: AudioResource;
|
||||
export interface PreparedSpeechAudio {
|
||||
stream: Readable;
|
||||
dispose: () => void;
|
||||
}
|
||||
|
||||
export class ElevenLabsTtsService {
|
||||
constructor(private readonly config: AppConfig) {
|
||||
constructor(private readonly config: AssistantRuntimeConfig) {
|
||||
const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
|
||||
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
|
||||
process.env.FFMPEG_PATH = resolvedFfmpegPath;
|
||||
}
|
||||
}
|
||||
|
||||
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechPlayback> {
|
||||
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechAudio> {
|
||||
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
|
||||
url.searchParams.set("output_format", "mp3_44100_128");
|
||||
url.searchParams.set("enable_logging", "false");
|
||||
@@ -68,12 +67,8 @@ export class ElevenLabsTtsService {
|
||||
|
||||
input.pipe(ffmpeg);
|
||||
|
||||
const resource = createAudioResource(ffmpeg, {
|
||||
inputType: StreamType.Raw,
|
||||
});
|
||||
|
||||
return {
|
||||
resource,
|
||||
stream: ffmpeg,
|
||||
dispose: () => {
|
||||
input.destroy();
|
||||
ffmpeg.destroy();
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import OpenAI from "openai";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
import type { AssistantRuntimeConfig } from "../config.js";
|
||||
import type { ConversationMemory, UserUtterance } from "./conversation.js";
|
||||
|
||||
const ASSISTANT_INSTRUCTIONS = [
|
||||
@@ -30,7 +30,7 @@ function normalizeReply(text: string): string {
|
||||
export class OpenAiLlmService {
|
||||
private readonly client: OpenAI;
|
||||
|
||||
constructor(private readonly config: AppConfig) {
|
||||
constructor(private readonly config: AssistantRuntimeConfig) {
|
||||
this.client = new OpenAI({
|
||||
apiKey: this.config.OPENAI_API_KEY,
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user