feat: scaffold realtime Korean voice assistant bot
This commit is contained in:
77
src/services/conversation.ts
Normal file
77
src/services/conversation.ts
Normal file
@@ -0,0 +1,77 @@
|
||||
export interface ConversationTurn {
|
||||
role: "user" | "assistant";
|
||||
text: string;
|
||||
speakerId?: string;
|
||||
speakerName?: string;
|
||||
createdAt: number;
|
||||
}
|
||||
|
||||
export interface UserUtterance {
|
||||
speakerId: string;
|
||||
speakerName: string;
|
||||
text: string;
|
||||
}
|
||||
|
||||
export class ConversationMemory {
|
||||
private readonly turns: ConversationTurn[] = [];
|
||||
|
||||
constructor(private readonly maxTurns: number) {}
|
||||
|
||||
addUserTurn(utterance: UserUtterance): void {
|
||||
this.turns.push({
|
||||
role: "user",
|
||||
text: utterance.text,
|
||||
speakerId: utterance.speakerId,
|
||||
speakerName: utterance.speakerName,
|
||||
createdAt: Date.now(),
|
||||
});
|
||||
this.trim();
|
||||
}
|
||||
|
||||
addAssistantTurn(text: string): void {
|
||||
this.turns.push({
|
||||
role: "assistant",
|
||||
text,
|
||||
createdAt: Date.now(),
|
||||
});
|
||||
this.trim();
|
||||
}
|
||||
|
||||
clear(): void {
|
||||
this.turns.splice(0, this.turns.length);
|
||||
}
|
||||
|
||||
recentTurns(): ConversationTurn[] {
|
||||
return [...this.turns];
|
||||
}
|
||||
|
||||
buildPrompt(currentUtterance: UserUtterance): string {
|
||||
const recent = this.turns
|
||||
.slice(-this.maxTurns)
|
||||
.map((turn) => {
|
||||
if (turn.role === "assistant") {
|
||||
return `[assistant]\n${turn.text}`;
|
||||
}
|
||||
return `[user speaker_id=${turn.speakerId ?? "unknown"} speaker_name=${turn.speakerName ?? "unknown"}]\n${turn.text}`;
|
||||
})
|
||||
.join("\n\n");
|
||||
|
||||
const historyBlock = recent.length > 0 ? recent : "(이전 대화 없음)";
|
||||
|
||||
return [
|
||||
"최근 대화:",
|
||||
historyBlock,
|
||||
"",
|
||||
"이번 발화:",
|
||||
`[user speaker_id=${currentUtterance.speakerId} speaker_name=${currentUtterance.speakerName}]`,
|
||||
currentUtterance.text,
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
private trim(): void {
|
||||
const overflow = this.turns.length - this.maxTurns;
|
||||
if (overflow > 0) {
|
||||
this.turns.splice(0, overflow);
|
||||
}
|
||||
}
|
||||
}
|
||||
124
src/services/elevenlabs-stt.ts
Normal file
124
src/services/elevenlabs-stt.ts
Normal file
@@ -0,0 +1,124 @@
|
||||
import WebSocket from "ws";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
|
||||
interface ElevenLabsMessage {
|
||||
message_type?: string;
|
||||
text?: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
const NON_FATAL_ERROR_TYPES = new Set([
|
||||
"insufficient_audio_activity",
|
||||
]);
|
||||
|
||||
export class ElevenLabsSttService {
|
||||
constructor(private readonly config: AppConfig) {}
|
||||
|
||||
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
|
||||
if (pcm16MonoAudio.byteLength === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const url = new URL("wss://api.elevenlabs.io/v1/speech-to-text/realtime");
|
||||
url.searchParams.set("model_id", this.config.ELEVENLABS_STT_MODEL);
|
||||
url.searchParams.set("language_code", this.config.BOT_DEFAULT_LANGUAGE);
|
||||
url.searchParams.set("audio_format", "pcm_16000");
|
||||
url.searchParams.set("commit_strategy", "manual");
|
||||
url.searchParams.set("include_timestamps", "false");
|
||||
url.searchParams.set("include_language_detection", "false");
|
||||
url.searchParams.set("enable_logging", "false");
|
||||
|
||||
return await new Promise<string | null>((resolve, reject) => {
|
||||
const socket = new WebSocket(url, {
|
||||
headers: {
|
||||
"xi-api-key": this.config.ELEVENLABS_API_KEY,
|
||||
},
|
||||
});
|
||||
|
||||
let settled = false;
|
||||
let lastTranscript = "";
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
finish(lastTranscript || null);
|
||||
}, 15_000);
|
||||
|
||||
const finish = (result: string | null, error?: Error) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
settled = true;
|
||||
clearTimeout(timeout);
|
||||
try {
|
||||
socket.close();
|
||||
} catch {
|
||||
// Ignore close race.
|
||||
}
|
||||
|
||||
if (error) {
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
resolve(result);
|
||||
};
|
||||
|
||||
socket.on("message", (raw) => {
|
||||
let message: ElevenLabsMessage;
|
||||
try {
|
||||
message = JSON.parse(raw.toString()) as ElevenLabsMessage;
|
||||
} catch (error) {
|
||||
finish(null, error as Error);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (message.message_type) {
|
||||
case "session_started":
|
||||
socket.send(
|
||||
JSON.stringify({
|
||||
message_type: "input_audio_chunk",
|
||||
audio_base_64: pcm16MonoAudio.toString("base64"),
|
||||
commit: true,
|
||||
sample_rate: 16000,
|
||||
}),
|
||||
);
|
||||
return;
|
||||
case "partial_transcript":
|
||||
return;
|
||||
case "committed_transcript":
|
||||
case "committed_transcript_with_timestamps": {
|
||||
const transcript = message.text?.trim() ?? "";
|
||||
if (transcript.length > 0) {
|
||||
lastTranscript = transcript;
|
||||
finish(transcript);
|
||||
}
|
||||
return;
|
||||
}
|
||||
default:
|
||||
if (!message.message_type?.endsWith("error") && !message.message_type) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (message.message_type && NON_FATAL_ERROR_TYPES.has(message.message_type)) {
|
||||
finish(null);
|
||||
return;
|
||||
}
|
||||
|
||||
finish(
|
||||
null,
|
||||
new Error(message.error ?? `ElevenLabs STT error: ${message.message_type ?? "unknown"}`),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
socket.on("error", (error) => {
|
||||
finish(null, error as Error);
|
||||
});
|
||||
|
||||
socket.on("close", () => {
|
||||
if (!settled) {
|
||||
finish(lastTranscript || null);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
83
src/services/elevenlabs-tts.ts
Normal file
83
src/services/elevenlabs-tts.ts
Normal file
@@ -0,0 +1,83 @@
|
||||
import { Readable } from "node:stream";
|
||||
|
||||
import ffmpegStatic from "ffmpeg-static";
|
||||
import prism from "prism-media";
|
||||
import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
|
||||
export interface PreparedSpeechPlayback {
|
||||
resource: AudioResource;
|
||||
dispose: () => void;
|
||||
}
|
||||
|
||||
export class ElevenLabsTtsService {
|
||||
constructor(private readonly config: AppConfig) {
|
||||
const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
|
||||
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
|
||||
process.env.FFMPEG_PATH = resolvedFfmpegPath;
|
||||
}
|
||||
}
|
||||
|
||||
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechPlayback> {
|
||||
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
|
||||
url.searchParams.set("output_format", "mp3_44100_128");
|
||||
url.searchParams.set("enable_logging", "false");
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"xi-api-key": this.config.ELEVENLABS_API_KEY,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
model_id: this.config.ELEVENLABS_TTS_MODEL,
|
||||
language_code: this.config.BOT_DEFAULT_LANGUAGE,
|
||||
voice_settings: {
|
||||
stability: 0.35,
|
||||
similarity_boost: 0.75,
|
||||
speed: 1.05,
|
||||
},
|
||||
}),
|
||||
signal,
|
||||
});
|
||||
|
||||
if (!response.ok || !response.body) {
|
||||
throw new Error(`ElevenLabs TTS request failed with status ${response.status}`);
|
||||
}
|
||||
|
||||
const input = Readable.fromWeb(response.body as never);
|
||||
const ffmpeg = new prism.FFmpeg({
|
||||
args: [
|
||||
"-analyzeduration",
|
||||
"0",
|
||||
"-loglevel",
|
||||
"0",
|
||||
"-i",
|
||||
"pipe:0",
|
||||
"-f",
|
||||
"s16le",
|
||||
"-ar",
|
||||
"48000",
|
||||
"-ac",
|
||||
"2",
|
||||
"pipe:1",
|
||||
],
|
||||
});
|
||||
|
||||
input.pipe(ffmpeg);
|
||||
|
||||
const resource = createAudioResource(ffmpeg, {
|
||||
inputType: StreamType.Raw,
|
||||
});
|
||||
|
||||
return {
|
||||
resource,
|
||||
dispose: () => {
|
||||
input.destroy();
|
||||
ffmpeg.destroy();
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
64
src/services/openai-llm.ts
Normal file
64
src/services/openai-llm.ts
Normal file
@@ -0,0 +1,64 @@
|
||||
import OpenAI from "openai";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
import type { ConversationMemory, UserUtterance } from "./conversation.js";
|
||||
|
||||
const ASSISTANT_INSTRUCTIONS = [
|
||||
"너는 디스코드 음성 채널에서 동작하는 한국어 음성 비서다.",
|
||||
"답변은 짧고 실용적으로 한다.",
|
||||
"기본은 한 문장, 길어도 두 문장을 넘기지 않는다.",
|
||||
"말투는 자연스러운 한국어로 유지한다.",
|
||||
"speaker_id와 speaker_name은 화자 구분용이므로 필요할 때만 자연스럽게 반영한다.",
|
||||
"잘 못 들었거나 의미가 불명확하면 짧게 다시 물어본다.",
|
||||
"목록, 마크다운, 코드블록은 쓰지 않는다.",
|
||||
].join(" ");
|
||||
|
||||
function normalizeReply(text: string): string {
|
||||
const compact = text.replace(/\s+/g, " ").trim();
|
||||
if (compact.length <= 180) {
|
||||
return compact;
|
||||
}
|
||||
|
||||
const sentences = compact.match(/[^.!?]+[.!?]?/g);
|
||||
if (!sentences || sentences.length === 0) {
|
||||
return compact.slice(0, 180).trim();
|
||||
}
|
||||
|
||||
return sentences.slice(0, 2).join(" ").trim().slice(0, 180).trim();
|
||||
}
|
||||
|
||||
export class OpenAiLlmService {
|
||||
private readonly client: OpenAI;
|
||||
|
||||
constructor(private readonly config: AppConfig) {
|
||||
this.client = new OpenAI({
|
||||
apiKey: this.config.OPENAI_API_KEY,
|
||||
});
|
||||
}
|
||||
|
||||
async generateReply(memory: ConversationMemory, utterance: UserUtterance): Promise<string> {
|
||||
const response = await this.client.responses.create({
|
||||
model: this.config.OPENAI_MODEL,
|
||||
instructions: ASSISTANT_INSTRUCTIONS,
|
||||
input: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "input_text",
|
||||
text: memory.buildPrompt(utterance),
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
max_output_tokens: 120,
|
||||
});
|
||||
|
||||
const output = response.output_text?.trim();
|
||||
if (!output) {
|
||||
return "잘 못 들었습니다. 한 번만 다시 말씀해 주세요.";
|
||||
}
|
||||
|
||||
return normalizeReply(output);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user