feat: scaffold realtime Korean voice assistant bot

2026-04-30 02:29:18 +09:00
commit 9dee708b64
15 changed files with 1574 additions and 0 deletions
--- a/src/services/conversation.ts
+++ b/src/services/conversation.ts
@@ -0,0 +1,77 @@
+export interface ConversationTurn {
+  role: "user" | "assistant";
+  text: string;
+  speakerId?: string;
+  speakerName?: string;
+  createdAt: number;
+}
+
+export interface UserUtterance {
+  speakerId: string;
+  speakerName: string;
+  text: string;
+}
+
+export class ConversationMemory {
+  private readonly turns: ConversationTurn[] = [];
+
+  constructor(private readonly maxTurns: number) {}
+
+  addUserTurn(utterance: UserUtterance): void {
+    this.turns.push({
+      role: "user",
+      text: utterance.text,
+      speakerId: utterance.speakerId,
+      speakerName: utterance.speakerName,
+      createdAt: Date.now(),
+    });
+    this.trim();
+  }
+
+  addAssistantTurn(text: string): void {
+    this.turns.push({
+      role: "assistant",
+      text,
+      createdAt: Date.now(),
+    });
+    this.trim();
+  }
+
+  clear(): void {
+    this.turns.splice(0, this.turns.length);
+  }
+
+  recentTurns(): ConversationTurn[] {
+    return [...this.turns];
+  }
+
+  buildPrompt(currentUtterance: UserUtterance): string {
+    const recent = this.turns
+      .slice(-this.maxTurns)
+      .map((turn) => {
+        if (turn.role === "assistant") {
+          return `[assistant]\n${turn.text}`;
+        }
+        return `[user speaker_id=${turn.speakerId ?? "unknown"} speaker_name=${turn.speakerName ?? "unknown"}]\n${turn.text}`;
+      })
+      .join("\n\n");
+
+    const historyBlock = recent.length > 0 ? recent : "(이전 대화 없음)";
+
+    return [
+      "최근 대화:",
+      historyBlock,
+      "",
+      "이번 발화:",
+      `[user speaker_id=${currentUtterance.speakerId} speaker_name=${currentUtterance.speakerName}]`,
+      currentUtterance.text,
+    ].join("\n");
+  }
+
+  private trim(): void {
+    const overflow = this.turns.length - this.maxTurns;
+    if (overflow > 0) {
+      this.turns.splice(0, overflow);
+    }
+  }
+}
--- a/src/services/elevenlabs-stt.ts
+++ b/src/services/elevenlabs-stt.ts
@@ -0,0 +1,124 @@
+import WebSocket from "ws";
+
+import type { AppConfig } from "../config.js";
+
+interface ElevenLabsMessage {
+  message_type?: string;
+  text?: string;
+  error?: string;
+}
+
+const NON_FATAL_ERROR_TYPES = new Set([
+  "insufficient_audio_activity",
+]);
+
+export class ElevenLabsSttService {
+  constructor(private readonly config: AppConfig) {}
+
+  async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
+    if (pcm16MonoAudio.byteLength === 0) {
+      return null;
+    }
+
+    const url = new URL("wss://api.elevenlabs.io/v1/speech-to-text/realtime");
+    url.searchParams.set("model_id", this.config.ELEVENLABS_STT_MODEL);
+    url.searchParams.set("language_code", this.config.BOT_DEFAULT_LANGUAGE);
+    url.searchParams.set("audio_format", "pcm_16000");
+    url.searchParams.set("commit_strategy", "manual");
+    url.searchParams.set("include_timestamps", "false");
+    url.searchParams.set("include_language_detection", "false");
+    url.searchParams.set("enable_logging", "false");
+
+    return await new Promise<string | null>((resolve, reject) => {
+      const socket = new WebSocket(url, {
+        headers: {
+          "xi-api-key": this.config.ELEVENLABS_API_KEY,
+        },
+      });
+
+      let settled = false;
+      let lastTranscript = "";
+
+      const timeout = setTimeout(() => {
+        finish(lastTranscript || null);
+      }, 15_000);
+
+      const finish = (result: string | null, error?: Error) => {
+        if (settled) {
+          return;
+        }
+        settled = true;
+        clearTimeout(timeout);
+        try {
+          socket.close();
+        } catch {
+          // Ignore close race.
+        }
+
+        if (error) {
+          reject(error);
+          return;
+        }
+        resolve(result);
+      };
+
+      socket.on("message", (raw) => {
+        let message: ElevenLabsMessage;
+        try {
+          message = JSON.parse(raw.toString()) as ElevenLabsMessage;
+        } catch (error) {
+          finish(null, error as Error);
+          return;
+        }
+
+        switch (message.message_type) {
+          case "session_started":
+            socket.send(
+              JSON.stringify({
+                message_type: "input_audio_chunk",
+                audio_base_64: pcm16MonoAudio.toString("base64"),
+                commit: true,
+                sample_rate: 16000,
+              }),
+            );
+            return;
+          case "partial_transcript":
+            return;
+          case "committed_transcript":
+          case "committed_transcript_with_timestamps": {
+            const transcript = message.text?.trim() ?? "";
+            if (transcript.length > 0) {
+              lastTranscript = transcript;
+              finish(transcript);
+            }
+            return;
+          }
+          default:
+            if (!message.message_type?.endsWith("error") && !message.message_type) {
+              return;
+            }
+
+            if (message.message_type && NON_FATAL_ERROR_TYPES.has(message.message_type)) {
+              finish(null);
+              return;
+            }
+
+            finish(
+              null,
+              new Error(message.error ?? `ElevenLabs STT error: ${message.message_type ?? "unknown"}`),
+            );
+        }
+      });
+
+      socket.on("error", (error) => {
+        finish(null, error as Error);
+      });
+
+      socket.on("close", () => {
+        if (!settled) {
+          finish(lastTranscript || null);
+        }
+      });
+    });
+  }
+}
--- a/src/services/elevenlabs-tts.ts
+++ b/src/services/elevenlabs-tts.ts
@@ -0,0 +1,83 @@
+import { Readable } from "node:stream";
+
+import ffmpegStatic from "ffmpeg-static";
+import prism from "prism-media";
+import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice";
+
+import type { AppConfig } from "../config.js";
+
+export interface PreparedSpeechPlayback {
+  resource: AudioResource;
+  dispose: () => void;
+}
+
+export class ElevenLabsTtsService {
+  constructor(private readonly config: AppConfig) {
+    const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
+    if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
+      process.env.FFMPEG_PATH = resolvedFfmpegPath;
+    }
+  }
+
+  async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechPlayback> {
+    const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
+    url.searchParams.set("output_format", "mp3_44100_128");
+    url.searchParams.set("enable_logging", "false");
+
+    const response = await fetch(url, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        "xi-api-key": this.config.ELEVENLABS_API_KEY,
+      },
+      body: JSON.stringify({
+        text,
+        model_id: this.config.ELEVENLABS_TTS_MODEL,
+        language_code: this.config.BOT_DEFAULT_LANGUAGE,
+        voice_settings: {
+          stability: 0.35,
+          similarity_boost: 0.75,
+          speed: 1.05,
+        },
+      }),
+      signal,
+    });
+
+    if (!response.ok || !response.body) {
+      throw new Error(`ElevenLabs TTS request failed with status ${response.status}`);
+    }
+
+    const input = Readable.fromWeb(response.body as never);
+    const ffmpeg = new prism.FFmpeg({
+      args: [
+        "-analyzeduration",
+        "0",
+        "-loglevel",
+        "0",
+        "-i",
+        "pipe:0",
+        "-f",
+        "s16le",
+        "-ar",
+        "48000",
+        "-ac",
+        "2",
+        "pipe:1",
+      ],
+    });
+
+    input.pipe(ffmpeg);
+
+    const resource = createAudioResource(ffmpeg, {
+      inputType: StreamType.Raw,
+    });
+
+    return {
+      resource,
+      dispose: () => {
+        input.destroy();
+        ffmpeg.destroy();
+      },
+    };
+  }
+}
--- a/src/services/openai-llm.ts
+++ b/src/services/openai-llm.ts
@@ -0,0 +1,64 @@
+import OpenAI from "openai";
+
+import type { AppConfig } from "../config.js";
+import type { ConversationMemory, UserUtterance } from "./conversation.js";
+
+const ASSISTANT_INSTRUCTIONS = [
+  "너는 디스코드 음성 채널에서 동작하는 한국어 음성 비서다.",
+  "답변은 짧고 실용적으로 한다.",
+  "기본은 한 문장, 길어도 두 문장을 넘기지 않는다.",
+  "말투는 자연스러운 한국어로 유지한다.",
+  "speaker_id와 speaker_name은 화자 구분용이므로 필요할 때만 자연스럽게 반영한다.",
+  "잘 못 들었거나 의미가 불명확하면 짧게 다시 물어본다.",
+  "목록, 마크다운, 코드블록은 쓰지 않는다.",
+].join(" ");
+
+function normalizeReply(text: string): string {
+  const compact = text.replace(/\s+/g, " ").trim();
+  if (compact.length <= 180) {
+    return compact;
+  }
+
+  const sentences = compact.match(/[^.!?]+[.!?]?/g);
+  if (!sentences || sentences.length === 0) {
+    return compact.slice(0, 180).trim();
+  }
+
+  return sentences.slice(0, 2).join(" ").trim().slice(0, 180).trim();
+}
+
+export class OpenAiLlmService {
+  private readonly client: OpenAI;
+
+  constructor(private readonly config: AppConfig) {
+    this.client = new OpenAI({
+      apiKey: this.config.OPENAI_API_KEY,
+    });
+  }
+
+  async generateReply(memory: ConversationMemory, utterance: UserUtterance): Promise<string> {
+    const response = await this.client.responses.create({
+      model: this.config.OPENAI_MODEL,
+      instructions: ASSISTANT_INSTRUCTIONS,
+      input: [
+        {
+          role: "user",
+          content: [
+            {
+              type: "input_text",
+              text: memory.buildPrompt(utterance),
+            },
+          ],
+        },
+      ],
+      max_output_tokens: 120,
+    });
+
+    const output = response.output_text?.trim();
+    if (!output) {
+      return "잘 못 들었습니다. 한 번만 다시 말씀해 주세요.";
+    }
+
+    return normalizeReply(output);
+  }
+}