diff --git a/src/audio/local-voice-session.ts b/src/audio/local-voice-session.ts index da0ce4a..e908e27 100644 --- a/src/audio/local-voice-session.ts +++ b/src/audio/local-voice-session.ts @@ -1,7 +1,11 @@ -import { spawn, type ChildProcessByStdio } from "node:child_process"; +import { spawn, type ChildProcess, type ChildProcessByStdio } from "node:child_process"; import { once } from "node:events"; +import { promises as fs } from "node:fs"; +import os from "node:os"; +import path from "node:path"; import type { Readable, Writable } from "node:stream"; +import ffmpegStatic from "ffmpeg-static"; import { RealTimeVAD } from "avr-vad"; import type { AssistantRuntimeConfig } from "../config.js"; @@ -32,7 +36,7 @@ export class LocalVoiceSession { private vad: RealTimeVAD | null = null; private recorder: ChildProcessByStdio | null = null; - private currentPlayer: ChildProcessByStdio | null = null; + private currentPlayer: ChildProcess | null = null; private currentAbortController: AbortController | null = null; private currentPlayback: PreparedSpeechAudio | null = null; private processing = Promise.resolve(); @@ -114,14 +118,19 @@ export class LocalVoiceSession { statusSummary(): string { return [ "모드: local", + `플랫폼: ${process.platform}`, `입력 source: ${this.options.config.LOCAL_AUDIO_SOURCE ?? "default"}`, - `출력 sink: ${this.options.config.LOCAL_AUDIO_SINK ?? "default"}`, + `출력 sink: ${this.describeSink()}`, `대기열: ${this.queue.length}`, `최근 대화 턴: ${this.memory.recentTurns().length}`, ].join("\n"); } private spawnRecorder(): ChildProcessByStdio { + if (process.platform === "win32") { + return this.spawnWindowsRecorder(); + } + const args = [ "--rate", "16000", @@ -147,6 +156,40 @@ export class LocalVoiceSession { }); } + private spawnWindowsRecorder(): ChildProcessByStdio { + const ffmpegPath = this.getFfmpegPath(); + const sourceName = this.options.config.LOCAL_AUDIO_SOURCE; + if (!sourceName) { + throw new Error("Windows 로컬 모드는 LOCAL_AUDIO_SOURCE 설정이 필요합니다. `bun run audio:devices` 로 이름을 확인해 주세요."); + } + + const args = [ + "-hide_banner", + "-loglevel", + "warning", + "-f", + "dshow", + "-i", + `audio=${sourceName}`, + "-ac", + "1", + "-ar", + "16000", + "-f", + "s16le", + "pipe:1", + ]; + + this.options.logger.info("Starting local recorder", { + source: sourceName, + backend: "ffmpeg-dshow", + }); + + return spawn(ffmpegPath, args, { + stdio: ["ignore", "pipe", "pipe"], + }); + } + private pushPcm16Chunk(chunk: Buffer): void { if (this.destroyed || !this.vad) { return; @@ -284,6 +327,11 @@ export class LocalVoiceSession { } private async playToSink(playback: PreparedSpeechAudio, signal: AbortSignal): Promise { + if (process.platform === "win32") { + await this.playToWindowsDefaultSink(playback, signal); + return; + } + const args = [ "--rate", "48000", @@ -336,4 +384,120 @@ export class LocalVoiceSession { throw new Error(`pw-play exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`); } } + + private async playToWindowsDefaultSink(playback: PreparedSpeechAudio, signal: AbortSignal): Promise { + const chunks: Buffer[] = []; + + await new Promise((resolve, reject) => { + playback.stream.on("data", (chunk: Buffer) => { + chunks.push(Buffer.from(chunk)); + }); + playback.stream.once("end", resolve); + playback.stream.once("error", reject); + signal.addEventListener( + "abort", + () => { + playback.stream.destroy(); + reject(new Error("playback aborted")); + }, + { once: true }, + ); + }).catch((error) => { + if (signal.aborted) { + return; + } + throw error; + }); + + if (signal.aborted) { + return; + } + + const pcm = Buffer.concat(chunks); + const wav = createWaveFileBuffer(pcm, 48000, 2, 16); + const tempPath = path.join(os.tmpdir(), `realtime-voice-bot-${Date.now()}.wav`); + await fs.writeFile(tempPath, wav); + + const psScript = [ + "Add-Type -AssemblyName System;", + `$player = New-Object System.Media.SoundPlayer('${tempPath.replace(/'/g, "''")}');`, + "$player.PlaySync();", + ].join(" "); + + const player = spawn("powershell", ["-NoProfile", "-Command", psScript], { + stdio: ["ignore", "ignore", "pipe"], + }); + this.currentPlayer = player; + + player.stderr.on("data", (chunk: Buffer) => { + const text = chunk.toString().trim(); + if (text.length > 0) { + this.options.logger.debug("[powershell-player]", text); + } + }); + + signal.addEventListener( + "abort", + () => { + if (!player.killed) { + player.kill("SIGKILL"); + } + }, + { once: true }, + ); + + const [code, playSignal] = (await once(player, "exit")) as [number | null, NodeJS.Signals | null]; + this.currentPlayer = null; + await fs.unlink(tempPath).catch(() => null); + + if (signal.aborted) { + return; + } + + if (code !== 0) { + throw new Error(`powershell playback exited with code=${code ?? "null"} signal=${playSignal ?? "null"}`); + } + } + + private getFfmpegPath(): string { + const ffmpegPath = ffmpegStatic as unknown as string | null; + if (!ffmpegPath) { + throw new Error("ffmpeg-static 경로를 찾지 못했습니다."); + } + return ffmpegPath; + } + + private describeSink(): string { + if (process.platform === "win32") { + return this.options.config.LOCAL_AUDIO_SINK ?? "system-default"; + } + return this.options.config.LOCAL_AUDIO_SINK ?? "default"; + } +} + +function createWaveFileBuffer( + pcm: Buffer, + sampleRate: number, + channels: number, + bitsPerSample: number, +): Buffer { + const header = Buffer.alloc(44); + const byteRate = sampleRate * channels * (bitsPerSample / 8); + const blockAlign = channels * (bitsPerSample / 8); + + header.write("RIFF", 0, 4, "ascii"); + header.writeUInt32LE(36 + pcm.length, 4); + header.write("WAVE", 8, 4, "ascii"); + header.write("fmt ", 12, 4, "ascii"); + header.writeUInt32LE(16, 16); + header.writeUInt16LE(1, 20); + header.writeUInt16LE(channels, 22); + header.writeUInt32LE(sampleRate, 24); + header.writeUInt32LE(byteRate, 28); + header.writeUInt16LE(blockAlign, 32); + header.writeUInt16LE(bitsPerSample, 34); + header.write("data", 36, 4, "ascii"); + header.writeUInt32LE(pcm.length, 40); + + return Buffer.concat([header, pcm]); } diff --git a/src/config.ts b/src/config.ts index 8cf45a0..98e2299 100644 --- a/src/config.ts +++ b/src/config.ts @@ -3,20 +3,28 @@ import { z } from "zod"; loadDotenv(); +const emptyToUndefined = z.preprocess((value) => { + if (typeof value !== "string") { + return value; + } + const trimmed = value.trim(); + return trimmed.length === 0 ? undefined : trimmed; +}, z.string().min(1).optional()); + const envSchema = z.object({ - DISCORD_BOT_TOKEN: z.string().min(1).optional(), - DISCORD_APPLICATION_ID: z.string().min(1).optional(), - DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(), - OPENAI_API_KEY: z.string().min(1).optional(), + DISCORD_BOT_TOKEN: emptyToUndefined, + DISCORD_APPLICATION_ID: emptyToUndefined, + DISCORD_COMMAND_GUILD_ID: emptyToUndefined, + OPENAI_API_KEY: emptyToUndefined, OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"), - ELEVENLABS_API_KEY: z.string().min(1).optional(), - ELEVENLABS_VOICE_ID: z.string().min(1).optional(), + ELEVENLABS_API_KEY: emptyToUndefined, + ELEVENLABS_VOICE_ID: emptyToUndefined, ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"), ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"), BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"), MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12), - LOCAL_AUDIO_SOURCE: z.string().min(1).optional(), - LOCAL_AUDIO_SINK: z.string().min(1).optional(), + LOCAL_AUDIO_SOURCE: emptyToUndefined, + LOCAL_AUDIO_SINK: emptyToUndefined, LOCAL_SPEAKER_NAME: z.string().min(1).default("local-user"), DEBUG_TEXT_EVENTS: z .string() diff --git a/src/local-main.ts b/src/local-main.ts index b4420c6..e28c929 100644 --- a/src/local-main.ts +++ b/src/local-main.ts @@ -1,6 +1,8 @@ import { spawn } from "node:child_process"; import process from "node:process"; +import ffmpegStatic from "ffmpeg-static"; + import type { AssistantRuntimeConfig } from "./config.js"; import { Logger } from "./logger.js"; import { LocalVoiceSession } from "./audio/local-voice-session.js"; @@ -8,7 +10,38 @@ import { ElevenLabsSttService } from "./services/elevenlabs-stt.js"; import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js"; import { OpenAiLlmService } from "./services/openai-llm.js"; +function resolveFfmpegPath(): string { + const ffmpegPath = ffmpegStatic as unknown as string | null; + if (!ffmpegPath) { + throw new Error("ffmpeg-static 경로를 찾지 못했습니다."); + } + return ffmpegPath; +} + export async function printLocalAudioDevices(): Promise { + if (process.platform === "win32") { + const ffmpegPath = resolveFfmpegPath(); + + console.log("\n=== ffmpeg dshow audio devices ==="); + await new Promise((resolve) => { + const child = spawn( + ffmpegPath, + ["-hide_banner", "-list_devices", "true", "-f", "dshow", "-i", "dummy"], + { + stdio: ["ignore", "ignore", "inherit"], + }, + ); + child.on("exit", () => resolve()); + child.on("error", (error) => { + throw error; + }); + }); + + console.log("\n위 목록의 오디오 장치 이름을 `LOCAL_AUDIO_SOURCE` 에 그대로 넣으면 됩니다."); + console.log("Windows 로컬 모드는 현재 출력 장치 직접 선택 대신 시스템 기본 출력 장치를 사용합니다."); + return; + } + const runs = [ { label: "wpctl status", @@ -52,6 +85,9 @@ export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: console.log(session.statusSummary()); console.log("로컬 음성 테스트를 시작합니다. Ctrl+C 로 종료합니다."); + if (process.platform === "win32") { + console.log("Windows 로컬 모드는 현재 시스템 기본 출력 장치로 재생됩니다."); + } if (config.DEBUG_TEXT_EVENTS) { console.log("텍스트 로그 출력이 켜져 있습니다."); }