commit 9dee708b64fc41780a37afc4e31e718b2a6a5e81 Author: claude-bot Date: Thu Apr 30 02:29:18 2026 +0900 feat: scaffold realtime Korean voice assistant bot diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..81bfc03 --- /dev/null +++ b/.env.example @@ -0,0 +1,16 @@ +DISCORD_BOT_TOKEN= +DISCORD_APPLICATION_ID= +DISCORD_COMMAND_GUILD_ID= + +OPENAI_API_KEY= +OPENAI_MODEL=gpt-5.4-mini + +ELEVENLABS_API_KEY= +ELEVENLABS_VOICE_ID= +ELEVENLABS_STT_MODEL=scribe_v2_realtime +ELEVENLABS_TTS_MODEL=eleven_flash_v2_5 + +BOT_DEFAULT_LANGUAGE=ko +MAX_CONVERSATION_TURNS=12 +DEBUG_TEXT_EVENTS=false +LOG_LEVEL=info diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9c97bbd --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +node_modules +dist +.env diff --git a/README.md b/README.md new file mode 100644 index 0000000..b25544e --- /dev/null +++ b/README.md @@ -0,0 +1,82 @@ +# realtime_voice_bot + +디스코드 음성 채널에서 여러 사용자의 음성을 개별로 받아 한국어로 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다. + +## 현재 구현 범위 + +- Discord slash command 기반 제어: `/join`, `/leave`, `/status`, `/reset`, `/say` +- `@discordjs/voice` 기반 음성 채널 입장 및 유저별 오디오 수신 +- 48k stereo PCM을 16k mono로 내려서 유저별 VAD 처리 +- Silero 계열 VAD(`avr-vad`)로 발화 시작/종료 감지 +- ElevenLabs Scribe Realtime WebSocket으로 발화 단위 STT +- OpenAI Responses API로 짧은 한국어 답변 생성 +- ElevenLabs Flash v2.5 스트리밍 TTS +- 채널 단위 단일 재생 큐 +- 사용자 발화 시작 시 현재 TTS와 대기열 중단(barge-in) + +## 권장 환경 + +- Bun `1.3+` +- Node.js `22.12+` +- Discord bot with Voice permissions +- ElevenLabs API key + 사용할 Voice ID +- OpenAI API key + +## 환경 변수 + +`.env.example`를 참고해서 `.env`를 채우면 됩니다. + +필수: + +- `DISCORD_BOT_TOKEN` +- `DISCORD_APPLICATION_ID` +- `OPENAI_API_KEY` +- `ELEVENLABS_API_KEY` +- `ELEVENLABS_VOICE_ID` + +선택: + +- `DISCORD_COMMAND_GUILD_ID` + - 테스트 서버에만 slash command를 즉시 반영하려면 설정 +- `OPENAI_MODEL` + - 기본값: `gpt-5.4-mini` +- `ELEVENLABS_STT_MODEL` + - 기본값: `scribe_v2_realtime` +- `ELEVENLABS_TTS_MODEL` + - 기본값: `eleven_flash_v2_5` +- `DEBUG_TEXT_EVENTS` + - `true`면 명령을 실행한 텍스트 채널에 transcript/reply를 같이 올림 + +## 실행 + +```bash +bun install +bun run start +``` + +개발 모드: + +```bash +bun run dev +``` + +타입 체크: + +```bash +bun run check +``` + +## 사용 흐름 + +1. 봇을 서버에 초대하고 음성 권한을 부여합니다. +2. 음성 채널에 들어갑니다. +3. 텍스트 채널에서 `/join` 실행 +4. 말을 하면 봇이 발화 단위로 인식하고 음성으로 짧게 답합니다. +5. 다시 말하면 현재 읽고 있던 TTS는 즉시 중단됩니다. + +## 설계 메모 + +- 입력은 유저별 병렬 처리 +- 출력은 길드 세션당 단일 큐 +- 화자 구분은 `speaker_id`, `speaker_name`을 LLM 프롬프트에 항상 포함 +- 최소 프로토타입이므로 Deepgram 대체 STT, 장기 메모리, 고급 명령 라우팅은 아직 포함하지 않았습니다. diff --git a/bun.lock b/bun.lock new file mode 100644 index 0000000..4dfaaa2 --- /dev/null +++ b/bun.lock @@ -0,0 +1,225 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "realtime_voice_bot", + "dependencies": { + "@discordjs/voice": "^0.19.2", + "avr-vad": "^1.0.10", + "discord.js": "^14.26.3", + "dotenv": "^17.4.2", + "ffmpeg-static": "^5.3.0", + "openai": "^6.35.0", + "opusscript": "^0.1.1", + "prism-media": "^1.3.5", + "ws": "^8.20.0", + "zod": "^4.3.6", + }, + "devDependencies": { + "@types/node": "^25.6.0", + "typescript": "^6.0.3", + }, + }, + }, + "trustedDependencies": [ + "onnxruntime-node", + ], + "packages": { + "@derhuerst/http-basic": ["@derhuerst/http-basic@8.2.4", "", { "dependencies": { "caseless": "^0.12.0", "concat-stream": "^2.0.0", "http-response-object": "^3.0.1", "parse-cache-control": "^1.0.1" } }, "sha512-F9rL9k9Xjf5blCz8HsJRO4diy111cayL2vkY2XE4r4t3n0yPXVYy3KD3nJ1qbrSn9743UWSXH4IwuCa/HWlGFw=="], + + "@discordjs/builders": ["@discordjs/builders@1.14.1", "", { "dependencies": { "@discordjs/formatters": "^0.6.2", "@discordjs/util": "^1.2.0", "@sapphire/shapeshift": "^4.0.0", "discord-api-types": "^0.38.40", "fast-deep-equal": "^3.1.3", "ts-mixer": "^6.0.4", "tslib": "^2.6.3" } }, "sha512-gSKkhXLqs96TCzk66VZuHHl8z2bQMJFGwrXC0f33ngK+FLNau4hU1PYny3DNJfNdSH+gVMzE85/d5FQ2BpcNwQ=="], + + "@discordjs/collection": ["@discordjs/collection@1.5.3", "", {}, "sha512-SVb428OMd3WO1paV3rm6tSjM4wC+Kecaa1EUGX7vc6/fddvw/6lg90z4QtCqm21zvVe92vMMDt9+DkIvjXImQQ=="], + + "@discordjs/formatters": ["@discordjs/formatters@0.6.2", "", { "dependencies": { "discord-api-types": "^0.38.33" } }, "sha512-y4UPwWhH6vChKRkGdMB4odasUbHOUwy7KL+OVwF86PvT6QVOwElx+TiI1/6kcmcEe+g5YRXJFiXSXUdabqZOvQ=="], + + "@discordjs/rest": ["@discordjs/rest@2.6.1", "", { "dependencies": { "@discordjs/collection": "^2.1.1", "@discordjs/util": "^1.2.0", "@sapphire/async-queue": "^1.5.3", "@sapphire/snowflake": "^3.5.5", "@vladfrangu/async_event_emitter": "^2.4.6", "discord-api-types": "^0.38.40", "magic-bytes.js": "^1.13.0", "tslib": "^2.6.3", "undici": "6.24.1" } }, "sha512-wwQdgjeaoYFiaG+atbqx6aJDpqW7JHAo0HrQkBTbYzM3/PJ3GweQIpgElNcGZ26DCUOXMyawYd0YF7vtr+fZXg=="], + + "@discordjs/util": ["@discordjs/util@1.2.0", "", { "dependencies": { "discord-api-types": "^0.38.33" } }, "sha512-3LKP7F2+atl9vJFhaBjn4nOaSWahZ/yWjOvA4e5pnXkt2qyXRCHLxoBQy81GFtLGCq7K9lPm9R517M1U+/90Qg=="], + + "@discordjs/voice": ["@discordjs/voice@0.19.2", "", { "dependencies": { "@snazzah/davey": "^0.1.9", "@types/ws": "^8.18.1", "discord-api-types": "^0.38.41", "prism-media": "^1.3.5", "tslib": "^2.8.1", "ws": "^8.19.0" } }, "sha512-3yJ255e4ag3wfZu/DSxeOZK1UtnqNxnspmLaQetGT0pDkThNZoHs+Zg6dgZZ19JEVomXygvfHn9lNpICZuYtEA=="], + + "@discordjs/ws": ["@discordjs/ws@1.2.3", "", { "dependencies": { "@discordjs/collection": "^2.1.0", "@discordjs/rest": "^2.5.1", "@discordjs/util": "^1.1.0", "@sapphire/async-queue": "^1.5.2", "@types/ws": "^8.5.10", "@vladfrangu/async_event_emitter": "^2.2.4", "discord-api-types": "^0.38.1", "tslib": "^2.6.2", "ws": "^8.17.0" } }, "sha512-wPlQDxEmlDg5IxhJPuxXr3Vy9AjYq5xCvFWGJyD7w7Np8ZGu+Mc+97LCoEc/+AYCo2IDpKioiH0/c/mj5ZR9Uw=="], + + "@emnapi/core": ["@emnapi/core@1.10.0", "", { "dependencies": { "@emnapi/wasi-threads": "1.2.1", "tslib": "^2.4.0" } }, "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw=="], + + "@emnapi/runtime": ["@emnapi/runtime@1.10.0", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA=="], + + "@emnapi/wasi-threads": ["@emnapi/wasi-threads@1.2.1", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-uTII7OYF+/Mes/MrcIOYp5yOtSMLBWSIoLPpcgwipoiKbli6k322tcoFsxoIIxPDqW01SQGAgko4EzZi2BNv2w=="], + + "@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.4", "", { "dependencies": { "@tybys/wasm-util": "^0.10.1" }, "peerDependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1" } }, "sha512-3NQNNgA1YSlJb/kMH1ildASP9HW7/7kYnRI2szWJaofaS1hWmbGI4H+d3+22aGzXXN9IJ+n+GiFVcGipJP18ow=="], + + "@sapphire/async-queue": ["@sapphire/async-queue@1.5.5", "", {}, "sha512-cvGzxbba6sav2zZkH8GPf2oGk9yYoD5qrNWdu9fRehifgnFZJMV+nuy2nON2roRO4yQQ+v7MK/Pktl/HgfsUXg=="], + + "@sapphire/shapeshift": ["@sapphire/shapeshift@4.0.0", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "lodash": "^4.17.21" } }, "sha512-d9dUmWVA7MMiKobL3VpLF8P2aeanRTu6ypG2OIaEv/ZHH/SUQ2iHOVyi5wAPjQ+HmnMuL0whK9ez8I/raWbtIg=="], + + "@sapphire/snowflake": ["@sapphire/snowflake@3.5.3", "", {}, "sha512-jjmJywLAFoWeBi1W7994zZyiNWPIiqRRNAmSERxyg93xRGzNYvGjlZ0gR6x0F4gPRi2+0O6S71kOZYyr3cxaIQ=="], + + "@snazzah/davey": ["@snazzah/davey@0.1.11", "", { "optionalDependencies": { "@snazzah/davey-android-arm-eabi": "0.1.11", "@snazzah/davey-android-arm64": "0.1.11", "@snazzah/davey-darwin-arm64": "0.1.11", "@snazzah/davey-darwin-x64": "0.1.11", "@snazzah/davey-freebsd-x64": "0.1.11", "@snazzah/davey-linux-arm-gnueabihf": "0.1.11", "@snazzah/davey-linux-arm64-gnu": "0.1.11", "@snazzah/davey-linux-arm64-musl": "0.1.11", "@snazzah/davey-linux-x64-gnu": "0.1.11", "@snazzah/davey-linux-x64-musl": "0.1.11", "@snazzah/davey-wasm32-wasi": "0.1.11", "@snazzah/davey-win32-arm64-msvc": "0.1.11", "@snazzah/davey-win32-ia32-msvc": "0.1.11", "@snazzah/davey-win32-x64-msvc": "0.1.11" } }, "sha512-oBN+msHzPnm1M5DDx3wVD7iBwpNXFUtkh2MrAbUJu0OhKjliLChi28hq++mu1+qdMpAVQO5JKAvQQxYVbyneiw=="], + + "@snazzah/davey-android-arm-eabi": ["@snazzah/davey-android-arm-eabi@0.1.11", "", { "os": "android", "cpu": "arm" }, "sha512-T1RYbNYKN6tLOcGIDKJd8OI6FBSEemwL7DOYdTMmhqfhhMr3YVN8WOhfoxGg63OcnpTN2e2c5tdY2bAx25RmQQ=="], + + "@snazzah/davey-android-arm64": ["@snazzah/davey-android-arm64@0.1.11", "", { "os": "android", "cpu": "arm64" }, "sha512-ksJn/x2VU8h6w9eku1HT96ugSRZ7lKVkKNKbFleaFN+U99DJaPM+gMu2YvnFU4V54HR06ZBnRihnVG6VLXQpDw=="], + + "@snazzah/davey-darwin-arm64": ["@snazzah/davey-darwin-arm64@0.1.11", "", { "os": "darwin", "cpu": "arm64" }, "sha512-E1d7PbaaVMO3Lj9EiAPqOVbuV0xg5+PsHzHH097DDXiD1+zUDXvJaTnUWsnm5z50pJniHpi4GtaYmk+ieB/guA=="], + + "@snazzah/davey-darwin-x64": ["@snazzah/davey-darwin-x64@0.1.11", "", { "os": "darwin", "cpu": "x64" }, "sha512-Tl4TI/LTmgJZepgbgVMYDi8RqlAkPtPg1OEBPl7a9Tn3AwR36Vs6lyIT1cs/lGy/ds/+B+mKI4rPObN1cyILTw=="], + + "@snazzah/davey-freebsd-x64": ["@snazzah/davey-freebsd-x64@0.1.11", "", { "os": "freebsd", "cpu": "x64" }, "sha512-T8Iw9FXkuI1T+YBAFzh9v/TXf9IOTOSqnd/BFpTRTrlW72PR2lhIidzSmg027VxO7r5pX47iFwiOkb9I/NU/EA=="], + + "@snazzah/davey-linux-arm-gnueabihf": ["@snazzah/davey-linux-arm-gnueabihf@0.1.11", "", { "os": "linux", "cpu": "arm" }, "sha512-1Txj+8pqA8uq/OGtaUaBFWAPnNMQzFgIywj0iA7EI4xZl+mab48/pv+YZ1pNb/suC6ynsW44oB9efiXSdcUAgA=="], + + "@snazzah/davey-linux-arm64-gnu": ["@snazzah/davey-linux-arm64-gnu@0.1.11", "", { "os": "linux", "cpu": "arm64" }, "sha512-ERzF5nM/IYW1BcN3wLXpEwBCGLFf0kGJUVhaV6yfiInz0tkU8UmvrrgpaMaACfMjIhfWdq5CcX+aTkXo/saNcg=="], + + "@snazzah/davey-linux-arm64-musl": ["@snazzah/davey-linux-arm64-musl@0.1.11", "", { "os": "linux", "cpu": "arm64" }, "sha512-e6pX6Hiabtz99q+H/YHNkm9JVlpqN8HGh0qPib8G2+UY4/SSH8WvqWipk3v581dMy2oyCHt7MOoY1aU1P1N/xA=="], + + "@snazzah/davey-linux-x64-gnu": ["@snazzah/davey-linux-x64-gnu@0.1.11", "", { "os": "linux", "cpu": "x64" }, "sha512-TW5bSoqChOJMbvsDb4wAATYrxmAXuNnse7wFNVSAJUaZKSeRfZbu3UAiPWSNn7GwLwSfU6hg322KZUn8IWCuvg=="], + + "@snazzah/davey-linux-x64-musl": ["@snazzah/davey-linux-x64-musl@0.1.11", "", { "os": "linux", "cpu": "x64" }, "sha512-5j6Pmc+Wzv5lSxVP6quA7teYRJXibkZqQyYGfTDnTsUOO5dPpcojpqlXlkhyvsA1OAQTj4uxbOCciN3cVWwzug=="], + + "@snazzah/davey-wasm32-wasi": ["@snazzah/davey-wasm32-wasi@0.1.11", "", { "dependencies": { "@napi-rs/wasm-runtime": "^1.1.2" }, "cpu": "none" }, "sha512-rKOwZ/0J8lp+4VEyOdMDBRP9KR+PksZpa9V1Qn0veMzy4FqTVKthkxwGqewheFe0SFg9fdvt798l/PBFrfDeZw=="], + + "@snazzah/davey-win32-arm64-msvc": ["@snazzah/davey-win32-arm64-msvc@0.1.11", "", { "os": "win32", "cpu": "arm64" }, "sha512-5fptJU4tX901m3mj0SHiBljMrPT4ZEsynbBhR7bK1yn9TY1jjyhN8EFi7QF5IWtUEni+0mia2BCMHZ5ZkmFZqQ=="], + + "@snazzah/davey-win32-ia32-msvc": ["@snazzah/davey-win32-ia32-msvc@0.1.11", "", { "os": "win32", "cpu": "ia32" }, "sha512-ualexn8SeLsiMHhWfzVrzRcjHgcBapg++FPaVgJJxoh2S/jCRiklXOu3luqIZdJdNKvhe2V9SwO/cImPeIIBKw=="], + + "@snazzah/davey-win32-x64-msvc": ["@snazzah/davey-win32-x64-msvc@0.1.11", "", { "os": "win32", "cpu": "x64" }, "sha512-muNhc8UKXtknzsH/w4AIkbPR2I8BuvApn0pDXar0IEvY8PCjqU/M8MPbOOEYwQVvQRMwVTgExtxzrkBPSXB4nA=="], + + "@tybys/wasm-util": ["@tybys/wasm-util@0.10.1", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg=="], + + "@types/node": ["@types/node@25.6.0", "", { "dependencies": { "undici-types": "~7.19.0" } }, "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ=="], + + "@types/ws": ["@types/ws@8.18.1", "", { "dependencies": { "@types/node": "*" } }, "sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg=="], + + "@vladfrangu/async_event_emitter": ["@vladfrangu/async_event_emitter@2.4.7", "", {}, "sha512-Xfe6rpCTxSxfbswi/W/Pz7zp1WWSNn4A0eW4mLkQUewCrXXtMj31lCg+iQyTkh/CkusZSq9eDflu7tjEDXUY6g=="], + + "adm-zip": ["adm-zip@0.5.17", "", {}, "sha512-+Ut8d9LLqwEvHHJl1+PIHqoyDxFgVN847JTVM3Izi3xHDWPE4UtzzXysMZQs64DMcrJfBeS/uoEP4AD3HQHnQQ=="], + + "agent-base": ["agent-base@6.0.2", "", { "dependencies": { "debug": "4" } }, "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ=="], + + "avr-vad": ["avr-vad@1.0.10", "", { "dependencies": { "onnxruntime-node": "^1.22.0-rev" } }, "sha512-gM8SiQIebujfKMfy5w74tRPH+Fg78CMrBoDkMhCN3TmYVmmD8fmuVag7Q7ZCBITpFvYkOZnWEdGWuCb3YukBJw=="], + + "boolean": ["boolean@3.2.0", "", {}, "sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw=="], + + "buffer-from": ["buffer-from@1.1.2", "", {}, "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ=="], + + "caseless": ["caseless@0.12.0", "", {}, "sha512-4tYFyifaFfGacoiObjJegolkwSU4xQNGbVgUiNYVUxbQ2x2lUsFvY4hVgVzGiIe6WLOPqycWXA40l+PWsxthUw=="], + + "concat-stream": ["concat-stream@2.0.0", "", { "dependencies": { "buffer-from": "^1.0.0", "inherits": "^2.0.3", "readable-stream": "^3.0.2", "typedarray": "^0.0.6" } }, "sha512-MWufYdFw53ccGjCA+Ol7XJYpAlW6/prSMzuPOTRnJGcGzuhLn4Scrz7qf6o8bROZ514ltazcIFJZevcfbo0x7A=="], + + "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="], + + "define-data-property": ["define-data-property@1.1.4", "", { "dependencies": { "es-define-property": "^1.0.0", "es-errors": "^1.3.0", "gopd": "^1.0.1" } }, "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A=="], + + "define-properties": ["define-properties@1.2.1", "", { "dependencies": { "define-data-property": "^1.0.1", "has-property-descriptors": "^1.0.0", "object-keys": "^1.1.1" } }, "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg=="], + + "detect-node": ["detect-node@2.1.0", "", {}, "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g=="], + + "discord-api-types": ["discord-api-types@0.38.47", "", {}, "sha512-XgXQodHQBAE6kfD7kMvVo30863iHX1LHSqNq6MGUTDwIFCCvHva13+rwxyxVXDqudyApMNAd32PGjgVETi5rjA=="], + + "discord.js": ["discord.js@14.26.3", "", { "dependencies": { "@discordjs/builders": "^1.14.1", "@discordjs/collection": "1.5.3", "@discordjs/formatters": "^0.6.2", "@discordjs/rest": "^2.6.1", "@discordjs/util": "^1.2.0", "@discordjs/ws": "^1.2.3", "@sapphire/snowflake": "3.5.3", "discord-api-types": "^0.38.40", "fast-deep-equal": "3.1.3", "lodash.snakecase": "4.1.1", "magic-bytes.js": "^1.13.0", "tslib": "^2.6.3", "undici": "6.24.1" } }, "sha512-XEKtYn28YFsiJ5l4fLRyikdbo6RD5oFyqfVHQlvXz2104JhH/E8slN28dbky05w3DCrJcNVWvhVvcJCTSl/KIg=="], + + "dotenv": ["dotenv@17.4.2", "", {}, "sha512-nI4U3TottKAcAD9LLud4Cb7b2QztQMUEfHbvhTH09bqXTxnSie8WnjPALV/WMCrJZ6UV/qHJ6L03OqO3LcdYZw=="], + + "env-paths": ["env-paths@2.2.1", "", {}, "sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A=="], + + "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="], + + "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="], + + "es6-error": ["es6-error@4.1.1", "", {}, "sha512-Um/+FxMr9CISWh0bi5Zv0iOD+4cFh5qLeks1qhAopKVAJw3drgKbKySikp7wGhDL0HPeaja0P5ULZrxLkniUVg=="], + + "escape-string-regexp": ["escape-string-regexp@4.0.0", "", {}, "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA=="], + + "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="], + + "ffmpeg-static": ["ffmpeg-static@5.3.0", "", { "dependencies": { "@derhuerst/http-basic": "^8.2.0", "env-paths": "^2.2.0", "https-proxy-agent": "^5.0.0", "progress": "^2.0.3" } }, "sha512-H+K6sW6TiIX6VGend0KQwthe+kaceeH/luE8dIZyOP35ik7ahYojDuqlTV1bOrtEwl01sy2HFNGQfi5IDJvotg=="], + + "global-agent": ["global-agent@3.0.0", "", { "dependencies": { "boolean": "^3.0.1", "es6-error": "^4.1.1", "matcher": "^3.0.0", "roarr": "^2.15.3", "semver": "^7.3.2", "serialize-error": "^7.0.1" } }, "sha512-PT6XReJ+D07JvGoxQMkT6qji/jVNfX/h364XHZOWeRzy64sSFr+xJ5OX7LI3b4MPQzdL4H8Y8M0xzPpsVMwA8Q=="], + + "globalthis": ["globalthis@1.0.4", "", { "dependencies": { "define-properties": "^1.2.1", "gopd": "^1.0.1" } }, "sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ=="], + + "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="], + + "has-property-descriptors": ["has-property-descriptors@1.0.2", "", { "dependencies": { "es-define-property": "^1.0.0" } }, "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg=="], + + "http-response-object": ["http-response-object@3.0.2", "", { "dependencies": { "@types/node": "^10.0.3" } }, "sha512-bqX0XTF6fnXSQcEJ2Iuyr75yVakyjIDCqroJQ/aHfSdlM743Cwqoi2nDYMzLGWUcuTWGWy8AAvOKXTfiv6q9RA=="], + + "https-proxy-agent": ["https-proxy-agent@5.0.1", "", { "dependencies": { "agent-base": "6", "debug": "4" } }, "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA=="], + + "inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="], + + "json-stringify-safe": ["json-stringify-safe@5.0.1", "", {}, "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA=="], + + "lodash": ["lodash@4.18.1", "", {}, "sha512-dMInicTPVE8d1e5otfwmmjlxkZoUpiVLwyeTdUsi/Caj/gfzzblBcCE5sRHV/AsjuCmxWrte2TNGSYuCeCq+0Q=="], + + "lodash.snakecase": ["lodash.snakecase@4.1.1", "", {}, "sha512-QZ1d4xoBHYUeuouhEq3lk3Uq7ldgyFXGBhg04+oRLnIz8o9T65Eh+8YdroUwn846zchkA9yDsDl5CVVaV2nqYw=="], + + "magic-bytes.js": ["magic-bytes.js@1.13.0", "", {}, "sha512-afO2mnxW7GDTXMm5/AoN1WuOcdoKhtgXjIvHmobqTD1grNplhGdv3PFOyjCVmrnOZBIT/gD/koDKpYG+0mvHcg=="], + + "matcher": ["matcher@3.0.0", "", { "dependencies": { "escape-string-regexp": "^4.0.0" } }, "sha512-OkeDaAZ/bQCxeFAozM55PKcKU0yJMPGifLwV4Qgjitu+5MoAfSQN4lsLJeXZ1b8w0x+/Emda6MZgXS1jvsapng=="], + + "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], + + "object-keys": ["object-keys@1.1.1", "", {}, "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA=="], + + "onnxruntime-common": ["onnxruntime-common@1.24.3", "", {}, "sha512-GeuPZO6U/LBJXvwdaqHbuUmoXiEdeCjWi/EG7Y1HNnDwJYuk6WUbNXpF6luSUY8yASul3cmUlLGrCCL1ZgVXqA=="], + + "onnxruntime-node": ["onnxruntime-node@1.24.3", "", { "dependencies": { "adm-zip": "^0.5.16", "global-agent": "^3.0.0", "onnxruntime-common": "1.24.3" }, "os": [ "linux", "win32", "darwin", ] }, "sha512-JH7+czbc8ALA819vlTgcV+Q214/+VjGeBHDjX81+ZCD0PCVCIFGFNtT0V4sXG/1JXypKPgScQcB3ij/hk3YnTg=="], + + "openai": ["openai@6.35.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-L/skwIGnt5xQZHb0UfTu9uAUKbis3ehKypOuJKi20QvG7UStV6C8IC3myGYHcdiF4kms/bAvOJ9UqqNWqi8x/Q=="], + + "opusscript": ["opusscript@0.1.1", "", {}, "sha512-mL0fZZOUnXdZ78woRXp18lApwpp0lF5tozJOD1Wut0dgrA9WuQTgSels/CSmFleaAZrJi/nci5KOVtbuxeWoQA=="], + + "parse-cache-control": ["parse-cache-control@1.0.1", "", {}, "sha512-60zvsJReQPX5/QP0Kzfd/VrpjScIQ7SHBW6bFCYfEP+fp0Eppr1SHhIO5nd1PjZtvclzSzES9D/p5nFJurwfWg=="], + + "prism-media": ["prism-media@1.3.5", "", { "peerDependencies": { "@discordjs/opus": ">=0.8.0 <1.0.0", "ffmpeg-static": "^5.0.2 || ^4.2.7 || ^3.0.0 || ^2.4.0", "node-opus": "^0.3.3", "opusscript": "^0.0.8" }, "optionalPeers": ["@discordjs/opus", "ffmpeg-static", "node-opus", "opusscript"] }, "sha512-IQdl0Q01m4LrkN1EGIE9lphov5Hy7WWlH6ulf5QdGePLlPas9p2mhgddTEHrlaXYjjFToM1/rWuwF37VF4taaA=="], + + "progress": ["progress@2.0.3", "", {}, "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA=="], + + "readable-stream": ["readable-stream@3.6.2", "", { "dependencies": { "inherits": "^2.0.3", "string_decoder": "^1.1.1", "util-deprecate": "^1.0.1" } }, "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA=="], + + "roarr": ["roarr@2.15.4", "", { "dependencies": { "boolean": "^3.0.1", "detect-node": "^2.0.4", "globalthis": "^1.0.1", "json-stringify-safe": "^5.0.1", "semver-compare": "^1.0.0", "sprintf-js": "^1.1.2" } }, "sha512-CHhPh+UNHD2GTXNYhPWLnU8ONHdI+5DI+4EYIAOaiD63rHeYlZvyh8P+in5999TTSFgUYuKUAjzRI4mdh/p+2A=="], + + "safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="], + + "semver": ["semver@7.7.4", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA=="], + + "semver-compare": ["semver-compare@1.0.0", "", {}, "sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow=="], + + "serialize-error": ["serialize-error@7.0.1", "", { "dependencies": { "type-fest": "^0.13.1" } }, "sha512-8I8TjW5KMOKsZQTvoxjuSIa7foAwPWGOts+6o7sgjz41/qMD9VQHEDxi6PBvK2l0MXUmqZyNpUK+T2tQaaElvw=="], + + "sprintf-js": ["sprintf-js@1.1.3", "", {}, "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA=="], + + "string_decoder": ["string_decoder@1.3.0", "", { "dependencies": { "safe-buffer": "~5.2.0" } }, "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA=="], + + "ts-mixer": ["ts-mixer@6.0.4", "", {}, "sha512-ufKpbmrugz5Aou4wcr5Wc1UUFWOLhq+Fm6qa6P0w0K5Qw2yhaUoiWszhCVuNQyNwrlGiscHOmqYoAox1PtvgjA=="], + + "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], + + "type-fest": ["type-fest@0.13.1", "", {}, "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg=="], + + "typedarray": ["typedarray@0.0.6", "", {}, "sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA=="], + + "typescript": ["typescript@6.0.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw=="], + + "undici": ["undici@6.24.1", "", {}, "sha512-sC+b0tB1whOCzbtlx20fx3WgCXwkW627p4EA9uM+/tNNPkSS+eSEld6pAs9nDv7WbY1UUljBMYPtu9BCOrCWKA=="], + + "undici-types": ["undici-types@7.19.2", "", {}, "sha512-qYVnV5OEm2AW8cJMCpdV20CDyaN3g0AjDlOGf1OW4iaDEx8MwdtChUp4zu4H0VP3nDRF/8RKWH+IPp9uW0YGZg=="], + + "util-deprecate": ["util-deprecate@1.0.2", "", {}, "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="], + + "ws": ["ws@8.20.0", "", { "peerDependencies": { "bufferutil": "^4.0.1", "utf-8-validate": ">=5.0.2" }, "optionalPeers": ["bufferutil", "utf-8-validate"] }, "sha512-sAt8BhgNbzCtgGbt2OxmpuryO63ZoDk/sqaB/znQm94T4fCEsy/yV+7CdC1kJhOU9lboAEU7R3kquuycDoibVA=="], + + "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], + + "@discordjs/rest/@discordjs/collection": ["@discordjs/collection@2.1.1", "", {}, "sha512-LiSusze9Tc7qF03sLCujF5iZp7K+vRNEDBZ86FT9aQAv3vxMLihUvKvpsCWiQ2DJq1tVckopKm1rxomgNUc9hg=="], + + "@discordjs/rest/@sapphire/snowflake": ["@sapphire/snowflake@3.5.5", "", {}, "sha512-xzvBr1Q1c4lCe7i6sRnrofxeO1QTP/LKQ6A6qy0iB4x5yfiSfARMEQEghojzTNALDTcv8En04qYNIco9/K9eZQ=="], + + "@discordjs/ws/@discordjs/collection": ["@discordjs/collection@2.1.1", "", {}, "sha512-LiSusze9Tc7qF03sLCujF5iZp7K+vRNEDBZ86FT9aQAv3vxMLihUvKvpsCWiQ2DJq1tVckopKm1rxomgNUc9hg=="], + + "http-response-object/@types/node": ["@types/node@10.17.60", "", {}, "sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw=="], + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..2e95195 --- /dev/null +++ b/package.json @@ -0,0 +1,35 @@ +{ + "name": "realtime_voice_bot", + "version": "0.1.0", + "private": true, + "type": "module", + "scripts": { + "dev": "bun --watch src/index.ts", + "start": "bun src/index.ts", + "check": "tsc --noEmit", + "build": "tsc -p tsconfig.json" + }, + "engines": { + "bun": ">=1.3.0", + "node": ">=22.12.0" + }, + "dependencies": { + "@discordjs/voice": "^0.19.2", + "avr-vad": "^1.0.10", + "discord.js": "^14.26.3", + "dotenv": "^17.4.2", + "ffmpeg-static": "^5.3.0", + "openai": "^6.35.0", + "opusscript": "^0.1.1", + "prism-media": "^1.3.5", + "ws": "^8.20.0", + "zod": "^4.3.6" + }, + "devDependencies": { + "@types/node": "^25.6.0", + "typescript": "^6.0.3" + }, + "trustedDependencies": [ + "onnxruntime-node" + ] +} diff --git a/src/audio/guild-voice-session.ts b/src/audio/guild-voice-session.ts new file mode 100644 index 0000000..5644d49 --- /dev/null +++ b/src/audio/guild-voice-session.ts @@ -0,0 +1,452 @@ +import { EventEmitter } from "node:events"; + +import prism from "prism-media"; +import { RealTimeVAD } from "avr-vad"; +import { + AudioPlayerStatus, + EndBehaviorType, + NoSubscriberBehavior, + VoiceConnectionStatus, + createAudioPlayer, + entersState, + joinVoiceChannel, + type AudioPlayer, + type AudioReceiveStream, + type VoiceConnection, +} from "@discordjs/voice"; +import type { Client, Guild, VoiceBasedChannel } from "discord.js"; + +import type { AppConfig } from "../config.js"; +import { Logger } from "../logger.js"; +import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js"; +import { ConversationMemory, type UserUtterance } from "../services/conversation.js"; +import { ElevenLabsSttService } from "../services/elevenlabs-stt.js"; +import { ElevenLabsTtsService, type PreparedSpeechPlayback } from "../services/elevenlabs-tts.js"; +import { OpenAiLlmService } from "../services/openai-llm.js"; + +interface GuildVoiceSessionOptions { + client: Client; + config: AppConfig; + logger: Logger; + guild: Guild; + voiceChannel: VoiceBasedChannel; + textChannelId?: string; + stt: ElevenLabsSttService; + tts: ElevenLabsTtsService; + llm: OpenAiLlmService; +} + +interface SpeechJob { + text: string; + source: "assistant" | "manual"; +} + +class UserAudioSession { + private readonly downsampler = new Stereo48kToMono16kDownsampler(); + private readonly pendingSamples: number[] = []; + private readonly vad: RealTimeVAD; + private processing = Promise.resolve(); + + private constructor( + private readonly logger: Logger, + private readonly speakerId: string, + private readonly speakerName: string, + private readonly receiveStream: AudioReceiveStream, + private readonly decoder: NodeJS.ReadWriteStream & { destroy: () => void }, + vad: RealTimeVAD, + private readonly onSpeechEnd: (utterance: UserUtterance, audio: Float32Array) => void, + ) { + this.vad = vad; + } + + static async create(options: { + logger: Logger; + speakerId: string; + speakerName: string; + receiveStream: AudioReceiveStream; + decoder: NodeJS.ReadWriteStream & { destroy: () => void }; + onSpeechStart: () => void; + onSpeechEnd: (utterance: UserUtterance, audio: Float32Array) => void; + }): Promise { + const vadInstance = await RealTimeVAD.new({ + model: "v5", + sampleRate: 16000, + frameSamples: 1536, + positiveSpeechThreshold: 0.55, + negativeSpeechThreshold: 0.35, + redemptionFrames: 8, + preSpeechPadFrames: 2, + minSpeechFrames: 3, + onFrameProcessed: () => undefined, + onVADMisfire: () => undefined, + onSpeechStart: () => { + options.onSpeechStart(); + }, + onSpeechRealStart: () => undefined, + onSpeechEnd: (audio: Float32Array) => { + options.onSpeechEnd( + { + speakerId: options.speakerId, + speakerName: options.speakerName, + text: "", + }, + audio, + ); + }, + }); + + const session = new UserAudioSession( + options.logger, + options.speakerId, + options.speakerName, + options.receiveStream, + options.decoder, + vadInstance, + options.onSpeechEnd, + ); + + session.decoder.on("data", (chunk: Buffer) => { + session.pushPcmChunk(chunk); + }); + + session.decoder.on("error", (error) => { + options.logger.warn("PCM decoder error", options.speakerId, error); + }); + + session.receiveStream.on("error", (error) => { + options.logger.warn("Audio receive stream error", options.speakerId, error); + }); + + return session; + } + + private pushPcmChunk(chunk: Buffer): void { + const mono16k = this.downsampler.pushStereo48kChunk(chunk); + if (mono16k.length === 0) { + return; + } + + for (const sample of mono16k) { + this.pendingSamples.push(sample); + } + + while (true) { + const frame = takeFrame(this.pendingSamples, 1536); + if (!frame) { + return; + } + + const floatFrame = int16ArrayToFloat32(frame); + this.processing = this.processing + .then(() => this.vad.processAudio(floatFrame)) + .catch((error) => { + this.logger.warn("VAD frame processing failed", this.speakerId, this.speakerName, error); + }); + } + } + + destroy(): void { + this.receiveStream.destroy(); + this.decoder.destroy(); + void this.vad.destroy().catch((error) => { + this.logger.warn("VAD destroy failed", this.speakerId, this.speakerName, error); + }); + } +} + +export class GuildVoiceSession extends EventEmitter { + readonly guildId: string; + readonly voiceChannelId: string; + + private readonly connection: VoiceConnection; + private readonly player: AudioPlayer; + private readonly memory: ConversationMemory; + private readonly trackedUsers = new Map(); + private readonly pendingUsers = new Map>(); + private readonly queue: SpeechJob[] = []; + + private draining = false; + private currentAbortController: AbortController | null = null; + private currentPlayback: PreparedSpeechPlayback | null = null; + private textChannelId?: string; + + private constructor(private readonly options: GuildVoiceSessionOptions) { + super(); + + this.guildId = options.guild.id; + this.voiceChannelId = options.voiceChannel.id; + this.textChannelId = options.textChannelId; + this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS); + this.player = createAudioPlayer({ + behaviors: { + noSubscriber: NoSubscriberBehavior.Pause, + }, + }); + this.connection = joinVoiceChannel({ + guildId: options.guild.id, + channelId: options.voiceChannel.id, + adapterCreator: options.guild.voiceAdapterCreator, + selfDeaf: false, + selfMute: false, + }); + } + + static async create(options: GuildVoiceSessionOptions): Promise { + const session = new GuildVoiceSession(options); + await session.initialize(); + return session; + } + + private async initialize(): Promise { + this.player.on("error", (error) => { + this.options.logger.warn("Audio player error", this.guildId, error); + }); + + this.connection.on("stateChange", (_oldState, newState) => { + if (newState.status === VoiceConnectionStatus.Destroyed) { + this.options.logger.info("Voice connection destroyed", this.guildId); + } + }); + + this.connection.subscribe(this.player); + await entersState(this.connection, VoiceConnectionStatus.Ready, 30_000); + + this.connection.receiver.speaking.on("start", (userId: string) => { + if (userId === this.options.client.user?.id) { + return; + } + + void this.ensureTrackedUser(userId); + }); + } + + setTextChannel(textChannelId?: string): void { + this.textChannelId = textChannelId; + } + + clearConversation(): void { + this.memory.clear(); + this.interruptPlayback("conversation-reset"); + } + + statusSummary(): string { + const playbackState = this.player.state.status; + return [ + `세션 활성: 예`, + `음성 채널: ${this.options.voiceChannel.name}`, + `추적 유저 수: ${this.trackedUsers.size}`, + `재생 상태: ${playbackState}`, + `대기열: ${this.queue.length}`, + `최근 대화 턴: ${this.memory.recentTurns().length}`, + ].join("\n"); + } + + async speakText(text: string): Promise { + this.queue.push({ + text, + source: "manual", + }); + await this.drainQueue(); + } + + interruptPlayback(reason: string): void { + if (this.queue.length > 0 || this.player.state.status !== AudioPlayerStatus.Idle) { + this.options.logger.info("Interrupting playback", this.guildId, reason); + } + + this.queue.splice(0, this.queue.length); + this.currentAbortController?.abort(); + this.currentAbortController = null; + this.currentPlayback?.dispose(); + this.currentPlayback = null; + this.player.stop(true); + } + + async destroy(): Promise { + this.interruptPlayback("session-destroy"); + for (const session of this.trackedUsers.values()) { + session.destroy(); + } + this.trackedUsers.clear(); + this.pendingUsers.clear(); + this.connection.destroy(); + } + + private async ensureTrackedUser(userId: string): Promise { + if (this.trackedUsers.has(userId)) { + return; + } + + const existing = this.pendingUsers.get(userId); + if (existing) { + await existing; + return; + } + + const pending = this.createTrackedUser(userId).finally(() => { + this.pendingUsers.delete(userId); + }); + this.pendingUsers.set(userId, pending); + await pending; + } + + private async createTrackedUser(userId: string): Promise { + const speakerName = await this.resolveSpeakerName(userId); + const receiveStream = this.connection.receiver.subscribe(userId, { + end: { + behavior: EndBehaviorType.Manual, + }, + }); + + const decoder = new prism.opus.Decoder({ + rate: 48000, + channels: 2, + frameSize: 960, + }) as NodeJS.ReadWriteStream & { destroy: () => void }; + + receiveStream.pipe(decoder); + + const session = await UserAudioSession.create({ + logger: this.options.logger, + speakerId: userId, + speakerName, + receiveStream, + decoder, + onSpeechStart: () => { + this.interruptPlayback(`barge-in:${speakerName}`); + }, + onSpeechEnd: (utterance, audio) => { + void this.handleSpeechEnd(utterance, audio); + }, + }); + + this.trackedUsers.set(userId, session); + this.options.logger.info("Tracking speaker", this.guildId, userId, speakerName); + } + + private async resolveSpeakerName(userId: string): Promise { + try { + const user = await this.options.client.users.fetch(userId); + return user.globalName ?? user.username; + } catch { + return `user-${userId.slice(-6)}`; + } + } + + private async handleSpeechEnd(utterance: UserUtterance, audio: Float32Array): Promise { + if (audio.length < 16000 * 0.25) { + return; + } + + const pcmBuffer = float32ToPcm16Buffer(audio); + let transcript: string | null = null; + + try { + transcript = await this.options.stt.transcribePcm16(pcmBuffer); + } catch (error) { + this.options.logger.warn("STT failed", this.guildId, utterance.speakerId, error); + await this.announce(`음성 인식 실패: ${utterance.speakerName}`); + return; + } + + if (!transcript || transcript.trim().length === 0) { + return; + } + + const hydratedUtterance: UserUtterance = { + ...utterance, + text: transcript.trim(), + }; + + this.options.logger.info("Transcript committed", this.guildId, hydratedUtterance.speakerName, hydratedUtterance.text); + this.memory.addUserTurn(hydratedUtterance); + + if (this.options.config.DEBUG_TEXT_EVENTS) { + await this.announce(`🗣️ ${hydratedUtterance.speakerName}: ${hydratedUtterance.text}`); + } + + let reply: string; + try { + reply = await this.options.llm.generateReply(this.memory, hydratedUtterance); + } catch (error) { + this.options.logger.warn("LLM failed", this.guildId, utterance.speakerId, error); + reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요."; + } + + this.memory.addAssistantTurn(reply); + if (this.options.config.DEBUG_TEXT_EVENTS) { + await this.announce(`🤖 ${reply}`); + } + + this.queue.push({ + text: reply, + source: "assistant", + }); + await this.drainQueue(); + } + + private async drainQueue(): Promise { + if (this.draining) { + return; + } + + this.draining = true; + + try { + while (this.queue.length > 0) { + const job = this.queue.shift(); + if (!job) { + continue; + } + + const abortController = new AbortController(); + this.currentAbortController = abortController; + + try { + this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal); + } catch (error) { + if (abortController.signal.aborted) { + continue; + } + + this.options.logger.warn("TTS synthesis failed", this.guildId, job.source, error); + await this.announce("음성 출력 생성에 실패했습니다."); + continue; + } + + try { + const resource = this.currentPlayback.resource; + this.player.play(resource); + + await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null); + await entersState(this.player, AudioPlayerStatus.Idle, 300_000); + } catch (error) { + if (!abortController.signal.aborted) { + this.options.logger.warn("Audio playback failed", this.guildId, error); + } + } finally { + this.currentPlayback?.dispose(); + this.currentPlayback = null; + if (this.currentAbortController === abortController) { + this.currentAbortController = null; + } + } + } + } finally { + this.draining = false; + } + } + + private async announce(message: string): Promise { + if (!this.textChannelId) { + return; + } + + const channel = await this.options.client.channels.fetch(this.textChannelId).catch(() => null); + if (!channel?.isTextBased() || !("send" in channel) || typeof channel.send !== "function") { + return; + } + + await channel.send(message).catch(() => null); + } +} diff --git a/src/audio/pcm.ts b/src/audio/pcm.ts new file mode 100644 index 0000000..efdc468 --- /dev/null +++ b/src/audio/pcm.ts @@ -0,0 +1,60 @@ +export class Stereo48kToMono16kDownsampler { + private readonly pendingMono48k: number[] = []; + + pushStereo48kChunk(chunk: Buffer): Int16Array { + if (chunk.length < 4) { + return new Int16Array(); + } + + for (let offset = 0; offset + 3 < chunk.length; offset += 4) { + const left = chunk.readInt16LE(offset); + const right = chunk.readInt16LE(offset + 2); + this.pendingMono48k.push(Math.round((left + right) / 2)); + } + + const outputLength = Math.floor(this.pendingMono48k.length / 3); + if (outputLength === 0) { + return new Int16Array(); + } + + const output = new Int16Array(outputLength); + let readIndex = 0; + for (let index = 0; index < outputLength; index += 1) { + const a = this.pendingMono48k[readIndex]; + const b = this.pendingMono48k[readIndex + 1]; + const c = this.pendingMono48k[readIndex + 2]; + output[index] = Math.round((a + b + c) / 3); + readIndex += 3; + } + + this.pendingMono48k.splice(0, readIndex); + return output; + } +} + +export function int16ArrayToFloat32(input: Int16Array): Float32Array { + const output = new Float32Array(input.length); + for (let index = 0; index < input.length; index += 1) { + output[index] = input[index] / 32768; + } + return output; +} + +export function float32ToPcm16Buffer(input: Float32Array): Buffer { + const buffer = Buffer.allocUnsafe(input.length * 2); + for (let index = 0; index < input.length; index += 1) { + const value = Math.max(-1, Math.min(1, input[index])); + const scaled = value < 0 ? value * 32768 : value * 32767; + buffer.writeInt16LE(Math.round(scaled), index * 2); + } + return buffer; +} + +export function takeFrame(source: number[], frameSize: number): Int16Array | null { + if (source.length < frameSize) { + return null; + } + + const values = source.splice(0, frameSize); + return Int16Array.from(values); +} diff --git a/src/config.ts b/src/config.ts new file mode 100644 index 0000000..a122c90 --- /dev/null +++ b/src/config.ts @@ -0,0 +1,29 @@ +import { config as loadDotenv } from "dotenv"; +import { z } from "zod"; + +loadDotenv(); + +const envSchema = z.object({ + DISCORD_BOT_TOKEN: z.string().min(1), + DISCORD_APPLICATION_ID: z.string().min(1), + DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(), + OPENAI_API_KEY: z.string().min(1), + OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"), + ELEVENLABS_API_KEY: z.string().min(1), + ELEVENLABS_VOICE_ID: z.string().min(1), + ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"), + ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"), + BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"), + MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12), + DEBUG_TEXT_EVENTS: z + .string() + .optional() + .transform((value) => value === "true"), + LOG_LEVEL: z.enum(["debug", "info", "warn", "error"]).default("info"), +}); + +export type AppConfig = z.infer; + +export function loadConfig(): AppConfig { + return envSchema.parse(process.env); +} diff --git a/src/index.ts b/src/index.ts new file mode 100644 index 0000000..59cd390 --- /dev/null +++ b/src/index.ts @@ -0,0 +1,240 @@ +import process from "node:process"; + +import { + GatewayIntentBits, + REST, + Routes, + SlashCommandBuilder, + type ChatInputCommandInteraction, + type Client, + type GuildMember, + type VoiceBasedChannel, +} from "discord.js"; +import { Client as DiscordClient } from "discord.js"; + +import { GuildVoiceSession } from "./audio/guild-voice-session.js"; +import { loadConfig } from "./config.js"; +import { Logger } from "./logger.js"; +import { ElevenLabsSttService } from "./services/elevenlabs-stt.js"; +import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js"; +import { OpenAiLlmService } from "./services/openai-llm.js"; + +const config = loadConfig(); +const logger = new Logger(config.LOG_LEVEL); + +const commands = [ + new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."), + new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."), + new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."), + new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."), + new SlashCommandBuilder() + .setName("say") + .setDescription("텍스트를 바로 음성으로 읽습니다.") + .addStringOption((option) => + option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400), + ), +].map((command) => command.toJSON()); + +const client = new DiscordClient({ + intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates], +}); + +const stt = new ElevenLabsSttService(config); +const tts = new ElevenLabsTtsService(config); +const llm = new OpenAiLlmService(config); +const sessions = new Map(); + +function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null { + const member = interaction.member as GuildMember | null; + return member?.voice.channel ?? null; +} + +async function registerCommands(appClient: Client): Promise { + const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN); + if (config.DISCORD_COMMAND_GUILD_ID) { + await rest.put( + Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID), + { + body: commands, + }, + ); + logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID); + return; + } + + await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), { + body: commands, + }); + logger.info("Registered global commands"); +} + +async function createSession(interaction: ChatInputCommandInteraction): Promise { + if (!interaction.guild) { + throw new Error("Guild interaction required"); + } + + const voiceChannel = getVoiceChannel(interaction); + if (!voiceChannel) { + throw new Error("먼저 음성 채널에 들어가 주세요."); + } + + const existing = sessions.get(interaction.guild.id); + if (existing && existing.voiceChannelId === voiceChannel.id) { + existing.setTextChannel(interaction.channelId); + return existing; + } + + if (existing) { + await existing.destroy(); + sessions.delete(interaction.guild.id); + } + + const session = await GuildVoiceSession.create({ + client, + config, + logger, + guild: interaction.guild, + voiceChannel, + textChannelId: interaction.channelId, + stt, + tts, + llm, + }); + sessions.set(interaction.guild.id, session); + return session; +} + +async function handleJoin(interaction: ChatInputCommandInteraction): Promise { + await interaction.deferReply({ ephemeral: true }); + + try { + const session = await createSession(interaction); + await interaction.editReply(`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`); + } catch (error) { + const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다."; + await interaction.editReply(message); + } +} + +async function handleLeave(interaction: ChatInputCommandInteraction): Promise { + const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; + if (!session) { + await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true }); + return; + } + + await session.destroy(); + sessions.delete(interaction.guildId!); + await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true }); +} + +async function handleStatus(interaction: ChatInputCommandInteraction): Promise { + const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; + if (!session) { + await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true }); + return; + } + + await interaction.reply({ + content: session.statusSummary(), + ephemeral: true, + }); +} + +async function handleReset(interaction: ChatInputCommandInteraction): Promise { + const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; + if (!session) { + await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true }); + return; + } + + session.clearConversation(); + await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true }); +} + +async function handleSay(interaction: ChatInputCommandInteraction): Promise { + await interaction.deferReply({ ephemeral: true }); + + const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined; + if (!session) { + await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요."); + return; + } + + const text = interaction.options.getString("text", true).trim(); + await session.speakText(text); + await interaction.editReply("읽기 요청을 대기열에 추가했습니다."); +} + +async function shutdown(exitCode = 0): Promise { + logger.info("Shutting down"); + for (const session of sessions.values()) { + await session.destroy().catch((error) => { + logger.warn("Session shutdown failed", error); + }); + } + sessions.clear(); + await client.destroy(); + process.exit(exitCode); +} + +client.once("ready", async () => { + logger.info("Discord client ready", client.user?.tag ?? "unknown"); + try { + await registerCommands(client); + } catch (error) { + logger.error("Command registration failed", error); + } +}); + +client.on("interactionCreate", async (interaction) => { + if (!interaction.isChatInputCommand()) { + return; + } + + try { + switch (interaction.commandName) { + case "join": + await handleJoin(interaction); + return; + case "leave": + await handleLeave(interaction); + return; + case "status": + await handleStatus(interaction); + return; + case "reset": + await handleReset(interaction); + return; + case "say": + await handleSay(interaction); + return; + default: + await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true }); + } + } catch (error) { + logger.error("Interaction handler failed", error); + if (interaction.deferred || interaction.replied) { + await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null); + return; + } + await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null); + } +}); + +process.on("SIGINT", () => { + void shutdown(0); +}); + +process.on("SIGTERM", () => { + void shutdown(0); +}); + +async function main(): Promise { + await client.login(config.DISCORD_BOT_TOKEN); +} + +void main().catch((error) => { + logger.error("Fatal startup error", error); + process.exit(1); +}); diff --git a/src/logger.ts b/src/logger.ts new file mode 100644 index 0000000..28a8ef8 --- /dev/null +++ b/src/logger.ts @@ -0,0 +1,63 @@ +type LogLevel = "debug" | "info" | "warn" | "error"; + +const levelOrder: Record = { + debug: 10, + info: 20, + warn: 30, + error: 40, +}; + +function formatParts(parts: unknown[]): string { + return parts + .map((part) => { + if (part instanceof Error) { + return `${part.name}: ${part.message}`; + } + if (typeof part === "string") { + return part; + } + return JSON.stringify(part); + }) + .join(" "); +} + +export class Logger { + constructor(private readonly level: LogLevel) {} + + private shouldLog(target: LogLevel): boolean { + return levelOrder[target] >= levelOrder[this.level]; + } + + private write(target: LogLevel, ...parts: unknown[]): void { + if (!this.shouldLog(target)) { + return; + } + + const line = `[${new Date().toISOString()}] [${target.toUpperCase()}] ${formatParts(parts)}`; + if (target === "error") { + console.error(line); + return; + } + if (target === "warn") { + console.warn(line); + return; + } + console.log(line); + } + + debug(...parts: unknown[]): void { + this.write("debug", ...parts); + } + + info(...parts: unknown[]): void { + this.write("info", ...parts); + } + + warn(...parts: unknown[]): void { + this.write("warn", ...parts); + } + + error(...parts: unknown[]): void { + this.write("error", ...parts); + } +} diff --git a/src/services/conversation.ts b/src/services/conversation.ts new file mode 100644 index 0000000..f3cdb9d --- /dev/null +++ b/src/services/conversation.ts @@ -0,0 +1,77 @@ +export interface ConversationTurn { + role: "user" | "assistant"; + text: string; + speakerId?: string; + speakerName?: string; + createdAt: number; +} + +export interface UserUtterance { + speakerId: string; + speakerName: string; + text: string; +} + +export class ConversationMemory { + private readonly turns: ConversationTurn[] = []; + + constructor(private readonly maxTurns: number) {} + + addUserTurn(utterance: UserUtterance): void { + this.turns.push({ + role: "user", + text: utterance.text, + speakerId: utterance.speakerId, + speakerName: utterance.speakerName, + createdAt: Date.now(), + }); + this.trim(); + } + + addAssistantTurn(text: string): void { + this.turns.push({ + role: "assistant", + text, + createdAt: Date.now(), + }); + this.trim(); + } + + clear(): void { + this.turns.splice(0, this.turns.length); + } + + recentTurns(): ConversationTurn[] { + return [...this.turns]; + } + + buildPrompt(currentUtterance: UserUtterance): string { + const recent = this.turns + .slice(-this.maxTurns) + .map((turn) => { + if (turn.role === "assistant") { + return `[assistant]\n${turn.text}`; + } + return `[user speaker_id=${turn.speakerId ?? "unknown"} speaker_name=${turn.speakerName ?? "unknown"}]\n${turn.text}`; + }) + .join("\n\n"); + + const historyBlock = recent.length > 0 ? recent : "(이전 대화 없음)"; + + return [ + "최근 대화:", + historyBlock, + "", + "이번 발화:", + `[user speaker_id=${currentUtterance.speakerId} speaker_name=${currentUtterance.speakerName}]`, + currentUtterance.text, + ].join("\n"); + } + + private trim(): void { + const overflow = this.turns.length - this.maxTurns; + if (overflow > 0) { + this.turns.splice(0, overflow); + } + } +} diff --git a/src/services/elevenlabs-stt.ts b/src/services/elevenlabs-stt.ts new file mode 100644 index 0000000..1c3719b --- /dev/null +++ b/src/services/elevenlabs-stt.ts @@ -0,0 +1,124 @@ +import WebSocket from "ws"; + +import type { AppConfig } from "../config.js"; + +interface ElevenLabsMessage { + message_type?: string; + text?: string; + error?: string; +} + +const NON_FATAL_ERROR_TYPES = new Set([ + "insufficient_audio_activity", +]); + +export class ElevenLabsSttService { + constructor(private readonly config: AppConfig) {} + + async transcribePcm16(pcm16MonoAudio: Buffer): Promise { + if (pcm16MonoAudio.byteLength === 0) { + return null; + } + + const url = new URL("wss://api.elevenlabs.io/v1/speech-to-text/realtime"); + url.searchParams.set("model_id", this.config.ELEVENLABS_STT_MODEL); + url.searchParams.set("language_code", this.config.BOT_DEFAULT_LANGUAGE); + url.searchParams.set("audio_format", "pcm_16000"); + url.searchParams.set("commit_strategy", "manual"); + url.searchParams.set("include_timestamps", "false"); + url.searchParams.set("include_language_detection", "false"); + url.searchParams.set("enable_logging", "false"); + + return await new Promise((resolve, reject) => { + const socket = new WebSocket(url, { + headers: { + "xi-api-key": this.config.ELEVENLABS_API_KEY, + }, + }); + + let settled = false; + let lastTranscript = ""; + + const timeout = setTimeout(() => { + finish(lastTranscript || null); + }, 15_000); + + const finish = (result: string | null, error?: Error) => { + if (settled) { + return; + } + settled = true; + clearTimeout(timeout); + try { + socket.close(); + } catch { + // Ignore close race. + } + + if (error) { + reject(error); + return; + } + resolve(result); + }; + + socket.on("message", (raw) => { + let message: ElevenLabsMessage; + try { + message = JSON.parse(raw.toString()) as ElevenLabsMessage; + } catch (error) { + finish(null, error as Error); + return; + } + + switch (message.message_type) { + case "session_started": + socket.send( + JSON.stringify({ + message_type: "input_audio_chunk", + audio_base_64: pcm16MonoAudio.toString("base64"), + commit: true, + sample_rate: 16000, + }), + ); + return; + case "partial_transcript": + return; + case "committed_transcript": + case "committed_transcript_with_timestamps": { + const transcript = message.text?.trim() ?? ""; + if (transcript.length > 0) { + lastTranscript = transcript; + finish(transcript); + } + return; + } + default: + if (!message.message_type?.endsWith("error") && !message.message_type) { + return; + } + + if (message.message_type && NON_FATAL_ERROR_TYPES.has(message.message_type)) { + finish(null); + return; + } + + finish( + null, + new Error(message.error ?? `ElevenLabs STT error: ${message.message_type ?? "unknown"}`), + ); + } + }); + + socket.on("error", (error) => { + finish(null, error as Error); + }); + + socket.on("close", () => { + if (!settled) { + finish(lastTranscript || null); + } + }); + }); + } +} diff --git a/src/services/elevenlabs-tts.ts b/src/services/elevenlabs-tts.ts new file mode 100644 index 0000000..24f83a3 --- /dev/null +++ b/src/services/elevenlabs-tts.ts @@ -0,0 +1,83 @@ +import { Readable } from "node:stream"; + +import ffmpegStatic from "ffmpeg-static"; +import prism from "prism-media"; +import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice"; + +import type { AppConfig } from "../config.js"; + +export interface PreparedSpeechPlayback { + resource: AudioResource; + dispose: () => void; +} + +export class ElevenLabsTtsService { + constructor(private readonly config: AppConfig) { + const resolvedFfmpegPath = ffmpegStatic as unknown as string | null; + if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) { + process.env.FFMPEG_PATH = resolvedFfmpegPath; + } + } + + async preparePlayback(text: string, signal?: AbortSignal): Promise { + const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`); + url.searchParams.set("output_format", "mp3_44100_128"); + url.searchParams.set("enable_logging", "false"); + + const response = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + "xi-api-key": this.config.ELEVENLABS_API_KEY, + }, + body: JSON.stringify({ + text, + model_id: this.config.ELEVENLABS_TTS_MODEL, + language_code: this.config.BOT_DEFAULT_LANGUAGE, + voice_settings: { + stability: 0.35, + similarity_boost: 0.75, + speed: 1.05, + }, + }), + signal, + }); + + if (!response.ok || !response.body) { + throw new Error(`ElevenLabs TTS request failed with status ${response.status}`); + } + + const input = Readable.fromWeb(response.body as never); + const ffmpeg = new prism.FFmpeg({ + args: [ + "-analyzeduration", + "0", + "-loglevel", + "0", + "-i", + "pipe:0", + "-f", + "s16le", + "-ar", + "48000", + "-ac", + "2", + "pipe:1", + ], + }); + + input.pipe(ffmpeg); + + const resource = createAudioResource(ffmpeg, { + inputType: StreamType.Raw, + }); + + return { + resource, + dispose: () => { + input.destroy(); + ffmpeg.destroy(); + }, + }; + } +} diff --git a/src/services/openai-llm.ts b/src/services/openai-llm.ts new file mode 100644 index 0000000..c6b02e9 --- /dev/null +++ b/src/services/openai-llm.ts @@ -0,0 +1,64 @@ +import OpenAI from "openai"; + +import type { AppConfig } from "../config.js"; +import type { ConversationMemory, UserUtterance } from "./conversation.js"; + +const ASSISTANT_INSTRUCTIONS = [ + "너는 디스코드 음성 채널에서 동작하는 한국어 음성 비서다.", + "답변은 짧고 실용적으로 한다.", + "기본은 한 문장, 길어도 두 문장을 넘기지 않는다.", + "말투는 자연스러운 한국어로 유지한다.", + "speaker_id와 speaker_name은 화자 구분용이므로 필요할 때만 자연스럽게 반영한다.", + "잘 못 들었거나 의미가 불명확하면 짧게 다시 물어본다.", + "목록, 마크다운, 코드블록은 쓰지 않는다.", +].join(" "); + +function normalizeReply(text: string): string { + const compact = text.replace(/\s+/g, " ").trim(); + if (compact.length <= 180) { + return compact; + } + + const sentences = compact.match(/[^.!?]+[.!?]?/g); + if (!sentences || sentences.length === 0) { + return compact.slice(0, 180).trim(); + } + + return sentences.slice(0, 2).join(" ").trim().slice(0, 180).trim(); +} + +export class OpenAiLlmService { + private readonly client: OpenAI; + + constructor(private readonly config: AppConfig) { + this.client = new OpenAI({ + apiKey: this.config.OPENAI_API_KEY, + }); + } + + async generateReply(memory: ConversationMemory, utterance: UserUtterance): Promise { + const response = await this.client.responses.create({ + model: this.config.OPENAI_MODEL, + instructions: ASSISTANT_INSTRUCTIONS, + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: memory.buildPrompt(utterance), + }, + ], + }, + ], + max_output_tokens: 120, + }); + + const output = response.output_text?.trim(); + if (!output) { + return "잘 못 들었습니다. 한 번만 다시 말씀해 주세요."; + } + + return normalizeReply(output); + } +} diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..7edb43f --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,21 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "NodeNext", + "moduleResolution": "NodeNext", + "strict": true, + "noEmit": false, + "rootDir": "src", + "outDir": "dist", + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "skipLibCheck": true, + "resolveJsonModule": true, + "types": [ + "node" + ] + }, + "include": [ + "src/**/*.ts" + ] +}